5824: Merge branch 'master' into 5824-keep-web-workbench
authorTom Clegg <tom@curoverse.com>
Fri, 6 Nov 2015 21:53:01 +0000 (16:53 -0500)
committerTom Clegg <tom@curoverse.com>
Fri, 6 Nov 2015 21:53:01 +0000 (16:53 -0500)
25 files changed:
doc/install/install-compute-node.html.textile.liquid
doc/install/install-crunch-dispatch.html.textile.liquid
doc/install/install-keep-web.html.textile.liquid
doc/install/install-keepproxy.html.textile.liquid
sdk/cli/bin/crunch-job
sdk/go/arvadosclient/arvadosclient.go
sdk/go/arvadosclient/arvadosclient_test.go
sdk/go/arvadostest/fixtures.go
sdk/python/tests/run_test_server.py
services/api/test/fixtures/api_client_authorizations.yml
services/datamanager/datamanager_test.go
services/dockercleaner/arvados_docker/cleaner.py
services/dockercleaner/tests/test_cleaner.py
services/keep-web/anonymous.go
services/keep-web/doc.go
services/keep-web/main.go
services/keep-web/server.go
services/keepproxy/keepproxy_test.go
services/keepstore/keepstore.go
services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
services/nodemanager/arvnodeman/daemon.py
services/nodemanager/tests/test_computenode_dispatch.py
services/nodemanager/tests/test_daemon.py
tools/keep-rsync/keep-rsync_test.go

index 250d1dcc40363bb4fc73c116ee6edb345bd3a280..aa4f37d639704f33dc10b2f9e71db1c36a6c129a 100644 (file)
@@ -78,6 +78,10 @@ h2. Configure the Docker cleaner
 
 The arvados-docker-cleaner program removes least recently used docker images as needed to keep disk usage below a configured limit.
 
+{% include 'notebox_begin' %}
+This also removes all containers as soon as they exit, as if they were run with `docker run --rm`. If you need to debug or inspect containers after they stop, temporarily stop arvados-docker-cleaner or run it with `--remove-stopped-containers never`.
+{% include 'notebox_end' %}
+
 On Debian-based systems, install runit:
 
 <notextile>
index 370a6e7c3d04fce578ccc1c231e82ec0e44960f6..d632f9bbd61974966499090bed0c71949633edac 100644 (file)
@@ -185,7 +185,7 @@ export RAILS_ENV=production
 export CRUNCH_JOB_DOCKER_BIN=<span class="userinput">docker.io</span>
 
 fuser -TERM -k $CRUNCH_DISPATCH_LOCKFILE || true
-cd /var/www/arvados-api/services/api
+cd /var/www/arvados-api/current
 exec $rvmexec bundle exec ./script/crunch-dispatch.rb 2>&1
 </code></pre>
 </notextile>
index 0a00ca85f1d87348103e27b949575682e70e6174..11a425d3476d5ae69ba74f0e81ab8dcd14d219fa 100644 (file)
@@ -36,12 +36,12 @@ Verify that @keep-web@ is functional:
 <notextile>
 <pre><code>~$ <span class="userinput">keep-web -h</span>
 Usage of keep-web:
-  -address string
-        Address to listen on: "host:port", or ":port" to listen on all interfaces. (default ":80")
-  -anonymous-token value
-        API token to try when none of the tokens provided in an HTTP request succeed in reading the desired collection. If this flag is used more than once, each token will be attempted in turn until one works. (default [])
+  -allow-anonymous
+        Serve public data to anonymous clients. Try the token supplied in the ARVADOS_API_TOKEN environment variable when none of the tokens provided in an HTTP request succeed in reading the desired collection. (default false)
   -attachment-only-host string
         Accept credentials, and add "Content-Disposition: attachment" response headers, for requests at this hostname:port. Prohibiting inline display makes it possible to serve untrusted and non-public content from a single origin, i.e., without wildcard DNS or SSL.
+  -listen string
+        Address to listen on: "host:port", or ":port" to listen on all interfaces. (default ":80")
   -trust-all-content
         Serve non-public content from a single origin. Dangerous: read docs before using!
 </code></pre>
@@ -58,11 +58,12 @@ We recommend running @keep-web@ under "runit":https://packages.debian.org/search
 
 <notextile>
 <pre><code>export ARVADOS_API_HOST=<span class="userinput">uuid_prefix</span>.your.domain
-exec sudo -u nobody keep-web -address=<span class="userinput">:9002</span> -anonymous-token=<span class="userinput">hoShoomoo2bai3Ju1xahg6aeng1siquuaZ1yae2gi2Uhaeng2r</span> 2&gt;&amp;1
+export ARVADOS_API_TOKEN="<span class="userinput">hoShoomoo2bai3Ju1xahg6aeng1siquuaZ1yae2gi2Uhaeng2r</span>"
+exec sudo -u nobody keep-web -listen=<span class="userinput">:9002</span> -allow-anonymous 2&gt;&amp;1
 </code></pre>
 </notextile>
 
-Omit the @-anonymous-token@ arguments if you do not want to serve public data.
+Omit the @-allow-anonymous@ argument if you do not want to serve public data.
 
 Set @ARVADOS_API_HOST_INSECURE=1@ if your API server's SSL certificate is not signed by a recognized CA.
 
index e6e2b103ae286317e2ba88910d5897adeee72068..5a5b66aaaef98c1ee2e42525df2c881655baf3ec 100644 (file)
@@ -56,7 +56,7 @@ The Keepproxy server needs a token to talk to the API server.
 On the <strong>API server</strong>, use the following command to create the token:
 
 <notextile>
-<pre><code>~/arvados/services/api/script$ <span class="userinput">RAILS_ENV=production bundle exec ./get_anonymous_user_token.rb</span>
+<pre><code>/var/www/arvados-api/current/script$ <span class="userinput">RAILS_ENV=production bundle exec ./get_anonymous_user_token.rb</span>
 hoShoomoo2bai3Ju1xahg6aeng1siquuaZ1yae2gi2Uhaeng2r
 </code></pre></notextile>
 
index 555c4d19a613037e36395338a9de32e15ee53c6a..c2ea186ef5114f17a3198c1aee872103d5c0dc95 100755 (executable)
@@ -467,7 +467,7 @@ fi
     }
     srun(["srun", "--nodelist=" . $node[0]],
          ["/bin/sh", "-ec",
-          "a=`$docker_bin run --rm $try_user_arg $docker_hash id --user` && " .
+          "a=`$docker_bin run $try_user_arg $docker_hash id --user` && " .
           " test \$a -ne 0"],
          {fork => 1});
     if ($? == 0) {
@@ -896,9 +896,10 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
     $command .= "&& exec arv-mount --by-id --allow-other $ENV{TASK_KEEPMOUNT} --exec ";
     if ($docker_hash)
     {
-      my $cidfile = "$ENV{CRUNCH_TMP}/$Jobstep->{arvados_task}->{uuid}-$Jobstep->{failures}.cid";
+      my $containername = "$Jobstep->{arvados_task}->{uuid}-$Jobstep->{failures}";
+      my $cidfile = "$ENV{CRUNCH_TMP}/$containername.cid";
       $command .= "crunchstat -cgroup-root=/sys/fs/cgroup -cgroup-parent=docker -cgroup-cid=$cidfile -poll=10000 ";
-      $command .= "$docker_bin run --rm=true --attach=stdout --attach=stderr --attach=stdin -i \Q$dockeruserarg\E --cidfile=$cidfile --sig-proxy ";
+      $command .= "$docker_bin run --name=$containername --attach=stdout --attach=stderr --attach=stdin -i \Q$dockeruserarg\E --cidfile=$cidfile --sig-proxy ";
       # We only set memory limits if Docker lets us limit both memory and swap.
       # Memory limits alone have been supported longer, but subprocesses tend
       # to get SIGKILL if they exceed that without any swap limit set.
index 1cce0a7fc92d24e21fa694add86c75c63952eb46..18e1074bf6f6c801c21593bc57586a902807f8b4 100644 (file)
@@ -14,6 +14,7 @@ import (
        "os"
        "regexp"
        "strings"
+       "time"
 )
 
 type StringMatcher func(string) bool
@@ -25,6 +26,12 @@ var MissingArvadosApiHost = errors.New("Missing required environment variable AR
 var MissingArvadosApiToken = errors.New("Missing required environment variable ARVADOS_API_TOKEN")
 var ErrInvalidArgument = errors.New("Invalid argument")
 
+// A common failure mode is to reuse a keepalive connection that has been
+// terminated (in a way that we can't detect) for being idle too long.
+// POST and DELETE are not safe to retry automatically, so we minimize
+// such failures by always using a new or recently active socket.
+var MaxIdleConnectionDuration = 30 * time.Second
+
 // Indicates an error that was returned by the API server.
 type APIServerError struct {
        // Address of server returning error, of the form "host:port".
@@ -76,6 +83,8 @@ type ArvadosClient struct {
 
        // Discovery document
        DiscoveryDoc Dict
+
+       lastClosedIdlesAt time.Time
 }
 
 // Create a new ArvadosClient, initialized with standard Arvados environment
@@ -101,6 +110,8 @@ func MakeArvadosClient() (ac ArvadosClient, err error) {
                return ac, MissingArvadosApiToken
        }
 
+       ac.lastClosedIdlesAt = time.Now()
+
        return ac, err
 }
 
@@ -158,6 +169,15 @@ func (c ArvadosClient) CallRaw(method string, resourceType string, uuid string,
                req.Header.Add("X-External-Client", "1")
        }
 
+       // POST and DELETE are not safe to retry automatically, so we minimize
+       // such failures by always using a new or recently active socket
+       if method == "POST" || method == "DELETE" {
+               if time.Since(c.lastClosedIdlesAt) > MaxIdleConnectionDuration {
+                       c.lastClosedIdlesAt = time.Now()
+                       c.Client.Transport.(*http.Transport).CloseIdleConnections()
+               }
+       }
+
        // Make the request
        var resp *http.Response
        if resp, err = c.Client.Do(req); err != nil {
index 2c508dcb4a1100cf4609fc3ff3387ac089c6ab26..75af3ca51960a1b47a8ce9406ad3200823aac5db 100644 (file)
@@ -6,6 +6,7 @@ import (
        "net/http"
        "os"
        "testing"
+       "time"
 )
 
 // Gocheck boilerplate
@@ -102,39 +103,50 @@ func (s *ServerRequiredSuite) TestInvalidResourceType(c *C) {
 func (s *ServerRequiredSuite) TestCreatePipelineTemplate(c *C) {
        arv, err := MakeArvadosClient()
 
-       getback := make(Dict)
-       err = arv.Create("pipeline_templates",
-               Dict{"pipeline_template": Dict{
-                       "name": "tmp",
-                       "components": Dict{
-                               "c1": map[string]string{"script": "script1"},
-                               "c2": map[string]string{"script": "script2"}}}},
-               &getback)
-       c.Assert(err, Equals, nil)
-       c.Assert(getback["name"], Equals, "tmp")
-       c.Assert(getback["components"].(map[string]interface{})["c2"].(map[string]interface{})["script"], Equals, "script2")
-
-       uuid := getback["uuid"].(string)
-
-       getback = make(Dict)
-       err = arv.Get("pipeline_templates", uuid, nil, &getback)
-       c.Assert(err, Equals, nil)
-       c.Assert(getback["name"], Equals, "tmp")
-       c.Assert(getback["components"].(map[string]interface{})["c1"].(map[string]interface{})["script"], Equals, "script1")
-
-       getback = make(Dict)
-       err = arv.Update("pipeline_templates", uuid,
-               Dict{
-                       "pipeline_template": Dict{"name": "tmp2"}},
-               &getback)
-       c.Assert(err, Equals, nil)
-       c.Assert(getback["name"], Equals, "tmp2")
-
-       c.Assert(getback["uuid"].(string), Equals, uuid)
-       getback = make(Dict)
-       err = arv.Delete("pipeline_templates", uuid, nil, &getback)
-       c.Assert(err, Equals, nil)
-       c.Assert(getback["name"], Equals, "tmp2")
+       for _, idleConnections := range []bool{
+               false,
+               true,
+       } {
+               if idleConnections {
+                       arv.lastClosedIdlesAt = time.Now().Add(-time.Minute)
+               } else {
+                       arv.lastClosedIdlesAt = time.Now()
+               }
+
+               getback := make(Dict)
+               err = arv.Create("pipeline_templates",
+                       Dict{"pipeline_template": Dict{
+                               "name": "tmp",
+                               "components": Dict{
+                                       "c1": map[string]string{"script": "script1"},
+                                       "c2": map[string]string{"script": "script2"}}}},
+                       &getback)
+               c.Assert(err, Equals, nil)
+               c.Assert(getback["name"], Equals, "tmp")
+               c.Assert(getback["components"].(map[string]interface{})["c2"].(map[string]interface{})["script"], Equals, "script2")
+
+               uuid := getback["uuid"].(string)
+
+               getback = make(Dict)
+               err = arv.Get("pipeline_templates", uuid, nil, &getback)
+               c.Assert(err, Equals, nil)
+               c.Assert(getback["name"], Equals, "tmp")
+               c.Assert(getback["components"].(map[string]interface{})["c1"].(map[string]interface{})["script"], Equals, "script1")
+
+               getback = make(Dict)
+               err = arv.Update("pipeline_templates", uuid,
+                       Dict{
+                               "pipeline_template": Dict{"name": "tmp2"}},
+                       &getback)
+               c.Assert(err, Equals, nil)
+               c.Assert(getback["name"], Equals, "tmp2")
+
+               c.Assert(getback["uuid"].(string), Equals, uuid)
+               getback = make(Dict)
+               err = arv.Delete("pipeline_templates", uuid, nil, &getback)
+               c.Assert(err, Equals, nil)
+               c.Assert(getback["name"], Equals, "tmp2")
+       }
 }
 
 func (s *ServerRequiredSuite) TestErrorResponse(c *C) {
index d0270a6a71f79643bd4fdfbdd621b514449353f2..3256ec27a2572c0d9889ab1067dc43845075c540 100644 (file)
@@ -4,7 +4,9 @@ package arvadostest
 const (
        SpectatorToken        = "zw2f4gwx8hw8cjre7yp6v1zylhrhn3m5gvjq73rtpwhmknrybu"
        ActiveToken           = "3kg6k6lzmp9kj5cpkcoxie963cmvjahbt2fod9zru30k1jqdmi"
+       AdminToken            = "4axaw8zxe0qm22wa6urpp5nskcne8z88cvbupv653y1njyi05h"
        AnonymousToken        = "4kg6k6lzmp9kj4cpkcoxie964cmvjahbt4fod9zru44k4jqdmi"
+       DataManagerToken      = "320mkve8qkswstz7ff61glpk3mhgghmg67wmic7elw4z41pke1"
        FooCollection         = "zzzzz-4zz18-fy296fx3hot09f7"
        NonexistentCollection = "zzzzz-4zz18-totallynotexist"
        HelloWorldCollection  = "zzzzz-4zz18-4en62shvi99lxd4"
index cdfd93a0a5cce4bb933facf49eec3e915e7035e8..b58c6117dd56265b411340c9a700d3a660672817 100644 (file)
@@ -354,7 +354,7 @@ def run_keep(blob_signing_key=None, enforce_permissions=False, num_servers=2):
         keep_args['-enforce-permissions'] = 'true'
     with open(os.path.join(TEST_TMPDIR, "keep.data-manager-token-file"), "w") as f:
         keep_args['-data-manager-token-file'] = f.name
-        f.write(os.environ['ARVADOS_API_TOKEN'])
+        f.write(auth_token('data_manager'))
     keep_args['-never-delete'] = 'false'
 
     api = arvados.api(
index cb96295064a244073bbe1865f436dd951db890bb..85c2c791a379552caabc10f6665e1dfca10a2190 100644 (file)
@@ -18,6 +18,18 @@ admin_trustedclient:
   api_token: 1a9ffdcga2o7cw8q12dndskomgs1ygli3ns9k2o9hgzgmktc78
   expires_at: 2038-01-01 00:00:00
 
+data_manager:
+  api_client: untrusted
+  user: system_user
+  api_token: 320mkve8qkswstz7ff61glpk3mhgghmg67wmic7elw4z41pke1
+  expires_at: 2038-01-01 00:00:00
+  scopes:
+    - GET /arvados/v1/collections
+    - GET /arvados/v1/keep_services
+    - GET /arvados/v1/keep_services/accessible
+    - GET /arvados/v1/users/current
+    - POST /arvados/v1/logs
+
 miniadmin:
   api_client: untrusted
   user: miniadmin
index c2cb762d52b625b625634f24d385ddbf9ad4e7d8..685f94c88f3a35c33f6aa986e85701a5853e5d32 100644 (file)
@@ -16,11 +16,6 @@ import (
        "time"
 )
 
-const (
-       ActiveUserToken = "3kg6k6lzmp9kj5cpkcoxie963cmvjahbt2fod9zru30k1jqdmi"
-       AdminToken      = "4axaw8zxe0qm22wa6urpp5nskcne8z88cvbupv653y1njyi05h"
-)
-
 var arv arvadosclient.ArvadosClient
 var keepClient *keepclient.KeepClient
 var keepServers []string
@@ -34,6 +29,7 @@ func SetupDataManagerTest(t *testing.T) {
        arvadostest.StartKeep(2, false)
 
        arv = makeArvadosClient()
+       arv.ApiToken = arvadostest.DataManagerToken
 
        // keep client
        keepClient = &keepclient.KeepClient{
@@ -124,7 +120,18 @@ func getFirstLocatorFromCollection(t *testing.T, uuid string) string {
        return match[1] + "+" + match[2]
 }
 
+func switchToken(t string) func() {
+       orig := arv.ApiToken
+       restore := func() {
+               arv.ApiToken = orig
+       }
+       arv.ApiToken = t
+       return restore
+}
+
 func getCollection(t *testing.T, uuid string) Dict {
+       defer switchToken(arvadostest.AdminToken)()
+
        getback := make(Dict)
        err := arv.Get("collections", uuid, nil, &getback)
        if err != nil {
@@ -138,6 +145,8 @@ func getCollection(t *testing.T, uuid string) Dict {
 }
 
 func updateCollection(t *testing.T, uuid string, paramName string, paramValue string) {
+       defer switchToken(arvadostest.AdminToken)()
+
        err := arv.Update("collections", uuid, arvadosclient.Dict{
                "collection": arvadosclient.Dict{
                        paramName: paramValue,
@@ -152,6 +161,8 @@ func updateCollection(t *testing.T, uuid string, paramName string, paramValue st
 type Dict map[string]interface{}
 
 func deleteCollection(t *testing.T, uuid string) {
+       defer switchToken(arvadostest.AdminToken)()
+
        getback := make(Dict)
        err := arv.Delete("collections", uuid, nil, &getback)
        if err != nil {
@@ -175,7 +186,7 @@ func getBlockIndexesForServer(t *testing.T, i int) []string {
        path := keepServers[i] + "/index"
        client := http.Client{}
        req, err := http.NewRequest("GET", path, nil)
-       req.Header.Add("Authorization", "OAuth2 "+AdminToken)
+       req.Header.Add("Authorization", "OAuth2 "+arvadostest.DataManagerToken)
        req.Header.Add("Content-Type", "application/octet-stream")
        resp, err := client.Do(req)
        defer resp.Body.Close()
@@ -297,7 +308,7 @@ func backdateBlocks(t *testing.T, oldUnusedBlockLocators []string) {
 func getStatus(t *testing.T, path string) interface{} {
        client := http.Client{}
        req, err := http.NewRequest("GET", path, nil)
-       req.Header.Add("Authorization", "OAuth2 "+AdminToken)
+       req.Header.Add("Authorization", "OAuth2 "+arvadostest.DataManagerToken)
        req.Header.Add("Content-Type", "application/octet-stream")
        resp, err := client.Do(req)
        if err != nil {
@@ -504,7 +515,7 @@ func TestRunDatamanagerAsNonAdminUser(t *testing.T) {
        defer TearDownDataManagerTest(t)
        SetupDataManagerTest(t)
 
-       arv.ApiToken = ActiveUserToken
+       arv.ApiToken = arvadostest.ActiveToken
 
        err := singlerun(arv)
        if err == nil {
index 191cb55601053d1f58a284ce08847149d04d2522..2435e6de806043b4e014071e8f9e05f06b159c4e 100755 (executable)
@@ -177,9 +177,10 @@ class DockerImageUseRecorder(DockerEventListener):
 class DockerImageCleaner(DockerImageUseRecorder):
     event_handlers = DockerImageUseRecorder.event_handlers.copy()
 
-    def __init__(self, images, docker_client, events):
+    def __init__(self, images, docker_client, events, remove_containers_onexit=False):
         super().__init__(images, docker_client, events)
         self.logged_unknown = set()
+        self.remove_containers_onexit = remove_containers_onexit
 
     def new_container(self, event, container_hash):
         container_image_id = container_hash['Image']
@@ -188,6 +189,29 @@ class DockerImageCleaner(DockerImageUseRecorder):
             self.images.add_image(image_hash)
         return super().new_container(event, container_hash)
 
+    def _remove_container(self, cid):
+        try:
+            self.docker_client.remove_container(cid)
+        except docker.errors.APIError as error:
+            logger.warning("Failed to remove container %s: %s", cid, error)
+        else:
+            logger.info("Removed container %s", cid)
+
+    @event_handlers.on('die')
+    def clean_container(self, event=None):
+        if self.remove_containers_onexit:
+            self._remove_container(event['id'])
+
+    def check_stopped_containers(self, remove=False):
+        logger.info("Checking for stopped containers")
+        for c in self.docker_client.containers(filters={'status': 'exited'}):
+            logger.info("Container %s %s", c['Id'], c['Status'])
+            if c['Status'][:6] != 'Exited':
+                logger.error("Unexpected status %s for container %s",
+                             c['Status'], c['Id'])
+            elif remove:
+                self._remove_container(c['Id'])
+
     @event_handlers.on('destroy')
     def clean_images(self, event=None):
         for image_id in self.images.should_delete():
@@ -225,6 +249,12 @@ def parse_arguments(arguments):
     parser.add_argument(
         '--quota', action='store', type=human_size, required=True,
         help="space allowance for Docker images, suffixed with K/M/G/T")
+    parser.add_argument(
+        '--remove-stopped-containers', type=str, default='always',
+        choices=['never', 'onexit', 'always'],
+        help="""when to remove stopped containers (default: always, i.e., remove
+        stopped containers found at startup, and remove containers as
+        soon as they exit)""")
     parser.add_argument(
         '--verbose', '-v', action='count', default=0,
         help="log more information")
@@ -246,9 +276,13 @@ def run(args, docker_client):
         images, docker_client, docker_client.events(since=1, until=start_time))
     use_recorder.run()
     cleaner = DockerImageCleaner(
-        images, docker_client, docker_client.events(since=start_time))
-    logger.info("Starting cleanup loop")
+        images, docker_client, docker_client.events(since=start_time),
+        remove_containers_onexit=args.remove_stopped_containers != 'never')
+    cleaner.check_stopped_containers(
+        remove=args.remove_stopped_containers == 'always')
+    logger.info("Checking image quota at startup")
     cleaner.clean_images()
+    logger.info("Listening for docker events")
     cleaner.run()
 
 def main(arguments):
index fd959de7762dd2b0a54f5b08b1107e4aedc45d5a..43abe4f636dcb1076ee65e90f7a549f6616af834 100644 (file)
@@ -223,13 +223,14 @@ class DockerImagesTestCase(unittest.TestCase):
 
 class DockerImageUseRecorderTestCase(unittest.TestCase):
     TEST_CLASS = cleaner.DockerImageUseRecorder
+    TEST_CLASS_INIT_KWARGS = {}
 
     def setUp(self):
         self.images = mock.MagicMock(name='images')
         self.docker_client = mock.MagicMock(name='docker_client')
         self.events = []
         self.recorder = self.TEST_CLASS(self.images, self.docker_client,
-                                        self.encoded_events)
+                                        self.encoded_events, **self.TEST_CLASS_INIT_KWARGS)
 
     @property
     def encoded_events(self):
@@ -310,6 +311,23 @@ class DockerImageCleanerTestCase(DockerImageUseRecorderTestCase):
         self.assertFalse(self.images.del_image.called)
 
 
+class DockerContainerCleanerTestCase(DockerImageUseRecorderTestCase):
+    TEST_CLASS = cleaner.DockerImageCleaner
+    TEST_CLASS_INIT_KWARGS = {'remove_containers_onexit': True}
+
+    @mock.patch('arvados_docker.cleaner.logger')
+    def test_failed_container_deletion_handling(self, mockLogger):
+        cid = MockDockerId()
+        self.docker_client.remove_container.side_effect = MockException(500)
+        self.events.append(MockEvent('die', docker_id=cid))
+        self.recorder.run()
+        self.docker_client.remove_container.assert_called_with(cid)
+        self.assertEqual("Failed to remove container %s: %s",
+                         mockLogger.warning.call_args[0][0])
+        self.assertEqual(cid,
+                         mockLogger.warning.call_args[0][1])
+
+
 class HumanSizeTestCase(unittest.TestCase):
     def check(self, human_str, count, exp):
         self.assertEqual(count * (1024 ** exp),
@@ -354,3 +372,57 @@ class RunTestCase(unittest.TestCase):
         self.assertLessEqual(test_start_time, event_kwargs[0]['until'])
         self.assertIn('since', event_kwargs[1])
         self.assertEqual(event_kwargs[0]['until'], event_kwargs[1]['since'])
+
+
+class ContainerRemovalTestCase(unittest.TestCase):
+    LIFECYCLE = ['create', 'attach', 'start', 'resize', 'die', 'destroy']
+
+    def setUp(self):
+        self.args = mock.MagicMock(name='args')
+        self.docker_client = mock.MagicMock(name='docker_client')
+        self.existingCID = MockDockerId()
+        self.docker_client.containers.return_value = [{
+            'Id': self.existingCID,
+            'Status': 'Exited (0) 6 weeks ago',
+        }, {
+            # If docker_client.containers() returns non-exited
+            # containers for some reason, do not remove them.
+            'Id': MockDockerId(),
+            'Status': 'Running',
+        }]
+        self.newCID = MockDockerId()
+        self.docker_client.events.return_value = [
+            MockEvent(e, docker_id=self.newCID).encoded()
+            for e in self.LIFECYCLE]
+
+    def test_remove_onexit(self):
+        self.args.remove_stopped_containers = 'onexit'
+        cleaner.run(self.args, self.docker_client)
+        self.docker_client.remove_container.assert_called_once_with(self.newCID)
+
+    def test_remove_always(self):
+        self.args.remove_stopped_containers = 'always'
+        cleaner.run(self.args, self.docker_client)
+        self.docker_client.remove_container.assert_any_call(self.existingCID)
+        self.docker_client.remove_container.assert_any_call(self.newCID)
+        self.assertEqual(2, self.docker_client.remove_container.call_count)
+
+    def test_remove_never(self):
+        self.args.remove_stopped_containers = 'never'
+        cleaner.run(self.args, self.docker_client)
+        self.assertEqual(0, self.docker_client.remove_container.call_count)
+
+    def test_container_exited_between_subscribe_events_and_check_existing(self):
+        self.args.remove_stopped_containers = 'always'
+        self.docker_client.events.return_value = [
+            MockEvent(e, docker_id=self.existingCID).encoded()
+            for e in ['die', 'destroy']]
+        cleaner.run(self.args, self.docker_client)
+        # Subscribed to events before getting the list of existing
+        # exited containers?
+        self.docker_client.assert_has_calls([
+            mock.call.events(since=mock.ANY),
+            mock.call.containers(filters={'status':'exited'})])
+        # Asked to delete the container twice?
+        self.docker_client.remove_container.assert_has_calls([mock.call(self.existingCID)] * 2)
+        self.assertEqual(2, self.docker_client.remove_container.call_count)
index bfc716f2b2b865d9a12577bcca41812e7252f4e9..15a98c2f361ef62f2ddfd313da5d5402ea718c56 100644 (file)
@@ -3,22 +3,33 @@ package main
 import (
        "flag"
        "fmt"
+       "os"
+       "strconv"
 )
 
 var anonymousTokens tokenSet
 
 type tokenSet []string
 
-func (ts *tokenSet) Set(t string) error {
-       *ts = append(*ts, t)
-       return nil
+func (ts *tokenSet) Set(s string) error {
+       v, err := strconv.ParseBool(s)
+       if v && len(*ts) == 0 {
+               *ts = append(*ts, os.Getenv("ARVADOS_API_TOKEN"))
+       } else if !v {
+               *ts = (*ts)[:0]
+       }
+       return err
 }
 
 func (ts *tokenSet) String() string {
-       return fmt.Sprintf("%+v", (*ts)[:])
+       return fmt.Sprintf("%v", len(*ts) > 0)
+}
+
+func (ts *tokenSet) IsBoolFlag() bool {
+       return true
 }
 
 func init() {
-       flag.Var(&anonymousTokens, "anonymous-token",
-               "API token to try when none of the tokens provided in an HTTP request succeed in reading the desired collection. Multiple anonymous tokens can be provided by using this flag more than once; each token will be attempted in turn until one works.")
+       flag.Var(&anonymousTokens, "allow-anonymous",
+               "Serve public data to anonymous clients. Try the token supplied in the ARVADOS_API_TOKEN environment variable when none of the tokens provided in an HTTP request succeed in reading the desired collection.")
 }
index 5aa09e20a455b979954855cd4367224b36900c69..4207d7bfc7344cb72dfedbb176000bced1966077 100644 (file)
 //
 // Serve HTTP requests at port 1234 on all interfaces:
 //
-//   keep-web -address=:1234
+//   keep-web -listen=:1234
 //
 // Serve HTTP requests at port 1234 on the interface with IP address 1.2.3.4:
 //
-//   keep-web -address=1.2.3.4:1234
+//   keep-web -listen=1.2.3.4:1234
 //
 // Proxy configuration
 //
 //
 // Anonymous downloads
 //
-// Use the -anonymous-token option to specify a token to use when clients
-// try to retrieve files without providing their own Arvados API token.
+// Use the -allow-anonymous flag with an ARVADOS_API_TOKEN environment
+// variable to specify a token to use when clients try to retrieve
+// files without providing their own Arvados API token.
 //
-//   keep-web [...] -anonymous-token=zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
+//   export ARVADOS_API_TOKEN=zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
+//   keep-web [...] -allow-anonymous
 //
 // See http://doc.arvados.org/install/install-keep-web.html for examples.
 //
 // only when the designated origin matches exactly the Host header
 // provided by the client or downstream proxy.
 //
-//   keep-web -address :9999 -attachment-only-host domain.example:9999
+//   keep-web -listen :9999 -attachment-only-host domain.example:9999
 //
 // Trust All Content mode
 //
 //
 // In such cases you can enable trust-all-content mode.
 //
-//   keep-web -address :9999 -trust-all-content
+//   keep-web -listen :9999 -trust-all-content
 //
 // When using trust-all-content mode, the only effect of the
 // -attachment-only-host option is to add a "Content-Disposition:
 // attachment" header.
 //
-//   keep-web -address :9999 -attachment-only-host domain.example:9999 -trust-all-content
+//   keep-web -listen :9999 -attachment-only-host domain.example:9999 -trust-all-content
 //
 package main
index 751543e8fa81a0af389a5dcfc2c744a3c66ea503..135f01b394720efba18dc8082744637bfaf3a7c1 100644 (file)
@@ -12,7 +12,9 @@ func init() {
        // different token before doing anything with the client). We
        // set this dummy value during init so it doesn't clobber the
        // one used by "run test servers".
-       os.Setenv("ARVADOS_API_TOKEN", "xxx")
+       if os.Getenv("ARVADOS_API_TOKEN") == "" {
+               os.Setenv("ARVADOS_API_TOKEN", "xxx")
+       }
 }
 
 func main() {
index 2359f23c761504cca5d8116db420b35607e4a311..100900830f5d6808563a928fdd3e501ca660d501 100644 (file)
@@ -10,7 +10,7 @@ import (
 var address string
 
 func init() {
-       flag.StringVar(&address, "address", ":80",
+       flag.StringVar(&address, "listen", ":80",
                "Address to listen on: \"host:port\", or \":port\" to listen on all interfaces.")
 }
 
index e4f09b4e74a071b62797da22c3649ddd1f423938..9c33948330262fe9912cd3ce6257ddbc5172060a 100644 (file)
@@ -367,6 +367,8 @@ func (s *ServerRequiredSuite) TestGetIndex(c *C) {
        _, rep, err = kc.PutB([]byte("some-more-index-data"))
        c.Check(err, Equals, nil)
 
+       kc.Arvados.ApiToken = arvadostest.DataManagerToken
+
        // Invoke GetIndex
        for _, spec := range []struct {
                prefix         string
index 2528f6d6a6c4dbf2f4b509e670c834aa10b9e618..7525441aaec995d4a524a7ff20f954ee06ae50c9 100644 (file)
@@ -33,10 +33,6 @@ const BlockSize = 64 * 1024 * 1024
 // in order to permit writes.
 const MinFreeKilobytes = BlockSize / 1024
 
-// Until #6221 is resolved, never_delete must be true.
-// However, allow it to be false in testing with TestDataManagerToken
-const TestDataManagerToken = "4axaw8zxe0qm22wa6urpp5nskcne8z88cvbupv653y1njyi05h"
-
 // ProcMounts /proc/mounts
 var ProcMounts = "/proc/mounts"
 
@@ -159,8 +155,9 @@ func main() {
                &neverDelete,
                "never-delete",
                true,
-               "If set, nothing will be deleted. HTTP 405 will be returned "+
-                       "for valid DELETE requests.")
+               "If true, nothing will be deleted. "+
+                       "Warning: the relevant features in keepstore and data manager have not been extensively tested. "+
+                       "You should leave this option alone unless you can afford to lose data.")
        flag.StringVar(
                &blobSigningKeyFile,
                "permission-key-file",
@@ -257,8 +254,9 @@ func main() {
                }
        }
 
-       if neverDelete != true && dataManagerToken != TestDataManagerToken {
-               log.Fatal("never_delete must be true, see #6221")
+       if neverDelete != true {
+               log.Print("never-delete is not set. Warning: the relevant features in keepstore and data manager have not " +
+                       "been extensively tested. You should leave this option alone unless you can afford to lose data.")
        }
 
        if blobSigningKeyFile != "" {
index 1c828c13c3b5033aa92353c2b403c179e00120c6..3c708778d0c274b17c0d9c2c42b8c0b768da331b 100644 (file)
@@ -154,6 +154,10 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
 
     This actor simply destroys a cloud node, retrying as needed.
     """
+    # Reasons for a shutdown to be cancelled.
+    WINDOW_CLOSED = "shutdown window closed"
+    NODE_BROKEN = "cloud failed to shut down broken node"
+
     def __init__(self, timer_actor, cloud_client, arvados_client, node_monitor,
                  cancellable=True, retry_wait=1, max_retry_wait=180):
         # If a ShutdownActor is cancellable, it will ask the
@@ -167,6 +171,7 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
         self._monitor = node_monitor.proxy()
         self.cloud_node = self._monitor.cloud_node.get()
         self.cancellable = cancellable
+        self.cancel_reason = None
         self.success = None
 
     def on_start(self):
@@ -180,7 +185,10 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
             self.success = success_flag
         return super(ComputeNodeShutdownActor, self)._finished()
 
-    def cancel_shutdown(self):
+    def cancel_shutdown(self, reason):
+        self.cancel_reason = reason
+        self._logger.info("Cloud node %s shutdown cancelled: %s.",
+                          self.cloud_node.id, reason)
         self._finished(success_flag=False)
 
     def _stop_if_window_closed(orig_func):
@@ -188,10 +196,7 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
         def stop_wrapper(self, *args, **kwargs):
             if (self.cancellable and
                   (not self._monitor.shutdown_eligible().get())):
-                self._logger.info(
-                    "Cloud node %s shutdown cancelled - no longer eligible.",
-                    self.cloud_node.id)
-                self._later.cancel_shutdown()
+                self._later.cancel_shutdown(self.WINDOW_CLOSED)
                 return None
             else:
                 return orig_func(self, *args, **kwargs)
@@ -201,8 +206,11 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
     @ComputeNodeStateChangeBase._retry()
     def shutdown_node(self):
         if not self._cloud.destroy_node(self.cloud_node):
-            # Force a retry.
-            raise cloud_types.LibcloudError("destroy_node failed")
+            if self._cloud.broken(self.cloud_node):
+                self._later.cancel_shutdown(self.NODE_BROKEN)
+            else:
+                # Force a retry.
+                raise cloud_types.LibcloudError("destroy_node failed")
         self._logger.info("Cloud node %s shut down.", self.cloud_node.id)
         arv_node = self._arvados_node()
         if arv_node is None:
index ec5014e9f9cf1e8848353cf3c755e22875227850..919b57f42c8973bab91de742d1fee48598296f35 100644 (file)
@@ -43,7 +43,7 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
     # error are still being investigated.
 
     @ShutdownActorBase._retry((subprocess.CalledProcessError, OSError))
-    def cancel_shutdown(self):
+    def cancel_shutdown(self, reason):
         if self._nodename:
             if self._get_slurm_state() in self.SLURM_DRAIN_STATES:
                 # Resume from "drng" or "drain"
@@ -52,7 +52,7 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
                 # Node is in a state such as 'idle' or 'alloc' so don't
                 # try to resume it because that will just raise an error.
                 pass
-        return super(ComputeNodeShutdownActor, self).cancel_shutdown()
+        return super(ComputeNodeShutdownActor, self).cancel_shutdown(reason)
 
     @ShutdownActorBase._retry((subprocess.CalledProcessError, OSError))
     @ShutdownActorBase._stop_if_window_closed
index 1d52073ce5ff7a362e0b8e9cc13b12c0f2d3b6a2..a65e9a0705d1cd5941140a20dbd4c6d4e1f5fd57 100644 (file)
@@ -25,6 +25,7 @@ class _BaseNodeTracker(object):
     def __init__(self):
         self.nodes = {}
         self.orphans = {}
+        self._blacklist = set()
 
     # Proxy the methods listed below to self.nodes.
     def _proxy_method(name):
@@ -43,6 +44,9 @@ class _BaseNodeTracker(object):
     def add(self, record):
         self.nodes[self.record_key(record)] = record
 
+    def blacklist(self, key):
+        self._blacklist.add(key)
+
     def update_record(self, key, item):
         setattr(self.nodes[key], self.RECORD_ATTR, item)
 
@@ -50,7 +54,9 @@ class _BaseNodeTracker(object):
         unseen = set(self.nodes.iterkeys())
         for item in response:
             key = self.item_key(item)
-            if key in unseen:
+            if key in self._blacklist:
+                continue
+            elif key in unseen:
                 unseen.remove(key)
                 self.update_record(key, item)
             else:
@@ -346,11 +352,13 @@ class NodeManagerDaemonActor(actor_class):
             self._begin_node_shutdown(record.actor, cancellable=False)
 
     def node_finished_shutdown(self, shutdown_actor):
-        success, cloud_node = self._get_actor_attrs(shutdown_actor, 'success',
-                                                    'cloud_node')
+        cloud_node, success, cancel_reason = self._get_actor_attrs(
+            shutdown_actor, 'cloud_node', 'success', 'cancel_reason')
         shutdown_actor.stop()
         cloud_node_id = cloud_node.id
         if not success:
+            if cancel_reason == self._node_shutdown.NODE_BROKEN:
+                self.cloud_nodes.blacklist(cloud_node_id)
             del self.shutdowns[cloud_node_id]
         elif cloud_node_id in self.booted:
             self.booted.pop(cloud_node_id).actor.stop()
index e718fc134b7a20723f7f33ef5aeefd5095763eb6..0bdb2cba471f4beac0c4b8a90f17c12a93c7bdf8 100644 (file)
@@ -116,11 +116,12 @@ class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
 
 class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
     def make_mocks(self, cloud_node=None, arvados_node=None,
-                   shutdown_open=True):
+                   shutdown_open=True, node_broken=False):
         self.timer = testutil.MockTimer()
         self.shutdowns = testutil.MockShutdownTimer()
         self.shutdowns._set_state(shutdown_open, 300)
         self.cloud_client = mock.MagicMock(name='cloud_client')
+        self.cloud_client.broken.return_value = node_broken
         self.arvados_client = mock.MagicMock(name='arvados_client')
         self.updates = mock.MagicMock(name='update_mock')
         if cloud_node is None:
@@ -201,6 +202,8 @@ class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
         self.make_actor()
         self.check_success_flag(False, 2)
         self.assertFalse(self.cloud_client.destroy_node.called)
+        self.assertEqual(self.ACTOR_CLASS.WINDOW_CLOSED,
+                         self.shutdown_actor.cancel_reason.get(self.TIMEOUT))
 
     def test_shutdown_retries_when_cloud_fails(self):
         self.make_mocks()
@@ -210,6 +213,15 @@ class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
         self.cloud_client.destroy_node.return_value = True
         self.check_success_flag(True)
 
+    def test_shutdown_cancelled_when_cloud_fails_on_broken_node(self):
+        self.make_mocks(node_broken=True)
+        self.cloud_client.destroy_node.return_value = False
+        self.make_actor(start_time=0)
+        self.check_success_flag(False, 2)
+        self.assertEqual(1, self.cloud_client.destroy_node.call_count)
+        self.assertEqual(self.ACTOR_CLASS.NODE_BROKEN,
+                         self.shutdown_actor.cancel_reason.get(self.TIMEOUT))
+
     def test_late_subscribe(self):
         self.make_actor()
         subscriber = mock.Mock(name='subscriber_mock')
index 16f560457765e3878e8e4c2a1bae6d46f8615a43..bbfbe4b7452504ad9935729b1654821bddbd3903 100644 (file)
@@ -449,6 +449,26 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
         self.assertTrue(shutdown_proxy.called)
 
+    def test_broken_node_blackholed_after_cancelled_shutdown(self):
+        cloud_node = testutil.cloud_node_mock(8)
+        wishlist = [testutil.MockSize(8)]
+        self.make_daemon([cloud_node], [testutil.arvados_node_mock(8)],
+                         wishlist)
+        self.assertEqual(1, self.alive_monitor_count())
+        self.assertFalse(self.node_setup.start.called)
+        monitor = self.monitor_list()[0].proxy()
+        shutdown_proxy = self.node_shutdown.start().proxy
+        shutdown_proxy().cloud_node.get.return_value = cloud_node
+        shutdown_proxy().success.get.return_value = False
+        shutdown_proxy().cancel_reason.get.return_value = self.node_shutdown.NODE_BROKEN
+        self.daemon.update_server_wishlist([]).get(self.TIMEOUT)
+        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
+        self.daemon.node_finished_shutdown(shutdown_proxy()).get(self.TIMEOUT)
+        self.daemon.update_cloud_nodes([cloud_node]).get(self.TIMEOUT)
+        self.daemon.update_server_wishlist(wishlist).get(self.TIMEOUT)
+        self.stop_proxy(self.daemon)
+        self.assertEqual(1, self.node_setup.start.call_count)
+
     def test_nodes_shutting_down_replaced_below_max_nodes(self):
         cloud_node = testutil.cloud_node_mock(6)
         self.make_daemon([cloud_node], [testutil.arvados_node_mock(6)])
index e72889038850631bb8205ee1602326d307d729fc..9432a0d383b534236ff506c83ac98ed0c196c324 100644 (file)
@@ -81,13 +81,13 @@ func setupRsync(c *C, enforcePermissions bool, replications int) {
        // srcConfig
        var srcConfig apiConfig
        srcConfig.APIHost = os.Getenv("ARVADOS_API_HOST")
-       srcConfig.APIToken = os.Getenv("ARVADOS_API_TOKEN")
+       srcConfig.APIToken = arvadostest.DataManagerToken
        srcConfig.APIHostInsecure = matchTrue.MatchString(os.Getenv("ARVADOS_API_HOST_INSECURE"))
 
        // dstConfig
        var dstConfig apiConfig
        dstConfig.APIHost = os.Getenv("ARVADOS_API_HOST")
-       dstConfig.APIToken = os.Getenv("ARVADOS_API_TOKEN")
+       dstConfig.APIToken = arvadostest.DataManagerToken
        dstConfig.APIHostInsecure = matchTrue.MatchString(os.Getenv("ARVADOS_API_HOST_INSECURE"))
 
        if enforcePermissions {
@@ -389,7 +389,7 @@ func (s *ServerNotRequiredSuite) TestLoadConfig(c *C) {
        c.Check(err, IsNil)
 
        c.Assert(srcConfig.APIHost, Equals, os.Getenv("ARVADOS_API_HOST"))
-       c.Assert(srcConfig.APIToken, Equals, os.Getenv("ARVADOS_API_TOKEN"))
+       c.Assert(srcConfig.APIToken, Equals, arvadostest.DataManagerToken)
        c.Assert(srcConfig.APIHostInsecure, Equals, matchTrue.MatchString(os.Getenv("ARVADOS_API_HOST_INSECURE")))
        c.Assert(srcConfig.ExternalClient, Equals, false)
 
@@ -397,7 +397,7 @@ func (s *ServerNotRequiredSuite) TestLoadConfig(c *C) {
        c.Check(err, IsNil)
 
        c.Assert(dstConfig.APIHost, Equals, os.Getenv("ARVADOS_API_HOST"))
-       c.Assert(dstConfig.APIToken, Equals, os.Getenv("ARVADOS_API_TOKEN"))
+       c.Assert(dstConfig.APIToken, Equals, arvadostest.DataManagerToken)
        c.Assert(dstConfig.APIHostInsecure, Equals, matchTrue.MatchString(os.Getenv("ARVADOS_API_HOST_INSECURE")))
        c.Assert(dstConfig.ExternalClient, Equals, false)
 
@@ -422,7 +422,7 @@ func setupConfigFile(c *C, name string) *os.File {
        c.Check(err, IsNil)
 
        fileContent := "ARVADOS_API_HOST=" + os.Getenv("ARVADOS_API_HOST") + "\n"
-       fileContent += "ARVADOS_API_TOKEN=" + os.Getenv("ARVADOS_API_TOKEN") + "\n"
+       fileContent += "ARVADOS_API_TOKEN=" + arvadostest.DataManagerToken + "\n"
        fileContent += "ARVADOS_API_HOST_INSECURE=" + os.Getenv("ARVADOS_API_HOST_INSECURE") + "\n"
        fileContent += "ARVADOS_EXTERNAL_CLIENT=false\n"
        fileContent += "ARVADOS_BLOB_SIGNING_KEY=abcdefg"