7444: Clean stopped containers at startup.
authorTom Clegg <tom@curoverse.com>
Mon, 2 Nov 2015 20:46:22 +0000 (15:46 -0500)
committerTom Clegg <tom@curoverse.com>
Mon, 2 Nov 2015 20:46:22 +0000 (15:46 -0500)
doc/install/install-compute-node.html.textile.liquid
services/dockercleaner/arvados_docker/cleaner.py
services/dockercleaner/tests/test_cleaner.py

index 25d047343b1f9cfe4919ad92c1f3852ff6a661db..aa4f37d639704f33dc10b2f9e71db1c36a6c129a 100644 (file)
@@ -79,7 +79,7 @@ h2. Configure the Docker cleaner
 The arvados-docker-cleaner program removes least recently used docker images as needed to keep disk usage below a configured limit.
 
 {% include 'notebox_begin' %}
-This also removes all containers as soon as they exit, as if they were run with `docker run --rm`. If you need to debug or inspect containers after they stop, temporarily stop arvados-docker-cleaner or run it with the `--no-remove-stopped-containers` flag.
+This also removes all containers as soon as they exit, as if they were run with `docker run --rm`. If you need to debug or inspect containers after they stop, temporarily stop arvados-docker-cleaner or run it with `--remove-stopped-containers never`.
 {% include 'notebox_end' %}
 
 On Debian-based systems, install runit:
index 89d65216dc52f600b6ccd52aaa7c65aa5203ee67..f9d727f2ec54ec96d5e006e187a58b3969507b89 100755 (executable)
@@ -189,11 +189,7 @@ class DockerImageCleaner(DockerImageUseRecorder):
             self.images.add_image(image_hash)
         return super().new_container(event, container_hash)
 
-    @event_handlers.on('die')
-    def clean_container(self, event=None):
-        if not self.remove_stopped_containers:
-            return
-        cid = event['id']
+    def _remove_container(self, cid):
         try:
             self.docker_client.remove_container(cid)
         except docker.errors.APIError as error:
@@ -201,6 +197,22 @@ class DockerImageCleaner(DockerImageUseRecorder):
         else:
             logger.info("Removed container %s", cid)
 
+    @event_handlers.on('die')
+    def clean_container(self, event=None):
+        if not self.remove_stopped_containers:
+            return
+        self._remove_container(event['id'])
+
+    def check_stopped_containers(self, remove=False):
+        logger.info("Checking for stopped containers")
+        for c in self.docker_client.containers(filters={'status': 'exited'}):
+            logger.info("Container %s %s", c['Id'], c['Status'])
+            if c['Status'][:6] != 'Exited':
+                logger.error("Unexpected status %s for container %s",
+                             c['Status'], c['Id'])
+            elif remove:
+                self._remove_container(c['Id'])
+
     @event_handlers.on('destroy')
     def clean_images(self, event=None):
         for image_id in self.images.should_delete():
@@ -239,9 +251,11 @@ def parse_arguments(arguments):
         '--quota', action='store', type=human_size, required=True,
         help="space allowance for Docker images, suffixed with K/M/G/T")
     parser.add_argument(
-        '--no-remove-stopped-containers', action='store_false', default=True,
-        dest='remove_stopped_containers',
-        help="do not remove containers (default: remove on exit)")
+        '--remove-stopped-containers', type=str, default='always',
+        choices=['never', 'onexit', 'always'],
+        help="""when to remove stopped containers (default: always, i.e., remove
+        stopped containers found at startup, and remove containers as
+        soon as they exit)""")
     parser.add_argument(
         '--verbose', '-v', action='count', default=0,
         help="log more information")
@@ -264,9 +278,12 @@ def run(args, docker_client):
     use_recorder.run()
     cleaner = DockerImageCleaner(
         images, docker_client, docker_client.events(since=start_time),
-        remove_stopped_containers=args.remove_stopped_containers)
-    logger.info("Starting cleanup loop")
+        remove_stopped_containers=args.remove_stopped_containers != 'never')
+    cleaner.check_stopped_containers(
+        remove=args.remove_stopped_containers == 'always')
+    logger.info("Checking image quota at startup")
     cleaner.clean_images()
+    logger.info("Listening for docker events")
     cleaner.run()
 
 def main(arguments):
index 6793923762e4fe5a7cae620d49ba397c515ccc0c..a9ecc92441f700532370cdce7b6b61c01e03bd76 100644 (file)
@@ -375,20 +375,54 @@ class RunTestCase(unittest.TestCase):
 
 
 class ContainerRemovalTestCase(unittest.TestCase):
+    LIFECYCLE = ['create', 'attach', 'start', 'resize', 'die', 'destroy']
+
     def setUp(self):
         self.args = mock.MagicMock(name='args')
         self.docker_client = mock.MagicMock(name='docker_client')
-
-    def test_remove_on_die(self):
-        mockID = MockDockerId()
+        self.existingCID = MockDockerId()
+        self.docker_client.containers.return_value = [{
+            'Id': self.existingCID,
+            'Status': 'Exited (0) 6 weeks ago',
+        }, {
+            # If docker_client.containers() returns non-exited
+            # containers for some reason, do not remove them.
+            'Id': MockDockerId(),
+            'Status': 'Running',
+        }]
+        self.newCID = MockDockerId()
         self.docker_client.events.return_value = [
-            MockEvent(x, docker_id=mockID).encoded()
-            for x in ['create', 'attach', 'start', 'resize', 'die', 'destroy']]
+            MockEvent(e, docker_id=self.newCID).encoded()
+            for e in self.LIFECYCLE]
+
+    def test_remove_onexit(self):
+        self.args.remove_stopped_containers = 'onexit'
+        cleaner.run(self.args, self.docker_client)
+        self.docker_client.remove_container.assert_called_once_with(self.newCID)
+
+    def test_remove_always(self):
+        self.args.remove_stopped_containers = 'always'
         cleaner.run(self.args, self.docker_client)
-        self.docker_client.remove_container.assert_called_once_with(mockID)
+        self.docker_client.remove_container.assert_any_call(self.existingCID)
+        self.docker_client.remove_container.assert_any_call(self.newCID)
+        self.assertEqual(2, self.docker_client.remove_container.call_count)
 
-    def test_disabled_flag(self):
-        self.args.remove_stopped_containers = False
-        self.docker_client.events.return_value = [MockEvent('die').encoded()]
+    def test_remove_never(self):
+        self.args.remove_stopped_containers = 'never'
         cleaner.run(self.args, self.docker_client)
         self.assertEqual(0, self.docker_client.remove_container.call_count)
+
+    def test_container_exited_between_subscribe_events_and_check_existing(self):
+        self.args.remove_stopped_containers = 'always'
+        self.docker_client.events.return_value = [
+            MockEvent(e, docker_id=self.existingCID).encoded()
+            for e in ['die', 'destroy']]
+        cleaner.run(self.args, self.docker_client)
+        # Subscribed to events before getting the list of existing
+        # exited containers?
+        self.docker_client.assert_has_calls([
+            mock.call.events(since=mock.ANY),
+            mock.call.containers(filters={'status':'exited'})])
+        # Asked to delete the container twice?
+        self.docker_client.remove_container.assert_has_calls([mock.call(self.existingCID)] * 2)
+        self.assertEqual(2, self.docker_client.remove_container.call_count)