13804: Smarter shutdown behavior WIP
authorPeter Amstutz <pamstutz@veritasgenetics.com>
Tue, 17 Jul 2018 13:51:23 +0000 (09:51 -0400)
committerPeter Amstutz <pamstutz@veritasgenetics.com>
Tue, 17 Jul 2018 13:51:23 +0000 (09:51 -0400)
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <pamstutz@veritasgenetics.com>

services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
services/nodemanager/arvnodeman/config.py
services/nodemanager/arvnodeman/daemon.py
services/nodemanager/tests/test_computenode_dispatch.py

index bb83a193fa30bae46dc7a7f6979843eb65fd42d1..30ca16805d50690f923c2d6223d17d25341e7365 100644 (file)
@@ -339,7 +339,7 @@ class ComputeNodeMonitorActor(config.actor_class):
     def __init__(self, cloud_node, cloud_node_start_time, shutdown_timer,
                  timer_actor, update_actor, cloud_client,
                  arvados_node=None, poll_stale_after=600, node_stale_after=3600,
-                 boot_fail_after=1800
+                 boot_fail_after=1800, consecutive_idle_count=0
     ):
         super(ComputeNodeMonitorActor, self).__init__()
         self._later = self.actor_ref.tell_proxy()
@@ -354,6 +354,7 @@ class ComputeNodeMonitorActor(config.actor_class):
         self.boot_fail_after = boot_fail_after
         self.subscribers = set()
         self.arvados_node = None
+        self.consecutive_idle_count = consecutive_idle_count
         self.consecutive_idle = 0
         self._later.update_arvados_node(arvados_node)
         self.last_shutdown_opening = None
@@ -458,7 +459,7 @@ class ComputeNodeMonitorActor(config.actor_class):
 
         if crunch_worker_state == "idle":
             # Must report as "idle" at least two consecutive times
-            if self.consecutive_idle < 2:
+            if self.consecutive_idle < self.consecutive_idle_count:
                 idle_grace = 'idle wait'
             else:
                 idle_grace = 'idle exceeded'
index e47f9fcb1d036b78f94af0af25e8c37dc17b5ad0..8d974872f2b73203ed6403f30fa1eca411736456 100644 (file)
@@ -56,7 +56,8 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser):
                        'boot_fail_after': str(sys.maxint),
                        'node_stale_after': str(60 * 60 * 2),
                        'watchdog': '600',
-                       'node_mem_scaling': '0.95'},
+                       'node_mem_scaling': '0.95',
+                       'consecutive_idle_count': 2},
             'Manage': {'address': '127.0.0.1',
                        'port': '-1',
                        'ManagementToken': ''},
index 2e71b316df1459c3255f0235889e554eea5fc728..f34260adb9d5f689b2ca3c24f8333221dd04fc22 100644 (file)
@@ -112,7 +112,8 @@ class NodeManagerDaemonActor(actor_class):
                  node_setup_class=dispatch.ComputeNodeSetupActor,
                  node_shutdown_class=dispatch.ComputeNodeShutdownActor,
                  node_actor_class=dispatch.ComputeNodeMonitorActor,
-                 max_total_price=0):
+                 max_total_price=0,
+                 consecutive_idle_count=1):
         super(NodeManagerDaemonActor, self).__init__()
         self._node_setup = node_setup_class
         self._node_shutdown = node_shutdown_class
@@ -133,6 +134,7 @@ class NodeManagerDaemonActor(actor_class):
         self.poll_stale_after = poll_stale_after
         self.boot_fail_after = boot_fail_after
         self.node_stale_after = node_stale_after
+        self.consecutive_idle_count = consecutive_idle_count
         self.last_polls = {}
         for poll_name in ['server_wishlist', 'arvados_nodes', 'cloud_nodes']:
             poll_actor = locals()[poll_name + '_actor']
@@ -173,7 +175,8 @@ class NodeManagerDaemonActor(actor_class):
             poll_stale_after=self.poll_stale_after,
             node_stale_after=self.node_stale_after,
             cloud_client=self._cloud_driver,
-            boot_fail_after=self.boot_fail_after)
+            boot_fail_after=self.boot_fail_after,
+            consecutive_idle_count=self.consecutive_idle_count)
         actorTell = actor.tell_proxy()
         actorTell.subscribe(self._later.node_can_shutdown)
         self._cloud_nodes_actor.subscribe_to(cloud_node.id,
index 5775aa659a31391f13a5071929d9f5562ba3969d..ee30502a192829be284a145b180d63d8358f1fba 100644 (file)
@@ -424,12 +424,12 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.make_actor()
         self.shutdowns._set_state(True, 600)
         self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
-                          (False, "node state is ('unpaired', 'open', 'boot wait', 'idle exceeded')"))
+                          (False, "node state is ('unpaired', 'open', 'boot wait', 'not idle')"))
 
     def test_shutdown_without_arvados_node(self):
         self.make_actor(start_time=0)
         self.shutdowns._set_state(True, 600)
-        self.assertEquals((True, "node state is ('down', 'open', 'boot exceeded', 'idle exceeded')"),
+        self.assertEquals((True, "node state is ('down', 'open', 'boot exceeded', 'not idle')"),
                           self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_shutdown_missing(self):
@@ -438,7 +438,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
                                               last_ping_at='1970-01-01T01:02:03.04050607Z')
         self.make_actor(10, arv_node)
         self.shutdowns._set_state(True, 600)
-        self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"),
+        self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'not idle')"),
                           self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_shutdown_running_broken(self):
@@ -447,7 +447,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.make_actor(12, arv_node)
         self.shutdowns._set_state(True, 600)
         self.cloud_client.broken.return_value = True
-        self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"),
+        self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'not idle')"),
                           self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_shutdown_missing_broken(self):
@@ -457,7 +457,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.make_actor(11, arv_node)
         self.shutdowns._set_state(True, 600)
         self.cloud_client.broken.return_value = True
-        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"))
+        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'not idle')"))
 
     def test_no_shutdown_when_window_closed(self):
         self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
@@ -467,7 +467,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
     def test_no_shutdown_when_node_running_job(self):
         self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
         self.shutdowns._set_state(True, 600)
-        self.assertEquals((False, "node state is ('busy', 'open', 'boot wait', 'idle exceeded')"),
+        self.assertEquals((False, "node state is ('busy', 'open', 'boot wait', 'not idle')"),
                           self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_shutdown_when_node_state_unknown(self):
@@ -481,7 +481,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.make_actor(5, testutil.arvados_node_mock(
             5, crunch_worker_state='fail'))
         self.shutdowns._set_state(True, 600)
-        self.assertEquals((True, "node state is ('fail', 'open', 'boot wait', 'idle exceeded')"),
+        self.assertEquals((True, "node state is ('fail', 'open', 'boot wait', 'not idle')"),
                           self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_no_shutdown_when_node_state_stale(self):