8953: Drained SLURM nodes can be eligible for shutdown. 8953-node-manager-prevent-shutdown-eligible-flapping-wip
authorBrett Smith <brett@curoverse.com>
Wed, 13 Apr 2016 01:54:50 +0000 (21:54 -0400)
committerBrett Smith <brett@curoverse.com>
Wed, 13 Apr 2016 01:54:50 +0000 (21:54 -0400)
Without this, shutdown_eligible() will report "node is draining" as
ComputeNodeShutdownActor runs its course, cancelling the shutdown.

services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
services/nodemanager/tests/test_computenode_dispatch_slurm.py

index 41919db07e12efe7a262c4635e9a8432febdec2f..6d979b6c5cbd08c3c6db27ffda4895e14234c11b 100644 (file)
@@ -76,11 +76,9 @@ class ComputeNodeMonitorActor(SlurmMixin, MonitorActorBase):
             state = self._get_slurm_state(self.arvados_node['hostname'])
             # Automatically eligible for shutdown if it's down or failed, but
             # not drain to avoid a race condition with resume_node().
-            if state in self.SLURM_END_STATES:
-                if state in self.SLURM_DRAIN_STATES:
-                    return "node is draining"
-                else:
-                    return True
+            if ((state in self.SLURM_END_STATES) and
+                  (state not in self.SLURM_DRAIN_STATES)):
+                return True
         return super(ComputeNodeMonitorActor, self).shutdown_eligible()
 
     def resume_node(self):
index 135b817d91b725d26f712a8f2b1f5a5bb1f93144..0ec9612bcaa80c99df12502ea23b5b8cc8dc0295 100644 (file)
@@ -147,7 +147,14 @@ class SLURMComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.assertIs(True, self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     @mock.patch("subprocess.check_output")
-    def test_no_shutdown_drain_node(self, check_output):
+    def test_no_shutdown_ineligible_drain_node(self, check_output):
         check_output.return_value = "drain\n"
         self.make_actor(arv_node=testutil.arvados_node_mock())
-        self.assertEquals('node is draining', self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+        self.assertIsNot(True, self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+
+    @mock.patch("subprocess.check_output")
+    def test_shutdown_eligible_drain_node(self, check_output):
+        check_output.return_value = "drain\n"
+        self.make_actor(arv_node=testutil.arvados_node_mock())
+        self.shutdowns._set_state(True, 300)
+        self.assertIs(True, self.node_actor.shutdown_eligible().get(self.TIMEOUT))