4380: Node Manager SLURM dispatcher proceeds from more states.
authorBrett Smith <brett@curoverse.com>
Fri, 5 Dec 2014 22:45:13 +0000 (17:45 -0500)
committerBrett Smith <brett@curoverse.com>
Wed, 10 Dec 2014 13:03:50 +0000 (08:03 -0500)
Per discussion with Ward.  Our main concern is that Node Manager
shouldn't shut down nodes that are doing work.  We feel comfortable
broadening the definition of "not doing work" to this set of states.

services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
services/nodemanager/tests/test_computenode_dispatch_slurm.py

index 27397e5d5015f7eca875520238066b2625149396..6eaa8b937b979939c584f9b31927442aa4461d18 100644 (file)
@@ -10,6 +10,8 @@ from . import \
 from . import ComputeNodeShutdownActor as ShutdownActorBase
 
 class ComputeNodeShutdownActor(ShutdownActorBase):
+    SLURM_END_STATES = frozenset(['down\n', 'down*\n', 'drain\n', 'fail\n'])
+
     def on_start(self):
         arv_node = self._monitor.arvados_node.get()
         if arv_node is None:
@@ -42,7 +44,7 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
     def await_slurm_drain(self):
         output = subprocess.check_output(
             ['sinfo', '--noheader', '-o', '%t', '-n', self._nodename])
-        if output == 'drain\n':
+        if output in self.SLURM_END_STATES:
             self._later.shutdown_node()
         else:
             self._timer.schedule(time.time() + 10,
index ccac8b2449b1c54abbda22e4323256ec3de6834a..93cc60d4e8c93bb2d124b1bbe05ca722c3736541 100644 (file)
@@ -22,21 +22,31 @@ class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
         for s in args:
             self.assertIn(s, slurm_cmd)
 
-    def check_success_after_reset(self, proc_mock):
+    def check_success_after_reset(self, proc_mock, end_state='drain\n'):
         self.make_mocks(arvados_node=testutil.arvados_node_mock(63))
         self.make_actor()
         self.check_success_flag(None, 0)
         self.check_success_flag(None, 0)
         # Order is critical here: if the mock gets called when no return value
         # or side effect is set, we may invoke a real subprocess.
-        proc_mock.return_value = 'drain\n'
+        proc_mock.return_value = end_state
         proc_mock.side_effect = None
         self.check_success_flag(True, 3)
         self.check_slurm_got_args(proc_mock, 'compute63')
 
-    def test_wait_for_drained_state(self, proc_mock):
-        proc_mock.return_value = 'drng\n'
-        self.check_success_after_reset(proc_mock)
+    def make_wait_state_test(start_state='drng\n', end_state='drain\n'):
+        def test(self, proc_mock):
+            proc_mock.return_value = start_state
+            self.check_success_after_reset(proc_mock, end_state)
+        return test
+
+    for wait_state in ['alloc\n', 'drng\n', 'idle*\n']:
+        locals()['test_wait_while_' + wait_state.strip()
+                 ] = make_wait_state_test(start_state=wait_state)
+
+    for end_state in ['down\n', 'down*\n', 'drain\n', 'fail\n']:
+        locals()['test_wait_until_' + end_state.strip()
+                 ] = make_wait_state_test(end_state=end_state)
 
     def test_retry_failed_slurm_calls(self, proc_mock):
         proc_mock.side_effect = subprocess.CalledProcessError(1, ["mock"])