From db32ebf3b3bf067870e5c4a1883a08bc1e77e6b8 Mon Sep 17 00:00:00 2001 From: Brett Smith Date: Fri, 5 Dec 2014 17:45:13 -0500 Subject: [PATCH] 4380: Node Manager SLURM dispatcher proceeds from more states. Per discussion with Ward. Our main concern is that Node Manager shouldn't shut down nodes that are doing work. We feel comfortable broadening the definition of "not doing work" to this set of states. --- .../arvnodeman/computenode/dispatch/slurm.py | 4 +++- .../tests/test_computenode_dispatch_slurm.py | 20 ++++++++++++++----- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py index 27397e5d50..6eaa8b937b 100644 --- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py +++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py @@ -10,6 +10,8 @@ from . import \ from . import ComputeNodeShutdownActor as ShutdownActorBase class ComputeNodeShutdownActor(ShutdownActorBase): + SLURM_END_STATES = frozenset(['down\n', 'down*\n', 'drain\n', 'fail\n']) + def on_start(self): arv_node = self._monitor.arvados_node.get() if arv_node is None: @@ -42,7 +44,7 @@ class ComputeNodeShutdownActor(ShutdownActorBase): def await_slurm_drain(self): output = subprocess.check_output( ['sinfo', '--noheader', '-o', '%t', '-n', self._nodename]) - if output == 'drain\n': + if output in self.SLURM_END_STATES: self._later.shutdown_node() else: self._timer.schedule(time.time() + 10, diff --git a/services/nodemanager/tests/test_computenode_dispatch_slurm.py b/services/nodemanager/tests/test_computenode_dispatch_slurm.py index ccac8b2449..93cc60d4e8 100644 --- a/services/nodemanager/tests/test_computenode_dispatch_slurm.py +++ b/services/nodemanager/tests/test_computenode_dispatch_slurm.py @@ -22,21 +22,31 @@ class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin, for s in args: self.assertIn(s, slurm_cmd) - def check_success_after_reset(self, proc_mock): + def check_success_after_reset(self, proc_mock, end_state='drain\n'): self.make_mocks(arvados_node=testutil.arvados_node_mock(63)) self.make_actor() self.check_success_flag(None, 0) self.check_success_flag(None, 0) # Order is critical here: if the mock gets called when no return value # or side effect is set, we may invoke a real subprocess. - proc_mock.return_value = 'drain\n' + proc_mock.return_value = end_state proc_mock.side_effect = None self.check_success_flag(True, 3) self.check_slurm_got_args(proc_mock, 'compute63') - def test_wait_for_drained_state(self, proc_mock): - proc_mock.return_value = 'drng\n' - self.check_success_after_reset(proc_mock) + def make_wait_state_test(start_state='drng\n', end_state='drain\n'): + def test(self, proc_mock): + proc_mock.return_value = start_state + self.check_success_after_reset(proc_mock, end_state) + return test + + for wait_state in ['alloc\n', 'drng\n', 'idle*\n']: + locals()['test_wait_while_' + wait_state.strip() + ] = make_wait_state_test(start_state=wait_state) + + for end_state in ['down\n', 'down*\n', 'drain\n', 'fail\n']: + locals()['test_wait_until_' + end_state.strip() + ] = make_wait_state_test(end_state=end_state) def test_retry_failed_slurm_calls(self, proc_mock): proc_mock.side_effect = subprocess.CalledProcessError(1, ["mock"]) -- 2.30.2