12199: Improve race handling in busywait.
[arvados.git] / services / nodemanager / tests / test_computenode_dispatch_slurm.py
1 #!/usr/bin/env python
2 # Copyright (C) The Arvados Authors. All rights reserved.
3 #
4 # SPDX-License-Identifier: AGPL-3.0
5
6 from __future__ import absolute_import, print_function
7
8 import subprocess
9 import time
10 import unittest
11
12 import mock
13
14 import arvnodeman.computenode.dispatch.slurm as slurm_dispatch
15 from . import testutil
16 from .test_computenode_dispatch import \
17     ComputeNodeShutdownActorMixin, \
18     ComputeNodeSetupActorTestCase, \
19     ComputeNodeUpdateActorTestCase
20
21 @mock.patch('subprocess.check_output')
22 class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
23                                             unittest.TestCase):
24     ACTOR_CLASS = slurm_dispatch.ComputeNodeShutdownActor
25
26     def check_slurm_got_args(self, proc_mock, *args):
27         self.assertTrue(proc_mock.called)
28         slurm_cmd = proc_mock.call_args[0][0]
29         for s in args:
30             self.assertIn(s, slurm_cmd)
31
32     def check_success_after_reset(self, proc_mock, end_state='drain\n', timer=False):
33         self.make_mocks(arvados_node=testutil.arvados_node_mock(63))
34         if not timer:
35             self.timer = testutil.MockTimer(False)
36         self.make_actor()
37         self.check_success_flag(None, 0)
38         # At this point, 1st try should have happened.
39
40         self.timer.deliver()
41         self.check_success_flag(None, 0)
42         # At this point, 2nd try should have happened.
43
44         # Order is critical here: if the mock gets called when no return value
45         # or side effect is set, we may invoke a real subprocess.
46         proc_mock.return_value = end_state
47         proc_mock.side_effect = None
48
49         # 3rd try
50         self.timer.deliver()
51
52         self.check_success_flag(True, 3)
53         self.check_slurm_got_args(proc_mock, 'NodeName=compute63')
54
55     def make_wait_state_test(start_state='drng\n', end_state='drain\n'):
56         def test(self, proc_mock):
57             proc_mock.return_value = start_state
58             self.check_success_after_reset(proc_mock, end_state)
59         return test
60
61     for wait_state in ['alloc\n', 'drng\n']:
62         locals()['test_wait_while_' + wait_state.strip()
63                  ] = make_wait_state_test(start_state=wait_state)
64
65     for end_state in ['idle*\n', 'down\n', 'down*\n', 'drain\n', 'fail\n']:
66         locals()['test_wait_until_' + end_state.strip()
67                  ] = make_wait_state_test(end_state=end_state)
68
69     def test_retry_failed_slurm_calls(self, proc_mock):
70         proc_mock.side_effect = subprocess.CalledProcessError(1, ["mock"])
71         self.check_success_after_reset(proc_mock)
72
73     def test_slurm_bypassed_when_no_arvados_node(self, proc_mock):
74         # Test we correctly handle a node that failed to bootstrap.
75         proc_mock.return_value = 'down\n'
76         self.make_actor(start_time=0)
77         self.check_success_flag(True)
78         self.assertFalse(proc_mock.called)
79
80     def test_node_resumed_when_shutdown_cancelled(self, proc_mock):
81         try:
82             proc_mock.side_effect = iter(['', 'drng\n', 'drng\n', ''])
83             self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
84             self.timer = testutil.MockTimer(False)
85             self.make_actor()
86             self.busywait(lambda: proc_mock.call_args is not None)
87             self.shutdown_actor.cancel_shutdown("test")
88             self.check_success_flag(False, 2)
89             self.assertEqual(proc_mock.call_args_list[0], mock.call(['scontrol', 'update', 'NodeName=compute99', 'State=DRAIN', 'Reason=Node Manager shutdown']))
90             self.assertEqual(proc_mock.call_args_list[-1], mock.call(['scontrol', 'update', 'NodeName=compute99', 'State=RESUME']))
91
92         finally:
93             self.shutdown_actor.actor_ref.stop()
94
95     def test_cancel_shutdown_retry(self, proc_mock):
96         proc_mock.side_effect = iter([OSError, 'drain\n', OSError, 'idle\n', 'idle\n'])
97         self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
98         self.make_actor()
99         self.check_success_flag(False, 5)
100
101     def test_issue_slurm_drain_retry(self, proc_mock):
102         proc_mock.side_effect = iter([OSError, OSError, 'drng\n', 'drain\n'])
103         self.check_success_after_reset(proc_mock, timer=False)
104
105     def test_arvados_node_cleaned_after_shutdown(self, proc_mock):
106         proc_mock.return_value = 'drain\n'
107         super(SLURMComputeNodeShutdownActorTestCase,
108               self).test_arvados_node_cleaned_after_shutdown()
109
110     def test_cancellable_shutdown(self, proc_mock):
111         proc_mock.return_value = 'other\n'
112         super(SLURMComputeNodeShutdownActorTestCase,
113               self).test_cancellable_shutdown()
114
115     def test_uncancellable_shutdown(self, proc_mock):
116         proc_mock.return_value = 'other\n'
117         super(SLURMComputeNodeShutdownActorTestCase,
118               self).test_uncancellable_shutdown()
119
120 @mock.patch('subprocess.check_output')
121 class SLURMComputeNodeUpdateActorTestCase(ComputeNodeUpdateActorTestCase):
122     ACTOR_CLASS = slurm_dispatch.ComputeNodeUpdateActor
123
124     def test_update_node_weight(self, check_output):
125         self.make_actor()
126         cloud_node = testutil.cloud_node_mock()
127         arv_node = testutil.arvados_node_mock()
128         self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
129         check_output.assert_called_with(['scontrol', 'update', 'NodeName=compute99', 'Weight=99000', 'Features=instancetype=z99.test'])
130
131 class SLURMComputeNodeSetupActorTestCase(ComputeNodeSetupActorTestCase):
132     ACTOR_CLASS = slurm_dispatch.ComputeNodeSetupActor
133
134     @mock.patch('subprocess.check_output')
135     def test_update_node_features(self, check_output):
136         # `scontrol update` happens only if the Arvados node record
137         # has a hostname. ComputeNodeSetupActorTestCase.make_mocks
138         # uses mocks with scrubbed hostnames, so we override with the
139         # default testutil.arvados_node_mock.
140         self.make_mocks(arvados_effect=[testutil.arvados_node_mock()])
141         self.make_actor()
142         self.wait_for_assignment(self.setup_actor, 'cloud_node')
143         check_output.assert_called_with(['scontrol', 'update', 'NodeName=compute99', 'Weight=1000', 'Features=instancetype=z1.test'])