7661: added test with only_pdh (not working yet)
[arvados.git] / services / nodemanager / tests / test_computenode_dispatch.py
1 #!/usr/bin/env python
2
3 from __future__ import absolute_import, print_function
4
5 import time
6 import unittest
7
8 import arvados.errors as arverror
9 import httplib2
10 import mock
11 import pykka
12
13 import arvnodeman.computenode.dispatch as dispatch
14 from . import testutil
15
16 class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
17     def make_mocks(self, arvados_effect=None):
18         if arvados_effect is None:
19             arvados_effect = [testutil.arvados_node_mock()]
20         self.arvados_effect = arvados_effect
21         self.timer = testutil.MockTimer()
22         self.api_client = mock.MagicMock(name='api_client')
23         self.api_client.nodes().create().execute.side_effect = arvados_effect
24         self.api_client.nodes().update().execute.side_effect = arvados_effect
25         self.cloud_client = mock.MagicMock(name='cloud_client')
26         self.cloud_client.create_node.return_value = testutil.cloud_node_mock(1)
27
28     def make_actor(self, arv_node=None):
29         if not hasattr(self, 'timer'):
30             self.make_mocks(arvados_effect=[arv_node])
31         self.setup_actor = dispatch.ComputeNodeSetupActor.start(
32             self.timer, self.api_client, self.cloud_client,
33             testutil.MockSize(1), arv_node).proxy()
34
35     def test_creation_without_arvados_node(self):
36         self.make_actor()
37         self.assertEqual(self.arvados_effect[-1],
38                          self.setup_actor.arvados_node.get(self.TIMEOUT))
39         self.assertTrue(self.api_client.nodes().create().execute.called)
40         self.assertEqual(self.cloud_client.create_node(),
41                          self.setup_actor.cloud_node.get(self.TIMEOUT))
42
43     def test_creation_with_arvados_node(self):
44         self.make_actor(testutil.arvados_node_mock())
45         self.assertEqual(self.arvados_effect[-1],
46                          self.setup_actor.arvados_node.get(self.TIMEOUT))
47         self.assertTrue(self.api_client.nodes().update().execute.called)
48         self.assertEqual(self.cloud_client.create_node(),
49                          self.setup_actor.cloud_node.get(self.TIMEOUT))
50
51     def test_failed_arvados_calls_retried(self):
52         self.make_mocks([
53                 arverror.ApiError(httplib2.Response({'status': '500'}), ""),
54                 testutil.arvados_node_mock(),
55                 ])
56         self.make_actor()
57         self.wait_for_assignment(self.setup_actor, 'arvados_node')
58
59     def test_failed_cloud_calls_retried(self):
60         self.make_mocks()
61         self.cloud_client.create_node.side_effect = [
62             Exception("test cloud creation error"),
63             self.cloud_client.create_node.return_value,
64             ]
65         self.make_actor()
66         self.wait_for_assignment(self.setup_actor, 'cloud_node')
67
68     def test_failed_post_create_retried(self):
69         self.make_mocks()
70         self.cloud_client.post_create_node.side_effect = [
71             Exception("test cloud post-create error"), None]
72         self.make_actor()
73         done = self.FUTURE_CLASS()
74         self.setup_actor.subscribe(done.set)
75         done.get(self.TIMEOUT)
76         self.assertEqual(2, self.cloud_client.post_create_node.call_count)
77
78     def test_stop_when_no_cloud_node(self):
79         self.make_mocks(
80             arverror.ApiError(httplib2.Response({'status': '500'}), ""))
81         self.make_actor()
82         self.assertTrue(
83             self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT))
84         self.assertTrue(
85             self.setup_actor.actor_ref.actor_stopped.wait(self.TIMEOUT))
86
87     def test_no_stop_when_cloud_node(self):
88         self.make_actor()
89         self.wait_for_assignment(self.setup_actor, 'cloud_node')
90         self.assertFalse(
91             self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT))
92         self.assertTrue(self.stop_proxy(self.setup_actor),
93                         "actor was stopped by stop_if_no_cloud_node")
94
95     def test_subscribe(self):
96         self.make_mocks(
97             arverror.ApiError(httplib2.Response({'status': '500'}), ""))
98         self.make_actor()
99         subscriber = mock.Mock(name='subscriber_mock')
100         self.setup_actor.subscribe(subscriber)
101         self.api_client.nodes().create().execute.side_effect = [
102             testutil.arvados_node_mock()]
103         self.wait_for_assignment(self.setup_actor, 'cloud_node')
104         self.assertEqual(self.setup_actor.actor_ref.actor_urn,
105                          subscriber.call_args[0][0].actor_ref.actor_urn)
106
107     def test_late_subscribe(self):
108         self.make_actor()
109         subscriber = mock.Mock(name='subscriber_mock')
110         self.wait_for_assignment(self.setup_actor, 'cloud_node')
111         self.setup_actor.subscribe(subscriber).get(self.TIMEOUT)
112         self.stop_proxy(self.setup_actor)
113         self.assertEqual(self.setup_actor.actor_ref.actor_urn,
114                          subscriber.call_args[0][0].actor_ref.actor_urn)
115
116
117 class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
118     def make_mocks(self, cloud_node=None, arvados_node=None,
119                    shutdown_open=True, node_broken=False):
120         self.timer = testutil.MockTimer()
121         self.shutdowns = testutil.MockShutdownTimer()
122         self.shutdowns._set_state(shutdown_open, 300)
123         self.cloud_client = mock.MagicMock(name='cloud_client')
124         self.cloud_client.broken.return_value = node_broken
125         self.arvados_client = mock.MagicMock(name='arvados_client')
126         self.updates = mock.MagicMock(name='update_mock')
127         if cloud_node is None:
128             cloud_node = testutil.cloud_node_mock()
129         self.cloud_node = cloud_node
130         self.arvados_node = arvados_node
131
132     def make_actor(self, cancellable=True, start_time=None):
133         if not hasattr(self, 'timer'):
134             self.make_mocks()
135         if start_time is None:
136             start_time = time.time()
137         monitor_actor = dispatch.ComputeNodeMonitorActor.start(
138             self.cloud_node, start_time, self.shutdowns,
139             testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client,
140             self.arvados_node)
141         self.shutdown_actor = self.ACTOR_CLASS.start(
142             self.timer, self.cloud_client, self.arvados_client, monitor_actor,
143             cancellable).proxy()
144         self.monitor_actor = monitor_actor.proxy()
145
146     def check_success_flag(self, expected, allow_msg_count=1):
147         # allow_msg_count is the number of internal messages that may
148         # need to be handled for shutdown to finish.
149         for try_num in range(1 + allow_msg_count):
150             last_flag = self.shutdown_actor.success.get(self.TIMEOUT)
151             if last_flag is expected:
152                 break
153         else:
154             self.fail("success flag {} is not {}".format(last_flag, expected))
155
156     def test_uncancellable_shutdown(self, *mocks):
157         self.make_mocks(shutdown_open=False)
158         self.cloud_client.destroy_node.return_value = False
159         self.make_actor(cancellable=False)
160         self.check_success_flag(None, 0)
161         self.shutdowns._set_state(True, 600)
162         self.cloud_client.destroy_node.return_value = True
163         self.check_success_flag(True)
164
165     def test_arvados_node_cleaned_after_shutdown(self, *mocks):
166         cloud_node = testutil.cloud_node_mock(62)
167         arv_node = testutil.arvados_node_mock(62)
168         self.make_mocks(cloud_node, arv_node)
169         self.make_actor()
170         self.check_success_flag(True, 3)
171         update_mock = self.arvados_client.nodes().update
172         self.assertTrue(update_mock.called)
173         update_kwargs = update_mock.call_args_list[0][1]
174         self.assertEqual(arv_node['uuid'], update_kwargs.get('uuid'))
175         self.assertIn('body', update_kwargs)
176         for clear_key in ['slot_number', 'hostname', 'ip_address',
177                           'first_ping_at', 'last_ping_at']:
178             self.assertIn(clear_key, update_kwargs['body'])
179             self.assertIsNone(update_kwargs['body'][clear_key])
180         self.assertTrue(update_mock().execute.called)
181
182     def test_arvados_node_not_cleaned_after_shutdown_cancelled(self, *mocks):
183         cloud_node = testutil.cloud_node_mock(61)
184         arv_node = testutil.arvados_node_mock(61)
185         self.make_mocks(cloud_node, arv_node, shutdown_open=False)
186         self.make_actor(cancellable=True)
187         self.check_success_flag(False, 2)
188         self.assertFalse(self.arvados_client.nodes().update.called)
189
190
191 class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
192                                        unittest.TestCase):
193     ACTOR_CLASS = dispatch.ComputeNodeShutdownActor
194
195     def test_easy_shutdown(self):
196         self.make_actor(start_time=0)
197         self.check_success_flag(True)
198         self.assertTrue(self.cloud_client.destroy_node.called)
199
200     def test_shutdown_cancelled_when_window_closes(self):
201         self.make_mocks(shutdown_open=False)
202         self.make_actor()
203         self.check_success_flag(False, 2)
204         self.assertFalse(self.cloud_client.destroy_node.called)
205         self.assertEqual(self.ACTOR_CLASS.WINDOW_CLOSED,
206                          self.shutdown_actor.cancel_reason.get(self.TIMEOUT))
207
208     def test_shutdown_retries_when_cloud_fails(self):
209         self.make_mocks()
210         self.cloud_client.destroy_node.return_value = False
211         self.make_actor(start_time=0)
212         self.assertIsNone(self.shutdown_actor.success.get(self.TIMEOUT))
213         self.cloud_client.destroy_node.return_value = True
214         self.check_success_flag(True)
215
216     def test_shutdown_cancelled_when_cloud_fails_on_broken_node(self):
217         self.make_mocks(node_broken=True)
218         self.cloud_client.destroy_node.return_value = False
219         self.make_actor(start_time=0)
220         self.check_success_flag(False, 2)
221         self.assertEqual(1, self.cloud_client.destroy_node.call_count)
222         self.assertEqual(self.ACTOR_CLASS.NODE_BROKEN,
223                          self.shutdown_actor.cancel_reason.get(self.TIMEOUT))
224
225     def test_late_subscribe(self):
226         self.make_actor()
227         subscriber = mock.Mock(name='subscriber_mock')
228         self.shutdown_actor.subscribe(subscriber).get(self.TIMEOUT)
229         self.stop_proxy(self.shutdown_actor)
230         self.assertTrue(subscriber.called)
231         self.assertEqual(self.shutdown_actor.actor_ref.actor_urn,
232                          subscriber.call_args[0][0].actor_ref.actor_urn)
233
234
235 class ComputeNodeUpdateActorTestCase(testutil.ActorTestMixin,
236                                      unittest.TestCase):
237     def make_actor(self):
238         self.driver = mock.MagicMock(name='driver_mock')
239         self.updater = dispatch.ComputeNodeUpdateActor.start(self.driver).proxy()
240
241     def test_node_sync(self):
242         self.make_actor()
243         cloud_node = testutil.cloud_node_mock()
244         arv_node = testutil.arvados_node_mock()
245         self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
246         self.driver().sync_node.assert_called_with(cloud_node, arv_node)
247
248
249 class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
250                                       unittest.TestCase):
251     def make_mocks(self, node_num):
252         self.shutdowns = testutil.MockShutdownTimer()
253         self.shutdowns._set_state(False, 300)
254         self.timer = mock.MagicMock(name='timer_mock')
255         self.updates = mock.MagicMock(name='update_mock')
256         self.cloud_mock = testutil.cloud_node_mock(node_num)
257         self.subscriber = mock.Mock(name='subscriber_mock')
258         self.cloud_client = mock.MagicMock(name='cloud_client')
259         self.cloud_client.broken.return_value = False
260
261     def make_actor(self, node_num=1, arv_node=None, start_time=None):
262         if not hasattr(self, 'cloud_mock'):
263             self.make_mocks(node_num)
264         if start_time is None:
265             start_time = time.time()
266         self.node_actor = dispatch.ComputeNodeMonitorActor.start(
267             self.cloud_mock, start_time, self.shutdowns,
268             testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client,
269             arv_node, boot_fail_after=300).proxy()
270         self.node_actor.subscribe(self.subscriber).get(self.TIMEOUT)
271
272     def node_state(self, *states):
273         return self.node_actor.in_state(*states).get(self.TIMEOUT)
274
275     def test_in_state_when_unpaired(self):
276         self.make_actor()
277         self.assertIsNone(self.node_state('idle', 'busy'))
278
279     def test_in_state_when_pairing_stale(self):
280         self.make_actor(arv_node=testutil.arvados_node_mock(
281                 job_uuid=None, age=90000))
282         self.assertIsNone(self.node_state('idle', 'busy'))
283
284     def test_in_state_when_no_state_available(self):
285         self.make_actor(arv_node=testutil.arvados_node_mock(
286                 crunch_worker_state=None))
287         self.assertIsNone(self.node_state('idle', 'busy'))
288
289     def test_in_idle_state(self):
290         self.make_actor(2, arv_node=testutil.arvados_node_mock(job_uuid=None))
291         self.assertTrue(self.node_state('idle'))
292         self.assertFalse(self.node_state('busy'))
293         self.assertTrue(self.node_state('idle', 'busy'))
294
295     def test_in_busy_state(self):
296         self.make_actor(3, arv_node=testutil.arvados_node_mock(job_uuid=True))
297         self.assertFalse(self.node_state('idle'))
298         self.assertTrue(self.node_state('busy'))
299         self.assertTrue(self.node_state('idle', 'busy'))
300
301     def test_init_shutdown_scheduling(self):
302         self.make_actor()
303         self.assertTrue(self.timer.schedule.called)
304         self.assertEqual(300, self.timer.schedule.call_args[0][0])
305
306     def test_shutdown_window_close_scheduling(self):
307         self.make_actor()
308         self.shutdowns._set_state(False, 600)
309         self.timer.schedule.reset_mock()
310         self.node_actor.consider_shutdown().get(self.TIMEOUT)
311         self.stop_proxy(self.node_actor)
312         self.assertTrue(self.timer.schedule.called)
313         self.assertEqual(600, self.timer.schedule.call_args[0][0])
314         self.assertFalse(self.subscriber.called)
315
316     def test_shutdown_subscription(self):
317         self.make_actor(start_time=0)
318         self.shutdowns._set_state(True, 600)
319         self.node_actor.consider_shutdown().get(self.TIMEOUT)
320         self.assertTrue(self.subscriber.called)
321         self.assertEqual(self.node_actor.actor_ref.actor_urn,
322                          self.subscriber.call_args[0][0].actor_ref.actor_urn)
323
324     def test_no_shutdown_booting(self):
325         self.make_actor()
326         self.shutdowns._set_state(True, 600)
327         self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
328
329     def test_shutdown_without_arvados_node(self):
330         self.make_actor(start_time=0)
331         self.shutdowns._set_state(True, 600)
332         self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
333
334     def test_no_shutdown_missing(self):
335         arv_node = testutil.arvados_node_mock(10, job_uuid=None,
336                                               crunch_worker_state="down",
337                                               last_ping_at='1970-01-01T01:02:03.04050607Z')
338         self.make_actor(10, arv_node)
339         self.shutdowns._set_state(True, 600)
340         self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
341
342     def test_no_shutdown_running_broken(self):
343         arv_node = testutil.arvados_node_mock(12, job_uuid=None,
344                                               crunch_worker_state="down")
345         self.make_actor(12, arv_node)
346         self.shutdowns._set_state(True, 600)
347         self.cloud_client.broken.return_value = True
348         self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
349
350     def test_shutdown_missing_broken(self):
351         arv_node = testutil.arvados_node_mock(11, job_uuid=None,
352                                               crunch_worker_state="down",
353                                               last_ping_at='1970-01-01T01:02:03.04050607Z')
354         self.make_actor(11, arv_node)
355         self.shutdowns._set_state(True, 600)
356         self.cloud_client.broken.return_value = True
357         self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
358
359     def test_no_shutdown_when_window_closed(self):
360         self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
361         self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
362
363     def test_no_shutdown_when_node_running_job(self):
364         self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
365         self.shutdowns._set_state(True, 600)
366         self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
367
368     def test_no_shutdown_when_node_state_unknown(self):
369         self.make_actor(5, testutil.arvados_node_mock(
370             5, crunch_worker_state=None))
371         self.shutdowns._set_state(True, 600)
372         self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
373
374     def test_no_shutdown_when_node_state_stale(self):
375         self.make_actor(6, testutil.arvados_node_mock(6, age=90000))
376         self.shutdowns._set_state(True, 600)
377         self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
378
379     def test_arvados_node_match(self):
380         self.make_actor(2)
381         arv_node = testutil.arvados_node_mock(
382             2, hostname='compute-two.zzzzz.arvadosapi.com')
383         pair_id = self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT)
384         self.assertEqual(self.cloud_mock.id, pair_id)
385         self.stop_proxy(self.node_actor)
386         self.updates.sync_node.assert_called_with(self.cloud_mock, arv_node)
387
388     def test_arvados_node_mismatch(self):
389         self.make_actor(3)
390         arv_node = testutil.arvados_node_mock(1)
391         self.assertIsNone(
392             self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT))
393
394     def test_arvados_node_mismatch_first_ping_too_early(self):
395         self.make_actor(4)
396         arv_node = testutil.arvados_node_mock(
397             4, first_ping_at='1971-03-02T14:15:16.1717282Z')
398         self.assertIsNone(
399             self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT))
400
401     def test_update_cloud_node(self):
402         self.make_actor(1)
403         self.make_mocks(2)
404         self.cloud_mock.id = '1'
405         self.node_actor.update_cloud_node(self.cloud_mock)
406         current_cloud = self.node_actor.cloud_node.get(self.TIMEOUT)
407         self.assertEqual([testutil.ip_address_mock(2)],
408                          current_cloud.private_ips)
409
410     def test_missing_cloud_node_update(self):
411         self.make_actor(1)
412         self.node_actor.update_cloud_node(None)
413         current_cloud = self.node_actor.cloud_node.get(self.TIMEOUT)
414         self.assertEqual([testutil.ip_address_mock(1)],
415                          current_cloud.private_ips)
416
417     def test_update_arvados_node(self):
418         self.make_actor(3)
419         job_uuid = 'zzzzz-jjjjj-updatejobnode00'
420         new_arvados = testutil.arvados_node_mock(3, job_uuid)
421         self.node_actor.update_arvados_node(new_arvados)
422         current_arvados = self.node_actor.arvados_node.get(self.TIMEOUT)
423         self.assertEqual(job_uuid, current_arvados['job_uuid'])
424
425     def test_missing_arvados_node_update(self):
426         self.make_actor(4, testutil.arvados_node_mock(4))
427         self.node_actor.update_arvados_node(None)
428         current_arvados = self.node_actor.arvados_node.get(self.TIMEOUT)
429         self.assertEqual(testutil.ip_address_mock(4),
430                          current_arvados['ip_address'])
431
432     def test_update_arvados_node_syncs_when_fqdn_mismatch(self):
433         self.make_mocks(5)
434         self.cloud_mock.extra['testname'] = 'cloudfqdn.zzzzz.arvadosapi.com'
435         self.make_actor()
436         arv_node = testutil.arvados_node_mock(5)
437         self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT)
438         self.assertEqual(1, self.updates.sync_node.call_count)
439
440     def test_update_arvados_node_skips_sync_when_fqdn_match(self):
441         self.make_mocks(6)
442         arv_node = testutil.arvados_node_mock(6)
443         self.cloud_mock.extra['testname'] ='{n[hostname]}.{n[domain]}'.format(
444             n=arv_node)
445         self.make_actor()
446         self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT)
447         self.assertEqual(0, self.updates.sync_node.call_count)