Merge branch 'master' into 7490-datamanager-dont-die-return-error
[arvados.git] / services / nodemanager / tests / test_computenode_dispatch.py
1 #!/usr/bin/env python
2
3 from __future__ import absolute_import, print_function
4
5 import time
6 import unittest
7
8 import arvados.errors as arverror
9 import httplib2
10 import mock
11 import pykka
12
13 import arvnodeman.computenode.dispatch as dispatch
14 from . import testutil
15
16 class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
17     def make_mocks(self, arvados_effect=None):
18         if arvados_effect is None:
19             arvados_effect = [testutil.arvados_node_mock()]
20         self.arvados_effect = arvados_effect
21         self.timer = testutil.MockTimer()
22         self.api_client = mock.MagicMock(name='api_client')
23         self.api_client.nodes().create().execute.side_effect = arvados_effect
24         self.api_client.nodes().update().execute.side_effect = arvados_effect
25         self.cloud_client = mock.MagicMock(name='cloud_client')
26         self.cloud_client.create_node.return_value = testutil.cloud_node_mock(1)
27
28     def make_actor(self, arv_node=None):
29         if not hasattr(self, 'timer'):
30             self.make_mocks(arvados_effect=[arv_node] if arv_node else None)
31         self.setup_actor = dispatch.ComputeNodeSetupActor.start(
32             self.timer, self.api_client, self.cloud_client,
33             testutil.MockSize(1), arv_node).proxy()
34
35     def assert_node_properties_updated(self, uuid=None,
36                                        size=testutil.MockSize(1)):
37         self.api_client.nodes().update.assert_any_call(
38             uuid=(uuid or self.arvados_effect[-1]['uuid']),
39             body={
40                 'properties': {
41                     'cloud_node': {
42                         'size': size.id,
43                         'price': size.price}}})
44
45     def test_creation_without_arvados_node(self):
46         self.make_actor()
47         self.assertEqual(self.arvados_effect[-1],
48                          self.setup_actor.arvados_node.get(self.TIMEOUT))
49         self.assertEqual(1, self.api_client.nodes().create().execute.call_count)
50         self.assertEqual(1, self.api_client.nodes().update().execute.call_count)
51         self.assert_node_properties_updated()
52         self.assertEqual(self.cloud_client.create_node(),
53                          self.setup_actor.cloud_node.get(self.TIMEOUT))
54
55     def test_creation_with_arvados_node(self):
56         self.make_mocks(arvados_effect=[testutil.arvados_node_mock()]*2)
57         self.make_actor(testutil.arvados_node_mock())
58         self.assertEqual(self.arvados_effect[-1],
59                          self.setup_actor.arvados_node.get(self.TIMEOUT))
60         self.assert_node_properties_updated()
61         self.assertEqual(2, self.api_client.nodes().update().execute.call_count)
62         self.assertEqual(self.cloud_client.create_node(),
63                          self.setup_actor.cloud_node.get(self.TIMEOUT))
64
65     def test_failed_arvados_calls_retried(self):
66         self.make_mocks([
67                 arverror.ApiError(httplib2.Response({'status': '500'}), ""),
68                 testutil.arvados_node_mock(),
69                 ])
70         self.make_actor()
71         self.wait_for_assignment(self.setup_actor, 'arvados_node')
72
73     def test_failed_cloud_calls_retried(self):
74         self.make_mocks()
75         self.cloud_client.create_node.side_effect = [
76             Exception("test cloud creation error"),
77             self.cloud_client.create_node.return_value,
78             ]
79         self.make_actor()
80         self.wait_for_assignment(self.setup_actor, 'cloud_node')
81
82     def test_failed_post_create_retried(self):
83         self.make_mocks()
84         self.cloud_client.post_create_node.side_effect = [
85             Exception("test cloud post-create error"), None]
86         self.make_actor()
87         done = self.FUTURE_CLASS()
88         self.setup_actor.subscribe(done.set)
89         done.get(self.TIMEOUT)
90         self.assertEqual(2, self.cloud_client.post_create_node.call_count)
91
92     def test_stop_when_no_cloud_node(self):
93         self.make_mocks(
94             arverror.ApiError(httplib2.Response({'status': '500'}), ""))
95         self.make_actor()
96         self.assertTrue(
97             self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT))
98         self.assertTrue(
99             self.setup_actor.actor_ref.actor_stopped.wait(self.TIMEOUT))
100
101     def test_no_stop_when_cloud_node(self):
102         self.make_actor()
103         self.wait_for_assignment(self.setup_actor, 'cloud_node')
104         self.assertFalse(
105             self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT))
106         self.assertTrue(self.stop_proxy(self.setup_actor),
107                         "actor was stopped by stop_if_no_cloud_node")
108
109     def test_subscribe(self):
110         self.make_mocks(
111             arverror.ApiError(httplib2.Response({'status': '500'}), ""))
112         self.make_actor()
113         subscriber = mock.Mock(name='subscriber_mock')
114         self.setup_actor.subscribe(subscriber)
115         retry_resp = [testutil.arvados_node_mock()]
116         self.api_client.nodes().create().execute.side_effect = retry_resp
117         self.api_client.nodes().update().execute.side_effect = retry_resp
118         self.wait_for_assignment(self.setup_actor, 'cloud_node')
119         self.assertEqual(self.setup_actor.actor_ref.actor_urn,
120                          subscriber.call_args[0][0].actor_ref.actor_urn)
121
122     def test_late_subscribe(self):
123         self.make_actor()
124         subscriber = mock.Mock(name='subscriber_mock')
125         self.wait_for_assignment(self.setup_actor, 'cloud_node')
126         self.setup_actor.subscribe(subscriber).get(self.TIMEOUT)
127         self.stop_proxy(self.setup_actor)
128         self.assertEqual(self.setup_actor.actor_ref.actor_urn,
129                          subscriber.call_args[0][0].actor_ref.actor_urn)
130
131
132 class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
133     def make_mocks(self, cloud_node=None, arvados_node=None,
134                    shutdown_open=True, node_broken=False):
135         self.timer = testutil.MockTimer()
136         self.shutdowns = testutil.MockShutdownTimer()
137         self.shutdowns._set_state(shutdown_open, 300)
138         self.cloud_client = mock.MagicMock(name='cloud_client')
139         self.cloud_client.broken.return_value = node_broken
140         self.arvados_client = mock.MagicMock(name='arvados_client')
141         self.updates = mock.MagicMock(name='update_mock')
142         if cloud_node is None:
143             cloud_node = testutil.cloud_node_mock()
144         self.cloud_node = cloud_node
145         self.arvados_node = arvados_node
146
147     def make_actor(self, cancellable=True, start_time=None):
148         if not hasattr(self, 'timer'):
149             self.make_mocks()
150         if start_time is None:
151             start_time = time.time()
152         monitor_actor = dispatch.ComputeNodeMonitorActor.start(
153             self.cloud_node, start_time, self.shutdowns,
154             testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client,
155             self.arvados_node)
156         self.shutdown_actor = self.ACTOR_CLASS.start(
157             self.timer, self.cloud_client, self.arvados_client, monitor_actor,
158             cancellable).proxy()
159         self.monitor_actor = monitor_actor.proxy()
160
161     def check_success_flag(self, expected, allow_msg_count=1):
162         # allow_msg_count is the number of internal messages that may
163         # need to be handled for shutdown to finish.
164         for try_num in range(1 + allow_msg_count):
165             last_flag = self.shutdown_actor.success.get(self.TIMEOUT)
166             if last_flag is expected:
167                 break
168         else:
169             self.fail("success flag {} is not {}".format(last_flag, expected))
170
171     def test_uncancellable_shutdown(self, *mocks):
172         self.make_mocks(shutdown_open=False)
173         self.cloud_client.destroy_node.return_value = False
174         self.make_actor(cancellable=False)
175         self.check_success_flag(None, 0)
176         self.shutdowns._set_state(True, 600)
177         self.cloud_client.destroy_node.return_value = True
178         self.check_success_flag(True)
179
180     def test_arvados_node_cleaned_after_shutdown(self, *mocks):
181         cloud_node = testutil.cloud_node_mock(62)
182         arv_node = testutil.arvados_node_mock(62)
183         self.make_mocks(cloud_node, arv_node)
184         self.make_actor()
185         self.check_success_flag(True, 3)
186         update_mock = self.arvados_client.nodes().update
187         self.assertTrue(update_mock.called)
188         update_kwargs = update_mock.call_args_list[0][1]
189         self.assertEqual(arv_node['uuid'], update_kwargs.get('uuid'))
190         self.assertIn('body', update_kwargs)
191         for clear_key in ['slot_number', 'hostname', 'ip_address',
192                           'first_ping_at', 'last_ping_at']:
193             self.assertIn(clear_key, update_kwargs['body'])
194             self.assertIsNone(update_kwargs['body'][clear_key])
195         self.assertTrue(update_mock().execute.called)
196
197     def test_arvados_node_not_cleaned_after_shutdown_cancelled(self, *mocks):
198         cloud_node = testutil.cloud_node_mock(61)
199         arv_node = testutil.arvados_node_mock(61)
200         self.make_mocks(cloud_node, arv_node, shutdown_open=False)
201         self.make_actor(cancellable=True)
202         self.check_success_flag(False, 2)
203         self.assertFalse(self.arvados_client.nodes().update.called)
204
205
206 class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
207                                        unittest.TestCase):
208     ACTOR_CLASS = dispatch.ComputeNodeShutdownActor
209
210     def test_easy_shutdown(self):
211         self.make_actor(start_time=0)
212         self.check_success_flag(True)
213         self.assertTrue(self.cloud_client.destroy_node.called)
214
215     def test_shutdown_cancelled_when_window_closes(self):
216         self.make_mocks(shutdown_open=False)
217         self.make_actor()
218         self.check_success_flag(False, 2)
219         self.assertFalse(self.cloud_client.destroy_node.called)
220         self.assertEqual(self.ACTOR_CLASS.WINDOW_CLOSED,
221                          self.shutdown_actor.cancel_reason.get(self.TIMEOUT))
222
223     def test_shutdown_retries_when_cloud_fails(self):
224         self.make_mocks()
225         self.cloud_client.destroy_node.return_value = False
226         self.make_actor(start_time=0)
227         self.assertIsNone(self.shutdown_actor.success.get(self.TIMEOUT))
228         self.cloud_client.destroy_node.return_value = True
229         self.check_success_flag(True)
230
231     def test_shutdown_cancelled_when_cloud_fails_on_broken_node(self):
232         self.make_mocks(node_broken=True)
233         self.cloud_client.destroy_node.return_value = False
234         self.make_actor(start_time=0)
235         self.check_success_flag(False, 2)
236         self.assertEqual(1, self.cloud_client.destroy_node.call_count)
237         self.assertEqual(self.ACTOR_CLASS.NODE_BROKEN,
238                          self.shutdown_actor.cancel_reason.get(self.TIMEOUT))
239
240     def test_late_subscribe(self):
241         self.make_actor()
242         subscriber = mock.Mock(name='subscriber_mock')
243         self.shutdown_actor.subscribe(subscriber).get(self.TIMEOUT)
244         self.stop_proxy(self.shutdown_actor)
245         self.assertTrue(subscriber.called)
246         self.assertEqual(self.shutdown_actor.actor_ref.actor_urn,
247                          subscriber.call_args[0][0].actor_ref.actor_urn)
248
249
250 class ComputeNodeUpdateActorTestCase(testutil.ActorTestMixin,
251                                      unittest.TestCase):
252     def make_actor(self):
253         self.driver = mock.MagicMock(name='driver_mock')
254         self.updater = dispatch.ComputeNodeUpdateActor.start(self.driver).proxy()
255
256     def test_node_sync(self):
257         self.make_actor()
258         cloud_node = testutil.cloud_node_mock()
259         arv_node = testutil.arvados_node_mock()
260         self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
261         self.driver().sync_node.assert_called_with(cloud_node, arv_node)
262
263
264 class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
265                                       unittest.TestCase):
266     def make_mocks(self, node_num):
267         self.shutdowns = testutil.MockShutdownTimer()
268         self.shutdowns._set_state(False, 300)
269         self.timer = mock.MagicMock(name='timer_mock')
270         self.updates = mock.MagicMock(name='update_mock')
271         self.cloud_mock = testutil.cloud_node_mock(node_num)
272         self.subscriber = mock.Mock(name='subscriber_mock')
273         self.cloud_client = mock.MagicMock(name='cloud_client')
274         self.cloud_client.broken.return_value = False
275
276     def make_actor(self, node_num=1, arv_node=None, start_time=None):
277         if not hasattr(self, 'cloud_mock'):
278             self.make_mocks(node_num)
279         if start_time is None:
280             start_time = time.time()
281         self.node_actor = dispatch.ComputeNodeMonitorActor.start(
282             self.cloud_mock, start_time, self.shutdowns,
283             testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client,
284             arv_node, boot_fail_after=300).proxy()
285         self.node_actor.subscribe(self.subscriber).get(self.TIMEOUT)
286
287     def node_state(self, *states):
288         return self.node_actor.in_state(*states).get(self.TIMEOUT)
289
290     def test_in_state_when_unpaired(self):
291         self.make_actor()
292         self.assertIsNone(self.node_state('idle', 'busy'))
293
294     def test_in_state_when_pairing_stale(self):
295         self.make_actor(arv_node=testutil.arvados_node_mock(
296                 job_uuid=None, age=90000))
297         self.assertIsNone(self.node_state('idle', 'busy'))
298
299     def test_in_state_when_no_state_available(self):
300         self.make_actor(arv_node=testutil.arvados_node_mock(
301                 crunch_worker_state=None))
302         self.assertIsNone(self.node_state('idle', 'busy'))
303
304     def test_in_idle_state(self):
305         self.make_actor(2, arv_node=testutil.arvados_node_mock(job_uuid=None))
306         self.assertTrue(self.node_state('idle'))
307         self.assertFalse(self.node_state('busy'))
308         self.assertTrue(self.node_state('idle', 'busy'))
309
310     def test_in_busy_state(self):
311         self.make_actor(3, arv_node=testutil.arvados_node_mock(job_uuid=True))
312         self.assertFalse(self.node_state('idle'))
313         self.assertTrue(self.node_state('busy'))
314         self.assertTrue(self.node_state('idle', 'busy'))
315
316     def test_init_shutdown_scheduling(self):
317         self.make_actor()
318         self.assertTrue(self.timer.schedule.called)
319         self.assertEqual(300, self.timer.schedule.call_args[0][0])
320
321     def test_shutdown_window_close_scheduling(self):
322         self.make_actor()
323         self.shutdowns._set_state(False, 600)
324         self.timer.schedule.reset_mock()
325         self.node_actor.consider_shutdown().get(self.TIMEOUT)
326         self.stop_proxy(self.node_actor)
327         self.assertTrue(self.timer.schedule.called)
328         self.assertEqual(600, self.timer.schedule.call_args[0][0])
329         self.assertFalse(self.subscriber.called)
330
331     def test_shutdown_subscription(self):
332         self.make_actor(start_time=0)
333         self.shutdowns._set_state(True, 600)
334         self.node_actor.consider_shutdown().get(self.TIMEOUT)
335         self.assertTrue(self.subscriber.called)
336         self.assertEqual(self.node_actor.actor_ref.actor_urn,
337                          self.subscriber.call_args[0][0].actor_ref.actor_urn)
338
339     def test_no_shutdown_booting(self):
340         self.make_actor()
341         self.shutdowns._set_state(True, 600)
342         self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
343
344     def test_shutdown_without_arvados_node(self):
345         self.make_actor(start_time=0)
346         self.shutdowns._set_state(True, 600)
347         self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
348
349     def test_no_shutdown_missing(self):
350         arv_node = testutil.arvados_node_mock(10, job_uuid=None,
351                                               crunch_worker_state="down",
352                                               last_ping_at='1970-01-01T01:02:03.04050607Z')
353         self.make_actor(10, arv_node)
354         self.shutdowns._set_state(True, 600)
355         self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
356
357     def test_no_shutdown_running_broken(self):
358         arv_node = testutil.arvados_node_mock(12, job_uuid=None,
359                                               crunch_worker_state="down")
360         self.make_actor(12, arv_node)
361         self.shutdowns._set_state(True, 600)
362         self.cloud_client.broken.return_value = True
363         self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
364
365     def test_shutdown_missing_broken(self):
366         arv_node = testutil.arvados_node_mock(11, job_uuid=None,
367                                               crunch_worker_state="down",
368                                               last_ping_at='1970-01-01T01:02:03.04050607Z')
369         self.make_actor(11, arv_node)
370         self.shutdowns._set_state(True, 600)
371         self.cloud_client.broken.return_value = True
372         self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
373
374     def test_no_shutdown_when_window_closed(self):
375         self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
376         self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
377
378     def test_no_shutdown_when_node_running_job(self):
379         self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
380         self.shutdowns._set_state(True, 600)
381         self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
382
383     def test_no_shutdown_when_node_state_unknown(self):
384         self.make_actor(5, testutil.arvados_node_mock(
385             5, crunch_worker_state=None))
386         self.shutdowns._set_state(True, 600)
387         self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
388
389     def test_no_shutdown_when_node_state_stale(self):
390         self.make_actor(6, testutil.arvados_node_mock(6, age=90000))
391         self.shutdowns._set_state(True, 600)
392         self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
393
394     def test_arvados_node_match(self):
395         self.make_actor(2)
396         arv_node = testutil.arvados_node_mock(
397             2, hostname='compute-two.zzzzz.arvadosapi.com')
398         pair_id = self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT)
399         self.assertEqual(self.cloud_mock.id, pair_id)
400         self.stop_proxy(self.node_actor)
401         self.updates.sync_node.assert_called_with(self.cloud_mock, arv_node)
402
403     def test_arvados_node_mismatch(self):
404         self.make_actor(3)
405         arv_node = testutil.arvados_node_mock(1)
406         self.assertIsNone(
407             self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT))
408
409     def test_arvados_node_mismatch_first_ping_too_early(self):
410         self.make_actor(4)
411         arv_node = testutil.arvados_node_mock(
412             4, first_ping_at='1971-03-02T14:15:16.1717282Z')
413         self.assertIsNone(
414             self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT))
415
416     def test_update_cloud_node(self):
417         self.make_actor(1)
418         self.make_mocks(2)
419         self.cloud_mock.id = '1'
420         self.node_actor.update_cloud_node(self.cloud_mock)
421         current_cloud = self.node_actor.cloud_node.get(self.TIMEOUT)
422         self.assertEqual([testutil.ip_address_mock(2)],
423                          current_cloud.private_ips)
424
425     def test_missing_cloud_node_update(self):
426         self.make_actor(1)
427         self.node_actor.update_cloud_node(None)
428         current_cloud = self.node_actor.cloud_node.get(self.TIMEOUT)
429         self.assertEqual([testutil.ip_address_mock(1)],
430                          current_cloud.private_ips)
431
432     def test_update_arvados_node(self):
433         self.make_actor(3)
434         job_uuid = 'zzzzz-jjjjj-updatejobnode00'
435         new_arvados = testutil.arvados_node_mock(3, job_uuid)
436         self.node_actor.update_arvados_node(new_arvados)
437         current_arvados = self.node_actor.arvados_node.get(self.TIMEOUT)
438         self.assertEqual(job_uuid, current_arvados['job_uuid'])
439
440     def test_missing_arvados_node_update(self):
441         self.make_actor(4, testutil.arvados_node_mock(4))
442         self.node_actor.update_arvados_node(None)
443         current_arvados = self.node_actor.arvados_node.get(self.TIMEOUT)
444         self.assertEqual(testutil.ip_address_mock(4),
445                          current_arvados['ip_address'])
446
447     def test_update_arvados_node_syncs_when_fqdn_mismatch(self):
448         self.make_mocks(5)
449         self.cloud_mock.extra['testname'] = 'cloudfqdn.zzzzz.arvadosapi.com'
450         self.make_actor()
451         arv_node = testutil.arvados_node_mock(5)
452         self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT)
453         self.assertEqual(1, self.updates.sync_node.call_count)
454
455     def test_update_arvados_node_skips_sync_when_fqdn_match(self):
456         self.make_mocks(6)
457         arv_node = testutil.arvados_node_mock(6)
458         self.cloud_mock.extra['testname'] ='{n[hostname]}.{n[domain]}'.format(
459             n=arv_node)
460         self.make_actor()
461         self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT)
462         self.assertEqual(0, self.updates.sync_node.call_count)