Merge branch '8953-node-manager-FSM' closes #8953
[arvados.git] / services / nodemanager / tests / test_computenode_dispatch.py
1 #!/usr/bin/env python
2
3 from __future__ import absolute_import, print_function
4
5 import time
6 import unittest
7
8 import arvados.errors as arverror
9 import httplib2
10 import mock
11 import pykka
12 import threading
13
14 import arvnodeman.computenode.dispatch as dispatch
15 from . import testutil
16
17 class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
18     def make_mocks(self, arvados_effect=None):
19         if arvados_effect is None:
20             arvados_effect = [testutil.arvados_node_mock()]
21         self.arvados_effect = arvados_effect
22         self.timer = testutil.MockTimer()
23         self.api_client = mock.MagicMock(name='api_client')
24         self.api_client.nodes().create().execute.side_effect = arvados_effect
25         self.api_client.nodes().update().execute.side_effect = arvados_effect
26         self.cloud_client = mock.MagicMock(name='cloud_client')
27         self.cloud_client.create_node.return_value = testutil.cloud_node_mock(1)
28
29     def make_actor(self, arv_node=None):
30         if not hasattr(self, 'timer'):
31             self.make_mocks(arvados_effect=[arv_node] if arv_node else None)
32         self.setup_actor = dispatch.ComputeNodeSetupActor.start(
33             self.timer, self.api_client, self.cloud_client,
34             testutil.MockSize(1), arv_node).proxy()
35
36     def assert_node_properties_updated(self, uuid=None,
37                                        size=testutil.MockSize(1)):
38         self.api_client.nodes().update.assert_any_call(
39             uuid=(uuid or self.arvados_effect[-1]['uuid']),
40             body={
41                 'properties': {
42                     'cloud_node': {
43                         'size': size.id,
44                         'price': size.price}}})
45
46     def test_creation_without_arvados_node(self):
47         self.make_actor()
48         finished = threading.Event()
49         self.setup_actor.subscribe(lambda _: finished.set())
50         self.assertEqual(self.arvados_effect[-1],
51                          self.setup_actor.arvados_node.get(self.TIMEOUT))
52         assert(finished.wait(self.TIMEOUT))
53         self.assertEqual(1, self.api_client.nodes().create().execute.call_count)
54         self.assertEqual(1, self.api_client.nodes().update().execute.call_count)
55         self.assert_node_properties_updated()
56         self.assertEqual(self.cloud_client.create_node(),
57                          self.setup_actor.cloud_node.get(self.TIMEOUT))
58
59     def test_creation_with_arvados_node(self):
60         self.make_mocks(arvados_effect=[testutil.arvados_node_mock()]*2)
61         self.make_actor(testutil.arvados_node_mock())
62         finished = threading.Event()
63         self.setup_actor.subscribe(lambda _: finished.set())
64         self.assertEqual(self.arvados_effect[-1],
65                          self.setup_actor.arvados_node.get(self.TIMEOUT))
66         assert(finished.wait(self.TIMEOUT))
67         self.assert_node_properties_updated()
68         self.assertEqual(2, self.api_client.nodes().update().execute.call_count)
69         self.assertEqual(self.cloud_client.create_node(),
70                          self.setup_actor.cloud_node.get(self.TIMEOUT))
71
72     def test_failed_arvados_calls_retried(self):
73         self.make_mocks([
74                 arverror.ApiError(httplib2.Response({'status': '500'}), ""),
75                 testutil.arvados_node_mock(),
76                 ])
77         self.make_actor()
78         self.wait_for_assignment(self.setup_actor, 'arvados_node')
79
80     def test_failed_cloud_calls_retried(self):
81         self.make_mocks()
82         self.cloud_client.create_node.side_effect = [
83             Exception("test cloud creation error"),
84             self.cloud_client.create_node.return_value,
85             ]
86         self.make_actor()
87         self.wait_for_assignment(self.setup_actor, 'cloud_node')
88
89     def test_failed_post_create_retried(self):
90         self.make_mocks()
91         self.cloud_client.post_create_node.side_effect = [
92             Exception("test cloud post-create error"), None]
93         self.make_actor()
94         done = self.FUTURE_CLASS()
95         self.setup_actor.subscribe(done.set)
96         done.get(self.TIMEOUT)
97         self.assertEqual(2, self.cloud_client.post_create_node.call_count)
98
99     def test_stop_when_no_cloud_node(self):
100         self.make_mocks(
101             arverror.ApiError(httplib2.Response({'status': '500'}), ""))
102         self.make_actor()
103         self.assertTrue(
104             self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT))
105         self.assertTrue(
106             self.setup_actor.actor_ref.actor_stopped.wait(self.TIMEOUT))
107
108     def test_no_stop_when_cloud_node(self):
109         self.make_actor()
110         self.wait_for_assignment(self.setup_actor, 'cloud_node')
111         self.assertFalse(
112             self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT))
113         self.assertTrue(self.stop_proxy(self.setup_actor),
114                         "actor was stopped by stop_if_no_cloud_node")
115
116     def test_subscribe(self):
117         self.make_mocks(
118             arverror.ApiError(httplib2.Response({'status': '500'}), ""))
119         self.make_actor()
120         subscriber = mock.Mock(name='subscriber_mock')
121         self.setup_actor.subscribe(subscriber)
122         retry_resp = [testutil.arvados_node_mock()]
123         self.api_client.nodes().create().execute.side_effect = retry_resp
124         self.api_client.nodes().update().execute.side_effect = retry_resp
125         self.wait_for_assignment(self.setup_actor, 'cloud_node')
126         self.assertEqual(self.setup_actor.actor_ref.actor_urn,
127                          subscriber.call_args[0][0].actor_ref.actor_urn)
128
129     def test_late_subscribe(self):
130         self.make_actor()
131         subscriber = mock.Mock(name='subscriber_mock')
132         self.wait_for_assignment(self.setup_actor, 'cloud_node')
133         self.setup_actor.subscribe(subscriber).get(self.TIMEOUT)
134         self.stop_proxy(self.setup_actor)
135         self.assertEqual(self.setup_actor.actor_ref.actor_urn,
136                          subscriber.call_args[0][0].actor_ref.actor_urn)
137
138
139 class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
140     def make_mocks(self, cloud_node=None, arvados_node=None,
141                    shutdown_open=True, node_broken=False):
142         self.timer = testutil.MockTimer()
143         self.shutdowns = testutil.MockShutdownTimer()
144         self.shutdowns._set_state(shutdown_open, 300)
145         self.cloud_client = mock.MagicMock(name='cloud_client')
146         self.cloud_client.broken.return_value = node_broken
147         self.arvados_client = mock.MagicMock(name='arvados_client')
148         self.updates = mock.MagicMock(name='update_mock')
149         if cloud_node is None:
150             cloud_node = testutil.cloud_node_mock()
151         self.cloud_node = cloud_node
152         self.arvados_node = arvados_node
153
154     def make_actor(self, cancellable=True, start_time=None):
155         if not hasattr(self, 'timer'):
156             self.make_mocks()
157         if start_time is None:
158             start_time = time.time()
159         monitor_actor = dispatch.ComputeNodeMonitorActor.start(
160             self.cloud_node, start_time, self.shutdowns,
161             testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client,
162             self.arvados_node)
163         self.shutdown_actor = self.ACTOR_CLASS.start(
164             self.timer, self.cloud_client, self.arvados_client, monitor_actor,
165             cancellable).proxy()
166         self.monitor_actor = monitor_actor.proxy()
167
168     def check_success_flag(self, expected, allow_msg_count=1):
169         # allow_msg_count is the number of internal messages that may
170         # need to be handled for shutdown to finish.
171         for try_num in range(1 + allow_msg_count):
172             last_flag = self.shutdown_actor.success.get(self.TIMEOUT)
173             if last_flag is expected:
174                 break
175         else:
176             self.fail("success flag {} is not {}".format(last_flag, expected))
177
178     def test_uncancellable_shutdown(self, *mocks):
179         self.make_mocks(shutdown_open=False)
180         self.cloud_client.destroy_node.return_value = False
181         self.make_actor(cancellable=False)
182         self.check_success_flag(None, 0)
183         self.shutdowns._set_state(True, 600)
184         self.cloud_client.destroy_node.return_value = True
185         self.check_success_flag(True)
186
187     def test_arvados_node_cleaned_after_shutdown(self, *mocks):
188         cloud_node = testutil.cloud_node_mock(62)
189         arv_node = testutil.arvados_node_mock(62)
190         self.make_mocks(cloud_node, arv_node)
191         self.make_actor()
192         self.check_success_flag(True, 3)
193         update_mock = self.arvados_client.nodes().update
194         self.assertTrue(update_mock.called)
195         update_kwargs = update_mock.call_args_list[0][1]
196         self.assertEqual(arv_node['uuid'], update_kwargs.get('uuid'))
197         self.assertIn('body', update_kwargs)
198         for clear_key in ['slot_number', 'hostname', 'ip_address',
199                           'first_ping_at', 'last_ping_at']:
200             self.assertIn(clear_key, update_kwargs['body'])
201             self.assertIsNone(update_kwargs['body'][clear_key])
202         self.assertTrue(update_mock().execute.called)
203
204     def test_arvados_node_not_cleaned_after_shutdown_cancelled(self, *mocks):
205         cloud_node = testutil.cloud_node_mock(61)
206         arv_node = testutil.arvados_node_mock(61)
207         self.make_mocks(cloud_node, arv_node, shutdown_open=False)
208         self.cloud_client.destroy_node.return_value = False
209         self.make_actor(cancellable=True)
210         self.shutdown_actor.cancel_shutdown("test")
211         self.check_success_flag(False, 2)
212         self.assertFalse(self.arvados_client.nodes().update.called)
213
214
215 class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
216                                        unittest.TestCase):
217     ACTOR_CLASS = dispatch.ComputeNodeShutdownActor
218
219     def test_easy_shutdown(self):
220         self.make_actor(start_time=0)
221         self.check_success_flag(True)
222         self.assertTrue(self.cloud_client.destroy_node.called)
223
224     def test_shutdown_retries_when_cloud_fails(self):
225         self.make_mocks()
226         self.cloud_client.destroy_node.return_value = False
227         self.make_actor(start_time=0)
228         self.assertIsNone(self.shutdown_actor.success.get(self.TIMEOUT))
229         self.cloud_client.destroy_node.return_value = True
230         self.check_success_flag(True)
231
232     def test_shutdown_cancelled_when_cloud_fails_on_broken_node(self):
233         self.make_mocks(node_broken=True)
234         self.cloud_client.destroy_node.return_value = False
235         self.make_actor(start_time=0)
236         self.check_success_flag(False, 2)
237         self.assertEqual(1, self.cloud_client.destroy_node.call_count)
238         self.assertEqual(self.ACTOR_CLASS.NODE_BROKEN,
239                          self.shutdown_actor.cancel_reason.get(self.TIMEOUT))
240
241     def test_late_subscribe(self):
242         self.make_actor()
243         subscriber = mock.Mock(name='subscriber_mock')
244         self.shutdown_actor.subscribe(subscriber).get(self.TIMEOUT)
245         self.stop_proxy(self.shutdown_actor)
246         self.assertTrue(subscriber.called)
247         self.assertEqual(self.shutdown_actor.actor_ref.actor_urn,
248                          subscriber.call_args[0][0].actor_ref.actor_urn)
249
250
251 class ComputeNodeUpdateActorTestCase(testutil.ActorTestMixin,
252                                      unittest.TestCase):
253     def make_actor(self):
254         self.driver = mock.MagicMock(name='driver_mock')
255         self.updater = dispatch.ComputeNodeUpdateActor.start(self.driver).proxy()
256
257     def test_node_sync(self):
258         self.make_actor()
259         cloud_node = testutil.cloud_node_mock()
260         arv_node = testutil.arvados_node_mock()
261         self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
262         self.driver().sync_node.assert_called_with(cloud_node, arv_node)
263
264     @testutil.no_sleep
265     def test_node_sync_error(self):
266         self.make_actor()
267         cloud_node = testutil.cloud_node_mock()
268         arv_node = testutil.arvados_node_mock()
269         self.driver().sync_node.side_effect = (IOError, Exception, True)
270         self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
271         self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
272         self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
273         self.driver().sync_node.assert_called_with(cloud_node, arv_node)
274
275 class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
276                                       unittest.TestCase):
277     def make_mocks(self, node_num):
278         self.shutdowns = testutil.MockShutdownTimer()
279         self.shutdowns._set_state(False, 300)
280         self.timer = mock.MagicMock(name='timer_mock')
281         self.updates = mock.MagicMock(name='update_mock')
282         self.cloud_mock = testutil.cloud_node_mock(node_num)
283         self.subscriber = mock.Mock(name='subscriber_mock')
284         self.cloud_client = mock.MagicMock(name='cloud_client')
285         self.cloud_client.broken.return_value = False
286
287     def make_actor(self, node_num=1, arv_node=None, start_time=None):
288         if not hasattr(self, 'cloud_mock'):
289             self.make_mocks(node_num)
290         if start_time is None:
291             start_time = time.time()
292         self.node_actor = dispatch.ComputeNodeMonitorActor.start(
293             self.cloud_mock, start_time, self.shutdowns,
294             testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client,
295             arv_node, boot_fail_after=300).proxy()
296         self.node_actor.subscribe(self.subscriber).get(self.TIMEOUT)
297
298     def node_state(self, *states):
299         return self.node_actor.in_state(*states).get(self.TIMEOUT)
300
301     def test_in_state_when_unpaired(self):
302         self.make_actor()
303         self.assertIsNone(self.node_state('idle', 'busy'))
304
305     def test_in_state_when_pairing_stale(self):
306         self.make_actor(arv_node=testutil.arvados_node_mock(
307                 job_uuid=None, age=90000))
308         self.assertIsNone(self.node_state('idle', 'busy'))
309
310     def test_in_state_when_no_state_available(self):
311         self.make_actor(arv_node=testutil.arvados_node_mock(
312                 crunch_worker_state=None))
313         self.assertIsNone(self.node_state('idle', 'busy'))
314
315     def test_in_idle_state(self):
316         self.make_actor(2, arv_node=testutil.arvados_node_mock(job_uuid=None))
317         self.assertTrue(self.node_state('idle'))
318         self.assertFalse(self.node_state('busy'))
319         self.assertTrue(self.node_state('idle', 'busy'))
320
321     def test_in_busy_state(self):
322         self.make_actor(3, arv_node=testutil.arvados_node_mock(job_uuid=True))
323         self.assertFalse(self.node_state('idle'))
324         self.assertTrue(self.node_state('busy'))
325         self.assertTrue(self.node_state('idle', 'busy'))
326
327     def test_init_shutdown_scheduling(self):
328         self.make_actor()
329         self.assertTrue(self.timer.schedule.called)
330         self.assertEqual(300, self.timer.schedule.call_args[0][0])
331
332     def test_shutdown_window_close_scheduling(self):
333         self.make_actor()
334         self.shutdowns._set_state(False, 600)
335         self.timer.schedule.reset_mock()
336         self.node_actor.consider_shutdown().get(self.TIMEOUT)
337         self.stop_proxy(self.node_actor)
338         self.assertTrue(self.timer.schedule.called)
339         self.assertEqual(600, self.timer.schedule.call_args[0][0])
340         self.assertFalse(self.subscriber.called)
341
342     def test_shutdown_subscription(self):
343         self.make_actor(start_time=0)
344         self.shutdowns._set_state(True, 600)
345         self.node_actor.consider_shutdown().get(self.TIMEOUT)
346         self.assertTrue(self.subscriber.called)
347         self.assertEqual(self.node_actor.actor_ref.actor_urn,
348                          self.subscriber.call_args[0][0].actor_ref.actor_urn)
349
350     def test_no_shutdown_booting(self):
351         self.make_actor()
352         self.shutdowns._set_state(True, 600)
353         self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
354                           (False, "node state is ('unpaired', 'open', 'boot wait', 'idle exceeded')"))
355
356     def test_shutdown_without_arvados_node(self):
357         self.make_actor(start_time=0)
358         self.shutdowns._set_state(True, 600)
359         self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('unpaired', 'open', 'boot exceeded', 'idle exceeded')"))
360
361     def test_shutdown_missing(self):
362         arv_node = testutil.arvados_node_mock(10, job_uuid=None,
363                                               crunch_worker_state="down",
364                                               last_ping_at='1970-01-01T01:02:03.04050607Z')
365         self.make_actor(10, arv_node)
366         self.shutdowns._set_state(True, 600)
367         self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"))
368
369     def test_shutdown_running_broken(self):
370         arv_node = testutil.arvados_node_mock(12, job_uuid=None,
371                                               crunch_worker_state="down")
372         self.make_actor(12, arv_node)
373         self.shutdowns._set_state(True, 600)
374         self.cloud_client.broken.return_value = True
375         self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"))
376
377     def test_shutdown_missing_broken(self):
378         arv_node = testutil.arvados_node_mock(11, job_uuid=None,
379                                               crunch_worker_state="down",
380                                               last_ping_at='1970-01-01T01:02:03.04050607Z')
381         self.make_actor(11, arv_node)
382         self.shutdowns._set_state(True, 600)
383         self.cloud_client.broken.return_value = True
384         self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"))
385
386     def test_no_shutdown_when_window_closed(self):
387         self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
388         self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
389                           (False, "node state is ('idle', 'closed', 'boot wait', 'idle exceeded')"))
390
391     def test_no_shutdown_when_node_running_job(self):
392         self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
393         self.shutdowns._set_state(True, 600)
394         self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
395                           (False, "node state is ('busy', 'open', 'boot wait', 'idle exceeded')"))
396
397     def test_no_shutdown_when_node_state_unknown(self):
398         self.make_actor(5, testutil.arvados_node_mock(
399             5, crunch_worker_state=None))
400         self.shutdowns._set_state(True, 600)
401         self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
402                           (False, "node is paired but crunch_worker_state is 'None'"))
403
404     def test_no_shutdown_when_node_state_stale(self):
405         self.make_actor(6, testutil.arvados_node_mock(6, age=90000))
406         self.shutdowns._set_state(True, 600)
407         self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
408                           (False, "node state is stale"))
409
410     def test_arvados_node_match(self):
411         self.make_actor(2)
412         arv_node = testutil.arvados_node_mock(
413             2, hostname='compute-two.zzzzz.arvadosapi.com')
414         self.cloud_client.node_id.return_value = '2'
415         pair_id = self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT)
416         self.assertEqual(self.cloud_mock.id, pair_id)
417         self.stop_proxy(self.node_actor)
418         self.updates.sync_node.assert_called_with(self.cloud_mock, arv_node)
419
420     def test_arvados_node_mismatch(self):
421         self.make_actor(3)
422         arv_node = testutil.arvados_node_mock(1)
423         self.assertIsNone(
424             self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT))
425
426     def test_arvados_node_mismatch_first_ping_too_early(self):
427         self.make_actor(4)
428         arv_node = testutil.arvados_node_mock(
429             4, first_ping_at='1971-03-02T14:15:16.1717282Z')
430         self.assertIsNone(
431             self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT))
432
433     def test_update_cloud_node(self):
434         self.make_actor(1)
435         self.make_mocks(2)
436         self.cloud_mock.id = '1'
437         self.node_actor.update_cloud_node(self.cloud_mock)
438         current_cloud = self.node_actor.cloud_node.get(self.TIMEOUT)
439         self.assertEqual([testutil.ip_address_mock(2)],
440                          current_cloud.private_ips)
441
442     def test_missing_cloud_node_update(self):
443         self.make_actor(1)
444         self.node_actor.update_cloud_node(None)
445         current_cloud = self.node_actor.cloud_node.get(self.TIMEOUT)
446         self.assertEqual([testutil.ip_address_mock(1)],
447                          current_cloud.private_ips)
448
449     def test_update_arvados_node(self):
450         self.make_actor(3)
451         job_uuid = 'zzzzz-jjjjj-updatejobnode00'
452         new_arvados = testutil.arvados_node_mock(3, job_uuid)
453         self.node_actor.update_arvados_node(new_arvados)
454         current_arvados = self.node_actor.arvados_node.get(self.TIMEOUT)
455         self.assertEqual(job_uuid, current_arvados['job_uuid'])
456
457     def test_missing_arvados_node_update(self):
458         self.make_actor(4, testutil.arvados_node_mock(4))
459         self.node_actor.update_arvados_node(None)
460         current_arvados = self.node_actor.arvados_node.get(self.TIMEOUT)
461         self.assertEqual(testutil.ip_address_mock(4),
462                          current_arvados['ip_address'])
463
464     def test_update_arvados_node_syncs_when_fqdn_mismatch(self):
465         self.make_mocks(5)
466         self.cloud_mock.extra['testname'] = 'cloudfqdn.zzzzz.arvadosapi.com'
467         self.make_actor()
468         arv_node = testutil.arvados_node_mock(5)
469         self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT)
470         self.assertEqual(1, self.updates.sync_node.call_count)
471
472     def test_update_arvados_node_skips_sync_when_fqdn_match(self):
473         self.make_mocks(6)
474         arv_node = testutil.arvados_node_mock(6)
475         self.cloud_mock.extra['testname'] ='{n[hostname]}.{n[domain]}'.format(
476             n=arv_node)
477         self.make_actor()
478         self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT)
479         self.assertEqual(0, self.updates.sync_node.call_count)