2 # Copyright (C) The Arvados Authors. All rights reserved.
4 # SPDX-License-Identifier: AGPL-3.0
6 from __future__ import absolute_import, print_function
11 import arvados.errors as arverror
17 from libcloud.common.exceptions import BaseHTTPError
19 import arvnodeman.computenode.dispatch as dispatch
20 from arvnodeman.computenode.driver import BaseComputeNodeDriver
21 from . import testutil
23 class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
24 def make_mocks(self, arvados_effect=None):
25 if arvados_effect is None:
26 arvados_effect = [testutil.arvados_node_mock()]
27 self.arvados_effect = arvados_effect
28 self.timer = testutil.MockTimer()
29 self.api_client = mock.MagicMock(name='api_client')
30 self.api_client.nodes().create().execute.side_effect = arvados_effect
31 self.api_client.nodes().update().execute.side_effect = arvados_effect
32 self.cloud_client = mock.MagicMock(name='cloud_client')
33 self.cloud_client.create_node.return_value = testutil.cloud_node_mock(1)
35 def make_actor(self, arv_node=None):
36 if not hasattr(self, 'timer'):
37 self.make_mocks(arvados_effect=[arv_node] if arv_node else None)
38 self.setup_actor = dispatch.ComputeNodeSetupActor.start(
39 self.timer, self.api_client, self.cloud_client,
40 testutil.MockSize(1), arv_node).proxy()
42 def assert_node_properties_updated(self, uuid=None,
43 size=testutil.MockSize(1)):
44 self.api_client.nodes().update.assert_any_call(
45 uuid=(uuid or self.arvados_effect[-1]['uuid']),
50 'price': size.price}}})
52 def test_creation_without_arvados_node(self):
54 finished = threading.Event()
55 self.setup_actor.subscribe(lambda _: finished.set())
56 self.assertEqual(self.arvados_effect[-1],
57 self.setup_actor.arvados_node.get(self.TIMEOUT))
58 assert(finished.wait(self.TIMEOUT))
59 self.assertEqual(1, self.api_client.nodes().create().execute.call_count)
60 self.assertEqual(1, self.api_client.nodes().update().execute.call_count)
61 self.assert_node_properties_updated()
62 self.assertEqual(self.cloud_client.create_node(),
63 self.setup_actor.cloud_node.get(self.TIMEOUT))
65 def test_creation_with_arvados_node(self):
66 self.make_mocks(arvados_effect=[testutil.arvados_node_mock()]*2)
67 self.make_actor(testutil.arvados_node_mock())
68 finished = threading.Event()
69 self.setup_actor.subscribe(lambda _: finished.set())
70 self.assertEqual(self.arvados_effect[-1],
71 self.setup_actor.arvados_node.get(self.TIMEOUT))
72 assert(finished.wait(self.TIMEOUT))
73 self.assert_node_properties_updated()
74 self.assertEqual(2, self.api_client.nodes().update().execute.call_count)
75 self.assertEqual(self.cloud_client.create_node(),
76 self.setup_actor.cloud_node.get(self.TIMEOUT))
78 def test_failed_arvados_calls_retried(self):
80 arverror.ApiError(httplib2.Response({'status': '500'}), ""),
81 testutil.arvados_node_mock(),
84 self.wait_for_assignment(self.setup_actor, 'arvados_node')
86 def test_failed_cloud_calls_retried(self):
88 self.cloud_client.create_node.side_effect = [
89 Exception("test cloud creation error"),
90 self.cloud_client.create_node.return_value,
93 self.wait_for_assignment(self.setup_actor, 'cloud_node')
95 def test_basehttperror_retried(self):
97 self.cloud_client.create_node.side_effect = [
98 BaseHTTPError(500, "Try again"),
99 self.cloud_client.create_node.return_value,
102 self.wait_for_assignment(self.setup_actor, 'cloud_node')
103 self.assertEqual(1, self.cloud_client.post_create_node.call_count)
105 def test_instance_exceeded_not_retried(self):
107 self.cloud_client.create_node.side_effect = [
108 BaseHTTPError(400, "InstanceLimitExceeded"),
109 self.cloud_client.create_node.return_value,
112 done = self.FUTURE_CLASS()
113 self.setup_actor.subscribe(done.set)
114 done.get(self.TIMEOUT)
115 self.assertEqual(0, self.cloud_client.post_create_node.call_count)
117 def test_failed_post_create_retried(self):
119 self.cloud_client.post_create_node.side_effect = [
120 Exception("test cloud post-create error"), None]
122 done = self.FUTURE_CLASS()
123 self.setup_actor.subscribe(done.set)
124 done.get(self.TIMEOUT)
125 self.assertEqual(2, self.cloud_client.post_create_node.call_count)
127 def test_stop_when_no_cloud_node(self):
129 arverror.ApiError(httplib2.Response({'status': '500'}), ""))
132 self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT))
134 self.setup_actor.actor_ref.actor_stopped.wait(self.TIMEOUT))
136 def test_no_stop_when_cloud_node(self):
138 self.wait_for_assignment(self.setup_actor, 'cloud_node')
140 self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT))
141 self.assertTrue(self.stop_proxy(self.setup_actor),
142 "actor was stopped by stop_if_no_cloud_node")
144 def test_subscribe(self):
146 arverror.ApiError(httplib2.Response({'status': '500'}), ""))
148 subscriber = mock.Mock(name='subscriber_mock')
149 self.setup_actor.subscribe(subscriber)
150 retry_resp = [testutil.arvados_node_mock()]
151 self.api_client.nodes().create().execute.side_effect = retry_resp
152 self.api_client.nodes().update().execute.side_effect = retry_resp
153 self.wait_for_assignment(self.setup_actor, 'cloud_node')
154 self.assertEqual(self.setup_actor.actor_ref.actor_urn,
155 subscriber.call_args[0][0].actor_ref.actor_urn)
157 def test_late_subscribe(self):
159 subscriber = mock.Mock(name='subscriber_mock')
160 self.wait_for_assignment(self.setup_actor, 'cloud_node')
161 self.setup_actor.subscribe(subscriber).get(self.TIMEOUT)
162 self.stop_proxy(self.setup_actor)
163 self.assertEqual(self.setup_actor.actor_ref.actor_urn,
164 subscriber.call_args[0][0].actor_ref.actor_urn)
167 class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
168 def make_mocks(self, cloud_node=None, arvados_node=None,
169 shutdown_open=True, node_broken=False):
170 self.timer = testutil.MockTimer()
171 self.shutdowns = testutil.MockShutdownTimer()
172 self.shutdowns._set_state(shutdown_open, 300)
173 self.cloud_client = mock.MagicMock(name='cloud_client')
174 self.cloud_client.broken.return_value = node_broken
175 self.arvados_client = mock.MagicMock(name='arvados_client')
176 self.updates = mock.MagicMock(name='update_mock')
177 if cloud_node is None:
178 cloud_node = testutil.cloud_node_mock()
179 self.cloud_node = cloud_node
180 self.arvados_node = arvados_node
182 def make_actor(self, cancellable=True, start_time=None):
183 if not hasattr(self, 'timer'):
185 if start_time is None:
186 start_time = time.time()
187 monitor_actor = dispatch.ComputeNodeMonitorActor.start(
188 self.cloud_node, start_time, self.shutdowns,
189 testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client,
191 self.shutdown_actor = self.ACTOR_CLASS.start(
192 self.timer, self.cloud_client, self.arvados_client, monitor_actor,
194 self.monitor_actor = monitor_actor.proxy()
196 def check_success_flag(self, expected, allow_msg_count=1):
197 # allow_msg_count is the number of internal messages that may
198 # need to be handled for shutdown to finish.
199 for try_num in range(1 + allow_msg_count):
200 last_flag = self.shutdown_actor.success.get(self.TIMEOUT)
201 if last_flag is expected:
204 self.fail("success flag {} is not {}".format(last_flag, expected))
206 def test_cancellable_shutdown(self, *mocks):
207 self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="busy"))
208 self.cloud_client.destroy_node.return_value = True
209 self.make_actor(cancellable=True)
210 self.check_success_flag(False)
211 self.assertFalse(self.cloud_client.destroy_node.called)
213 def test_uncancellable_shutdown(self, *mocks):
214 self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="busy"))
215 self.cloud_client.destroy_node.return_value = True
216 self.make_actor(cancellable=False)
217 self.check_success_flag(True, 2)
218 self.assertTrue(self.cloud_client.destroy_node.called)
220 def test_arvados_node_cleaned_after_shutdown(self, *mocks):
221 cloud_node = testutil.cloud_node_mock(62)
222 arv_node = testutil.arvados_node_mock(62)
223 self.make_mocks(cloud_node, arv_node)
225 self.check_success_flag(True, 3)
226 update_mock = self.arvados_client.nodes().update
227 self.assertTrue(update_mock.called)
228 update_kwargs = update_mock.call_args_list[0][1]
229 self.assertEqual(arv_node['uuid'], update_kwargs.get('uuid'))
230 self.assertIn('body', update_kwargs)
231 for clear_key in ['slot_number', 'hostname', 'ip_address',
232 'first_ping_at', 'last_ping_at']:
233 self.assertIn(clear_key, update_kwargs['body'])
234 self.assertIsNone(update_kwargs['body'][clear_key])
235 self.assertTrue(update_mock().execute.called)
237 def test_arvados_node_not_cleaned_after_shutdown_cancelled(self, *mocks):
238 cloud_node = testutil.cloud_node_mock(61)
239 arv_node = testutil.arvados_node_mock(61)
240 self.make_mocks(cloud_node, arv_node, shutdown_open=False)
241 self.cloud_client.destroy_node.return_value = False
242 self.make_actor(cancellable=True)
243 self.shutdown_actor.cancel_shutdown("test")
244 self.check_success_flag(False, 2)
245 self.assertFalse(self.arvados_client.nodes().update.called)
248 class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
250 ACTOR_CLASS = dispatch.ComputeNodeShutdownActor
252 def test_easy_shutdown(self):
253 self.make_actor(start_time=0)
254 self.check_success_flag(True)
255 self.assertTrue(self.cloud_client.destroy_node.called)
257 def test_shutdown_cancelled_when_destroy_node_fails(self):
258 self.make_mocks(node_broken=True)
259 self.cloud_client.destroy_node.return_value = False
260 self.make_actor(start_time=0)
261 self.check_success_flag(False, 2)
262 self.assertEqual(1, self.cloud_client.destroy_node.call_count)
263 self.assertEqual(self.ACTOR_CLASS.DESTROY_FAILED,
264 self.shutdown_actor.cancel_reason.get(self.TIMEOUT))
266 def test_late_subscribe(self):
268 subscriber = mock.Mock(name='subscriber_mock')
269 self.shutdown_actor.subscribe(subscriber).get(self.TIMEOUT)
270 self.stop_proxy(self.shutdown_actor)
271 self.assertTrue(subscriber.called)
272 self.assertEqual(self.shutdown_actor.actor_ref.actor_urn,
273 subscriber.call_args[0][0].actor_ref.actor_urn)
276 class ComputeNodeUpdateActorTestCase(testutil.ActorTestMixin,
278 ACTOR_CLASS = dispatch.ComputeNodeUpdateActor
280 def make_actor(self):
281 self.driver = mock.MagicMock(name='driver_mock')
282 self.timer = mock.MagicMock(name='timer_mock')
283 self.updater = self.ACTOR_CLASS.start(self.driver, self.timer).proxy()
285 def test_node_sync(self, *args):
287 cloud_node = testutil.cloud_node_mock()
288 arv_node = testutil.arvados_node_mock()
289 self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
290 self.driver().sync_node.assert_called_with(cloud_node, arv_node)
293 def test_node_sync_error(self, *args):
295 cloud_node = testutil.cloud_node_mock()
296 arv_node = testutil.arvados_node_mock()
297 self.driver().sync_node.side_effect = (IOError, Exception, True)
298 self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
299 self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
300 self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
301 self.driver().sync_node.assert_called_with(cloud_node, arv_node)
303 class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
305 def make_mocks(self, node_num):
306 self.shutdowns = testutil.MockShutdownTimer()
307 self.shutdowns._set_state(False, 300)
308 self.timer = mock.MagicMock(name='timer_mock')
309 self.updates = mock.MagicMock(name='update_mock')
310 self.cloud_mock = testutil.cloud_node_mock(node_num)
311 self.subscriber = mock.Mock(name='subscriber_mock')
312 self.cloud_client = mock.MagicMock(name='cloud_client')
313 self.cloud_client.broken.return_value = False
315 def make_actor(self, node_num=1, arv_node=None, start_time=None):
316 if not hasattr(self, 'cloud_mock'):
317 self.make_mocks(node_num)
318 if start_time is None:
319 start_time = time.time()
320 self.node_actor = dispatch.ComputeNodeMonitorActor.start(
321 self.cloud_mock, start_time, self.shutdowns,
322 testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client,
323 arv_node, boot_fail_after=300).proxy()
324 self.node_actor.subscribe(self.subscriber).get(self.TIMEOUT)
326 def node_state(self, *states):
327 return self.node_actor.in_state(*states).get(self.TIMEOUT)
329 def test_in_state_when_unpaired(self):
331 self.assertTrue(self.node_state('unpaired'))
333 def test_in_state_when_pairing_stale(self):
334 self.make_actor(arv_node=testutil.arvados_node_mock(
335 job_uuid=None, age=90000))
336 self.assertTrue(self.node_state('down'))
338 def test_in_state_when_no_state_available(self):
339 self.make_actor(arv_node=testutil.arvados_node_mock(
340 crunch_worker_state=None))
341 print(self.node_actor.get_state().get())
342 self.assertTrue(self.node_state('idle'))
344 def test_in_state_when_no_state_available_old(self):
345 self.make_actor(arv_node=testutil.arvados_node_mock(
346 crunch_worker_state=None, age=90000))
347 print(self.node_actor.get_state().get())
348 self.assertTrue(self.node_state('down'))
350 def test_in_idle_state(self):
351 self.make_actor(2, arv_node=testutil.arvados_node_mock(job_uuid=None))
352 self.assertTrue(self.node_state('idle'))
353 self.assertFalse(self.node_state('busy'))
354 self.assertTrue(self.node_state('idle', 'busy'))
356 def test_in_busy_state(self):
357 self.make_actor(3, arv_node=testutil.arvados_node_mock(job_uuid=True))
358 self.assertFalse(self.node_state('idle'))
359 self.assertTrue(self.node_state('busy'))
360 self.assertTrue(self.node_state('idle', 'busy'))
362 def test_init_shutdown_scheduling(self):
364 self.assertTrue(self.timer.schedule.called)
365 self.assertEqual(300, self.timer.schedule.call_args[0][0])
367 def test_shutdown_window_close_scheduling(self):
369 self.shutdowns._set_state(False, 600)
370 self.timer.schedule.reset_mock()
371 self.node_actor.consider_shutdown().get(self.TIMEOUT)
372 self.stop_proxy(self.node_actor)
373 self.assertTrue(self.timer.schedule.called)
374 self.assertEqual(600, self.timer.schedule.call_args[0][0])
375 self.assertFalse(self.subscriber.called)
377 def test_shutdown_subscription(self):
378 self.make_actor(start_time=0)
379 self.shutdowns._set_state(True, 600)
380 self.node_actor.consider_shutdown().get(self.TIMEOUT)
381 self.assertTrue(self.subscriber.called)
382 self.assertEqual(self.node_actor.actor_ref.actor_urn,
383 self.subscriber.call_args[0][0].actor_ref.actor_urn)
385 def test_no_shutdown_booting(self):
387 self.shutdowns._set_state(True, 600)
388 self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
389 (False, "node state is ('unpaired', 'open', 'boot wait', 'idle exceeded')"))
391 def test_shutdown_without_arvados_node(self):
392 self.make_actor(start_time=0)
393 self.shutdowns._set_state(True, 600)
394 self.assertEquals((True, "node state is ('unpaired', 'open', 'boot exceeded', 'idle exceeded')"),
395 self.node_actor.shutdown_eligible().get(self.TIMEOUT))
397 def test_shutdown_missing(self):
398 arv_node = testutil.arvados_node_mock(10, job_uuid=None,
399 crunch_worker_state="down",
400 last_ping_at='1970-01-01T01:02:03.04050607Z')
401 self.make_actor(10, arv_node)
402 self.shutdowns._set_state(True, 600)
403 self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"),
404 self.node_actor.shutdown_eligible().get(self.TIMEOUT))
406 def test_shutdown_running_broken(self):
407 arv_node = testutil.arvados_node_mock(12, job_uuid=None,
408 crunch_worker_state="down")
409 self.make_actor(12, arv_node)
410 self.shutdowns._set_state(True, 600)
411 self.cloud_client.broken.return_value = True
412 self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"),
413 self.node_actor.shutdown_eligible().get(self.TIMEOUT))
415 def test_shutdown_missing_broken(self):
416 arv_node = testutil.arvados_node_mock(11, job_uuid=None,
417 crunch_worker_state="down",
418 last_ping_at='1970-01-01T01:02:03.04050607Z')
419 self.make_actor(11, arv_node)
420 self.shutdowns._set_state(True, 600)
421 self.cloud_client.broken.return_value = True
422 self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"))
424 def test_no_shutdown_when_window_closed(self):
425 self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
426 self.assertEquals((False, "node state is ('idle', 'closed', 'boot wait', 'idle exceeded')"),
427 self.node_actor.shutdown_eligible().get(self.TIMEOUT))
429 def test_no_shutdown_when_node_running_job(self):
430 self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
431 self.shutdowns._set_state(True, 600)
432 self.assertEquals((False, "node state is ('busy', 'open', 'boot wait', 'idle exceeded')"),
433 self.node_actor.shutdown_eligible().get(self.TIMEOUT))
435 def test_shutdown_when_node_state_unknown(self):
436 self.make_actor(5, testutil.arvados_node_mock(
437 5, crunch_worker_state=None))
438 self.shutdowns._set_state(True, 600)
439 self.assertEquals((True, "node state is ('idle', 'open', 'boot wait', 'idle exceeded')"),
440 self.node_actor.shutdown_eligible().get(self.TIMEOUT))
442 def test_no_shutdown_when_node_state_stale(self):
443 self.make_actor(6, testutil.arvados_node_mock(6, age=90000))
444 self.shutdowns._set_state(True, 600)
445 self.assertEquals((False, "node state is stale"),
446 self.node_actor.shutdown_eligible().get(self.TIMEOUT))
448 def test_arvados_node_match(self):
450 arv_node = testutil.arvados_node_mock(
451 2, hostname='compute-two.zzzzz.arvadosapi.com')
452 self.cloud_client.node_id.return_value = '2'
453 pair_id = self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT)
454 self.assertEqual(self.cloud_mock.id, pair_id)
455 self.stop_proxy(self.node_actor)
456 self.updates.sync_node.assert_called_with(self.cloud_mock, arv_node)
458 def test_arvados_node_mismatch(self):
460 arv_node = testutil.arvados_node_mock(1)
462 self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT))
464 def test_arvados_node_mismatch_first_ping_too_early(self):
466 arv_node = testutil.arvados_node_mock(
467 4, first_ping_at='1971-03-02T14:15:16.1717282Z')
469 self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT))
471 def test_update_cloud_node(self):
474 self.cloud_mock.id = '1'
475 self.node_actor.update_cloud_node(self.cloud_mock)
476 current_cloud = self.node_actor.cloud_node.get(self.TIMEOUT)
477 self.assertEqual([testutil.ip_address_mock(2)],
478 current_cloud.private_ips)
480 def test_missing_cloud_node_update(self):
482 self.node_actor.update_cloud_node(None)
483 current_cloud = self.node_actor.cloud_node.get(self.TIMEOUT)
484 self.assertEqual([testutil.ip_address_mock(1)],
485 current_cloud.private_ips)
487 def test_update_arvados_node(self):
489 job_uuid = 'zzzzz-jjjjj-updatejobnode00'
490 new_arvados = testutil.arvados_node_mock(3, job_uuid)
491 self.node_actor.update_arvados_node(new_arvados)
492 current_arvados = self.node_actor.arvados_node.get(self.TIMEOUT)
493 self.assertEqual(job_uuid, current_arvados['job_uuid'])
495 def test_missing_arvados_node_update(self):
496 self.make_actor(4, testutil.arvados_node_mock(4))
497 self.node_actor.update_arvados_node(None)
498 current_arvados = self.node_actor.arvados_node.get(self.TIMEOUT)
499 self.assertEqual(testutil.ip_address_mock(4),
500 current_arvados['ip_address'])
502 def test_update_arvados_node_syncs_when_fqdn_mismatch(self):
504 self.cloud_mock.extra['testname'] = 'cloudfqdn.zzzzz.arvadosapi.com'
506 arv_node = testutil.arvados_node_mock(5)
507 self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT)
508 self.assertEqual(1, self.updates.sync_node.call_count)
510 def test_update_arvados_node_skips_sync_when_fqdn_match(self):
512 arv_node = testutil.arvados_node_mock(6)
513 self.cloud_mock.extra['testname'] ='{n[hostname]}.{n[domain]}'.format(
516 self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT)
517 self.assertEqual(0, self.updates.sync_node.call_count)