2 # Copyright (C) The Arvados Authors. All rights reserved.
4 # SPDX-License-Identifier: AGPL-3.0
6 from __future__ import absolute_import, print_function
11 import arvados.errors as arverror
17 from libcloud.common.exceptions import BaseHTTPError
19 import arvnodeman.computenode.dispatch as dispatch
20 from arvnodeman.computenode.driver import BaseComputeNodeDriver
21 from . import testutil
23 class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
24 ACTOR_CLASS = dispatch.ComputeNodeSetupActor
26 def make_mocks(self, arvados_effect=None):
27 if arvados_effect is None:
28 arvados_effect = [testutil.arvados_node_mock(
34 self.arvados_effect = arvados_effect
35 self.timer = testutil.MockTimer()
36 self.api_client = mock.MagicMock(name='api_client')
37 self.api_client.nodes().create().execute.side_effect = arvados_effect
38 self.api_client.nodes().update().execute.side_effect = arvados_effect
39 self.cloud_client = mock.MagicMock(name='cloud_client')
40 self.cloud_client.create_node.return_value = testutil.cloud_node_mock(1)
42 def make_actor(self, arv_node=None):
43 if not hasattr(self, 'timer'):
44 self.make_mocks(arvados_effect=[arv_node] if arv_node else None)
45 self.setup_actor = self.ACTOR_CLASS.start(
46 self.timer, self.api_client, self.cloud_client,
47 testutil.MockSize(1), arv_node).proxy()
49 def assert_node_properties_updated(self, uuid=None,
50 size=testutil.MockSize(1)):
51 self.api_client.nodes().update.assert_any_call(
52 uuid=(uuid or self.arvados_effect[-1]['uuid']),
57 'price': size.price}}})
59 def test_creation_without_arvados_node(self):
61 finished = threading.Event()
62 self.setup_actor.subscribe(lambda _: finished.set())
63 self.assertEqual(self.arvados_effect[-1],
64 self.setup_actor.arvados_node.get(self.TIMEOUT))
65 assert(finished.wait(self.TIMEOUT))
66 self.api_client.nodes().create.called_with(body={}, assign_slot=True)
67 self.assertEqual(1, self.api_client.nodes().create().execute.call_count)
68 self.assertEqual(1, self.api_client.nodes().update().execute.call_count)
69 self.assert_node_properties_updated()
70 self.assertEqual(self.cloud_client.create_node(),
71 self.setup_actor.cloud_node.get(self.TIMEOUT))
73 def test_creation_with_arvados_node(self):
74 self.make_mocks(arvados_effect=[testutil.arvados_node_mock()]*2)
75 self.make_actor(testutil.arvados_node_mock())
76 finished = threading.Event()
77 self.setup_actor.subscribe(lambda _: finished.set())
78 self.assertEqual(self.arvados_effect[-1],
79 self.setup_actor.arvados_node.get(self.TIMEOUT))
80 assert(finished.wait(self.TIMEOUT))
81 self.assert_node_properties_updated()
82 self.api_client.nodes().create.called_with(body={}, assign_slot=True)
83 self.assertEqual(3, self.api_client.nodes().update().execute.call_count)
84 self.assertEqual(self.cloud_client.create_node(),
85 self.setup_actor.cloud_node.get(self.TIMEOUT))
87 def test_failed_arvados_calls_retried(self):
89 arverror.ApiError(httplib2.Response({'status': '500'}), ""),
90 testutil.arvados_node_mock(),
93 self.wait_for_assignment(self.setup_actor, 'arvados_node')
95 def test_failed_cloud_calls_retried(self):
97 self.cloud_client.create_node.side_effect = [
98 Exception("test cloud creation error"),
99 self.cloud_client.create_node.return_value,
102 self.wait_for_assignment(self.setup_actor, 'cloud_node')
104 def test_basehttperror_retried(self):
106 self.cloud_client.create_node.side_effect = [
107 BaseHTTPError(500, "Try again"),
108 self.cloud_client.create_node.return_value,
111 self.wait_for_assignment(self.setup_actor, 'cloud_node')
112 self.setup_actor.ping().get(self.TIMEOUT)
113 self.assertEqual(1, self.cloud_client.post_create_node.call_count)
115 def test_instance_exceeded_not_retried(self):
117 self.cloud_client.create_node.side_effect = [
118 BaseHTTPError(400, "InstanceLimitExceeded"),
119 self.cloud_client.create_node.return_value,
122 done = self.FUTURE_CLASS()
123 self.setup_actor.subscribe(done.set)
124 done.get(self.TIMEOUT)
125 self.assertEqual(0, self.cloud_client.post_create_node.call_count)
127 def test_failed_post_create_retried(self):
129 self.cloud_client.post_create_node.side_effect = [
130 Exception("test cloud post-create error"), None]
132 done = self.FUTURE_CLASS()
133 self.setup_actor.subscribe(done.set)
134 done.get(self.TIMEOUT)
135 self.assertEqual(2, self.cloud_client.post_create_node.call_count)
137 def test_stop_when_no_cloud_node(self):
139 arverror.ApiError(httplib2.Response({'status': '500'}), ""))
142 self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT))
144 self.setup_actor.actor_ref.actor_stopped.wait(self.TIMEOUT))
146 def test_no_stop_when_cloud_node(self):
148 self.wait_for_assignment(self.setup_actor, 'cloud_node')
150 self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT))
151 self.assertTrue(self.stop_proxy(self.setup_actor),
152 "actor was stopped by stop_if_no_cloud_node")
154 def test_subscribe(self):
156 arverror.ApiError(httplib2.Response({'status': '500'}), ""))
158 subscriber = mock.Mock(name='subscriber_mock')
159 self.setup_actor.subscribe(subscriber)
160 retry_resp = [testutil.arvados_node_mock()]
161 self.api_client.nodes().create().execute.side_effect = retry_resp
162 self.api_client.nodes().update().execute.side_effect = retry_resp
163 self.wait_for_assignment(self.setup_actor, 'cloud_node')
164 self.setup_actor.ping().get(self.TIMEOUT)
165 self.assertEqual(self.setup_actor.actor_ref.actor_urn,
166 subscriber.call_args[0][0].actor_ref.actor_urn)
168 def test_late_subscribe(self):
170 subscriber = mock.Mock(name='subscriber_mock')
171 self.wait_for_assignment(self.setup_actor, 'cloud_node')
172 self.setup_actor.subscribe(subscriber).get(self.TIMEOUT)
173 self.stop_proxy(self.setup_actor)
174 self.assertEqual(self.setup_actor.actor_ref.actor_urn,
175 subscriber.call_args[0][0].actor_ref.actor_urn)
178 class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
179 def make_mocks(self, cloud_node=None, arvados_node=None,
180 shutdown_open=True, node_broken=False):
181 self.timer = testutil.MockTimer()
182 self.shutdowns = testutil.MockShutdownTimer()
183 self.shutdowns._set_state(shutdown_open, 300)
184 self.cloud_client = mock.MagicMock(name='cloud_client')
185 self.cloud_client.broken.return_value = node_broken
186 self.arvados_client = mock.MagicMock(name='arvados_client')
187 self.updates = mock.MagicMock(name='update_mock')
188 if cloud_node is None:
189 cloud_node = testutil.cloud_node_mock()
190 self.cloud_node = cloud_node
191 self.arvados_node = arvados_node
193 def make_actor(self, cancellable=True, start_time=None):
194 if not hasattr(self, 'timer'):
196 if start_time is None:
197 start_time = time.time()
198 monitor_actor = dispatch.ComputeNodeMonitorActor.start(
199 self.cloud_node, start_time, self.shutdowns,
200 self.timer, self.updates, self.cloud_client,
202 self.shutdown_actor = self.ACTOR_CLASS.start(
203 self.timer, self.cloud_client, self.arvados_client, monitor_actor,
205 self.monitor_actor = monitor_actor.proxy()
207 def check_success_flag(self, expected, allow_msg_count=1):
208 # allow_msg_count is the number of internal messages that may
209 # need to be handled for shutdown to finish.
210 for try_num in range(1 + allow_msg_count):
211 last_flag = self.shutdown_actor.success.get(self.TIMEOUT)
212 if last_flag is expected:
215 self.fail("success flag {} is not {}".format(last_flag, expected))
217 def test_cancellable_shutdown(self, *mocks):
218 self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="busy"))
219 self.cloud_client.destroy_node.return_value = True
220 self.make_actor(cancellable=True)
221 self.check_success_flag(False, 2)
222 self.assertFalse(self.cloud_client.destroy_node.called)
224 def test_uncancellable_shutdown(self, *mocks):
225 self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="busy"))
226 self.cloud_client.destroy_node.return_value = True
227 self.make_actor(cancellable=False)
228 self.check_success_flag(True, 4)
229 self.assertTrue(self.cloud_client.destroy_node.called)
231 def test_arvados_node_cleaned_after_shutdown(self, *mocks):
233 mocks[0].return_value = "drain\n"
234 cloud_node = testutil.cloud_node_mock(62)
235 arv_node = testutil.arvados_node_mock(62)
236 self.make_mocks(cloud_node, arv_node)
238 self.check_success_flag(True, 3)
239 update_mock = self.arvados_client.nodes().update
240 self.assertTrue(update_mock.called)
241 update_kwargs = update_mock.call_args_list[0][1]
242 self.assertEqual(arv_node['uuid'], update_kwargs.get('uuid'))
243 self.assertIn('body', update_kwargs)
244 for clear_key in ['slot_number', 'hostname', 'ip_address',
245 'first_ping_at', 'last_ping_at']:
246 self.assertIn(clear_key, update_kwargs['body'])
247 self.assertIsNone(update_kwargs['body'][clear_key])
248 self.assertTrue(update_mock().execute.called)
250 def test_arvados_node_not_cleaned_after_shutdown_cancelled(self, *mocks):
252 mocks[0].return_value = "idle\n"
253 cloud_node = testutil.cloud_node_mock(61)
254 arv_node = testutil.arvados_node_mock(61)
255 self.make_mocks(cloud_node, arv_node, shutdown_open=False)
256 self.cloud_client.destroy_node.return_value = False
257 self.make_actor(cancellable=True)
258 self.shutdown_actor.cancel_shutdown("test")
259 self.shutdown_actor.ping().get(self.TIMEOUT)
260 self.check_success_flag(False, 2)
261 self.assertFalse(self.arvados_client.nodes().update.called)
264 class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
266 ACTOR_CLASS = dispatch.ComputeNodeShutdownActor
268 def test_easy_shutdown(self):
269 self.make_actor(start_time=0)
270 self.check_success_flag(True)
271 self.assertTrue(self.cloud_client.destroy_node.called)
273 def test_shutdown_cancelled_when_destroy_node_fails(self):
274 self.make_mocks(node_broken=True)
275 self.cloud_client.destroy_node.return_value = False
276 self.make_actor(start_time=0)
277 self.check_success_flag(False, 2)
278 self.assertEqual(1, self.cloud_client.destroy_node.call_count)
279 self.assertEqual(self.ACTOR_CLASS.DESTROY_FAILED,
280 self.shutdown_actor.cancel_reason.get(self.TIMEOUT))
282 def test_late_subscribe(self):
284 subscriber = mock.Mock(name='subscriber_mock')
285 self.shutdown_actor.subscribe(subscriber).get(self.TIMEOUT)
286 self.stop_proxy(self.shutdown_actor)
287 self.assertTrue(subscriber.called)
288 self.assertEqual(self.shutdown_actor.actor_ref.actor_urn,
289 subscriber.call_args[0][0].actor_ref.actor_urn)
292 class ComputeNodeUpdateActorTestCase(testutil.ActorTestMixin,
294 ACTOR_CLASS = dispatch.ComputeNodeUpdateActor
296 def make_actor(self):
297 self.driver = mock.MagicMock(name='driver_mock')
298 self.timer = mock.MagicMock(name='timer_mock')
299 self.updater = self.ACTOR_CLASS.start(self.driver, self.timer).proxy()
301 def test_node_sync(self, *args):
303 cloud_node = testutil.cloud_node_mock()
304 arv_node = testutil.arvados_node_mock()
305 self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
306 self.driver().sync_node.assert_called_with(cloud_node, arv_node)
309 def test_node_sync_error(self, *args):
311 cloud_node = testutil.cloud_node_mock()
312 arv_node = testutil.arvados_node_mock()
313 self.driver().sync_node.side_effect = (IOError, Exception, True)
314 self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
315 self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
316 self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
317 self.driver().sync_node.assert_called_with(cloud_node, arv_node)
319 class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
321 def make_mocks(self, node_num):
322 self.shutdowns = testutil.MockShutdownTimer()
323 self.shutdowns._set_state(False, 300)
324 self.timer = mock.MagicMock(name='timer_mock')
325 self.updates = mock.MagicMock(name='update_mock')
326 self.cloud_mock = testutil.cloud_node_mock(node_num)
327 self.subscriber = mock.Mock(name='subscriber_mock')
328 self.cloud_client = mock.MagicMock(name='cloud_client')
329 self.cloud_client.broken.return_value = False
331 def make_actor(self, node_num=1, arv_node=None, start_time=None):
332 if not hasattr(self, 'cloud_mock'):
333 self.make_mocks(node_num)
334 if start_time is None:
335 start_time = time.time()
336 self.node_actor = dispatch.ComputeNodeMonitorActor.start(
337 self.cloud_mock, start_time, self.shutdowns,
338 self.timer, self.updates, self.cloud_client,
339 arv_node, boot_fail_after=300).proxy()
340 self.node_actor.subscribe(self.subscriber).get(self.TIMEOUT)
342 def node_state(self, *states):
343 return self.node_actor.in_state(*states).get(self.TIMEOUT)
345 def test_in_state_when_unpaired(self):
347 self.assertTrue(self.node_state('unpaired'))
349 def test_in_state_when_pairing_stale(self):
350 self.make_actor(arv_node=testutil.arvados_node_mock(
351 job_uuid=None, age=90000))
352 self.assertTrue(self.node_state('down'))
354 def test_in_state_when_no_state_available(self):
355 self.make_actor(arv_node=testutil.arvados_node_mock(
356 crunch_worker_state=None))
357 self.assertTrue(self.node_state('idle'))
359 def test_in_state_when_no_state_available_old(self):
360 self.make_actor(arv_node=testutil.arvados_node_mock(
361 crunch_worker_state=None, age=90000))
362 self.assertTrue(self.node_state('down'))
364 def test_in_idle_state(self):
365 self.make_actor(2, arv_node=testutil.arvados_node_mock(job_uuid=None))
366 self.assertTrue(self.node_state('idle'))
367 self.assertFalse(self.node_state('busy'))
368 self.assertTrue(self.node_state('idle', 'busy'))
370 def test_in_busy_state(self):
371 self.make_actor(3, arv_node=testutil.arvados_node_mock(job_uuid=True))
372 self.assertFalse(self.node_state('idle'))
373 self.assertTrue(self.node_state('busy'))
374 self.assertTrue(self.node_state('idle', 'busy'))
376 def test_init_shutdown_scheduling(self):
378 self.assertTrue(self.timer.schedule.called)
379 self.assertEqual(300, self.timer.schedule.call_args[0][0])
381 def test_shutdown_window_close_scheduling(self):
383 self.shutdowns._set_state(False, 600)
384 self.timer.schedule.reset_mock()
385 self.node_actor.consider_shutdown().get(self.TIMEOUT)
386 self.stop_proxy(self.node_actor)
387 self.assertTrue(self.timer.schedule.called)
388 self.assertEqual(600, self.timer.schedule.call_args[0][0])
389 self.assertFalse(self.subscriber.called)
391 def test_shutdown_subscription(self):
392 self.make_actor(start_time=0)
393 self.shutdowns._set_state(True, 600)
394 self.node_actor.consider_shutdown().get(self.TIMEOUT)
395 self.assertTrue(self.subscriber.called)
396 self.assertEqual(self.node_actor.actor_ref.actor_urn,
397 self.subscriber.call_args[0][0].actor_ref.actor_urn)
399 def test_no_shutdown_booting(self):
401 self.shutdowns._set_state(True, 600)
402 self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
403 (False, "node state is ('unpaired', 'open', 'boot wait', 'idle exceeded')"))
405 def test_shutdown_without_arvados_node(self):
406 self.make_actor(start_time=0)
407 self.shutdowns._set_state(True, 600)
408 self.assertEquals((True, "node state is ('down', 'open', 'boot exceeded', 'idle exceeded')"),
409 self.node_actor.shutdown_eligible().get(self.TIMEOUT))
411 def test_shutdown_missing(self):
412 arv_node = testutil.arvados_node_mock(10, job_uuid=None,
413 crunch_worker_state="down",
414 last_ping_at='1970-01-01T01:02:03.04050607Z')
415 self.make_actor(10, arv_node)
416 self.shutdowns._set_state(True, 600)
417 self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"),
418 self.node_actor.shutdown_eligible().get(self.TIMEOUT))
420 def test_shutdown_running_broken(self):
421 arv_node = testutil.arvados_node_mock(12, job_uuid=None,
422 crunch_worker_state="down")
423 self.make_actor(12, arv_node)
424 self.shutdowns._set_state(True, 600)
425 self.cloud_client.broken.return_value = True
426 self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"),
427 self.node_actor.shutdown_eligible().get(self.TIMEOUT))
429 def test_shutdown_missing_broken(self):
430 arv_node = testutil.arvados_node_mock(11, job_uuid=None,
431 crunch_worker_state="down",
432 last_ping_at='1970-01-01T01:02:03.04050607Z')
433 self.make_actor(11, arv_node)
434 self.shutdowns._set_state(True, 600)
435 self.cloud_client.broken.return_value = True
436 self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"))
438 def test_no_shutdown_when_window_closed(self):
439 self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
440 self.assertEquals((False, "node state is ('idle', 'closed', 'boot wait', 'idle exceeded')"),
441 self.node_actor.shutdown_eligible().get(self.TIMEOUT))
443 def test_no_shutdown_when_node_running_job(self):
444 self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
445 self.shutdowns._set_state(True, 600)
446 self.assertEquals((False, "node state is ('busy', 'open', 'boot wait', 'idle exceeded')"),
447 self.node_actor.shutdown_eligible().get(self.TIMEOUT))
449 def test_shutdown_when_node_state_unknown(self):
450 self.make_actor(5, testutil.arvados_node_mock(
451 5, crunch_worker_state=None))
452 self.shutdowns._set_state(True, 600)
453 self.assertEquals((True, "node state is ('idle', 'open', 'boot wait', 'idle exceeded')"),
454 self.node_actor.shutdown_eligible().get(self.TIMEOUT))
456 def test_shutdown_when_node_state_fail(self):
457 self.make_actor(5, testutil.arvados_node_mock(
458 5, crunch_worker_state='fail'))
459 self.shutdowns._set_state(True, 600)
460 self.assertEquals((True, "node state is ('fail', 'open', 'boot wait', 'idle exceeded')"),
461 self.node_actor.shutdown_eligible().get(self.TIMEOUT))
463 def test_no_shutdown_when_node_state_stale(self):
464 self.make_actor(6, testutil.arvados_node_mock(6, age=90000))
465 self.shutdowns._set_state(True, 600)
466 self.assertEquals((False, "node state is stale"),
467 self.node_actor.shutdown_eligible().get(self.TIMEOUT))
469 def test_arvados_node_match(self):
471 arv_node = testutil.arvados_node_mock(
472 2, hostname='compute-two.zzzzz.arvadosapi.com')
473 self.cloud_client.node_id.return_value = '2'
474 pair_id = self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT)
475 self.assertEqual(self.cloud_mock.id, pair_id)
476 self.stop_proxy(self.node_actor)
477 self.updates.sync_node.assert_called_with(self.cloud_mock, arv_node)
479 def test_arvados_node_mismatch(self):
481 arv_node = testutil.arvados_node_mock(1)
483 self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT))
485 def test_arvados_node_mismatch_first_ping_too_early(self):
487 arv_node = testutil.arvados_node_mock(
488 4, first_ping_at='1971-03-02T14:15:16.1717282Z')
490 self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT))
492 def test_update_cloud_node(self):
495 self.cloud_mock.id = '1'
496 self.node_actor.update_cloud_node(self.cloud_mock)
497 current_cloud = self.node_actor.cloud_node.get(self.TIMEOUT)
498 self.assertEqual([testutil.ip_address_mock(2)],
499 current_cloud.private_ips)
501 def test_missing_cloud_node_update(self):
503 self.node_actor.update_cloud_node(None)
504 current_cloud = self.node_actor.cloud_node.get(self.TIMEOUT)
505 self.assertEqual([testutil.ip_address_mock(1)],
506 current_cloud.private_ips)
508 def test_update_arvados_node(self):
510 job_uuid = 'zzzzz-jjjjj-updatejobnode00'
511 new_arvados = testutil.arvados_node_mock(3, job_uuid)
512 self.node_actor.update_arvados_node(new_arvados)
513 current_arvados = self.node_actor.arvados_node.get(self.TIMEOUT)
514 self.assertEqual(job_uuid, current_arvados['job_uuid'])
516 def test_missing_arvados_node_update(self):
517 self.make_actor(4, testutil.arvados_node_mock(4))
518 self.node_actor.update_arvados_node(None)
519 current_arvados = self.node_actor.arvados_node.get(self.TIMEOUT)
520 self.assertEqual(testutil.ip_address_mock(4),
521 current_arvados['ip_address'])
523 def test_update_arvados_node_calls_sync_node(self):
525 self.cloud_mock.extra['testname'] = 'cloudfqdn.zzzzz.arvadosapi.com'
527 arv_node = testutil.arvados_node_mock(5)
528 self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT)
529 self.assertEqual(1, self.updates.sync_node.call_count)