3 from __future__ import absolute_import, print_function
8 import arvados.errors as arverror
14 from libcloud.common.exceptions import BaseHTTPError
16 import arvnodeman.computenode.dispatch as dispatch
17 from arvnodeman.computenode.driver import BaseComputeNodeDriver
18 from . import testutil
20 class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
21 def make_mocks(self, arvados_effect=None):
22 if arvados_effect is None:
23 arvados_effect = [testutil.arvados_node_mock()]
24 self.arvados_effect = arvados_effect
25 self.timer = testutil.MockTimer()
26 self.api_client = mock.MagicMock(name='api_client')
27 self.api_client.nodes().create().execute.side_effect = arvados_effect
28 self.api_client.nodes().update().execute.side_effect = arvados_effect
29 self.cloud_client = mock.MagicMock(name='cloud_client')
30 self.cloud_client.create_node.return_value = testutil.cloud_node_mock(1)
31 self.cloud_client.is_cloud_exception = BaseComputeNodeDriver.is_cloud_exception
33 def make_actor(self, arv_node=None):
34 if not hasattr(self, 'timer'):
35 self.make_mocks(arvados_effect=[arv_node] if arv_node else None)
36 self.setup_actor = dispatch.ComputeNodeSetupActor.start(
37 self.timer, self.api_client, self.cloud_client,
38 testutil.MockSize(1), arv_node).proxy()
40 def assert_node_properties_updated(self, uuid=None,
41 size=testutil.MockSize(1)):
42 self.api_client.nodes().update.assert_any_call(
43 uuid=(uuid or self.arvados_effect[-1]['uuid']),
48 'price': size.price}}})
50 def test_creation_without_arvados_node(self):
52 finished = threading.Event()
53 self.setup_actor.subscribe(lambda _: finished.set())
54 self.assertEqual(self.arvados_effect[-1],
55 self.setup_actor.arvados_node.get(self.TIMEOUT))
56 assert(finished.wait(self.TIMEOUT))
57 self.assertEqual(1, self.api_client.nodes().create().execute.call_count)
58 self.assertEqual(1, self.api_client.nodes().update().execute.call_count)
59 self.assert_node_properties_updated()
60 self.assertEqual(self.cloud_client.create_node(),
61 self.setup_actor.cloud_node.get(self.TIMEOUT))
63 def test_creation_with_arvados_node(self):
64 self.make_mocks(arvados_effect=[testutil.arvados_node_mock()]*2)
65 self.make_actor(testutil.arvados_node_mock())
66 finished = threading.Event()
67 self.setup_actor.subscribe(lambda _: finished.set())
68 self.assertEqual(self.arvados_effect[-1],
69 self.setup_actor.arvados_node.get(self.TIMEOUT))
70 assert(finished.wait(self.TIMEOUT))
71 self.assert_node_properties_updated()
72 self.assertEqual(2, self.api_client.nodes().update().execute.call_count)
73 self.assertEqual(self.cloud_client.create_node(),
74 self.setup_actor.cloud_node.get(self.TIMEOUT))
76 def test_failed_arvados_calls_retried(self):
78 arverror.ApiError(httplib2.Response({'status': '500'}), ""),
79 testutil.arvados_node_mock(),
82 self.wait_for_assignment(self.setup_actor, 'arvados_node')
84 def test_failed_cloud_calls_retried(self):
86 self.cloud_client.create_node.side_effect = [
87 Exception("test cloud creation error"),
88 self.cloud_client.create_node.return_value,
91 self.wait_for_assignment(self.setup_actor, 'cloud_node')
93 def test_unknown_basehttperror_not_retried(self):
95 self.cloud_client.create_node.side_effect = [
96 BaseHTTPError(400, "Unknown"),
97 self.cloud_client.create_node.return_value,
100 finished = threading.Event()
101 self.setup_actor.subscribe(lambda _: finished.set())
102 assert(finished.wait(self.TIMEOUT))
103 self.assertEqual(0, self.cloud_client.post_create_node.call_count)
105 def test_known_basehttperror_retried(self):
107 self.cloud_client.create_node.side_effect = [
108 BaseHTTPError(400, "InstanceLimitExceeded"),
109 self.cloud_client.create_node.return_value,
112 self.wait_for_assignment(self.setup_actor, 'cloud_node')
113 self.assertEqual(1, self.cloud_client.post_create_node.call_count)
115 def test_failed_post_create_retried(self):
117 self.cloud_client.post_create_node.side_effect = [
118 Exception("test cloud post-create error"), None]
120 done = self.FUTURE_CLASS()
121 self.setup_actor.subscribe(done.set)
122 done.get(self.TIMEOUT)
123 self.assertEqual(2, self.cloud_client.post_create_node.call_count)
125 def test_stop_when_no_cloud_node(self):
127 arverror.ApiError(httplib2.Response({'status': '500'}), ""))
130 self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT))
132 self.setup_actor.actor_ref.actor_stopped.wait(self.TIMEOUT))
134 def test_no_stop_when_cloud_node(self):
136 self.wait_for_assignment(self.setup_actor, 'cloud_node')
138 self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT))
139 self.assertTrue(self.stop_proxy(self.setup_actor),
140 "actor was stopped by stop_if_no_cloud_node")
142 def test_subscribe(self):
144 arverror.ApiError(httplib2.Response({'status': '500'}), ""))
146 subscriber = mock.Mock(name='subscriber_mock')
147 self.setup_actor.subscribe(subscriber)
148 retry_resp = [testutil.arvados_node_mock()]
149 self.api_client.nodes().create().execute.side_effect = retry_resp
150 self.api_client.nodes().update().execute.side_effect = retry_resp
151 self.wait_for_assignment(self.setup_actor, 'cloud_node')
152 self.assertEqual(self.setup_actor.actor_ref.actor_urn,
153 subscriber.call_args[0][0].actor_ref.actor_urn)
155 def test_late_subscribe(self):
157 subscriber = mock.Mock(name='subscriber_mock')
158 self.wait_for_assignment(self.setup_actor, 'cloud_node')
159 self.setup_actor.subscribe(subscriber).get(self.TIMEOUT)
160 self.stop_proxy(self.setup_actor)
161 self.assertEqual(self.setup_actor.actor_ref.actor_urn,
162 subscriber.call_args[0][0].actor_ref.actor_urn)
165 class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
166 def make_mocks(self, cloud_node=None, arvados_node=None,
167 shutdown_open=True, node_broken=False):
168 self.timer = testutil.MockTimer()
169 self.shutdowns = testutil.MockShutdownTimer()
170 self.shutdowns._set_state(shutdown_open, 300)
171 self.cloud_client = mock.MagicMock(name='cloud_client')
172 self.cloud_client.broken.return_value = node_broken
173 self.arvados_client = mock.MagicMock(name='arvados_client')
174 self.updates = mock.MagicMock(name='update_mock')
175 if cloud_node is None:
176 cloud_node = testutil.cloud_node_mock()
177 self.cloud_node = cloud_node
178 self.arvados_node = arvados_node
180 def make_actor(self, cancellable=True, start_time=None):
181 if not hasattr(self, 'timer'):
183 if start_time is None:
184 start_time = time.time()
185 monitor_actor = dispatch.ComputeNodeMonitorActor.start(
186 self.cloud_node, start_time, self.shutdowns,
187 testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client,
189 self.shutdown_actor = self.ACTOR_CLASS.start(
190 self.timer, self.cloud_client, self.arvados_client, monitor_actor,
192 self.monitor_actor = monitor_actor.proxy()
194 def check_success_flag(self, expected, allow_msg_count=1):
195 # allow_msg_count is the number of internal messages that may
196 # need to be handled for shutdown to finish.
197 for try_num in range(1 + allow_msg_count):
198 last_flag = self.shutdown_actor.success.get(self.TIMEOUT)
199 if last_flag is expected:
202 self.fail("success flag {} is not {}".format(last_flag, expected))
204 def test_cancellable_shutdown(self, *mocks):
205 self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="busy"))
206 self.cloud_client.destroy_node.return_value = True
207 self.make_actor(cancellable=True)
208 self.check_success_flag(False)
209 self.assertFalse(self.cloud_client.destroy_node.called)
211 def test_uncancellable_shutdown(self, *mocks):
212 self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="busy"))
213 self.cloud_client.destroy_node.return_value = True
214 self.make_actor(cancellable=False)
215 self.check_success_flag(True, 2)
216 self.assertTrue(self.cloud_client.destroy_node.called)
218 def test_arvados_node_cleaned_after_shutdown(self, *mocks):
219 cloud_node = testutil.cloud_node_mock(62)
220 arv_node = testutil.arvados_node_mock(62)
221 self.make_mocks(cloud_node, arv_node)
223 self.check_success_flag(True, 3)
224 update_mock = self.arvados_client.nodes().update
225 self.assertTrue(update_mock.called)
226 update_kwargs = update_mock.call_args_list[0][1]
227 self.assertEqual(arv_node['uuid'], update_kwargs.get('uuid'))
228 self.assertIn('body', update_kwargs)
229 for clear_key in ['slot_number', 'hostname', 'ip_address',
230 'first_ping_at', 'last_ping_at']:
231 self.assertIn(clear_key, update_kwargs['body'])
232 self.assertIsNone(update_kwargs['body'][clear_key])
233 self.assertTrue(update_mock().execute.called)
235 def test_arvados_node_not_cleaned_after_shutdown_cancelled(self, *mocks):
236 cloud_node = testutil.cloud_node_mock(61)
237 arv_node = testutil.arvados_node_mock(61)
238 self.make_mocks(cloud_node, arv_node, shutdown_open=False)
239 self.cloud_client.destroy_node.return_value = False
240 self.make_actor(cancellable=True)
241 self.shutdown_actor.cancel_shutdown("test")
242 self.check_success_flag(False, 2)
243 self.assertFalse(self.arvados_client.nodes().update.called)
246 class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
248 ACTOR_CLASS = dispatch.ComputeNodeShutdownActor
250 def test_easy_shutdown(self):
251 self.make_actor(start_time=0)
252 self.check_success_flag(True)
253 self.assertTrue(self.cloud_client.destroy_node.called)
255 def test_shutdown_cancelled_when_destroy_node_fails(self):
256 self.make_mocks(node_broken=True)
257 self.cloud_client.destroy_node.return_value = False
258 self.make_actor(start_time=0)
259 self.check_success_flag(False, 2)
260 self.assertEqual(1, self.cloud_client.destroy_node.call_count)
261 self.assertEqual(self.ACTOR_CLASS.DESTROY_FAILED,
262 self.shutdown_actor.cancel_reason.get(self.TIMEOUT))
264 def test_late_subscribe(self):
266 subscriber = mock.Mock(name='subscriber_mock')
267 self.shutdown_actor.subscribe(subscriber).get(self.TIMEOUT)
268 self.stop_proxy(self.shutdown_actor)
269 self.assertTrue(subscriber.called)
270 self.assertEqual(self.shutdown_actor.actor_ref.actor_urn,
271 subscriber.call_args[0][0].actor_ref.actor_urn)
274 class ComputeNodeUpdateActorTestCase(testutil.ActorTestMixin,
276 ACTOR_CLASS = dispatch.ComputeNodeUpdateActor
278 def make_actor(self):
279 self.driver = mock.MagicMock(name='driver_mock')
280 self.updater = self.ACTOR_CLASS.start(self.driver).proxy()
282 def test_node_sync(self, *args):
284 cloud_node = testutil.cloud_node_mock()
285 arv_node = testutil.arvados_node_mock()
286 self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
287 self.driver().sync_node.assert_called_with(cloud_node, arv_node)
290 def test_node_sync_error(self, *args):
292 cloud_node = testutil.cloud_node_mock()
293 arv_node = testutil.arvados_node_mock()
294 self.driver().sync_node.side_effect = (IOError, Exception, True)
295 self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
296 self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
297 self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
298 self.driver().sync_node.assert_called_with(cloud_node, arv_node)
300 class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
302 def make_mocks(self, node_num):
303 self.shutdowns = testutil.MockShutdownTimer()
304 self.shutdowns._set_state(False, 300)
305 self.timer = mock.MagicMock(name='timer_mock')
306 self.updates = mock.MagicMock(name='update_mock')
307 self.cloud_mock = testutil.cloud_node_mock(node_num)
308 self.subscriber = mock.Mock(name='subscriber_mock')
309 self.cloud_client = mock.MagicMock(name='cloud_client')
310 self.cloud_client.broken.return_value = False
312 def make_actor(self, node_num=1, arv_node=None, start_time=None):
313 if not hasattr(self, 'cloud_mock'):
314 self.make_mocks(node_num)
315 if start_time is None:
316 start_time = time.time()
317 self.node_actor = dispatch.ComputeNodeMonitorActor.start(
318 self.cloud_mock, start_time, self.shutdowns,
319 testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client,
320 arv_node, boot_fail_after=300).proxy()
321 self.node_actor.subscribe(self.subscriber).get(self.TIMEOUT)
323 def node_state(self, *states):
324 return self.node_actor.in_state(*states).get(self.TIMEOUT)
326 def test_in_state_when_unpaired(self):
328 self.assertTrue(self.node_state('unpaired'))
330 def test_in_state_when_pairing_stale(self):
331 self.make_actor(arv_node=testutil.arvados_node_mock(
332 job_uuid=None, age=90000))
333 self.assertTrue(self.node_state('down'))
335 def test_in_state_when_no_state_available(self):
336 self.make_actor(arv_node=testutil.arvados_node_mock(
337 crunch_worker_state=None))
338 print(self.node_actor.get_state().get())
339 self.assertTrue(self.node_state('idle'))
341 def test_in_state_when_no_state_available_old(self):
342 self.make_actor(arv_node=testutil.arvados_node_mock(
343 crunch_worker_state=None, age=90000))
344 print(self.node_actor.get_state().get())
345 self.assertTrue(self.node_state('down'))
347 def test_in_idle_state(self):
348 self.make_actor(2, arv_node=testutil.arvados_node_mock(job_uuid=None))
349 self.assertTrue(self.node_state('idle'))
350 self.assertFalse(self.node_state('busy'))
351 self.assertTrue(self.node_state('idle', 'busy'))
353 def test_in_busy_state(self):
354 self.make_actor(3, arv_node=testutil.arvados_node_mock(job_uuid=True))
355 self.assertFalse(self.node_state('idle'))
356 self.assertTrue(self.node_state('busy'))
357 self.assertTrue(self.node_state('idle', 'busy'))
359 def test_init_shutdown_scheduling(self):
361 self.assertTrue(self.timer.schedule.called)
362 self.assertEqual(300, self.timer.schedule.call_args[0][0])
364 def test_shutdown_window_close_scheduling(self):
366 self.shutdowns._set_state(False, 600)
367 self.timer.schedule.reset_mock()
368 self.node_actor.consider_shutdown().get(self.TIMEOUT)
369 self.stop_proxy(self.node_actor)
370 self.assertTrue(self.timer.schedule.called)
371 self.assertEqual(600, self.timer.schedule.call_args[0][0])
372 self.assertFalse(self.subscriber.called)
374 def test_shutdown_subscription(self):
375 self.make_actor(start_time=0)
376 self.shutdowns._set_state(True, 600)
377 self.node_actor.consider_shutdown().get(self.TIMEOUT)
378 self.assertTrue(self.subscriber.called)
379 self.assertEqual(self.node_actor.actor_ref.actor_urn,
380 self.subscriber.call_args[0][0].actor_ref.actor_urn)
382 def test_no_shutdown_booting(self):
384 self.shutdowns._set_state(True, 600)
385 self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
386 (False, "node state is ('unpaired', 'open', 'boot wait', 'idle exceeded')"))
388 def test_shutdown_without_arvados_node(self):
389 self.make_actor(start_time=0)
390 self.shutdowns._set_state(True, 600)
391 self.assertEquals((True, "node state is ('unpaired', 'open', 'boot exceeded', 'idle exceeded')"),
392 self.node_actor.shutdown_eligible().get(self.TIMEOUT))
394 def test_shutdown_missing(self):
395 arv_node = testutil.arvados_node_mock(10, job_uuid=None,
396 crunch_worker_state="down",
397 last_ping_at='1970-01-01T01:02:03.04050607Z')
398 self.make_actor(10, arv_node)
399 self.shutdowns._set_state(True, 600)
400 self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"),
401 self.node_actor.shutdown_eligible().get(self.TIMEOUT))
403 def test_shutdown_running_broken(self):
404 arv_node = testutil.arvados_node_mock(12, job_uuid=None,
405 crunch_worker_state="down")
406 self.make_actor(12, arv_node)
407 self.shutdowns._set_state(True, 600)
408 self.cloud_client.broken.return_value = True
409 self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"),
410 self.node_actor.shutdown_eligible().get(self.TIMEOUT))
412 def test_shutdown_missing_broken(self):
413 arv_node = testutil.arvados_node_mock(11, job_uuid=None,
414 crunch_worker_state="down",
415 last_ping_at='1970-01-01T01:02:03.04050607Z')
416 self.make_actor(11, arv_node)
417 self.shutdowns._set_state(True, 600)
418 self.cloud_client.broken.return_value = True
419 self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"))
421 def test_no_shutdown_when_window_closed(self):
422 self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
423 self.assertEquals((False, "node state is ('idle', 'closed', 'boot wait', 'idle exceeded')"),
424 self.node_actor.shutdown_eligible().get(self.TIMEOUT))
426 def test_no_shutdown_when_node_running_job(self):
427 self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
428 self.shutdowns._set_state(True, 600)
429 self.assertEquals((False, "node state is ('busy', 'open', 'boot wait', 'idle exceeded')"),
430 self.node_actor.shutdown_eligible().get(self.TIMEOUT))
432 def test_shutdown_when_node_state_unknown(self):
433 self.make_actor(5, testutil.arvados_node_mock(
434 5, crunch_worker_state=None))
435 self.shutdowns._set_state(True, 600)
436 self.assertEquals((True, "node state is ('idle', 'open', 'boot wait', 'idle exceeded')"),
437 self.node_actor.shutdown_eligible().get(self.TIMEOUT))
439 def test_no_shutdown_when_node_state_stale(self):
440 self.make_actor(6, testutil.arvados_node_mock(6, age=90000))
441 self.shutdowns._set_state(True, 600)
442 self.assertEquals((False, "node state is stale"),
443 self.node_actor.shutdown_eligible().get(self.TIMEOUT))
445 def test_arvados_node_match(self):
447 arv_node = testutil.arvados_node_mock(
448 2, hostname='compute-two.zzzzz.arvadosapi.com')
449 self.cloud_client.node_id.return_value = '2'
450 pair_id = self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT)
451 self.assertEqual(self.cloud_mock.id, pair_id)
452 self.stop_proxy(self.node_actor)
453 self.updates.sync_node.assert_called_with(self.cloud_mock, arv_node)
455 def test_arvados_node_mismatch(self):
457 arv_node = testutil.arvados_node_mock(1)
459 self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT))
461 def test_arvados_node_mismatch_first_ping_too_early(self):
463 arv_node = testutil.arvados_node_mock(
464 4, first_ping_at='1971-03-02T14:15:16.1717282Z')
466 self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT))
468 def test_update_cloud_node(self):
471 self.cloud_mock.id = '1'
472 self.node_actor.update_cloud_node(self.cloud_mock)
473 current_cloud = self.node_actor.cloud_node.get(self.TIMEOUT)
474 self.assertEqual([testutil.ip_address_mock(2)],
475 current_cloud.private_ips)
477 def test_missing_cloud_node_update(self):
479 self.node_actor.update_cloud_node(None)
480 current_cloud = self.node_actor.cloud_node.get(self.TIMEOUT)
481 self.assertEqual([testutil.ip_address_mock(1)],
482 current_cloud.private_ips)
484 def test_update_arvados_node(self):
486 job_uuid = 'zzzzz-jjjjj-updatejobnode00'
487 new_arvados = testutil.arvados_node_mock(3, job_uuid)
488 self.node_actor.update_arvados_node(new_arvados)
489 current_arvados = self.node_actor.arvados_node.get(self.TIMEOUT)
490 self.assertEqual(job_uuid, current_arvados['job_uuid'])
492 def test_missing_arvados_node_update(self):
493 self.make_actor(4, testutil.arvados_node_mock(4))
494 self.node_actor.update_arvados_node(None)
495 current_arvados = self.node_actor.arvados_node.get(self.TIMEOUT)
496 self.assertEqual(testutil.ip_address_mock(4),
497 current_arvados['ip_address'])
499 def test_update_arvados_node_syncs_when_fqdn_mismatch(self):
501 self.cloud_mock.extra['testname'] = 'cloudfqdn.zzzzz.arvadosapi.com'
503 arv_node = testutil.arvados_node_mock(5)
504 self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT)
505 self.assertEqual(1, self.updates.sync_node.call_count)
507 def test_update_arvados_node_skips_sync_when_fqdn_match(self):
509 arv_node = testutil.arvados_node_mock(6)
510 self.cloud_mock.extra['testname'] ='{n[hostname]}.{n[domain]}'.format(
513 self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT)
514 self.assertEqual(0, self.updates.sync_node.call_count)