5313: Node Manager has cloud-specific logic to get node FQDNs.
[arvados.git] / services / nodemanager / tests / test_computenode_dispatch.py
1 #!/usr/bin/env python
2
3 from __future__ import absolute_import, print_function
4
5 import time
6 import unittest
7
8 import arvados.errors as arverror
9 import httplib2
10 import mock
11 import pykka
12
13 import arvnodeman.computenode.dispatch as dispatch
14 from . import testutil
15
16 class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
17     def make_mocks(self, arvados_effect=None):
18         if arvados_effect is None:
19             arvados_effect = [testutil.arvados_node_mock()]
20         self.arvados_effect = arvados_effect
21         self.timer = testutil.MockTimer()
22         self.api_client = mock.MagicMock(name='api_client')
23         self.api_client.nodes().create().execute.side_effect = arvados_effect
24         self.api_client.nodes().update().execute.side_effect = arvados_effect
25         self.cloud_client = mock.MagicMock(name='cloud_client')
26         self.cloud_client.create_node.return_value = testutil.cloud_node_mock(1)
27
28     def make_actor(self, arv_node=None):
29         if not hasattr(self, 'timer'):
30             self.make_mocks(arvados_effect=[arv_node])
31         self.setup_actor = dispatch.ComputeNodeSetupActor.start(
32             self.timer, self.api_client, self.cloud_client,
33             testutil.MockSize(1), arv_node).proxy()
34
35     def test_creation_without_arvados_node(self):
36         self.make_actor()
37         self.assertEqual(self.arvados_effect[-1],
38                          self.setup_actor.arvados_node.get(self.TIMEOUT))
39         self.assertTrue(self.api_client.nodes().create().execute.called)
40         self.assertEqual(self.cloud_client.create_node(),
41                          self.setup_actor.cloud_node.get(self.TIMEOUT))
42
43     def test_creation_with_arvados_node(self):
44         self.make_actor(testutil.arvados_node_mock())
45         self.assertEqual(self.arvados_effect[-1],
46                          self.setup_actor.arvados_node.get(self.TIMEOUT))
47         self.assertTrue(self.api_client.nodes().update().execute.called)
48         self.assertEqual(self.cloud_client.create_node(),
49                          self.setup_actor.cloud_node.get(self.TIMEOUT))
50
51     def test_failed_arvados_calls_retried(self):
52         self.make_mocks([
53                 arverror.ApiError(httplib2.Response({'status': '500'}), ""),
54                 testutil.arvados_node_mock(),
55                 ])
56         self.make_actor()
57         self.wait_for_assignment(self.setup_actor, 'arvados_node')
58
59     def test_failed_cloud_calls_retried(self):
60         self.make_mocks()
61         self.cloud_client.create_node.side_effect = [
62             Exception("test cloud creation error"),
63             self.cloud_client.create_node.return_value,
64             ]
65         self.make_actor()
66         self.wait_for_assignment(self.setup_actor, 'cloud_node')
67
68     def test_failed_post_create_retried(self):
69         self.make_mocks()
70         self.cloud_client.post_create_node.side_effect = [
71             Exception("test cloud post-create error"), None]
72         self.make_actor()
73         done = self.FUTURE_CLASS()
74         self.setup_actor.subscribe(done.set)
75         done.get(self.TIMEOUT)
76         self.assertEqual(2, self.cloud_client.post_create_node.call_count)
77
78     def test_stop_when_no_cloud_node(self):
79         self.make_mocks(
80             arverror.ApiError(httplib2.Response({'status': '500'}), ""))
81         self.make_actor()
82         self.setup_actor.stop_if_no_cloud_node()
83         self.assertTrue(
84             self.setup_actor.actor_ref.actor_stopped.wait(self.TIMEOUT))
85
86     def test_no_stop_when_cloud_node(self):
87         self.make_actor()
88         self.wait_for_assignment(self.setup_actor, 'cloud_node')
89         self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT)
90         self.assertTrue(self.stop_proxy(self.setup_actor),
91                         "actor was stopped by stop_if_no_cloud_node")
92
93     def test_subscribe(self):
94         self.make_mocks(
95             arverror.ApiError(httplib2.Response({'status': '500'}), ""))
96         self.make_actor()
97         subscriber = mock.Mock(name='subscriber_mock')
98         self.setup_actor.subscribe(subscriber)
99         self.api_client.nodes().create().execute.side_effect = [
100             testutil.arvados_node_mock()]
101         self.wait_for_assignment(self.setup_actor, 'cloud_node')
102         self.assertEqual(self.setup_actor.actor_ref.actor_urn,
103                          subscriber.call_args[0][0].actor_ref.actor_urn)
104
105     def test_late_subscribe(self):
106         self.make_actor()
107         subscriber = mock.Mock(name='subscriber_mock')
108         self.wait_for_assignment(self.setup_actor, 'cloud_node')
109         self.setup_actor.subscribe(subscriber).get(self.TIMEOUT)
110         self.stop_proxy(self.setup_actor)
111         self.assertEqual(self.setup_actor.actor_ref.actor_urn,
112                          subscriber.call_args[0][0].actor_ref.actor_urn)
113
114
115 class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
116     def make_mocks(self, cloud_node=None, arvados_node=None,
117                    shutdown_open=True):
118         self.timer = testutil.MockTimer()
119         self.shutdowns = testutil.MockShutdownTimer()
120         self.shutdowns._set_state(shutdown_open, 300)
121         self.cloud_client = mock.MagicMock(name='cloud_client')
122         self.updates = mock.MagicMock(name='update_mock')
123         if cloud_node is None:
124             cloud_node = testutil.cloud_node_mock()
125         self.cloud_node = cloud_node
126         self.arvados_node = arvados_node
127
128     def make_actor(self, cancellable=True):
129         if not hasattr(self, 'timer'):
130             self.make_mocks()
131         monitor_actor = dispatch.ComputeNodeMonitorActor.start(
132             self.cloud_node, time.time(), self.shutdowns,
133             testutil.cloud_node_fqdn, self.timer, self.updates,
134             self.arvados_node)
135         self.shutdown_actor = self.ACTOR_CLASS.start(
136             self.timer, self.cloud_client, monitor_actor, cancellable).proxy()
137         self.monitor_actor = monitor_actor.proxy()
138
139     def check_success_flag(self, expected, allow_msg_count=1):
140         # allow_msg_count is the number of internal messages that may
141         # need to be handled for shutdown to finish.
142         for try_num in range(1 + allow_msg_count):
143             last_flag = self.shutdown_actor.success.get(self.TIMEOUT)
144             if last_flag is expected:
145                 break
146         else:
147             self.fail("success flag {} is not {}".format(last_flag, expected))
148
149     def test_uncancellable_shutdown(self, *mocks):
150         self.make_mocks(shutdown_open=False)
151         self.cloud_client.destroy_node.return_value = False
152         self.make_actor(cancellable=False)
153         self.check_success_flag(None, 0)
154         self.shutdowns._set_state(True, 600)
155         self.cloud_client.destroy_node.return_value = True
156         self.check_success_flag(True)
157
158
159 class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
160                                        unittest.TestCase):
161     ACTOR_CLASS = dispatch.ComputeNodeShutdownActor
162
163     def test_easy_shutdown(self):
164         self.make_actor()
165         self.check_success_flag(True)
166         self.assertTrue(self.cloud_client.destroy_node.called)
167
168     def test_shutdown_cancelled_when_window_closes(self):
169         self.make_mocks(shutdown_open=False)
170         self.make_actor()
171         self.check_success_flag(False, 2)
172         self.assertFalse(self.cloud_client.destroy_node.called)
173
174     def test_shutdown_retries_when_cloud_fails(self):
175         self.make_mocks()
176         self.cloud_client.destroy_node.return_value = False
177         self.make_actor()
178         self.assertIsNone(self.shutdown_actor.success.get(self.TIMEOUT))
179         self.cloud_client.destroy_node.return_value = True
180         self.check_success_flag(True)
181
182     def test_late_subscribe(self):
183         self.make_actor()
184         subscriber = mock.Mock(name='subscriber_mock')
185         self.shutdown_actor.subscribe(subscriber).get(self.TIMEOUT)
186         self.stop_proxy(self.shutdown_actor)
187         self.assertTrue(subscriber.called)
188         self.assertEqual(self.shutdown_actor.actor_ref.actor_urn,
189                          subscriber.call_args[0][0].actor_ref.actor_urn)
190
191
192 class ComputeNodeUpdateActorTestCase(testutil.ActorTestMixin,
193                                      unittest.TestCase):
194     def make_actor(self):
195         self.driver = mock.MagicMock(name='driver_mock')
196         self.updater = dispatch.ComputeNodeUpdateActor.start(self.driver).proxy()
197
198     def test_node_sync(self):
199         self.make_actor()
200         cloud_node = testutil.cloud_node_mock()
201         arv_node = testutil.arvados_node_mock()
202         self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
203         self.driver().sync_node.assert_called_with(cloud_node, arv_node)
204
205
206 class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
207                                       unittest.TestCase):
208     def make_mocks(self, node_num):
209         self.shutdowns = testutil.MockShutdownTimer()
210         self.shutdowns._set_state(False, 300)
211         self.timer = mock.MagicMock(name='timer_mock')
212         self.updates = mock.MagicMock(name='update_mock')
213         self.cloud_mock = testutil.cloud_node_mock(node_num)
214         self.subscriber = mock.Mock(name='subscriber_mock')
215
216     def make_actor(self, node_num=1, arv_node=None, start_time=None):
217         if not hasattr(self, 'cloud_mock'):
218             self.make_mocks(node_num)
219         if start_time is None:
220             start_time = time.time()
221         self.node_actor = dispatch.ComputeNodeMonitorActor.start(
222             self.cloud_mock, start_time, self.shutdowns,
223             testutil.cloud_node_fqdn, self.timer, self.updates,
224             arv_node).proxy()
225         self.node_actor.subscribe(self.subscriber).get(self.TIMEOUT)
226
227     def node_state(self, *states):
228         return self.node_actor.in_state(*states).get(self.TIMEOUT)
229
230     def test_in_state_when_unpaired(self):
231         self.make_actor()
232         self.assertIsNone(self.node_state('idle', 'busy'))
233
234     def test_in_state_when_pairing_stale(self):
235         self.make_actor(arv_node=testutil.arvados_node_mock(
236                 job_uuid=None, age=90000))
237         self.assertIsNone(self.node_state('idle', 'busy'))
238
239     def test_in_state_when_no_state_available(self):
240         self.make_actor(arv_node=testutil.arvados_node_mock(
241                 crunch_worker_state=None))
242         self.assertIsNone(self.node_state('idle', 'busy'))
243
244     def test_in_idle_state(self):
245         self.make_actor(2, arv_node=testutil.arvados_node_mock(job_uuid=None))
246         self.assertTrue(self.node_state('idle'))
247         self.assertFalse(self.node_state('busy'))
248         self.assertTrue(self.node_state('idle', 'busy'))
249
250     def test_in_busy_state(self):
251         self.make_actor(3, arv_node=testutil.arvados_node_mock(job_uuid=True))
252         self.assertFalse(self.node_state('idle'))
253         self.assertTrue(self.node_state('busy'))
254         self.assertTrue(self.node_state('idle', 'busy'))
255
256     def test_init_shutdown_scheduling(self):
257         self.make_actor()
258         self.assertTrue(self.timer.schedule.called)
259         self.assertEqual(300, self.timer.schedule.call_args[0][0])
260
261     def test_shutdown_window_close_scheduling(self):
262         self.make_actor()
263         self.shutdowns._set_state(False, 600)
264         self.timer.schedule.reset_mock()
265         self.node_actor.consider_shutdown().get(self.TIMEOUT)
266         self.stop_proxy(self.node_actor)
267         self.assertTrue(self.timer.schedule.called)
268         self.assertEqual(600, self.timer.schedule.call_args[0][0])
269         self.assertFalse(self.subscriber.called)
270
271     def test_shutdown_subscription(self):
272         self.make_actor()
273         self.shutdowns._set_state(True, 600)
274         self.node_actor.consider_shutdown().get(self.TIMEOUT)
275         self.assertTrue(self.subscriber.called)
276         self.assertEqual(self.node_actor.actor_ref.actor_urn,
277                          self.subscriber.call_args[0][0].actor_ref.actor_urn)
278
279     def test_shutdown_without_arvados_node(self):
280         self.make_actor()
281         self.shutdowns._set_state(True, 600)
282         self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
283
284     def test_no_shutdown_without_arvados_node_and_old_cloud_node(self):
285         self.make_actor(start_time=0)
286         self.shutdowns._set_state(True, 600)
287         self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
288
289     def test_no_shutdown_when_window_closed(self):
290         self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
291         self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
292
293     def test_no_shutdown_when_node_running_job(self):
294         self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
295         self.shutdowns._set_state(True, 600)
296         self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
297
298     def test_no_shutdown_when_node_state_unknown(self):
299         self.make_actor(5, testutil.arvados_node_mock(
300             5, crunch_worker_state=None))
301         self.shutdowns._set_state(True, 600)
302         self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
303
304     def test_no_shutdown_when_node_state_stale(self):
305         self.make_actor(6, testutil.arvados_node_mock(6, age=90000))
306         self.shutdowns._set_state(True, 600)
307         self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
308
309     def test_arvados_node_match(self):
310         self.make_actor(2)
311         arv_node = testutil.arvados_node_mock(
312             2, hostname='compute-two.zzzzz.arvadosapi.com')
313         pair_id = self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT)
314         self.assertEqual(self.cloud_mock.id, pair_id)
315         self.stop_proxy(self.node_actor)
316         self.updates.sync_node.assert_called_with(self.cloud_mock, arv_node)
317
318     def test_arvados_node_mismatch(self):
319         self.make_actor(3)
320         arv_node = testutil.arvados_node_mock(1)
321         self.assertIsNone(
322             self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT))
323
324     def test_arvados_node_mismatch_first_ping_too_early(self):
325         self.make_actor(4)
326         arv_node = testutil.arvados_node_mock(
327             4, first_ping_at='1971-03-02T14:15:16.1717282Z')
328         self.assertIsNone(
329             self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT))
330
331     def test_update_cloud_node(self):
332         self.make_actor(1)
333         self.make_mocks(2)
334         self.cloud_mock.id = '1'
335         self.node_actor.update_cloud_node(self.cloud_mock)
336         current_cloud = self.node_actor.cloud_node.get(self.TIMEOUT)
337         self.assertEqual([testutil.ip_address_mock(2)],
338                          current_cloud.private_ips)
339
340     def test_missing_cloud_node_update(self):
341         self.make_actor(1)
342         self.node_actor.update_cloud_node(None)
343         current_cloud = self.node_actor.cloud_node.get(self.TIMEOUT)
344         self.assertEqual([testutil.ip_address_mock(1)],
345                          current_cloud.private_ips)
346
347     def test_update_arvados_node(self):
348         self.make_actor(3)
349         job_uuid = 'zzzzz-jjjjj-updatejobnode00'
350         new_arvados = testutil.arvados_node_mock(3, job_uuid)
351         self.node_actor.update_arvados_node(new_arvados)
352         current_arvados = self.node_actor.arvados_node.get(self.TIMEOUT)
353         self.assertEqual(job_uuid, current_arvados['job_uuid'])
354
355     def test_missing_arvados_node_update(self):
356         self.make_actor(4, testutil.arvados_node_mock(4))
357         self.node_actor.update_arvados_node(None)
358         current_arvados = self.node_actor.arvados_node.get(self.TIMEOUT)
359         self.assertEqual(testutil.ip_address_mock(4),
360                          current_arvados['ip_address'])
361
362     def test_update_arvados_node_syncs_when_fqdn_mismatch(self):
363         self.make_mocks(5)
364         self.cloud_mock.extra['testname'] = 'cloudfqdn.zzzzz.arvadosapi.com'
365         self.make_actor()
366         arv_node = testutil.arvados_node_mock(5)
367         self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT)
368         self.assertEqual(1, self.updates.sync_node.call_count)
369
370     def test_update_arvados_node_skips_sync_when_fqdn_match(self):
371         self.make_mocks(6)
372         arv_node = testutil.arvados_node_mock(6)
373         self.cloud_mock.extra['testname'] ='{n[hostname]}.{n[domain]}'.format(
374             n=arv_node)
375         self.make_actor()
376         self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT)
377         self.assertEqual(0, self.updates.sync_node.call_count)