X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/bf3f076d4e1559937dfc4ac27747612df0f8761b..8a41cc44ee196c9347785baa476a370abe77c75c:/services/nodemanager/tests/test_computenode_dispatch.py diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py index 227b5e5f34..aee3cbdac8 100644 --- a/services/nodemanager/tests/test_computenode_dispatch.py +++ b/services/nodemanager/tests/test_computenode_dispatch.py @@ -1,4 +1,7 @@ #!/usr/bin/env python +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0 from __future__ import absolute_import, print_function @@ -11,13 +14,24 @@ import mock import pykka import threading +from libcloud.common.exceptions import BaseHTTPError + import arvnodeman.computenode.dispatch as dispatch +import arvnodeman.status as status +from arvnodeman.computenode.driver import BaseComputeNodeDriver from . import testutil class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase): + ACTOR_CLASS = dispatch.ComputeNodeSetupActor + def make_mocks(self, arvados_effect=None): if arvados_effect is None: - arvados_effect = [testutil.arvados_node_mock()] + arvados_effect = [testutil.arvados_node_mock( + slot_number=None, + hostname=None, + first_ping_at=None, + last_ping_at=None, + )] self.arvados_effect = arvados_effect self.timer = testutil.MockTimer() self.api_client = mock.MagicMock(name='api_client') @@ -29,7 +43,7 @@ class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase): def make_actor(self, arv_node=None): if not hasattr(self, 'timer'): self.make_mocks(arvados_effect=[arv_node] if arv_node else None) - self.setup_actor = dispatch.ComputeNodeSetupActor.start( + self.setup_actor = self.ACTOR_CLASS.start( self.timer, self.api_client, self.cloud_client, testutil.MockSize(1), arv_node).proxy() @@ -50,6 +64,7 @@ class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase): self.assertEqual(self.arvados_effect[-1], self.setup_actor.arvados_node.get(self.TIMEOUT)) assert(finished.wait(self.TIMEOUT)) + self.api_client.nodes().create.called_with(body={}, assign_slot=True) self.assertEqual(1, self.api_client.nodes().create().execute.call_count) self.assertEqual(1, self.api_client.nodes().update().execute.call_count) self.assert_node_properties_updated() @@ -65,7 +80,8 @@ class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase): self.setup_actor.arvados_node.get(self.TIMEOUT)) assert(finished.wait(self.TIMEOUT)) self.assert_node_properties_updated() - self.assertEqual(2, self.api_client.nodes().update().execute.call_count) + self.api_client.nodes().create.called_with(body={}, assign_slot=True) + self.assertEqual(3, self.api_client.nodes().update().execute.call_count) self.assertEqual(self.cloud_client.create_node(), self.setup_actor.cloud_node.get(self.TIMEOUT)) @@ -86,6 +102,29 @@ class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase): self.make_actor() self.wait_for_assignment(self.setup_actor, 'cloud_node') + def test_basehttperror_retried(self): + self.make_mocks() + self.cloud_client.create_node.side_effect = [ + BaseHTTPError(500, "Try again"), + self.cloud_client.create_node.return_value, + ] + self.make_actor() + self.wait_for_assignment(self.setup_actor, 'cloud_node') + self.setup_actor.ping().get(self.TIMEOUT) + self.assertEqual(1, self.cloud_client.post_create_node.call_count) + + def test_instance_exceeded_not_retried(self): + self.make_mocks() + self.cloud_client.create_node.side_effect = [ + BaseHTTPError(400, "InstanceLimitExceeded"), + self.cloud_client.create_node.return_value, + ] + self.make_actor() + done = self.FUTURE_CLASS() + self.setup_actor.subscribe(done.set) + done.get(self.TIMEOUT) + self.assertEqual(0, self.cloud_client.post_create_node.call_count) + def test_failed_post_create_retried(self): self.make_mocks() self.cloud_client.post_create_node.side_effect = [ @@ -123,6 +162,7 @@ class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase): self.api_client.nodes().create().execute.side_effect = retry_resp self.api_client.nodes().update().execute.side_effect = retry_resp self.wait_for_assignment(self.setup_actor, 'cloud_node') + self.setup_actor.ping().get(self.TIMEOUT) self.assertEqual(self.setup_actor.actor_ref.actor_urn, subscriber.call_args[0][0].actor_ref.actor_urn) @@ -158,7 +198,7 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin): start_time = time.time() monitor_actor = dispatch.ComputeNodeMonitorActor.start( self.cloud_node, start_time, self.shutdowns, - testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client, + self.timer, self.updates, self.cloud_client, self.arvados_node) self.shutdown_actor = self.ACTOR_CLASS.start( self.timer, self.cloud_client, self.arvados_client, monitor_actor, @@ -168,23 +208,43 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin): def check_success_flag(self, expected, allow_msg_count=1): # allow_msg_count is the number of internal messages that may # need to be handled for shutdown to finish. - for try_num in range(1 + allow_msg_count): + for _ in range(1 + allow_msg_count): last_flag = self.shutdown_actor.success.get(self.TIMEOUT) if last_flag is expected: break else: self.fail("success flag {} is not {}".format(last_flag, expected)) - def test_uncancellable_shutdown(self, *mocks): - self.make_mocks(shutdown_open=False) - self.cloud_client.destroy_node.return_value = False + def test_boot_failure_counting(self, *mocks): + # A boot failure happens when a node transitions from unpaired to shutdown + status.tracker.update({'boot_failures': 0}) + self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="unpaired")) + self.cloud_client.destroy_node.return_value = True self.make_actor(cancellable=False) - self.check_success_flag(None, 0) - self.shutdowns._set_state(True, 600) + self.check_success_flag(True, 2) + self.assertTrue(self.cloud_client.destroy_node.called) + self.assertEqual(1, status.tracker.get('boot_failures')) + + def test_cancellable_shutdown(self, *mocks): + self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="busy")) self.cloud_client.destroy_node.return_value = True - self.check_success_flag(True) + self.make_actor(cancellable=True) + self.check_success_flag(False, 2) + self.assertFalse(self.cloud_client.destroy_node.called) + + def test_uncancellable_shutdown(self, *mocks): + status.tracker.update({'boot_failures': 0}) + self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="busy")) + self.cloud_client.destroy_node.return_value = True + self.make_actor(cancellable=False) + self.check_success_flag(True, 4) + self.assertTrue(self.cloud_client.destroy_node.called) + # A normal shutdown shouldn't be counted as boot failure + self.assertEqual(0, status.tracker.get('boot_failures')) def test_arvados_node_cleaned_after_shutdown(self, *mocks): + if len(mocks) == 1: + mocks[0].return_value = "drain\n" cloud_node = testutil.cloud_node_mock(62) arv_node = testutil.arvados_node_mock(62) self.make_mocks(cloud_node, arv_node) @@ -202,12 +262,15 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin): self.assertTrue(update_mock().execute.called) def test_arvados_node_not_cleaned_after_shutdown_cancelled(self, *mocks): + if len(mocks) == 1: + mocks[0].return_value = "idle\n" cloud_node = testutil.cloud_node_mock(61) arv_node = testutil.arvados_node_mock(61) self.make_mocks(cloud_node, arv_node, shutdown_open=False) self.cloud_client.destroy_node.return_value = False self.make_actor(cancellable=True) self.shutdown_actor.cancel_shutdown("test") + self.shutdown_actor.ping().get(self.TIMEOUT) self.check_success_flag(False, 2) self.assertFalse(self.arvados_client.nodes().update.called) @@ -221,21 +284,13 @@ class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin, self.check_success_flag(True) self.assertTrue(self.cloud_client.destroy_node.called) - def test_shutdown_retries_when_cloud_fails(self): - self.make_mocks() - self.cloud_client.destroy_node.return_value = False - self.make_actor(start_time=0) - self.assertIsNone(self.shutdown_actor.success.get(self.TIMEOUT)) - self.cloud_client.destroy_node.return_value = True - self.check_success_flag(True) - - def test_shutdown_cancelled_when_cloud_fails_on_broken_node(self): + def test_shutdown_cancelled_when_destroy_node_fails(self): self.make_mocks(node_broken=True) self.cloud_client.destroy_node.return_value = False self.make_actor(start_time=0) self.check_success_flag(False, 2) self.assertEqual(1, self.cloud_client.destroy_node.call_count) - self.assertEqual(self.ACTOR_CLASS.NODE_BROKEN, + self.assertEqual(self.ACTOR_CLASS.DESTROY_FAILED, self.shutdown_actor.cancel_reason.get(self.TIMEOUT)) def test_late_subscribe(self): @@ -250,11 +305,14 @@ class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin, class ComputeNodeUpdateActorTestCase(testutil.ActorTestMixin, unittest.TestCase): + ACTOR_CLASS = dispatch.ComputeNodeUpdateActor + def make_actor(self): self.driver = mock.MagicMock(name='driver_mock') - self.updater = dispatch.ComputeNodeUpdateActor.start(self.driver).proxy() + self.timer = mock.MagicMock(name='timer_mock') + self.updater = self.ACTOR_CLASS.start(self.driver, self.timer).proxy() - def test_node_sync(self): + def test_node_sync(self, *args): self.make_actor() cloud_node = testutil.cloud_node_mock() arv_node = testutil.arvados_node_mock() @@ -262,7 +320,7 @@ class ComputeNodeUpdateActorTestCase(testutil.ActorTestMixin, self.driver().sync_node.assert_called_with(cloud_node, arv_node) @testutil.no_sleep - def test_node_sync_error(self): + def test_node_sync_error(self, *args): self.make_actor() cloud_node = testutil.cloud_node_mock() arv_node = testutil.arvados_node_mock() @@ -291,7 +349,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin, start_time = time.time() self.node_actor = dispatch.ComputeNodeMonitorActor.start( self.cloud_mock, start_time, self.shutdowns, - testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client, + self.timer, self.updates, self.cloud_client, arv_node, boot_fail_after=300).proxy() self.node_actor.subscribe(self.subscriber).get(self.TIMEOUT) @@ -310,26 +368,34 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin, def test_in_state_when_no_state_available(self): self.make_actor(arv_node=testutil.arvados_node_mock( crunch_worker_state=None)) - print(self.node_actor.get_state().get()) self.assertTrue(self.node_state('idle')) def test_in_state_when_no_state_available_old(self): self.make_actor(arv_node=testutil.arvados_node_mock( crunch_worker_state=None, age=90000)) - print(self.node_actor.get_state().get()) self.assertTrue(self.node_state('down')) def test_in_idle_state(self): + idle_nodes_before = status.tracker._idle_nodes.keys() self.make_actor(2, arv_node=testutil.arvados_node_mock(job_uuid=None)) self.assertTrue(self.node_state('idle')) self.assertFalse(self.node_state('busy')) self.assertTrue(self.node_state('idle', 'busy')) + idle_nodes_after = status.tracker._idle_nodes.keys() + new_idle_nodes = [n for n in idle_nodes_after if n not in idle_nodes_before] + # There should be 1 additional idle node + self.assertEqual(1, len(new_idle_nodes)) def test_in_busy_state(self): + idle_nodes_before = status.tracker._idle_nodes.keys() self.make_actor(3, arv_node=testutil.arvados_node_mock(job_uuid=True)) self.assertFalse(self.node_state('idle')) self.assertTrue(self.node_state('busy')) self.assertTrue(self.node_state('idle', 'busy')) + idle_nodes_after = status.tracker._idle_nodes.keys() + new_idle_nodes = [n for n in idle_nodes_after if n not in idle_nodes_before] + # There shouldn't be any additional idle node + self.assertEqual(0, len(new_idle_nodes)) def test_init_shutdown_scheduling(self): self.make_actor() @@ -358,12 +424,21 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin, self.make_actor() self.shutdowns._set_state(True, 600) self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), - (False, "node state is ('unpaired', 'open', 'boot wait', 'idle exceeded')")) + (False, "node state is ('unpaired', 'open', 'boot wait', 'not idle')")) + + def test_shutdown_when_invalid_cloud_node_size(self): + self.make_mocks(1) + self.cloud_mock.size.id = 'invalid' + self.cloud_mock.extra['arvados_node_size'] = 'stale.type' + self.make_actor() + self.shutdowns._set_state(True, 600) + self.assertEquals((True, "node's size tag 'stale.type' not recognizable"), + self.node_actor.shutdown_eligible().get(self.TIMEOUT)) def test_shutdown_without_arvados_node(self): self.make_actor(start_time=0) self.shutdowns._set_state(True, 600) - self.assertEquals((True, "node state is ('unpaired', 'open', 'boot exceeded', 'idle exceeded')"), + self.assertEquals((True, "node state is ('down', 'open', 'boot exceeded', 'not idle')"), self.node_actor.shutdown_eligible().get(self.TIMEOUT)) def test_shutdown_missing(self): @@ -372,7 +447,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin, last_ping_at='1970-01-01T01:02:03.04050607Z') self.make_actor(10, arv_node) self.shutdowns._set_state(True, 600) - self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"), + self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'not idle')"), self.node_actor.shutdown_eligible().get(self.TIMEOUT)) def test_shutdown_running_broken(self): @@ -381,7 +456,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin, self.make_actor(12, arv_node) self.shutdowns._set_state(True, 600) self.cloud_client.broken.return_value = True - self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"), + self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'not idle')"), self.node_actor.shutdown_eligible().get(self.TIMEOUT)) def test_shutdown_missing_broken(self): @@ -391,7 +466,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin, self.make_actor(11, arv_node) self.shutdowns._set_state(True, 600) self.cloud_client.broken.return_value = True - self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')")) + self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'not idle')")) def test_no_shutdown_when_window_closed(self): self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None)) @@ -401,7 +476,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin, def test_no_shutdown_when_node_running_job(self): self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True)) self.shutdowns._set_state(True, 600) - self.assertEquals((False, "node state is ('busy', 'open', 'boot wait', 'idle exceeded')"), + self.assertEquals((False, "node state is ('busy', 'open', 'boot wait', 'not idle')"), self.node_actor.shutdown_eligible().get(self.TIMEOUT)) def test_shutdown_when_node_state_unknown(self): @@ -411,6 +486,13 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin, self.assertEquals((True, "node state is ('idle', 'open', 'boot wait', 'idle exceeded')"), self.node_actor.shutdown_eligible().get(self.TIMEOUT)) + def test_shutdown_when_node_state_fail(self): + self.make_actor(5, testutil.arvados_node_mock( + 5, crunch_worker_state='fail')) + self.shutdowns._set_state(True, 600) + self.assertEquals((True, "node state is ('fail', 'open', 'boot wait', 'not idle')"), + self.node_actor.shutdown_eligible().get(self.TIMEOUT)) + def test_no_shutdown_when_node_state_stale(self): self.make_actor(6, testutil.arvados_node_mock(6, age=90000)) self.shutdowns._set_state(True, 600) @@ -471,19 +553,10 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin, self.assertEqual(testutil.ip_address_mock(4), current_arvados['ip_address']) - def test_update_arvados_node_syncs_when_fqdn_mismatch(self): + def test_update_arvados_node_calls_sync_node(self): self.make_mocks(5) self.cloud_mock.extra['testname'] = 'cloudfqdn.zzzzz.arvadosapi.com' self.make_actor() arv_node = testutil.arvados_node_mock(5) self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT) self.assertEqual(1, self.updates.sync_node.call_count) - - def test_update_arvados_node_skips_sync_when_fqdn_match(self): - self.make_mocks(6) - arv_node = testutil.arvados_node_mock(6) - self.cloud_mock.extra['testname'] ='{n[hostname]}.{n[domain]}'.format( - n=arv_node) - self.make_actor() - self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT) - self.assertEqual(0, self.updates.sync_node.call_count)