X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/3d6d097f42128bde90b7bc184057a84e99ea3e0a..426103b2d2f071ab0d57b3f9aaea58a3f8455c4f:/services/nodemanager/tests/test_computenode_dispatch.py diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py index 0a2deb8a9c..aee3cbdac8 100644 --- a/services/nodemanager/tests/test_computenode_dispatch.py +++ b/services/nodemanager/tests/test_computenode_dispatch.py @@ -17,6 +17,7 @@ import threading from libcloud.common.exceptions import BaseHTTPError import arvnodeman.computenode.dispatch as dispatch +import arvnodeman.status as status from arvnodeman.computenode.driver import BaseComputeNodeDriver from . import testutil @@ -207,13 +208,23 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin): def check_success_flag(self, expected, allow_msg_count=1): # allow_msg_count is the number of internal messages that may # need to be handled for shutdown to finish. - for try_num in range(1 + allow_msg_count): + for _ in range(1 + allow_msg_count): last_flag = self.shutdown_actor.success.get(self.TIMEOUT) if last_flag is expected: break else: self.fail("success flag {} is not {}".format(last_flag, expected)) + def test_boot_failure_counting(self, *mocks): + # A boot failure happens when a node transitions from unpaired to shutdown + status.tracker.update({'boot_failures': 0}) + self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="unpaired")) + self.cloud_client.destroy_node.return_value = True + self.make_actor(cancellable=False) + self.check_success_flag(True, 2) + self.assertTrue(self.cloud_client.destroy_node.called) + self.assertEqual(1, status.tracker.get('boot_failures')) + def test_cancellable_shutdown(self, *mocks): self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="busy")) self.cloud_client.destroy_node.return_value = True @@ -222,11 +233,14 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin): self.assertFalse(self.cloud_client.destroy_node.called) def test_uncancellable_shutdown(self, *mocks): + status.tracker.update({'boot_failures': 0}) self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="busy")) self.cloud_client.destroy_node.return_value = True self.make_actor(cancellable=False) self.check_success_flag(True, 4) self.assertTrue(self.cloud_client.destroy_node.called) + # A normal shutdown shouldn't be counted as boot failure + self.assertEqual(0, status.tracker.get('boot_failures')) def test_arvados_node_cleaned_after_shutdown(self, *mocks): if len(mocks) == 1: @@ -362,16 +376,26 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin, self.assertTrue(self.node_state('down')) def test_in_idle_state(self): + idle_nodes_before = status.tracker._idle_nodes.keys() self.make_actor(2, arv_node=testutil.arvados_node_mock(job_uuid=None)) self.assertTrue(self.node_state('idle')) self.assertFalse(self.node_state('busy')) self.assertTrue(self.node_state('idle', 'busy')) + idle_nodes_after = status.tracker._idle_nodes.keys() + new_idle_nodes = [n for n in idle_nodes_after if n not in idle_nodes_before] + # There should be 1 additional idle node + self.assertEqual(1, len(new_idle_nodes)) def test_in_busy_state(self): + idle_nodes_before = status.tracker._idle_nodes.keys() self.make_actor(3, arv_node=testutil.arvados_node_mock(job_uuid=True)) self.assertFalse(self.node_state('idle')) self.assertTrue(self.node_state('busy')) self.assertTrue(self.node_state('idle', 'busy')) + idle_nodes_after = status.tracker._idle_nodes.keys() + new_idle_nodes = [n for n in idle_nodes_after if n not in idle_nodes_before] + # There shouldn't be any additional idle node + self.assertEqual(0, len(new_idle_nodes)) def test_init_shutdown_scheduling(self): self.make_actor() @@ -400,12 +424,21 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin, self.make_actor() self.shutdowns._set_state(True, 600) self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), - (False, "node state is ('unpaired', 'open', 'boot wait', 'idle exceeded')")) + (False, "node state is ('unpaired', 'open', 'boot wait', 'not idle')")) + + def test_shutdown_when_invalid_cloud_node_size(self): + self.make_mocks(1) + self.cloud_mock.size.id = 'invalid' + self.cloud_mock.extra['arvados_node_size'] = 'stale.type' + self.make_actor() + self.shutdowns._set_state(True, 600) + self.assertEquals((True, "node's size tag 'stale.type' not recognizable"), + self.node_actor.shutdown_eligible().get(self.TIMEOUT)) def test_shutdown_without_arvados_node(self): self.make_actor(start_time=0) self.shutdowns._set_state(True, 600) - self.assertEquals((True, "node state is ('down', 'open', 'boot exceeded', 'idle exceeded')"), + self.assertEquals((True, "node state is ('down', 'open', 'boot exceeded', 'not idle')"), self.node_actor.shutdown_eligible().get(self.TIMEOUT)) def test_shutdown_missing(self): @@ -414,7 +447,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin, last_ping_at='1970-01-01T01:02:03.04050607Z') self.make_actor(10, arv_node) self.shutdowns._set_state(True, 600) - self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"), + self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'not idle')"), self.node_actor.shutdown_eligible().get(self.TIMEOUT)) def test_shutdown_running_broken(self): @@ -423,7 +456,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin, self.make_actor(12, arv_node) self.shutdowns._set_state(True, 600) self.cloud_client.broken.return_value = True - self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"), + self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'not idle')"), self.node_actor.shutdown_eligible().get(self.TIMEOUT)) def test_shutdown_missing_broken(self): @@ -433,7 +466,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin, self.make_actor(11, arv_node) self.shutdowns._set_state(True, 600) self.cloud_client.broken.return_value = True - self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')")) + self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'not idle')")) def test_no_shutdown_when_window_closed(self): self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None)) @@ -443,7 +476,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin, def test_no_shutdown_when_node_running_job(self): self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True)) self.shutdowns._set_state(True, 600) - self.assertEquals((False, "node state is ('busy', 'open', 'boot wait', 'idle exceeded')"), + self.assertEquals((False, "node state is ('busy', 'open', 'boot wait', 'not idle')"), self.node_actor.shutdown_eligible().get(self.TIMEOUT)) def test_shutdown_when_node_state_unknown(self): @@ -457,7 +490,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin, self.make_actor(5, testutil.arvados_node_mock( 5, crunch_worker_state='fail')) self.shutdowns._set_state(True, 600) - self.assertEquals((True, "node state is ('fail', 'open', 'boot wait', 'idle exceeded')"), + self.assertEquals((True, "node state is ('fail', 'open', 'boot wait', 'not idle')"), self.node_actor.shutdown_eligible().get(self.TIMEOUT)) def test_no_shutdown_when_node_state_stale(self):