Merge branch 'master' into 13804-no-shutdown-wanted-nodes
[arvados.git] / services / nodemanager / tests / test_computenode_dispatch.py
index 0a2deb8a9cdd70ca7a72f1ef41b067bbe2f00ea4..aee3cbdac8928cb8237357b9250d595bba349ba9 100644 (file)
@@ -17,6 +17,7 @@ import threading
 from libcloud.common.exceptions import BaseHTTPError
 
 import arvnodeman.computenode.dispatch as dispatch
+import arvnodeman.status as status
 from arvnodeman.computenode.driver import BaseComputeNodeDriver
 from . import testutil
 
@@ -207,13 +208,23 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
     def check_success_flag(self, expected, allow_msg_count=1):
         # allow_msg_count is the number of internal messages that may
         # need to be handled for shutdown to finish.
-        for try_num in range(1 + allow_msg_count):
+        for _ in range(1 + allow_msg_count):
             last_flag = self.shutdown_actor.success.get(self.TIMEOUT)
             if last_flag is expected:
                 break
         else:
             self.fail("success flag {} is not {}".format(last_flag, expected))
 
+    def test_boot_failure_counting(self, *mocks):
+        # A boot failure happens when a node transitions from unpaired to shutdown
+        status.tracker.update({'boot_failures': 0})
+        self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="unpaired"))
+        self.cloud_client.destroy_node.return_value = True
+        self.make_actor(cancellable=False)
+        self.check_success_flag(True, 2)
+        self.assertTrue(self.cloud_client.destroy_node.called)
+        self.assertEqual(1, status.tracker.get('boot_failures'))
+
     def test_cancellable_shutdown(self, *mocks):
         self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="busy"))
         self.cloud_client.destroy_node.return_value = True
@@ -222,11 +233,14 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
         self.assertFalse(self.cloud_client.destroy_node.called)
 
     def test_uncancellable_shutdown(self, *mocks):
+        status.tracker.update({'boot_failures': 0})
         self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="busy"))
         self.cloud_client.destroy_node.return_value = True
         self.make_actor(cancellable=False)
         self.check_success_flag(True, 4)
         self.assertTrue(self.cloud_client.destroy_node.called)
+        # A normal shutdown shouldn't be counted as boot failure
+        self.assertEqual(0, status.tracker.get('boot_failures'))
 
     def test_arvados_node_cleaned_after_shutdown(self, *mocks):
         if len(mocks) == 1:
@@ -362,16 +376,26 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.assertTrue(self.node_state('down'))
 
     def test_in_idle_state(self):
+        idle_nodes_before = status.tracker._idle_nodes.keys()
         self.make_actor(2, arv_node=testutil.arvados_node_mock(job_uuid=None))
         self.assertTrue(self.node_state('idle'))
         self.assertFalse(self.node_state('busy'))
         self.assertTrue(self.node_state('idle', 'busy'))
+        idle_nodes_after = status.tracker._idle_nodes.keys()
+        new_idle_nodes = [n for n in idle_nodes_after if n not in idle_nodes_before]
+        # There should be 1 additional idle node
+        self.assertEqual(1, len(new_idle_nodes))
 
     def test_in_busy_state(self):
+        idle_nodes_before = status.tracker._idle_nodes.keys()
         self.make_actor(3, arv_node=testutil.arvados_node_mock(job_uuid=True))
         self.assertFalse(self.node_state('idle'))
         self.assertTrue(self.node_state('busy'))
         self.assertTrue(self.node_state('idle', 'busy'))
+        idle_nodes_after = status.tracker._idle_nodes.keys()
+        new_idle_nodes = [n for n in idle_nodes_after if n not in idle_nodes_before]
+        # There shouldn't be any additional idle node
+        self.assertEqual(0, len(new_idle_nodes))
 
     def test_init_shutdown_scheduling(self):
         self.make_actor()
@@ -400,12 +424,21 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.make_actor()
         self.shutdowns._set_state(True, 600)
         self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
-                          (False, "node state is ('unpaired', 'open', 'boot wait', 'idle exceeded')"))
+                          (False, "node state is ('unpaired', 'open', 'boot wait', 'not idle')"))
+
+    def test_shutdown_when_invalid_cloud_node_size(self):
+        self.make_mocks(1)
+        self.cloud_mock.size.id = 'invalid'
+        self.cloud_mock.extra['arvados_node_size'] = 'stale.type'
+        self.make_actor()
+        self.shutdowns._set_state(True, 600)
+        self.assertEquals((True, "node's size tag 'stale.type' not recognizable"),
+                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_shutdown_without_arvados_node(self):
         self.make_actor(start_time=0)
         self.shutdowns._set_state(True, 600)
-        self.assertEquals((True, "node state is ('down', 'open', 'boot exceeded', 'idle exceeded')"),
+        self.assertEquals((True, "node state is ('down', 'open', 'boot exceeded', 'not idle')"),
                           self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_shutdown_missing(self):
@@ -414,7 +447,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
                                               last_ping_at='1970-01-01T01:02:03.04050607Z')
         self.make_actor(10, arv_node)
         self.shutdowns._set_state(True, 600)
-        self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"),
+        self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'not idle')"),
                           self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_shutdown_running_broken(self):
@@ -423,7 +456,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.make_actor(12, arv_node)
         self.shutdowns._set_state(True, 600)
         self.cloud_client.broken.return_value = True
-        self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"),
+        self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'not idle')"),
                           self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_shutdown_missing_broken(self):
@@ -433,7 +466,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.make_actor(11, arv_node)
         self.shutdowns._set_state(True, 600)
         self.cloud_client.broken.return_value = True
-        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"))
+        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'not idle')"))
 
     def test_no_shutdown_when_window_closed(self):
         self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
@@ -443,7 +476,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
     def test_no_shutdown_when_node_running_job(self):
         self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
         self.shutdowns._set_state(True, 600)
-        self.assertEquals((False, "node state is ('busy', 'open', 'boot wait', 'idle exceeded')"),
+        self.assertEquals((False, "node state is ('busy', 'open', 'boot wait', 'not idle')"),
                           self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_shutdown_when_node_state_unknown(self):
@@ -457,7 +490,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.make_actor(5, testutil.arvados_node_mock(
             5, crunch_worker_state='fail'))
         self.shutdowns._set_state(True, 600)
-        self.assertEquals((True, "node state is ('fail', 'open', 'boot wait', 'idle exceeded')"),
+        self.assertEquals((True, "node state is ('fail', 'open', 'boot wait', 'not idle')"),
                           self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_no_shutdown_when_node_state_stale(self):