Merge branch 'master' into 13804-no-shutdown-wanted-nodes
[arvados.git] / services / nodemanager / tests / test_computenode_dispatch.py
index 227b5e5f3471ba4cf2e484461cd2c651f26a96e1..aee3cbdac8928cb8237357b9250d595bba349ba9 100644 (file)
@@ -1,4 +1,7 @@
 #!/usr/bin/env python
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
 
 from __future__ import absolute_import, print_function
 
@@ -11,13 +14,24 @@ import mock
 import pykka
 import threading
 
+from libcloud.common.exceptions import BaseHTTPError
+
 import arvnodeman.computenode.dispatch as dispatch
+import arvnodeman.status as status
+from arvnodeman.computenode.driver import BaseComputeNodeDriver
 from . import testutil
 
 class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
+    ACTOR_CLASS = dispatch.ComputeNodeSetupActor
+
     def make_mocks(self, arvados_effect=None):
         if arvados_effect is None:
-            arvados_effect = [testutil.arvados_node_mock()]
+            arvados_effect = [testutil.arvados_node_mock(
+                slot_number=None,
+                hostname=None,
+                first_ping_at=None,
+                last_ping_at=None,
+            )]
         self.arvados_effect = arvados_effect
         self.timer = testutil.MockTimer()
         self.api_client = mock.MagicMock(name='api_client')
@@ -29,7 +43,7 @@ class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
     def make_actor(self, arv_node=None):
         if not hasattr(self, 'timer'):
             self.make_mocks(arvados_effect=[arv_node] if arv_node else None)
-        self.setup_actor = dispatch.ComputeNodeSetupActor.start(
+        self.setup_actor = self.ACTOR_CLASS.start(
             self.timer, self.api_client, self.cloud_client,
             testutil.MockSize(1), arv_node).proxy()
 
@@ -50,6 +64,7 @@ class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
         self.assertEqual(self.arvados_effect[-1],
                          self.setup_actor.arvados_node.get(self.TIMEOUT))
         assert(finished.wait(self.TIMEOUT))
+        self.api_client.nodes().create.called_with(body={}, assign_slot=True)
         self.assertEqual(1, self.api_client.nodes().create().execute.call_count)
         self.assertEqual(1, self.api_client.nodes().update().execute.call_count)
         self.assert_node_properties_updated()
@@ -65,7 +80,8 @@ class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
                          self.setup_actor.arvados_node.get(self.TIMEOUT))
         assert(finished.wait(self.TIMEOUT))
         self.assert_node_properties_updated()
-        self.assertEqual(2, self.api_client.nodes().update().execute.call_count)
+        self.api_client.nodes().create.called_with(body={}, assign_slot=True)
+        self.assertEqual(3, self.api_client.nodes().update().execute.call_count)
         self.assertEqual(self.cloud_client.create_node(),
                          self.setup_actor.cloud_node.get(self.TIMEOUT))
 
@@ -86,6 +102,29 @@ class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
         self.make_actor()
         self.wait_for_assignment(self.setup_actor, 'cloud_node')
 
+    def test_basehttperror_retried(self):
+        self.make_mocks()
+        self.cloud_client.create_node.side_effect = [
+            BaseHTTPError(500, "Try again"),
+            self.cloud_client.create_node.return_value,
+            ]
+        self.make_actor()
+        self.wait_for_assignment(self.setup_actor, 'cloud_node')
+        self.setup_actor.ping().get(self.TIMEOUT)
+        self.assertEqual(1, self.cloud_client.post_create_node.call_count)
+
+    def test_instance_exceeded_not_retried(self):
+        self.make_mocks()
+        self.cloud_client.create_node.side_effect = [
+            BaseHTTPError(400, "InstanceLimitExceeded"),
+            self.cloud_client.create_node.return_value,
+            ]
+        self.make_actor()
+        done = self.FUTURE_CLASS()
+        self.setup_actor.subscribe(done.set)
+        done.get(self.TIMEOUT)
+        self.assertEqual(0, self.cloud_client.post_create_node.call_count)
+
     def test_failed_post_create_retried(self):
         self.make_mocks()
         self.cloud_client.post_create_node.side_effect = [
@@ -123,6 +162,7 @@ class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
         self.api_client.nodes().create().execute.side_effect = retry_resp
         self.api_client.nodes().update().execute.side_effect = retry_resp
         self.wait_for_assignment(self.setup_actor, 'cloud_node')
+        self.setup_actor.ping().get(self.TIMEOUT)
         self.assertEqual(self.setup_actor.actor_ref.actor_urn,
                          subscriber.call_args[0][0].actor_ref.actor_urn)
 
@@ -158,7 +198,7 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
             start_time = time.time()
         monitor_actor = dispatch.ComputeNodeMonitorActor.start(
             self.cloud_node, start_time, self.shutdowns,
-            testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client,
+            self.timer, self.updates, self.cloud_client,
             self.arvados_node)
         self.shutdown_actor = self.ACTOR_CLASS.start(
             self.timer, self.cloud_client, self.arvados_client, monitor_actor,
@@ -168,23 +208,43 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
     def check_success_flag(self, expected, allow_msg_count=1):
         # allow_msg_count is the number of internal messages that may
         # need to be handled for shutdown to finish.
-        for try_num in range(1 + allow_msg_count):
+        for _ in range(1 + allow_msg_count):
             last_flag = self.shutdown_actor.success.get(self.TIMEOUT)
             if last_flag is expected:
                 break
         else:
             self.fail("success flag {} is not {}".format(last_flag, expected))
 
-    def test_uncancellable_shutdown(self, *mocks):
-        self.make_mocks(shutdown_open=False)
-        self.cloud_client.destroy_node.return_value = False
+    def test_boot_failure_counting(self, *mocks):
+        # A boot failure happens when a node transitions from unpaired to shutdown
+        status.tracker.update({'boot_failures': 0})
+        self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="unpaired"))
+        self.cloud_client.destroy_node.return_value = True
         self.make_actor(cancellable=False)
-        self.check_success_flag(None, 0)
-        self.shutdowns._set_state(True, 600)
+        self.check_success_flag(True, 2)
+        self.assertTrue(self.cloud_client.destroy_node.called)
+        self.assertEqual(1, status.tracker.get('boot_failures'))
+
+    def test_cancellable_shutdown(self, *mocks):
+        self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="busy"))
         self.cloud_client.destroy_node.return_value = True
-        self.check_success_flag(True)
+        self.make_actor(cancellable=True)
+        self.check_success_flag(False, 2)
+        self.assertFalse(self.cloud_client.destroy_node.called)
+
+    def test_uncancellable_shutdown(self, *mocks):
+        status.tracker.update({'boot_failures': 0})
+        self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="busy"))
+        self.cloud_client.destroy_node.return_value = True
+        self.make_actor(cancellable=False)
+        self.check_success_flag(True, 4)
+        self.assertTrue(self.cloud_client.destroy_node.called)
+        # A normal shutdown shouldn't be counted as boot failure
+        self.assertEqual(0, status.tracker.get('boot_failures'))
 
     def test_arvados_node_cleaned_after_shutdown(self, *mocks):
+        if len(mocks) == 1:
+            mocks[0].return_value = "drain\n"
         cloud_node = testutil.cloud_node_mock(62)
         arv_node = testutil.arvados_node_mock(62)
         self.make_mocks(cloud_node, arv_node)
@@ -202,12 +262,15 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
         self.assertTrue(update_mock().execute.called)
 
     def test_arvados_node_not_cleaned_after_shutdown_cancelled(self, *mocks):
+        if len(mocks) == 1:
+            mocks[0].return_value = "idle\n"
         cloud_node = testutil.cloud_node_mock(61)
         arv_node = testutil.arvados_node_mock(61)
         self.make_mocks(cloud_node, arv_node, shutdown_open=False)
         self.cloud_client.destroy_node.return_value = False
         self.make_actor(cancellable=True)
         self.shutdown_actor.cancel_shutdown("test")
+        self.shutdown_actor.ping().get(self.TIMEOUT)
         self.check_success_flag(False, 2)
         self.assertFalse(self.arvados_client.nodes().update.called)
 
@@ -221,21 +284,13 @@ class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
         self.check_success_flag(True)
         self.assertTrue(self.cloud_client.destroy_node.called)
 
-    def test_shutdown_retries_when_cloud_fails(self):
-        self.make_mocks()
-        self.cloud_client.destroy_node.return_value = False
-        self.make_actor(start_time=0)
-        self.assertIsNone(self.shutdown_actor.success.get(self.TIMEOUT))
-        self.cloud_client.destroy_node.return_value = True
-        self.check_success_flag(True)
-
-    def test_shutdown_cancelled_when_cloud_fails_on_broken_node(self):
+    def test_shutdown_cancelled_when_destroy_node_fails(self):
         self.make_mocks(node_broken=True)
         self.cloud_client.destroy_node.return_value = False
         self.make_actor(start_time=0)
         self.check_success_flag(False, 2)
         self.assertEqual(1, self.cloud_client.destroy_node.call_count)
-        self.assertEqual(self.ACTOR_CLASS.NODE_BROKEN,
+        self.assertEqual(self.ACTOR_CLASS.DESTROY_FAILED,
                          self.shutdown_actor.cancel_reason.get(self.TIMEOUT))
 
     def test_late_subscribe(self):
@@ -250,11 +305,14 @@ class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
 
 class ComputeNodeUpdateActorTestCase(testutil.ActorTestMixin,
                                      unittest.TestCase):
+    ACTOR_CLASS = dispatch.ComputeNodeUpdateActor
+
     def make_actor(self):
         self.driver = mock.MagicMock(name='driver_mock')
-        self.updater = dispatch.ComputeNodeUpdateActor.start(self.driver).proxy()
+        self.timer = mock.MagicMock(name='timer_mock')
+        self.updater = self.ACTOR_CLASS.start(self.driver, self.timer).proxy()
 
-    def test_node_sync(self):
+    def test_node_sync(self, *args):
         self.make_actor()
         cloud_node = testutil.cloud_node_mock()
         arv_node = testutil.arvados_node_mock()
@@ -262,7 +320,7 @@ class ComputeNodeUpdateActorTestCase(testutil.ActorTestMixin,
         self.driver().sync_node.assert_called_with(cloud_node, arv_node)
 
     @testutil.no_sleep
-    def test_node_sync_error(self):
+    def test_node_sync_error(self, *args):
         self.make_actor()
         cloud_node = testutil.cloud_node_mock()
         arv_node = testutil.arvados_node_mock()
@@ -291,7 +349,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
             start_time = time.time()
         self.node_actor = dispatch.ComputeNodeMonitorActor.start(
             self.cloud_mock, start_time, self.shutdowns,
-            testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client,
+            self.timer, self.updates, self.cloud_client,
             arv_node, boot_fail_after=300).proxy()
         self.node_actor.subscribe(self.subscriber).get(self.TIMEOUT)
 
@@ -310,26 +368,34 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
     def test_in_state_when_no_state_available(self):
         self.make_actor(arv_node=testutil.arvados_node_mock(
                 crunch_worker_state=None))
-        print(self.node_actor.get_state().get())
         self.assertTrue(self.node_state('idle'))
 
     def test_in_state_when_no_state_available_old(self):
         self.make_actor(arv_node=testutil.arvados_node_mock(
                 crunch_worker_state=None, age=90000))
-        print(self.node_actor.get_state().get())
         self.assertTrue(self.node_state('down'))
 
     def test_in_idle_state(self):
+        idle_nodes_before = status.tracker._idle_nodes.keys()
         self.make_actor(2, arv_node=testutil.arvados_node_mock(job_uuid=None))
         self.assertTrue(self.node_state('idle'))
         self.assertFalse(self.node_state('busy'))
         self.assertTrue(self.node_state('idle', 'busy'))
+        idle_nodes_after = status.tracker._idle_nodes.keys()
+        new_idle_nodes = [n for n in idle_nodes_after if n not in idle_nodes_before]
+        # There should be 1 additional idle node
+        self.assertEqual(1, len(new_idle_nodes))
 
     def test_in_busy_state(self):
+        idle_nodes_before = status.tracker._idle_nodes.keys()
         self.make_actor(3, arv_node=testutil.arvados_node_mock(job_uuid=True))
         self.assertFalse(self.node_state('idle'))
         self.assertTrue(self.node_state('busy'))
         self.assertTrue(self.node_state('idle', 'busy'))
+        idle_nodes_after = status.tracker._idle_nodes.keys()
+        new_idle_nodes = [n for n in idle_nodes_after if n not in idle_nodes_before]
+        # There shouldn't be any additional idle node
+        self.assertEqual(0, len(new_idle_nodes))
 
     def test_init_shutdown_scheduling(self):
         self.make_actor()
@@ -358,12 +424,21 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.make_actor()
         self.shutdowns._set_state(True, 600)
         self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
-                          (False, "node state is ('unpaired', 'open', 'boot wait', 'idle exceeded')"))
+                          (False, "node state is ('unpaired', 'open', 'boot wait', 'not idle')"))
+
+    def test_shutdown_when_invalid_cloud_node_size(self):
+        self.make_mocks(1)
+        self.cloud_mock.size.id = 'invalid'
+        self.cloud_mock.extra['arvados_node_size'] = 'stale.type'
+        self.make_actor()
+        self.shutdowns._set_state(True, 600)
+        self.assertEquals((True, "node's size tag 'stale.type' not recognizable"),
+                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_shutdown_without_arvados_node(self):
         self.make_actor(start_time=0)
         self.shutdowns._set_state(True, 600)
-        self.assertEquals((True, "node state is ('unpaired', 'open', 'boot exceeded', 'idle exceeded')"),
+        self.assertEquals((True, "node state is ('down', 'open', 'boot exceeded', 'not idle')"),
                           self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_shutdown_missing(self):
@@ -372,7 +447,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
                                               last_ping_at='1970-01-01T01:02:03.04050607Z')
         self.make_actor(10, arv_node)
         self.shutdowns._set_state(True, 600)
-        self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"),
+        self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'not idle')"),
                           self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_shutdown_running_broken(self):
@@ -381,7 +456,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.make_actor(12, arv_node)
         self.shutdowns._set_state(True, 600)
         self.cloud_client.broken.return_value = True
-        self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"),
+        self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'not idle')"),
                           self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_shutdown_missing_broken(self):
@@ -391,7 +466,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.make_actor(11, arv_node)
         self.shutdowns._set_state(True, 600)
         self.cloud_client.broken.return_value = True
-        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"))
+        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'not idle')"))
 
     def test_no_shutdown_when_window_closed(self):
         self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
@@ -401,7 +476,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
     def test_no_shutdown_when_node_running_job(self):
         self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
         self.shutdowns._set_state(True, 600)
-        self.assertEquals((False, "node state is ('busy', 'open', 'boot wait', 'idle exceeded')"),
+        self.assertEquals((False, "node state is ('busy', 'open', 'boot wait', 'not idle')"),
                           self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_shutdown_when_node_state_unknown(self):
@@ -411,6 +486,13 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.assertEquals((True, "node state is ('idle', 'open', 'boot wait', 'idle exceeded')"),
                           self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
+    def test_shutdown_when_node_state_fail(self):
+        self.make_actor(5, testutil.arvados_node_mock(
+            5, crunch_worker_state='fail'))
+        self.shutdowns._set_state(True, 600)
+        self.assertEquals((True, "node state is ('fail', 'open', 'boot wait', 'not idle')"),
+                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+
     def test_no_shutdown_when_node_state_stale(self):
         self.make_actor(6, testutil.arvados_node_mock(6, age=90000))
         self.shutdowns._set_state(True, 600)
@@ -471,19 +553,10 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.assertEqual(testutil.ip_address_mock(4),
                          current_arvados['ip_address'])
 
-    def test_update_arvados_node_syncs_when_fqdn_mismatch(self):
+    def test_update_arvados_node_calls_sync_node(self):
         self.make_mocks(5)
         self.cloud_mock.extra['testname'] = 'cloudfqdn.zzzzz.arvadosapi.com'
         self.make_actor()
         arv_node = testutil.arvados_node_mock(5)
         self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT)
         self.assertEqual(1, self.updates.sync_node.call_count)
-
-    def test_update_arvados_node_skips_sync_when_fqdn_match(self):
-        self.make_mocks(6)
-        arv_node = testutil.arvados_node_mock(6)
-        self.cloud_mock.extra['testname'] ='{n[hostname]}.{n[domain]}'.format(
-            n=arv_node)
-        self.make_actor()
-        self.node_actor.update_arvados_node(arv_node).get(self.TIMEOUT)
-        self.assertEqual(0, self.updates.sync_node.call_count)