Merge branch '6520-nodemanager-crunchv2' refs #6520
[arvados.git] / services / nodemanager / tests / test_computenode_dispatch.py
index 95b1329fa603009dd65eb7420981c0d1cb1ed5b2..c718dad6b6440919361a1bc6179cbb6610db7c88 100644 (file)
@@ -11,7 +11,10 @@ import mock
 import pykka
 import threading
 
+from libcloud.common.exceptions import BaseHTTPError
+
 import arvnodeman.computenode.dispatch as dispatch
+from arvnodeman.computenode.driver import BaseComputeNodeDriver
 from . import testutil
 
 class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
@@ -25,6 +28,7 @@ class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
         self.api_client.nodes().update().execute.side_effect = arvados_effect
         self.cloud_client = mock.MagicMock(name='cloud_client')
         self.cloud_client.create_node.return_value = testutil.cloud_node_mock(1)
+        self.cloud_client.is_cloud_exception = BaseComputeNodeDriver.is_cloud_exception
 
     def make_actor(self, arv_node=None):
         if not hasattr(self, 'timer'):
@@ -86,6 +90,28 @@ class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
         self.make_actor()
         self.wait_for_assignment(self.setup_actor, 'cloud_node')
 
+    def test_unknown_basehttperror_not_retried(self):
+        self.make_mocks()
+        self.cloud_client.create_node.side_effect = [
+            BaseHTTPError(400, "Unknown"),
+            self.cloud_client.create_node.return_value,
+            ]
+        self.make_actor()
+        finished = threading.Event()
+        self.setup_actor.subscribe(lambda _: finished.set())
+        assert(finished.wait(self.TIMEOUT))
+        self.assertEqual(0, self.cloud_client.post_create_node.call_count)
+
+    def test_known_basehttperror_retried(self):
+        self.make_mocks()
+        self.cloud_client.create_node.side_effect = [
+            BaseHTTPError(400, "InstanceLimitExceeded"),
+            self.cloud_client.create_node.return_value,
+            ]
+        self.make_actor()
+        self.wait_for_assignment(self.setup_actor, 'cloud_node')
+        self.assertEqual(1, self.cloud_client.post_create_node.call_count)
+
     def test_failed_post_create_retried(self):
         self.make_mocks()
         self.cloud_client.post_create_node.side_effect = [
@@ -175,14 +201,19 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
         else:
             self.fail("success flag {} is not {}".format(last_flag, expected))
 
+    def test_cancellable_shutdown(self, *mocks):
+        self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="busy"))
+        self.cloud_client.destroy_node.return_value = True
+        self.make_actor(cancellable=True)
+        self.check_success_flag(False)
+        self.assertFalse(self.cloud_client.destroy_node.called)
+
     def test_uncancellable_shutdown(self, *mocks):
-        self.make_mocks(shutdown_open=False)
-        self.cloud_client.destroy_node.return_value = False
-        self.make_actor(cancellable=False)
-        self.check_success_flag(None, 0)
-        self.shutdowns._set_state(True, 600)
+        self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="busy"))
         self.cloud_client.destroy_node.return_value = True
-        self.check_success_flag(True)
+        self.make_actor(cancellable=False)
+        self.check_success_flag(True, 2)
+        self.assertTrue(self.cloud_client.destroy_node.called)
 
     def test_arvados_node_cleaned_after_shutdown(self, *mocks):
         cloud_node = testutil.cloud_node_mock(62)
@@ -205,7 +236,9 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
         cloud_node = testutil.cloud_node_mock(61)
         arv_node = testutil.arvados_node_mock(61)
         self.make_mocks(cloud_node, arv_node, shutdown_open=False)
+        self.cloud_client.destroy_node.return_value = False
         self.make_actor(cancellable=True)
+        self.shutdown_actor.cancel_shutdown("test")
         self.check_success_flag(False, 2)
         self.assertFalse(self.arvados_client.nodes().update.called)
 
@@ -219,29 +252,13 @@ class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
         self.check_success_flag(True)
         self.assertTrue(self.cloud_client.destroy_node.called)
 
-    def test_shutdown_cancelled_when_window_closes(self):
-        self.make_mocks(shutdown_open=False)
-        self.make_actor()
-        self.check_success_flag(False, 2)
-        self.assertFalse(self.cloud_client.destroy_node.called)
-        self.assertEqual(self.ACTOR_CLASS.WINDOW_CLOSED,
-                         self.shutdown_actor.cancel_reason.get(self.TIMEOUT))
-
-    def test_shutdown_retries_when_cloud_fails(self):
-        self.make_mocks()
-        self.cloud_client.destroy_node.return_value = False
-        self.make_actor(start_time=0)
-        self.assertIsNone(self.shutdown_actor.success.get(self.TIMEOUT))
-        self.cloud_client.destroy_node.return_value = True
-        self.check_success_flag(True)
-
-    def test_shutdown_cancelled_when_cloud_fails_on_broken_node(self):
+    def test_shutdown_cancelled_when_destroy_node_fails(self):
         self.make_mocks(node_broken=True)
         self.cloud_client.destroy_node.return_value = False
         self.make_actor(start_time=0)
         self.check_success_flag(False, 2)
         self.assertEqual(1, self.cloud_client.destroy_node.call_count)
-        self.assertEqual(self.ACTOR_CLASS.NODE_BROKEN,
+        self.assertEqual(self.ACTOR_CLASS.DESTROY_FAILED,
                          self.shutdown_actor.cancel_reason.get(self.TIMEOUT))
 
     def test_late_subscribe(self):
@@ -256,17 +273,29 @@ class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
 
 class ComputeNodeUpdateActorTestCase(testutil.ActorTestMixin,
                                      unittest.TestCase):
+    ACTOR_CLASS = dispatch.ComputeNodeUpdateActor
+
     def make_actor(self):
         self.driver = mock.MagicMock(name='driver_mock')
-        self.updater = dispatch.ComputeNodeUpdateActor.start(self.driver).proxy()
+        self.updater = self.ACTOR_CLASS.start(self.driver).proxy()
 
-    def test_node_sync(self):
+    def test_node_sync(self, *args):
         self.make_actor()
         cloud_node = testutil.cloud_node_mock()
         arv_node = testutil.arvados_node_mock()
         self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
         self.driver().sync_node.assert_called_with(cloud_node, arv_node)
 
+    @testutil.no_sleep
+    def test_node_sync_error(self, *args):
+        self.make_actor()
+        cloud_node = testutil.cloud_node_mock()
+        arv_node = testutil.arvados_node_mock()
+        self.driver().sync_node.side_effect = (IOError, Exception, True)
+        self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
+        self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
+        self.updater.sync_node(cloud_node, arv_node).get(self.TIMEOUT)
+        self.driver().sync_node.assert_called_with(cloud_node, arv_node)
 
 class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
                                       unittest.TestCase):
@@ -296,17 +325,24 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
 
     def test_in_state_when_unpaired(self):
         self.make_actor()
-        self.assertIsNone(self.node_state('idle', 'busy'))
+        self.assertTrue(self.node_state('unpaired'))
 
     def test_in_state_when_pairing_stale(self):
         self.make_actor(arv_node=testutil.arvados_node_mock(
                 job_uuid=None, age=90000))
-        self.assertIsNone(self.node_state('idle', 'busy'))
+        self.assertTrue(self.node_state('down'))
 
     def test_in_state_when_no_state_available(self):
         self.make_actor(arv_node=testutil.arvados_node_mock(
                 crunch_worker_state=None))
-        self.assertIsNone(self.node_state('idle', 'busy'))
+        print(self.node_actor.get_state().get())
+        self.assertTrue(self.node_state('idle'))
+
+    def test_in_state_when_no_state_available_old(self):
+        self.make_actor(arv_node=testutil.arvados_node_mock(
+                crunch_worker_state=None, age=90000))
+        print(self.node_actor.get_state().get())
+        self.assertTrue(self.node_state('down'))
 
     def test_in_idle_state(self):
         self.make_actor(2, arv_node=testutil.arvados_node_mock(job_uuid=None))
@@ -346,28 +382,32 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
     def test_no_shutdown_booting(self):
         self.make_actor()
         self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is still booting"))
+        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
+                          (False, "node state is ('unpaired', 'open', 'boot wait', 'idle exceeded')"))
 
     def test_shutdown_without_arvados_node(self):
         self.make_actor(start_time=0)
         self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+        self.assertEquals((True, "node state is ('unpaired', 'open', 'boot exceeded', 'idle exceeded')"),
+                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
-    def test_no_shutdown_missing(self):
+    def test_shutdown_missing(self):
         arv_node = testutil.arvados_node_mock(10, job_uuid=None,
                                               crunch_worker_state="down",
                                               last_ping_at='1970-01-01T01:02:03.04050607Z')
         self.make_actor(10, arv_node)
         self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+        self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"),
+                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
-    def test_no_shutdown_running_broken(self):
+    def test_shutdown_running_broken(self):
         arv_node = testutil.arvados_node_mock(12, job_uuid=None,
                                               crunch_worker_state="down")
         self.make_actor(12, arv_node)
         self.shutdowns._set_state(True, 600)
         self.cloud_client.broken.return_value = True
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+        self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"),
+                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_shutdown_missing_broken(self):
         arv_node = testutil.arvados_node_mock(11, job_uuid=None,
@@ -376,27 +416,31 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.make_actor(11, arv_node)
         self.shutdowns._set_state(True, 600)
         self.cloud_client.broken.return_value = True
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"))
 
     def test_no_shutdown_when_window_closed(self):
         self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("shutdown window is not open."))
+        self.assertEquals((False, "node state is ('idle', 'closed', 'boot wait', 'idle exceeded')"),
+                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_no_shutdown_when_node_running_job(self):
         self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
         self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+        self.assertEquals((False, "node state is ('busy', 'open', 'boot wait', 'idle exceeded')"),
+                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
-    def test_no_shutdown_when_node_state_unknown(self):
+    def test_shutdown_when_node_state_unknown(self):
         self.make_actor(5, testutil.arvados_node_mock(
             5, crunch_worker_state=None))
         self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+        self.assertEquals((True, "node state is ('idle', 'open', 'boot wait', 'idle exceeded')"),
+                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_no_shutdown_when_node_state_stale(self):
         self.make_actor(6, testutil.arvados_node_mock(6, age=90000))
         self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+        self.assertEquals((False, "node state is stale"),
+                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_arvados_node_match(self):
         self.make_actor(2)