12085: Boot failures counting, with tests.
authorLucas Di Pentima <ldipentima@veritasgenetics.com>
Tue, 27 Mar 2018 13:58:37 +0000 (10:58 -0300)
committerLucas Di Pentima <ldipentima@veritasgenetics.com>
Thu, 5 Apr 2018 14:17:02 +0000 (11:17 -0300)
Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima@veritasgenetics.com>

services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
services/nodemanager/tests/test_computenode_dispatch.py

index 37d7088b7a7c65bc8632e21269f465d6850d50b9..340668eff5857e8a49d2f0bcbf711314e3002671 100644 (file)
@@ -20,6 +20,7 @@ from .. import \
     arvados_node_missing, RetryMixin
 from ...clientactor import _notify_subscribers
 from ... import config
+from ... import status
 from .transitions import transitions
 
 QuotaExceeded = "QuotaExceeded"
@@ -272,6 +273,9 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
                 self.cancel_shutdown("No longer eligible for shut down because %s" % reason,
                                      try_resume=True)
                 return
+        # If boot failed, count the event
+        if self._monitor.get_state().get() == 'unpaired':
+            status.tracker.counter_add('boot_failures')
         self._destroy_node()
 
     def _destroy_node(self):
index 0a2deb8a9cdd70ca7a72f1ef41b067bbe2f00ea4..d93c940d3f18e7d1ca2340f6cf75eca2fb2c17c2 100644 (file)
@@ -17,6 +17,7 @@ import threading
 from libcloud.common.exceptions import BaseHTTPError
 
 import arvnodeman.computenode.dispatch as dispatch
+import arvnodeman.status as status
 from arvnodeman.computenode.driver import BaseComputeNodeDriver
 from . import testutil
 
@@ -207,13 +208,23 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
     def check_success_flag(self, expected, allow_msg_count=1):
         # allow_msg_count is the number of internal messages that may
         # need to be handled for shutdown to finish.
-        for try_num in range(1 + allow_msg_count):
+        for _ in range(1 + allow_msg_count):
             last_flag = self.shutdown_actor.success.get(self.TIMEOUT)
             if last_flag is expected:
                 break
         else:
             self.fail("success flag {} is not {}".format(last_flag, expected))
 
+    def test_boot_failure_counting(self, *mocks):
+        # A boot failure happens when a node transitions from unpaired to shutdown
+        status.tracker.update({'boot_failures': 0})
+        self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="unpaired"))
+        self.cloud_client.destroy_node.return_value = True
+        self.make_actor(cancellable=False)
+        self.check_success_flag(True, 2)
+        self.assertTrue(self.cloud_client.destroy_node.called)
+        self.assertEqual(1, status.tracker.get('boot_failures'))
+
     def test_cancellable_shutdown(self, *mocks):
         self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="busy"))
         self.cloud_client.destroy_node.return_value = True
@@ -222,11 +233,14 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
         self.assertFalse(self.cloud_client.destroy_node.called)
 
     def test_uncancellable_shutdown(self, *mocks):
+        status.tracker.update({'boot_failures': 0})
         self.make_mocks(shutdown_open=True, arvados_node=testutil.arvados_node_mock(crunch_worker_state="busy"))
         self.cloud_client.destroy_node.return_value = True
         self.make_actor(cancellable=False)
         self.check_success_flag(True, 4)
         self.assertTrue(self.cloud_client.destroy_node.called)
+        # A normal shutdown shouldn't be counted as boot failure
+        self.assertEqual(0, status.tracker.get('boot_failures'))
 
     def test_arvados_node_cleaned_after_shutdown(self, *mocks):
         if len(mocks) == 1: