5714: Avoid Node Manager race conditions around stop_if_no_cloud_node. 5714-gce-setup-bugfixes-wip
authorBrett Smith <brett@curoverse.com>
Mon, 13 Apr 2015 20:48:16 +0000 (16:48 -0400)
committerBrett Smith <brett@curoverse.com>
Mon, 13 Apr 2015 20:48:16 +0000 (16:48 -0400)
Checking .is_alive() seems to always lead to race conditions.
Instead, have CloudNodeSetupActor.stop_if_no_cloud_node() return True
if it's going to stop, else False.  Have NodeManagerDaemonActor
respect this return value consistently.

services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
services/nodemanager/arvnodeman/daemon.py
services/nodemanager/tests/test_computenode_dispatch.py
services/nodemanager/tests/test_daemon.py

index d0a8b0d542768b3b90aa7e5964cc7a1565cf3333..0fab1b0fec5f3e0fd0696460089028ab244cfb66 100644 (file)
@@ -139,8 +139,10 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
         self._finished()
 
     def stop_if_no_cloud_node(self):
-        if self.cloud_node is None:
-            self.stop()
+        if self.cloud_node is not None:
+            return False
+        self.stop()
+        return True
 
 
 class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
index 836b673e9267818d37bd6bd750465a8e33a2b2b4..ba52871d39bf79a856fa5e2dfb6d87685fa39b5d 100644 (file)
@@ -299,8 +299,7 @@ class NodeManagerDaemonActor(actor_class):
         if (nodes_excess < 1) or not self.booting:
             return None
         for key, node in self.booting.iteritems():
-            node.stop_if_no_cloud_node().get()
-            if not node.actor_ref.is_alive():
+            if node.stop_if_no_cloud_node().get():
                 del self.booting[key]
                 if nodes_excess > 1:
                     self._later.stop_booting_node()
@@ -345,12 +344,14 @@ class NodeManagerDaemonActor(actor_class):
     def shutdown(self):
         self._logger.info("Shutting down after signal.")
         self.poll_stale_after = -1  # Inhibit starting/stopping nodes
-        for bootnode in self.booting.itervalues():
-            bootnode.stop_if_no_cloud_node()
+        setup_stops = {key: node.stop_if_no_cloud_node()
+                       for key, node in self.booting.iteritems()}
+        self.booting = {key: self.booting[key]
+                        for key in setup_stops if not setup_stops[key].get()}
         self._later.await_shutdown()
 
     def await_shutdown(self):
-        if any(node.actor_ref.is_alive() for node in self.booting.itervalues()):
+        if self.booting:
             self._timer.schedule(time.time() + 1, self._later.await_shutdown)
         else:
             self.stop()
index b8cf0ee408130f8203ac1d53d332ffb28d9b5659..96a70c6c96c794ebb17d05bf7880defe89ca014c 100644 (file)
@@ -79,14 +79,16 @@ class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
         self.make_mocks(
             arverror.ApiError(httplib2.Response({'status': '500'}), ""))
         self.make_actor()
-        self.setup_actor.stop_if_no_cloud_node()
+        self.assertTrue(
+            self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT))
         self.assertTrue(
             self.setup_actor.actor_ref.actor_stopped.wait(self.TIMEOUT))
 
     def test_no_stop_when_cloud_node(self):
         self.make_actor()
         self.wait_for_assignment(self.setup_actor, 'cloud_node')
-        self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT)
+        self.assertFalse(
+            self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT))
         self.assertTrue(self.stop_proxy(self.setup_actor),
                         "actor was stopped by stop_if_no_cloud_node")
 
index dc8fdc3f8496b9d90d43fdabca4b922120875a6f..228b5521151a2ac3bb96af81ac220690f8255974 100644 (file)
@@ -288,6 +288,24 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.stop_proxy(self.daemon)
         self.assertTrue(self.last_setup.stop_if_no_cloud_node.called)
 
+    def test_all_booting_nodes_tried_to_shut_down(self):
+        size = testutil.MockSize(2)
+        self.make_daemon(want_sizes=[size])
+        self.daemon.max_nodes.get(self.TIMEOUT)
+        setup1 = self.last_setup
+        setup1.stop_if_no_cloud_node().get.return_value = False
+        setup1.stop_if_no_cloud_node.reset_mock()
+        self.daemon.update_server_wishlist([size, size]).get(self.TIMEOUT)
+        self.daemon.max_nodes.get(self.TIMEOUT)
+        self.assertIsNot(setup1, self.last_setup)
+        self.last_setup.stop_if_no_cloud_node().get.return_value = True
+        self.last_setup.stop_if_no_cloud_node.reset_mock()
+        self.daemon.update_server_wishlist([]).get(self.TIMEOUT)
+        self.daemon.max_nodes.get(self.TIMEOUT)
+        self.stop_proxy(self.daemon)
+        self.assertEqual(1, self.last_setup.stop_if_no_cloud_node.call_count)
+        self.assertTrue(setup1.stop_if_no_cloud_node.called)
+
     def test_shutdown_declined_at_wishlist_capacity(self):
         cloud_node = testutil.cloud_node_mock(1)
         size = testutil.MockSize(1)
@@ -384,6 +402,8 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
 
     def test_clean_shutdown_waits_for_node_setup_finish(self):
         new_node = self.start_node_boot()
+        new_node.stop_if_no_cloud_node().get.return_value = False
+        new_node.stop_if_no_cloud_node.reset_mock()
         self.daemon.shutdown().get(self.TIMEOUT)
         self.assertTrue(new_node.stop_if_no_cloud_node.called)
         self.daemon.node_up(new_node).get(self.TIMEOUT)
@@ -393,9 +413,11 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
             self.daemon.actor_ref.actor_stopped.wait(self.TIMEOUT))
 
     def test_wishlist_ignored_after_shutdown(self):
-        size = testutil.MockSize(2)
-        self.make_daemon(want_sizes=[size])
+        new_node = self.start_node_boot()
+        new_node.stop_if_no_cloud_node().get.return_value = False
+        new_node.stop_if_no_cloud_node.reset_mock()
         self.daemon.shutdown().get(self.TIMEOUT)
+        size = testutil.MockSize(2)
         self.daemon.update_server_wishlist([size] * 2).get(self.TIMEOUT)
         self.timer.deliver()
         self.stop_proxy(self.daemon)