From 0c166be2c9004dd01a1a4b613cb1f0dddd413d53 Mon Sep 17 00:00:00 2001 From: Brett Smith Date: Mon, 13 Apr 2015 16:48:16 -0400 Subject: [PATCH] 5714: Avoid Node Manager race conditions around stop_if_no_cloud_node. Checking .is_alive() seems to always lead to race conditions. Instead, have CloudNodeSetupActor.stop_if_no_cloud_node() return True if it's going to stop, else False. Have NodeManagerDaemonActor respect this return value consistently. --- .../computenode/dispatch/__init__.py | 6 +++-- services/nodemanager/arvnodeman/daemon.py | 11 ++++---- .../tests/test_computenode_dispatch.py | 6 +++-- services/nodemanager/tests/test_daemon.py | 26 +++++++++++++++++-- 4 files changed, 38 insertions(+), 11 deletions(-) diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py index d0a8b0d542..0fab1b0fec 100644 --- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py +++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py @@ -139,8 +139,10 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase): self._finished() def stop_if_no_cloud_node(self): - if self.cloud_node is None: - self.stop() + if self.cloud_node is not None: + return False + self.stop() + return True class ComputeNodeShutdownActor(ComputeNodeStateChangeBase): diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py index 836b673e92..ba52871d39 100644 --- a/services/nodemanager/arvnodeman/daemon.py +++ b/services/nodemanager/arvnodeman/daemon.py @@ -299,8 +299,7 @@ class NodeManagerDaemonActor(actor_class): if (nodes_excess < 1) or not self.booting: return None for key, node in self.booting.iteritems(): - node.stop_if_no_cloud_node().get() - if not node.actor_ref.is_alive(): + if node.stop_if_no_cloud_node().get(): del self.booting[key] if nodes_excess > 1: self._later.stop_booting_node() @@ -345,12 +344,14 @@ class NodeManagerDaemonActor(actor_class): def shutdown(self): self._logger.info("Shutting down after signal.") self.poll_stale_after = -1 # Inhibit starting/stopping nodes - for bootnode in self.booting.itervalues(): - bootnode.stop_if_no_cloud_node() + setup_stops = {key: node.stop_if_no_cloud_node() + for key, node in self.booting.iteritems()} + self.booting = {key: self.booting[key] + for key in setup_stops if not setup_stops[key].get()} self._later.await_shutdown() def await_shutdown(self): - if any(node.actor_ref.is_alive() for node in self.booting.itervalues()): + if self.booting: self._timer.schedule(time.time() + 1, self._later.await_shutdown) else: self.stop() diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py index b8cf0ee408..96a70c6c96 100644 --- a/services/nodemanager/tests/test_computenode_dispatch.py +++ b/services/nodemanager/tests/test_computenode_dispatch.py @@ -79,14 +79,16 @@ class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase): self.make_mocks( arverror.ApiError(httplib2.Response({'status': '500'}), "")) self.make_actor() - self.setup_actor.stop_if_no_cloud_node() + self.assertTrue( + self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT)) self.assertTrue( self.setup_actor.actor_ref.actor_stopped.wait(self.TIMEOUT)) def test_no_stop_when_cloud_node(self): self.make_actor() self.wait_for_assignment(self.setup_actor, 'cloud_node') - self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT) + self.assertFalse( + self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT)) self.assertTrue(self.stop_proxy(self.setup_actor), "actor was stopped by stop_if_no_cloud_node") diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py index dc8fdc3f84..228b552115 100644 --- a/services/nodemanager/tests/test_daemon.py +++ b/services/nodemanager/tests/test_daemon.py @@ -288,6 +288,24 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin, self.stop_proxy(self.daemon) self.assertTrue(self.last_setup.stop_if_no_cloud_node.called) + def test_all_booting_nodes_tried_to_shut_down(self): + size = testutil.MockSize(2) + self.make_daemon(want_sizes=[size]) + self.daemon.max_nodes.get(self.TIMEOUT) + setup1 = self.last_setup + setup1.stop_if_no_cloud_node().get.return_value = False + setup1.stop_if_no_cloud_node.reset_mock() + self.daemon.update_server_wishlist([size, size]).get(self.TIMEOUT) + self.daemon.max_nodes.get(self.TIMEOUT) + self.assertIsNot(setup1, self.last_setup) + self.last_setup.stop_if_no_cloud_node().get.return_value = True + self.last_setup.stop_if_no_cloud_node.reset_mock() + self.daemon.update_server_wishlist([]).get(self.TIMEOUT) + self.daemon.max_nodes.get(self.TIMEOUT) + self.stop_proxy(self.daemon) + self.assertEqual(1, self.last_setup.stop_if_no_cloud_node.call_count) + self.assertTrue(setup1.stop_if_no_cloud_node.called) + def test_shutdown_declined_at_wishlist_capacity(self): cloud_node = testutil.cloud_node_mock(1) size = testutil.MockSize(1) @@ -384,6 +402,8 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin, def test_clean_shutdown_waits_for_node_setup_finish(self): new_node = self.start_node_boot() + new_node.stop_if_no_cloud_node().get.return_value = False + new_node.stop_if_no_cloud_node.reset_mock() self.daemon.shutdown().get(self.TIMEOUT) self.assertTrue(new_node.stop_if_no_cloud_node.called) self.daemon.node_up(new_node).get(self.TIMEOUT) @@ -393,9 +413,11 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin, self.daemon.actor_ref.actor_stopped.wait(self.TIMEOUT)) def test_wishlist_ignored_after_shutdown(self): - size = testutil.MockSize(2) - self.make_daemon(want_sizes=[size]) + new_node = self.start_node_boot() + new_node.stop_if_no_cloud_node().get.return_value = False + new_node.stop_if_no_cloud_node.reset_mock() self.daemon.shutdown().get(self.TIMEOUT) + size = testutil.MockSize(2) self.daemon.update_server_wishlist([size] * 2).get(self.TIMEOUT) self.timer.deliver() self.stop_proxy(self.daemon) -- 2.30.2