From 2e32ef1657b439c0398e66930c3d17437032fb1a Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Thu, 23 Mar 2017 14:05:51 -0400 Subject: [PATCH] 11324: Fix crash in NodeManagerDaemonActor when receiving a node_can_shutdown message for a node that has already been shut down. --- services/nodemanager/arvnodeman/daemon.py | 29 +++++++++++++++-------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py index b4f17849f1..8890e83731 100644 --- a/services/nodemanager/arvnodeman/daemon.py +++ b/services/nodemanager/arvnodeman/daemon.py @@ -426,16 +426,25 @@ class NodeManagerDaemonActor(actor_class): @_check_poll_freshness def node_can_shutdown(self, node_actor): - if self._nodes_excess(node_actor.cloud_node.get().size) > 0: - self._begin_node_shutdown(node_actor, cancellable=True) - elif self.cloud_nodes.nodes.get(node_actor.cloud_node.get().id).arvados_node is None: - # Node is unpaired, which means it probably exceeded its booting - # grace period without a ping, so shut it down so we can boot a new - # node in its place. - self._begin_node_shutdown(node_actor, cancellable=False) - elif node_actor.in_state('down').get(): - # Node is down and unlikely to come back. - self._begin_node_shutdown(node_actor, cancellable=False) + try: + if self._nodes_excess(node_actor.cloud_node.get().size) > 0: + self._begin_node_shutdown(node_actor, cancellable=True) + elif self.cloud_nodes.nodes.get(node_actor.cloud_node.get().id).arvados_node is None: + # Node is unpaired, which means it probably exceeded its booting + # grace period without a ping, so shut it down so we can boot a new + # node in its place. + self._begin_node_shutdown(node_actor, cancellable=False) + elif node_actor.in_state('down').get(): + # Node is down and unlikely to come back. + self._begin_node_shutdown(node_actor, cancellable=False) + except pykka.ActorDeadError as e: + # The monitor actor sends shutdown suggestions every time the + # node's state is updated, and these go into the daemon actor's + # message queue. It's possible that the node has already been shut + # down (which shuts down the node monitor actor). In that case, + # this message is stale and we'll get ActorDeadError when we try to + # access node_actor. Log the error. + self._logger.debug("ActorDeadError in node_can_shutdown: %s", e) def node_finished_shutdown(self, shutdown_actor): try: -- 2.30.2