X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/55aafbb07904ca24390dd47ea960eae7cb2b909a..88a29cd091468feb98e5cd541c560f4d35bca716:/services/nodemanager/arvnodeman/daemon.py diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py index e476e5e3e2..73b58bfe65 100644 --- a/services/nodemanager/arvnodeman/daemon.py +++ b/services/nodemanager/arvnodeman/daemon.py @@ -78,7 +78,10 @@ class _ArvadosNodeTracker(_BaseNodeTracker): item_key = staticmethod(lambda arvados_node: arvados_node['uuid']) def find_stale_node(self, stale_time): - for record in self.nodes.itervalues(): + # Try to select a stale node record that have an assigned slot first + for record in sorted(self.nodes.itervalues(), + key=lambda r: r.arvados_node['slot_number'], + reverse=True): node = record.arvados_node if (not cnode.timestamp_fresh(cnode.arvados_node_mtime(node), stale_time) and @@ -164,7 +167,6 @@ class NodeManagerDaemonActor(actor_class): cloud_node=cloud_node, cloud_node_start_time=start_time, shutdown_timer=shutdown_timer, - cloud_fqdn_func=self._cloud_driver.node_fqdn, update_actor=self._cloud_updater, timer_actor=self._timer, arvados_node=None, @@ -277,6 +279,7 @@ class NodeManagerDaemonActor(actor_class): "unpaired": 0, "busy": 0, "idle": 0, + "fail": 0, "down": 0, "shutdown": 0 } @@ -318,7 +321,7 @@ class NodeManagerDaemonActor(actor_class): counts["unpaired"], counts["idle"], busy_count, - counts["down"], + counts["down"]+counts["fail"], counts["shutdown"]) if over_max >= 0: @@ -479,7 +482,7 @@ class NodeManagerDaemonActor(actor_class): # grace period without a ping, so shut it down so we can boot a new # node in its place. self._begin_node_shutdown(node_actor, cancellable=False) - elif node_actor.in_state('down').get(): + elif node_actor.in_state('down', 'fail').get(): # Node is down and unlikely to come back. self._begin_node_shutdown(node_actor, cancellable=False) except pykka.ActorDeadError as e: @@ -498,8 +501,19 @@ class NodeManagerDaemonActor(actor_class): except pykka.ActorDeadError: return cloud_node_id = cloud_node.id - record = self.cloud_nodes[cloud_node_id] - shutdown_actor.stop() + + try: + shutdown_actor.stop() + except pykka.ActorDeadError: + pass + + try: + record = self.cloud_nodes[cloud_node_id] + except KeyError: + # Cloud node was already removed from the cloud node list + # supposedly while the destroy_node call was finishing its + # job. + return record.shutdown_actor = None if not success: