Merge branch 'master' into 11850-singlecontainer-max-requirements
[arvados.git] / services / nodemanager / arvnodeman / daemon.py
index e476e5e3e21c07205144deb6226e54c9762f8c58..73b58bfe65fc0cc871579a22500424d2ec2f87fc 100644 (file)
@@ -78,7 +78,10 @@ class _ArvadosNodeTracker(_BaseNodeTracker):
     item_key = staticmethod(lambda arvados_node: arvados_node['uuid'])
 
     def find_stale_node(self, stale_time):
-        for record in self.nodes.itervalues():
+        # Try to select a stale node record that have an assigned slot first
+        for record in sorted(self.nodes.itervalues(),
+                             key=lambda r: r.arvados_node['slot_number'],
+                             reverse=True):
             node = record.arvados_node
             if (not cnode.timestamp_fresh(cnode.arvados_node_mtime(node),
                                           stale_time) and
@@ -164,7 +167,6 @@ class NodeManagerDaemonActor(actor_class):
             cloud_node=cloud_node,
             cloud_node_start_time=start_time,
             shutdown_timer=shutdown_timer,
-            cloud_fqdn_func=self._cloud_driver.node_fqdn,
             update_actor=self._cloud_updater,
             timer_actor=self._timer,
             arvados_node=None,
@@ -277,6 +279,7 @@ class NodeManagerDaemonActor(actor_class):
             "unpaired": 0,
             "busy": 0,
             "idle": 0,
+            "fail": 0,
             "down": 0,
             "shutdown": 0
         }
@@ -318,7 +321,7 @@ class NodeManagerDaemonActor(actor_class):
                           counts["unpaired"],
                           counts["idle"],
                           busy_count,
-                          counts["down"],
+                          counts["down"]+counts["fail"],
                           counts["shutdown"])
 
         if over_max >= 0:
@@ -479,7 +482,7 @@ class NodeManagerDaemonActor(actor_class):
                 # grace period without a ping, so shut it down so we can boot a new
                 # node in its place.
                 self._begin_node_shutdown(node_actor, cancellable=False)
-            elif node_actor.in_state('down').get():
+            elif node_actor.in_state('down', 'fail').get():
                 # Node is down and unlikely to come back.
                 self._begin_node_shutdown(node_actor, cancellable=False)
         except pykka.ActorDeadError as e:
@@ -498,8 +501,19 @@ class NodeManagerDaemonActor(actor_class):
         except pykka.ActorDeadError:
             return
         cloud_node_id = cloud_node.id
-        record = self.cloud_nodes[cloud_node_id]
-        shutdown_actor.stop()
+
+        try:
+            shutdown_actor.stop()
+        except pykka.ActorDeadError:
+            pass
+
+        try:
+            record = self.cloud_nodes[cloud_node_id]
+        except KeyError:
+            # Cloud node was already removed from the cloud node list
+            # supposedly while the destroy_node call was finishing its
+            # job.
+            return
         record.shutdown_actor = None
 
         if not success: