12073: Prioritize stale node records that have a slot_number when
[arvados.git] / services / nodemanager / arvnodeman / daemon.py
index 7e63c782ede1fecee931d088505aed549a21c9df..ca3029d9e1bc3c376b119cca367b3767f3a8bb45 100644 (file)
@@ -1,4 +1,7 @@
 #!/usr/bin/env python
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
 
 from __future__ import absolute_import, print_function
 
@@ -75,7 +78,10 @@ class _ArvadosNodeTracker(_BaseNodeTracker):
     item_key = staticmethod(lambda arvados_node: arvados_node['uuid'])
 
     def find_stale_node(self, stale_time):
-        for record in self.nodes.itervalues():
+        # Try to select a stale node record that have an assigned slot first
+        for record in sorted(self.nodes.itervalues(),
+                             key=lambda r: r.arvados_node['slot_number'],
+                             reverse=True):
             node = record.arvados_node
             if (not cnode.timestamp_fresh(cnode.arvados_node_mtime(node),
                                           stale_time) and
@@ -387,7 +393,7 @@ class NodeManagerDaemonActor(actor_class):
             arvados_client=self._new_arvados(),
             arvados_node=arvados_node,
             cloud_client=self._new_cloud(),
-            cloud_size=cloud_size).proxy()
+            cloud_size=self.server_calculator.find_size(cloud_size.id)).proxy()
         self.booting[new_setup.actor_ref.actor_urn] = new_setup
         self.sizes_booting[new_setup.actor_ref.actor_urn] = cloud_size
 
@@ -495,8 +501,19 @@ class NodeManagerDaemonActor(actor_class):
         except pykka.ActorDeadError:
             return
         cloud_node_id = cloud_node.id
-        record = self.cloud_nodes[cloud_node_id]
-        shutdown_actor.stop()
+
+        try:
+            shutdown_actor.stop()
+        except pykka.ActorDeadError:
+            pass
+
+        try:
+            record = self.cloud_nodes[cloud_node_id]
+        except KeyError:
+            # Cloud node was already removed from the cloud node list
+            # supposedly while the destroy_node call was finishing its
+            # job.
+            return
         record.shutdown_actor = None
 
         if not success: