4294: code review feedback
[arvados.git] / services / nodemanager / arvnodeman / daemon.py
index 5b7437f8d3a73529020dd4927f133d5cbc1fc08f..eaf10be324de16811e655e2b8427bed85ad6709d 100644 (file)
@@ -94,7 +94,7 @@ class NodeManagerDaemonActor(actor_class):
     def __init__(self, server_wishlist_actor, arvados_nodes_actor,
                  cloud_nodes_actor, cloud_update_actor, timer_actor,
                  arvados_factory, cloud_factory,
-                 shutdown_windows, max_nodes,
+                 shutdown_windows, min_nodes, max_nodes,
                  poll_stale_after=600, node_stale_after=7200,
                  node_setup_class=cnode.ComputeNodeSetupActor,
                  node_shutdown_class=cnode.ComputeNodeShutdownActor,
@@ -111,6 +111,7 @@ class NodeManagerDaemonActor(actor_class):
         self._logger = logging.getLogger('arvnodeman.daemon')
         self._later = self.actor_ref.proxy()
         self.shutdown_windows = shutdown_windows
+        self.min_nodes = min_nodes
         self.max_nodes = max_nodes
         self.poll_stale_after = poll_stale_after
         self.node_stale_after = node_stale_after
@@ -123,6 +124,7 @@ class NodeManagerDaemonActor(actor_class):
         self.cloud_nodes = _CloudNodeTracker()
         self.arvados_nodes = _ArvadosNodeTracker()
         self.booting = {}       # Actor IDs to ComputeNodeSetupActors
+        self.booted = {}        # Cloud node IDs to _ComputeNodeRecords
         self.shutdowns = {}     # Cloud node IDs to ComputeNodeShutdownActors
         self._logger.debug("Daemon initialized")
 
@@ -154,22 +156,24 @@ class NodeManagerDaemonActor(actor_class):
         self._cloud_nodes_actor.subscribe_to(cloud_node.id,
                                              actor.update_cloud_node)
         record = _ComputeNodeRecord(actor, cloud_node)
-        self.cloud_nodes.add(record)
         return record
 
     def update_cloud_nodes(self, nodelist):
         self._update_poll_time('cloud_nodes')
         for key, node in self.cloud_nodes.update_from(nodelist):
             self._logger.info("Registering new cloud node %s", key)
-            record = self._new_node(node)
+            if key in self.booted:
+                record = self.booted.pop(key)
+            else:
+                record = self._new_node(node)
+            self.cloud_nodes.add(record)
             for arv_rec in self.arvados_nodes.unpaired():
                 if record.actor.offer_arvados_pair(arv_rec.arvados_node).get():
                     self._pair_nodes(record, arv_rec.arvados_node)
                     break
         for key, record in self.cloud_nodes.orphans.iteritems():
             record.actor.stop()
-            if key in self.shutdowns:
-                self.shutdowns.pop(key).stop()
+            self.shutdowns.pop(key, None)
 
     def update_arvados_nodes(self, nodelist):
         self._update_poll_time('arvados_nodes')
@@ -184,19 +188,28 @@ class NodeManagerDaemonActor(actor_class):
                     self._pair_nodes(cloud_rec, arv_node)
                     break
 
-    def _node_count(self):
-        up = sum(len(nodelist) for nodelist in [self.cloud_nodes, self.booting])
+    def _nodes_up(self):
+        up = sum(len(nodelist) for nodelist in
+                 [self.cloud_nodes, self.booted, self.booting])
         return up - len(self.shutdowns)
 
+    def _nodes_busy(self):
+        return sum(1 for idle in
+                   pykka.get_all(rec.actor.in_state('idle') for rec in
+                                 self.cloud_nodes.nodes.itervalues())
+                   if idle is False)
+
     def _nodes_wanted(self):
-        return len(self.last_wishlist) - self._node_count()
+        return min(len(self.last_wishlist) + self._nodes_busy(),
+                   self.max_nodes) - self._nodes_up()
 
     def _nodes_excess(self):
-        return -self._nodes_wanted()
+        needed_nodes = self._nodes_busy() + len(self.last_wishlist)
+        return (self._nodes_up() - max(self.min_nodes, needed_nodes))
 
     def update_server_wishlist(self, wishlist):
         self._update_poll_time('server_wishlist')
-        self.last_wishlist = wishlist[:self.max_nodes]
+        self.last_wishlist = wishlist
         nodes_wanted = self._nodes_wanted()
         if nodes_wanted > 0:
             self._later.start_node()
@@ -253,6 +266,7 @@ class NodeManagerDaemonActor(actor_class):
         record = self.cloud_nodes.get(cloud_node.id)
         if record is None:
             record = self._new_node(cloud_node)
+            self.booted[cloud_node.id] = record
         self._pair_nodes(record, arvados_node)
 
     @_check_poll_freshness
@@ -279,6 +293,14 @@ class NodeManagerDaemonActor(actor_class):
                                              cloud_client=self._new_cloud(),
                                              cloud_node=cloud_node).proxy()
         self.shutdowns[cloud_node.id] = shutdown
+        shutdown.subscribe(self._later.node_finished_shutdown)
+
+    def node_finished_shutdown(self, shutdown_actor):
+        cloud_node_id = shutdown_actor.cloud_node.get().id
+        shutdown_actor.stop()
+        if cloud_node_id in self.booted:
+            self.booted.pop(cloud_node_id).actor.stop()
+            del self.shutdowns[cloud_node_id]
 
     def shutdown(self):
         self._logger.info("Shutting down after signal.")