4533: Merge branch 'master' into 4533-remote-reset
[arvados.git] / services / nodemanager / arvnodeman / daemon.py
index 6ea3cdf4291393edb18088d5ed26122c499fd3bf..9f22568faafbb1c45d9625816fb1c0027226882a 100644 (file)
@@ -9,6 +9,7 @@ import time
 import pykka
 
 from . import computenode as cnode
+from .computenode import dispatch
 from .config import actor_class
 
 class _ComputeNodeRecord(object):
@@ -94,11 +95,11 @@ class NodeManagerDaemonActor(actor_class):
     def __init__(self, server_wishlist_actor, arvados_nodes_actor,
                  cloud_nodes_actor, cloud_update_actor, timer_actor,
                  arvados_factory, cloud_factory,
-                 shutdown_windows, max_nodes,
+                 shutdown_windows, min_nodes, max_nodes,
                  poll_stale_after=600, node_stale_after=7200,
-                 node_setup_class=cnode.ComputeNodeSetupActor,
-                 node_shutdown_class=cnode.ComputeNodeShutdownActor,
-                 node_actor_class=cnode.ComputeNodeMonitorActor):
+                 node_setup_class=dispatch.ComputeNodeSetupActor,
+                 node_shutdown_class=dispatch.ComputeNodeShutdownActor,
+                 node_actor_class=dispatch.ComputeNodeMonitorActor):
         super(NodeManagerDaemonActor, self).__init__()
         self._node_setup = node_setup_class
         self._node_shutdown = node_shutdown_class
@@ -111,6 +112,7 @@ class NodeManagerDaemonActor(actor_class):
         self._logger = logging.getLogger('arvnodeman.daemon')
         self._later = self.actor_ref.proxy()
         self.shutdown_windows = shutdown_windows
+        self.min_nodes = min_nodes
         self.max_nodes = max_nodes
         self.poll_stale_after = poll_stale_after
         self.node_stale_after = node_stale_after
@@ -187,20 +189,36 @@ class NodeManagerDaemonActor(actor_class):
                     self._pair_nodes(cloud_rec, arv_node)
                     break
 
-    def _node_count(self):
-        up = sum(len(nodelist) for nodelist in
-                 [self.cloud_nodes, self.booted, self.booting])
-        return up - len(self.shutdowns)
+    def _nodes_up(self):
+        return sum(len(nodelist) for nodelist in
+                   [self.cloud_nodes, self.booted, self.booting])
+
+    def _nodes_busy(self):
+        return sum(1 for idle in
+                   pykka.get_all(rec.actor.in_state('idle') for rec in
+                                 self.cloud_nodes.nodes.itervalues())
+                   if idle is False)
 
     def _nodes_wanted(self):
-        return len(self.last_wishlist) - self._node_count()
+        up_count = self._nodes_up()
+        over_max = up_count - self.max_nodes
+        if over_max >= 0:
+            return -over_max
+        else:
+            up_count -= len(self.shutdowns) + self._nodes_busy()
+            return len(self.last_wishlist) - up_count
 
     def _nodes_excess(self):
-        return -self._nodes_wanted()
+        up_count = self._nodes_up() - len(self.shutdowns)
+        over_min = up_count - self.min_nodes
+        if over_min <= 0:
+            return over_min
+        else:
+            return up_count - self._nodes_busy() - len(self.last_wishlist)
 
     def update_server_wishlist(self, wishlist):
         self._update_poll_time('server_wishlist')
-        self.last_wishlist = wishlist[:self.max_nodes]
+        self.last_wishlist = wishlist
         nodes_wanted = self._nodes_wanted()
         if nodes_wanted > 0:
             self._later.start_node()
@@ -247,11 +265,12 @@ class NodeManagerDaemonActor(actor_class):
         if nodes_wanted > 1:
             self._later.start_node()
 
-    def _actor_nodes(self, node_actor):
-        return pykka.get_all([node_actor.cloud_node, node_actor.arvados_node])
+    def _get_actor_attrs(self, actor, *attr_names):
+        return pykka.get_all([getattr(actor, name) for name in attr_names])
 
     def node_up(self, setup_proxy):
-        cloud_node, arvados_node = self._actor_nodes(setup_proxy)
+        cloud_node, arvados_node = self._get_actor_attrs(
+            setup_proxy, 'cloud_node', 'arvados_node')
         del self.booting[setup_proxy.actor_ref.actor_urn]
         setup_proxy.stop()
         record = self.cloud_nodes.get(cloud_node.id)
@@ -277,19 +296,23 @@ class NodeManagerDaemonActor(actor_class):
     def node_can_shutdown(self, node_actor):
         if self._nodes_excess() < 1:
             return None
-        cloud_node, arvados_node = self._actor_nodes(node_actor)
-        if cloud_node.id in self.shutdowns:
+        cloud_node_id = node_actor.cloud_node.get().id
+        if cloud_node_id in self.shutdowns:
             return None
-        shutdown = self._node_shutdown.start(timer_actor=self._timer,
-                                             cloud_client=self._new_cloud(),
-                                             cloud_node=cloud_node).proxy()
-        self.shutdowns[cloud_node.id] = shutdown
+        shutdown = self._node_shutdown.start(
+            timer_actor=self._timer, cloud_client=self._new_cloud(),
+            node_monitor=node_actor.actor_ref).proxy()
+        self.shutdowns[cloud_node_id] = shutdown
         shutdown.subscribe(self._later.node_finished_shutdown)
 
     def node_finished_shutdown(self, shutdown_actor):
-        cloud_node_id = shutdown_actor.cloud_node.get().id
+        success, cloud_node = self._get_actor_attrs(shutdown_actor, 'success',
+                                                    'cloud_node')
         shutdown_actor.stop()
-        if cloud_node_id in self.booted:
+        cloud_node_id = cloud_node.id
+        if not success:
+            del self.shutdowns[cloud_node_id]
+        elif cloud_node_id in self.booted:
             self.booted.pop(cloud_node_id).actor.stop()
             del self.shutdowns[cloud_node_id]