Merge branch 'pr/28'
[arvados.git] / services / nodemanager / arvnodeman / daemon.py
index 7ff736fb86dcd075e0c9c3777479a746e1767ea7..1d52073ce5ff7a362e0b8e9cc13b12c0f2d3b6a2 100644 (file)
@@ -26,14 +26,16 @@ class _BaseNodeTracker(object):
         self.nodes = {}
         self.orphans = {}
 
-    def __getitem__(self, key):
-        return self.nodes[key]
-
-    def __len__(self):
-        return len(self.nodes)
+    # Proxy the methods listed below to self.nodes.
+    def _proxy_method(name):
+        method = getattr(dict, name)
+        @functools.wraps(method, ('__name__', '__doc__'))
+        def wrapper(self, *args, **kwargs):
+            return method(self.nodes, *args, **kwargs)
+        return wrapper
 
-    def get(self, key, default=None):
-        return self.nodes.get(key, default)
+    for _method_name in ['__contains__', '__getitem__', '__len__', 'get']:
+        locals()[_method_name] = _proxy_method(_method_name)
 
     def record_key(self, record):
         return self.item_key(getattr(record, self.RECORD_ATTR))
@@ -95,8 +97,10 @@ class NodeManagerDaemonActor(actor_class):
     def __init__(self, server_wishlist_actor, arvados_nodes_actor,
                  cloud_nodes_actor, cloud_update_actor, timer_actor,
                  arvados_factory, cloud_factory,
-                 shutdown_windows, min_nodes, max_nodes,
-                 poll_stale_after=600, node_stale_after=7200,
+                 shutdown_windows, min_size, min_nodes, max_nodes,
+                 poll_stale_after=600,
+                 boot_fail_after=1800,
+                 node_stale_after=7200,
                  node_setup_class=dispatch.ComputeNodeSetupActor,
                  node_shutdown_class=dispatch.ComputeNodeShutdownActor,
                  node_actor_class=dispatch.ComputeNodeMonitorActor):
@@ -112,9 +116,11 @@ class NodeManagerDaemonActor(actor_class):
         self._logger = logging.getLogger('arvnodeman.daemon')
         self._later = self.actor_ref.proxy()
         self.shutdown_windows = shutdown_windows
+        self.min_cloud_size = min_size
         self.min_nodes = min_nodes
         self.max_nodes = max_nodes
         self.poll_stale_after = poll_stale_after
+        self.boot_fail_after = boot_fail_after
         self.node_stale_after = node_stale_after
         self.last_polls = {}
         for poll_name in ['server_wishlist', 'arvados_nodes', 'cloud_nodes']:
@@ -148,11 +154,14 @@ class NodeManagerDaemonActor(actor_class):
             cloud_node=cloud_node,
             cloud_node_start_time=start_time,
             shutdown_timer=shutdown_timer,
+            cloud_fqdn_func=self._cloud_driver.node_fqdn,
             update_actor=self._cloud_updater,
             timer_actor=self._timer,
             arvados_node=None,
             poll_stale_after=self.poll_stale_after,
-            node_stale_after=self.node_stale_after).proxy()
+            node_stale_after=self.node_stale_after,
+            cloud_client=self._cloud_driver,
+            boot_fail_after=self.boot_fail_after).proxy()
         actor.subscribe(self._later.node_can_shutdown)
         self._cloud_nodes_actor.subscribe_to(cloud_node.id,
                                              actor.update_cloud_node)
@@ -173,8 +182,14 @@ class NodeManagerDaemonActor(actor_class):
                     self._pair_nodes(record, arv_rec.arvados_node)
                     break
         for key, record in self.cloud_nodes.orphans.iteritems():
+            if key in self.shutdowns:
+                try:
+                    self.shutdowns[key].stop().get()
+                except pykka.ActorDeadError:
+                    pass
+                del self.shutdowns[key]
             record.actor.stop()
-            self.shutdowns.pop(key, None)
+            record.cloud_node = None
 
     def update_arvados_nodes(self, nodelist):
         self._update_poll_time('arvados_nodes')
@@ -190,23 +205,41 @@ class NodeManagerDaemonActor(actor_class):
                     break
 
     def _nodes_up(self):
-        up = sum(len(nodelist) for nodelist in
-                 [self.cloud_nodes, self.booted, self.booting])
-        return up - len(self.shutdowns)
+        return sum(len(nodelist) for nodelist in
+                   [self.cloud_nodes, self.booted, self.booting])
 
     def _nodes_busy(self):
-        return sum(1 for idle in
-                   pykka.get_all(rec.actor.in_state('idle') for rec in
+        return sum(1 for busy in
+                   pykka.get_all(rec.actor.in_state('busy') for rec in
                                  self.cloud_nodes.nodes.itervalues())
-                   if idle is False)
+                   if busy)
+
+    def _nodes_missing(self):
+        return sum(1 for arv_node in
+                   pykka.get_all(rec.actor.arvados_node for rec in
+                                 self.cloud_nodes.nodes.itervalues()
+                                 if rec.actor.cloud_node.get().id not in self.shutdowns)
+                   if arv_node and cnode.arvados_node_missing(arv_node, self.node_stale_after))
 
     def _nodes_wanted(self):
-        return min(len(self.last_wishlist) + self._nodes_busy(),
-                   self.max_nodes) - self._nodes_up()
+        up_count = self._nodes_up()
+        under_min = self.min_nodes - up_count
+        over_max = up_count - self.max_nodes
+        if over_max >= 0:
+            return -over_max
+        elif under_min > 0:
+            return under_min
+        else:
+            up_count -= len(self.shutdowns) + self._nodes_busy() + self._nodes_missing()
+            return len(self.last_wishlist) - up_count
 
     def _nodes_excess(self):
-        needed_nodes = self._nodes_busy() + len(self.last_wishlist)
-        return (self._nodes_up() - max(self.min_nodes, needed_nodes))
+        up_count = self._nodes_up() - len(self.shutdowns)
+        over_min = up_count - self.min_nodes
+        if over_min <= 0:
+            return over_min
+        else:
+            return up_count - self._nodes_busy() - len(self.last_wishlist)
 
     def update_server_wishlist(self, wishlist):
         self._update_poll_time('server_wishlist')
@@ -240,7 +273,10 @@ class NodeManagerDaemonActor(actor_class):
         if nodes_wanted < 1:
             return None
         arvados_node = self.arvados_nodes.find_stale_node(self.node_stale_after)
-        cloud_size = self.last_wishlist[nodes_wanted - 1]
+        try:
+            cloud_size = self.last_wishlist[self._nodes_up()]
+        except IndexError:
+            cloud_size = self.min_cloud_size
         self._logger.info("Want %s more nodes.  Booting a %s node.",
                           nodes_wanted, cloud_size.name)
         new_setup = self._node_setup.start(
@@ -257,18 +293,19 @@ class NodeManagerDaemonActor(actor_class):
         if nodes_wanted > 1:
             self._later.start_node()
 
-    def _actor_nodes(self, node_actor):
-        return pykka.get_all([node_actor.cloud_node, node_actor.arvados_node])
+    def _get_actor_attrs(self, actor, *attr_names):
+        return pykka.get_all([getattr(actor, name) for name in attr_names])
 
     def node_up(self, setup_proxy):
-        cloud_node, arvados_node = self._actor_nodes(setup_proxy)
+        cloud_node = setup_proxy.cloud_node.get()
         del self.booting[setup_proxy.actor_ref.actor_urn]
         setup_proxy.stop()
         record = self.cloud_nodes.get(cloud_node.id)
         if record is None:
             record = self._new_node(cloud_node)
             self.booted[cloud_node.id] = record
-        self._pair_nodes(record, arvados_node)
+        self._timer.schedule(time.time() + self.boot_fail_after,
+                             self._later.shutdown_unpaired_node, cloud_node.id)
 
     @_check_poll_freshness
     def stop_booting_node(self):
@@ -276,42 +313,60 @@ class NodeManagerDaemonActor(actor_class):
         if (nodes_excess < 1) or not self.booting:
             return None
         for key, node in self.booting.iteritems():
-            node.stop_if_no_cloud_node().get()
-            if not node.actor_ref.is_alive():
+            if node.stop_if_no_cloud_node().get():
                 del self.booting[key]
                 if nodes_excess > 1:
                     self._later.stop_booting_node()
                 break
 
+    def _begin_node_shutdown(self, node_actor, cancellable):
+        cloud_node_id = node_actor.cloud_node.get().id
+        if cloud_node_id in self.shutdowns:
+            return None
+        shutdown = self._node_shutdown.start(
+            timer_actor=self._timer, cloud_client=self._new_cloud(),
+            arvados_client=self._new_arvados(),
+            node_monitor=node_actor.actor_ref, cancellable=cancellable).proxy()
+        self.shutdowns[cloud_node_id] = shutdown
+        shutdown.subscribe(self._later.node_finished_shutdown)
+
     @_check_poll_freshness
     def node_can_shutdown(self, node_actor):
-        if self._nodes_excess() < 1:
-            return None
-        cloud_node, arvados_node = self._actor_nodes(node_actor)
-        if cloud_node.id in self.shutdowns:
+        if self._nodes_excess() > 0:
+            self._begin_node_shutdown(node_actor, cancellable=True)
+
+    def shutdown_unpaired_node(self, cloud_node_id):
+        for record_dict in [self.cloud_nodes, self.booted]:
+            if cloud_node_id in record_dict:
+                record = record_dict[cloud_node_id]
+                break
+        else:
             return None
-        shutdown = self._node_shutdown.start(timer_actor=self._timer,
-                                             cloud_client=self._new_cloud(),
-                                             cloud_node=cloud_node).proxy()
-        self.shutdowns[cloud_node.id] = shutdown
-        shutdown.subscribe(self._later.node_finished_shutdown)
+        if not record.actor.in_state('idle', 'busy').get():
+            self._begin_node_shutdown(record.actor, cancellable=False)
 
     def node_finished_shutdown(self, shutdown_actor):
-        cloud_node_id = shutdown_actor.cloud_node.get().id
+        success, cloud_node = self._get_actor_attrs(shutdown_actor, 'success',
+                                                    'cloud_node')
         shutdown_actor.stop()
-        if cloud_node_id in self.booted:
+        cloud_node_id = cloud_node.id
+        if not success:
+            del self.shutdowns[cloud_node_id]
+        elif cloud_node_id in self.booted:
             self.booted.pop(cloud_node_id).actor.stop()
             del self.shutdowns[cloud_node_id]
 
     def shutdown(self):
         self._logger.info("Shutting down after signal.")
         self.poll_stale_after = -1  # Inhibit starting/stopping nodes
-        for bootnode in self.booting.itervalues():
-            bootnode.stop_if_no_cloud_node()
+        setup_stops = {key: node.stop_if_no_cloud_node()
+                       for key, node in self.booting.iteritems()}
+        self.booting = {key: self.booting[key]
+                        for key in setup_stops if not setup_stops[key].get()}
         self._later.await_shutdown()
 
     def await_shutdown(self):
-        if any(node.actor_ref.is_alive() for node in self.booting.itervalues()):
+        if self.booting:
             self._timer.schedule(time.time() + 1, self._later.await_shutdown)
         else:
             self.stop()