X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/759878e0a72644c5e537c51da9806672cf92e458..refs/heads/5562-pycurl:/services/nodemanager/arvnodeman/daemon.py diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py index 53af9339f0..ba52871d39 100644 --- a/services/nodemanager/arvnodeman/daemon.py +++ b/services/nodemanager/arvnodeman/daemon.py @@ -97,7 +97,7 @@ class NodeManagerDaemonActor(actor_class): def __init__(self, server_wishlist_actor, arvados_nodes_actor, cloud_nodes_actor, cloud_update_actor, timer_actor, arvados_factory, cloud_factory, - shutdown_windows, min_nodes, max_nodes, + shutdown_windows, min_size, min_nodes, max_nodes, poll_stale_after=600, boot_fail_after=1800, node_stale_after=7200, @@ -116,6 +116,7 @@ class NodeManagerDaemonActor(actor_class): self._logger = logging.getLogger('arvnodeman.daemon') self._later = self.actor_ref.proxy() self.shutdown_windows = shutdown_windows + self.min_cloud_size = min_size self.min_nodes = min_nodes self.max_nodes = max_nodes self.poll_stale_after = poll_stale_after @@ -153,6 +154,7 @@ class NodeManagerDaemonActor(actor_class): cloud_node=cloud_node, cloud_node_start_time=start_time, shutdown_timer=shutdown_timer, + cloud_fqdn_func=self._cloud_driver.node_fqdn, update_actor=self._cloud_updater, timer_actor=self._timer, arvados_node=None, @@ -207,9 +209,12 @@ class NodeManagerDaemonActor(actor_class): def _nodes_wanted(self): up_count = self._nodes_up() + under_min = self.min_nodes - up_count over_max = up_count - self.max_nodes if over_max >= 0: return -over_max + elif under_min > 0: + return under_min else: up_count -= len(self.shutdowns) + self._nodes_busy() return len(self.last_wishlist) - up_count @@ -254,7 +259,10 @@ class NodeManagerDaemonActor(actor_class): if nodes_wanted < 1: return None arvados_node = self.arvados_nodes.find_stale_node(self.node_stale_after) - cloud_size = self.last_wishlist[nodes_wanted - 1] + try: + cloud_size = self.last_wishlist[self._nodes_up()] + except IndexError: + cloud_size = self.min_cloud_size self._logger.info("Want %s more nodes. Booting a %s node.", nodes_wanted, cloud_size.name) new_setup = self._node_setup.start( @@ -291,8 +299,7 @@ class NodeManagerDaemonActor(actor_class): if (nodes_excess < 1) or not self.booting: return None for key, node in self.booting.iteritems(): - node.stop_if_no_cloud_node().get() - if not node.actor_ref.is_alive(): + if node.stop_if_no_cloud_node().get(): del self.booting[key] if nodes_excess > 1: self._later.stop_booting_node() @@ -337,12 +344,14 @@ class NodeManagerDaemonActor(actor_class): def shutdown(self): self._logger.info("Shutting down after signal.") self.poll_stale_after = -1 # Inhibit starting/stopping nodes - for bootnode in self.booting.itervalues(): - bootnode.stop_if_no_cloud_node() + setup_stops = {key: node.stop_if_no_cloud_node() + for key, node in self.booting.iteritems()} + self.booting = {key: self.booting[key] + for key in setup_stops if not setup_stops[key].get()} self._later.await_shutdown() def await_shutdown(self): - if any(node.actor_ref.is_alive() for node in self.booting.itervalues()): + if self.booting: self._timer.schedule(time.time() + 1, self._later.await_shutdown) else: self.stop()