X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/44e01cf266a3c062b2f0f5bb3426672024367d38..ca136c579fc014b6428c18cc3a74a45550539543:/services/nodemanager/arvnodeman/daemon.py?ds=sidebyside diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py index 5b7437f8d3..eaf10be324 100644 --- a/services/nodemanager/arvnodeman/daemon.py +++ b/services/nodemanager/arvnodeman/daemon.py @@ -94,7 +94,7 @@ class NodeManagerDaemonActor(actor_class): def __init__(self, server_wishlist_actor, arvados_nodes_actor, cloud_nodes_actor, cloud_update_actor, timer_actor, arvados_factory, cloud_factory, - shutdown_windows, max_nodes, + shutdown_windows, min_nodes, max_nodes, poll_stale_after=600, node_stale_after=7200, node_setup_class=cnode.ComputeNodeSetupActor, node_shutdown_class=cnode.ComputeNodeShutdownActor, @@ -111,6 +111,7 @@ class NodeManagerDaemonActor(actor_class): self._logger = logging.getLogger('arvnodeman.daemon') self._later = self.actor_ref.proxy() self.shutdown_windows = shutdown_windows + self.min_nodes = min_nodes self.max_nodes = max_nodes self.poll_stale_after = poll_stale_after self.node_stale_after = node_stale_after @@ -123,6 +124,7 @@ class NodeManagerDaemonActor(actor_class): self.cloud_nodes = _CloudNodeTracker() self.arvados_nodes = _ArvadosNodeTracker() self.booting = {} # Actor IDs to ComputeNodeSetupActors + self.booted = {} # Cloud node IDs to _ComputeNodeRecords self.shutdowns = {} # Cloud node IDs to ComputeNodeShutdownActors self._logger.debug("Daemon initialized") @@ -154,22 +156,24 @@ class NodeManagerDaemonActor(actor_class): self._cloud_nodes_actor.subscribe_to(cloud_node.id, actor.update_cloud_node) record = _ComputeNodeRecord(actor, cloud_node) - self.cloud_nodes.add(record) return record def update_cloud_nodes(self, nodelist): self._update_poll_time('cloud_nodes') for key, node in self.cloud_nodes.update_from(nodelist): self._logger.info("Registering new cloud node %s", key) - record = self._new_node(node) + if key in self.booted: + record = self.booted.pop(key) + else: + record = self._new_node(node) + self.cloud_nodes.add(record) for arv_rec in self.arvados_nodes.unpaired(): if record.actor.offer_arvados_pair(arv_rec.arvados_node).get(): self._pair_nodes(record, arv_rec.arvados_node) break for key, record in self.cloud_nodes.orphans.iteritems(): record.actor.stop() - if key in self.shutdowns: - self.shutdowns.pop(key).stop() + self.shutdowns.pop(key, None) def update_arvados_nodes(self, nodelist): self._update_poll_time('arvados_nodes') @@ -184,19 +188,28 @@ class NodeManagerDaemonActor(actor_class): self._pair_nodes(cloud_rec, arv_node) break - def _node_count(self): - up = sum(len(nodelist) for nodelist in [self.cloud_nodes, self.booting]) + def _nodes_up(self): + up = sum(len(nodelist) for nodelist in + [self.cloud_nodes, self.booted, self.booting]) return up - len(self.shutdowns) + def _nodes_busy(self): + return sum(1 for idle in + pykka.get_all(rec.actor.in_state('idle') for rec in + self.cloud_nodes.nodes.itervalues()) + if idle is False) + def _nodes_wanted(self): - return len(self.last_wishlist) - self._node_count() + return min(len(self.last_wishlist) + self._nodes_busy(), + self.max_nodes) - self._nodes_up() def _nodes_excess(self): - return -self._nodes_wanted() + needed_nodes = self._nodes_busy() + len(self.last_wishlist) + return (self._nodes_up() - max(self.min_nodes, needed_nodes)) def update_server_wishlist(self, wishlist): self._update_poll_time('server_wishlist') - self.last_wishlist = wishlist[:self.max_nodes] + self.last_wishlist = wishlist nodes_wanted = self._nodes_wanted() if nodes_wanted > 0: self._later.start_node() @@ -253,6 +266,7 @@ class NodeManagerDaemonActor(actor_class): record = self.cloud_nodes.get(cloud_node.id) if record is None: record = self._new_node(cloud_node) + self.booted[cloud_node.id] = record self._pair_nodes(record, arvados_node) @_check_poll_freshness @@ -279,6 +293,14 @@ class NodeManagerDaemonActor(actor_class): cloud_client=self._new_cloud(), cloud_node=cloud_node).proxy() self.shutdowns[cloud_node.id] = shutdown + shutdown.subscribe(self._later.node_finished_shutdown) + + def node_finished_shutdown(self, shutdown_actor): + cloud_node_id = shutdown_actor.cloud_node.get().id + shutdown_actor.stop() + if cloud_node_id in self.booted: + self.booted.pop(cloud_node_id).actor.stop() + del self.shutdowns[cloud_node_id] def shutdown(self): self._logger.info("Shutting down after signal.")