X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/0f644e242ef37c911ad3dc25aca8135c339de349..6fe8e52020d421797306e5c6536afbcee761510a:/services/nodemanager/arvnodeman/daemon.py diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py index e476e5e3e2..6e85b85ab2 100644 --- a/services/nodemanager/arvnodeman/daemon.py +++ b/services/nodemanager/arvnodeman/daemon.py @@ -78,7 +78,10 @@ class _ArvadosNodeTracker(_BaseNodeTracker): item_key = staticmethod(lambda arvados_node: arvados_node['uuid']) def find_stale_node(self, stale_time): - for record in self.nodes.itervalues(): + # Try to select a stale node record that have an assigned slot first + for record in sorted(self.nodes.itervalues(), + key=lambda r: r.arvados_node['slot_number'], + reverse=True): node = record.arvados_node if (not cnode.timestamp_fresh(cnode.arvados_node_mtime(node), stale_time) and @@ -164,7 +167,6 @@ class NodeManagerDaemonActor(actor_class): cloud_node=cloud_node, cloud_node_start_time=start_time, shutdown_timer=shutdown_timer, - cloud_fqdn_func=self._cloud_driver.node_fqdn, update_actor=self._cloud_updater, timer_actor=self._timer, arvados_node=None, @@ -213,8 +215,11 @@ class NodeManagerDaemonActor(actor_class): if hasattr(record.cloud_node, "_nodemanager_recently_booted"): self.cloud_nodes.add(record) else: - # Node disappeared from the cloud node list. Stop the monitor - # actor if necessary and forget about the node. + # Node disappeared from the cloud node list. If it's paired, + # remove its idle time counter. + if record.arvados_node: + status.tracker.idle_out(record.arvados_node.get('hostname')) + # Stop the monitor actor if necessary and forget about the node. if record.actor: try: record.actor.stop() @@ -268,6 +273,7 @@ class NodeManagerDaemonActor(actor_class): updates.setdefault('nodes_'+s, 0) updates['nodes_'+s] += 1 updates['nodes_wish'] = len(self.last_wishlist) + updates['node_quota'] = self.node_quota status.tracker.update(updates) def _state_counts(self, size): @@ -277,6 +283,7 @@ class NodeManagerDaemonActor(actor_class): "unpaired": 0, "busy": 0, "idle": 0, + "fail": 0, "down": 0, "shutdown": 0 } @@ -311,14 +318,14 @@ class NodeManagerDaemonActor(actor_class): busy_count = counts["busy"] wishlist_count = self._size_wishlist(size) - self._logger.info("%s: wishlist %i, up %i (booting %i, unpaired %i, idle %i, busy %i), down %i, shutdown %i", size.name, + self._logger.info("%s: wishlist %i, up %i (booting %i, unpaired %i, idle %i, busy %i), down %i, shutdown %i", size.id, wishlist_count, up_count, counts["booting"], counts["unpaired"], counts["idle"], busy_count, - counts["down"], + counts["down"]+counts["fail"], counts["shutdown"]) if over_max >= 0: @@ -331,7 +338,7 @@ class NodeManagerDaemonActor(actor_class): can_boot = int((self.max_total_price - total_price) / size.price) if can_boot == 0: self._logger.info("Not booting %s (price %s) because with it would exceed max_total_price of %s (current total_price is %s)", - size.name, size.price, self.max_total_price, total_price) + size.id, size.price, self.max_total_price, total_price) return can_boot else: return wanted @@ -345,7 +352,8 @@ class NodeManagerDaemonActor(actor_class): def update_server_wishlist(self, wishlist): self._update_poll_time('server_wishlist') - self.last_wishlist = wishlist + requestable_nodes = self.node_quota - (self._nodes_booting(None) + len(self.cloud_nodes)) + self.last_wishlist = wishlist[:requestable_nodes] for size in reversed(self.server_calculator.cloud_sizes): try: nodes_wanted = self._nodes_wanted(size) @@ -353,7 +361,7 @@ class NodeManagerDaemonActor(actor_class): self._later.start_node(size) elif (nodes_wanted < 0) and self.booting: self._later.stop_booting_node(size) - except Exception as e: + except Exception: self._logger.exception("while calculating nodes wanted for size %s", getattr(size, "id", "(id not available)")) try: self._update_tracker() @@ -384,20 +392,20 @@ class NodeManagerDaemonActor(actor_class): return None arvados_node = self.arvados_nodes.find_stale_node(self.node_stale_after) self._logger.info("Want %i more %s nodes. Booting a node.", - nodes_wanted, cloud_size.name) + nodes_wanted, cloud_size.id) new_setup = self._node_setup.start( timer_actor=self._timer, arvados_client=self._new_arvados(), arvados_node=arvados_node, cloud_client=self._new_cloud(), - cloud_size=self.server_calculator.find_size(cloud_size.id)).proxy() - self.booting[new_setup.actor_ref.actor_urn] = new_setup - self.sizes_booting[new_setup.actor_ref.actor_urn] = cloud_size + cloud_size=self.server_calculator.find_size(cloud_size.id)) + self.booting[new_setup.actor_urn] = new_setup.proxy() + self.sizes_booting[new_setup.actor_urn] = cloud_size if arvados_node is not None: self.arvados_nodes[arvados_node['uuid']].assignment_time = ( time.time()) - new_setup.subscribe(self._later.node_setup_finished) + new_setup.tell_proxy().subscribe(self._later.node_setup_finished) if nodes_wanted > 1: self._later.start_node(cloud_size) @@ -479,7 +487,7 @@ class NodeManagerDaemonActor(actor_class): # grace period without a ping, so shut it down so we can boot a new # node in its place. self._begin_node_shutdown(node_actor, cancellable=False) - elif node_actor.in_state('down').get(): + elif node_actor.in_state('down', 'fail').get(): # Node is down and unlikely to come back. self._begin_node_shutdown(node_actor, cancellable=False) except pykka.ActorDeadError as e: @@ -498,8 +506,19 @@ class NodeManagerDaemonActor(actor_class): except pykka.ActorDeadError: return cloud_node_id = cloud_node.id - record = self.cloud_nodes[cloud_node_id] - shutdown_actor.stop() + + try: + shutdown_actor.stop() + except pykka.ActorDeadError: + pass + + try: + record = self.cloud_nodes[cloud_node_id] + except KeyError: + # Cloud node was already removed from the cloud node list + # supposedly while the destroy_node call was finishing its + # job. + return record.shutdown_actor = None if not success: