X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/896d01ea7b3fed8b621ff930893d3ec806931dc9..44823f311ea0328c3a2aeefb7208a74031436d52:/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py index 48e8dcf458..70817627df 100644 --- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py +++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py @@ -9,7 +9,8 @@ import time import libcloud.common.types as cloud_types import pykka -from .. import arvados_node_fqdn, arvados_node_mtime, timestamp_fresh +from .. import \ + arvados_node_fqdn, arvados_node_mtime, arvados_timestamp, timestamp_fresh from ...clientactor import _notify_subscribers from ... import config @@ -252,13 +253,14 @@ class ComputeNodeMonitorActor(config.actor_class): for shutdown. """ def __init__(self, cloud_node, cloud_node_start_time, shutdown_timer, - timer_actor, update_actor, arvados_node=None, + cloud_fqdn_func, timer_actor, update_actor, arvados_node=None, poll_stale_after=600, node_stale_after=3600): super(ComputeNodeMonitorActor, self).__init__() self._later = self.actor_ref.proxy() self._logger = logging.getLogger('arvnodeman.computenode') self._last_log = None self._shutdowns = shutdown_timer + self._cloud_node_fqdn = cloud_fqdn_func self._timer = timer_actor self._update = update_actor self.cloud_node = cloud_node @@ -288,7 +290,7 @@ class ComputeNodeMonitorActor(config.actor_class): if (self.arvados_node is None) or not timestamp_fresh( arvados_node_mtime(self.arvados_node), self.node_stale_after): return None - state = self.arvados_node['info'].get('slurm_state') + state = self.arvados_node['crunch_worker_state'] if not state: return None result = state in states @@ -322,9 +324,11 @@ class ComputeNodeMonitorActor(config.actor_class): self.last_shutdown_opening = next_opening def offer_arvados_pair(self, arvados_node): - if self.arvados_node is not None: + first_ping_s = arvados_node.get('first_ping_at') + if (self.arvados_node is not None) or (not first_ping_s): return None - elif arvados_node['ip_address'] in self.cloud_node.private_ips: + elif ((arvados_node['ip_address'] in self.cloud_node.private_ips) and + (arvados_timestamp(first_ping_s) >= self.cloud_node_start_time)): self._later.update_arvados_node(arvados_node) return self.cloud_node.id else: @@ -336,9 +340,17 @@ class ComputeNodeMonitorActor(config.actor_class): self._later.consider_shutdown() def update_arvados_node(self, arvados_node): + # If the cloud node's FQDN doesn't match what's in the Arvados node + # record, make them match. + # This method is a little unusual in the way it just fires off the + # request without checking the result or retrying errors. That's + # because this update happens every time we reload the Arvados node + # list: if a previous sync attempt failed, we'll see that the names + # are out of sync and just try again. ComputeNodeUpdateActor has + # the logic to throttle those effective retries when there's trouble. if arvados_node is not None: self.arvados_node = arvados_node - new_hostname = arvados_node_fqdn(self.arvados_node) - if new_hostname != self.cloud_node.name: + if (self._cloud_node_fqdn(self.cloud_node) != + arvados_node_fqdn(self.arvados_node)): self._update.sync_node(self.cloud_node, self.arvados_node) self._later.consider_shutdown()