X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/72e3566f2cdacd44f095183ebf88f7aab8b0d8dc..d0f91b34eedec9af266ec4877a3005dd627ad38e:/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py index 455719832d..1c828c13c3 100644 --- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py +++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py @@ -10,7 +10,7 @@ import libcloud.common.types as cloud_types import pykka from .. import \ - arvados_node_fqdn, arvados_node_mtime, arvados_timestamp, timestamp_fresh + arvados_node_fqdn, arvados_node_mtime, arvados_timestamp, timestamp_fresh, arvados_node_missing from ...clientactor import _notify_subscribers from ... import config @@ -325,16 +325,21 @@ class ComputeNodeMonitorActor(config.actor_class): def shutdown_eligible(self): if not self._shutdowns.window_open(): return False - elif self.arvados_node is None: + if self.arvados_node is None: # Node is unpaired. # If it hasn't pinged Arvados after boot_fail seconds, shut it down return not timestamp_fresh(self.cloud_node_start_time, self.boot_fail_after) - elif self.arvados_node.get('status') == "missing" and self._cloud.broken(self.cloud_node): + missing = arvados_node_missing(self.arvados_node, self.node_stale_after) + if missing and self._cloud.broken(self.cloud_node): # Node is paired, but Arvados says it is missing and the cloud says the node # is in an error state, so shut it down. return True - else: - return self.in_state('idle') + if missing is None and self._cloud.broken(self.cloud_node): + self._logger.warning( + "cloud reports broken node, but paired node %s never pinged " + "(bug?) -- skipped check for node_stale_after", + self.arvados_node['uuid']) + return self.in_state('idle') def consider_shutdown(self): next_opening = self._shutdowns.next_opening()