From d2efca1759c9e104f67d1caf85c8e62ff06807bd Mon Sep 17 00:00:00 2001 From: Lucas Di Pentima Date: Mon, 11 Jun 2018 12:10:44 -0300 Subject: [PATCH] 7478: Assign invalid cloud sizes to nodes with stale arvados_node_size tags When retrieving the node's list from the cloud provider, if the tag refers to a non existant arvados cloud size, assign an invalid size and return 'down' state when asked so that it can be properly shut down. Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima --- .../computenode/dispatch/__init__.py | 5 +++++ services/nodemanager/arvnodeman/jobqueue.py | 22 ++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py index 9106ea67cc..dec8ef1434 100644 --- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py +++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py @@ -370,6 +370,11 @@ class ComputeNodeMonitorActor(config.actor_class): def get_state(self): """Get node state, one of ['unpaired', 'busy', 'idle', 'down'].""" + # If this node's size is invalid (because it has a stale arvados_node_size + # tag), return 'down' so that it's properly shut down. + if self.cloud_node.size.id == 'invalid': + return 'down' + # If this node is not associated with an Arvados node, return # 'unpaired' if we're in the boot grace period, and 'down' if not, # so it isn't counted towards usable nodes. diff --git a/services/nodemanager/arvnodeman/jobqueue.py b/services/nodemanager/arvnodeman/jobqueue.py index f3fdc8ee67..db578e01fe 100644 --- a/services/nodemanager/arvnodeman/jobqueue.py +++ b/services/nodemanager/arvnodeman/jobqueue.py @@ -24,6 +24,26 @@ class ServerCalculator(object): that would best satisfy the jobs, choosing the cheapest size that satisfies each job, and ignoring jobs that can't be satisfied. """ + class InvalidCloudSize(object): + """ + Dummy CloudSizeWrapper-like class, to be used when a cloud node doesn't + have a recognizable arvados_node_size tag. + """ + def __init__(self): + self.id = 'invalid' + self.name = 'invalid' + self.ram = 0 + self.disk = 0 + self.scratch = 0 + self.cores = 0 + self.bandwidth = 0 + self.price = 9999999 + self.preemptable = False + self.extra = {} + + def meets_constraints(self, **kwargs): + return False + class CloudSizeWrapper(object): def __init__(self, real_size, node_mem_scaling, **kwargs): @@ -119,7 +139,7 @@ class ServerCalculator(object): for s in self.cloud_sizes: if s.id == sizeid: return s - return None + return InvalidCloudSize() class JobQueueMonitorActor(clientactor.RemotePollLoopActor): -- 2.30.2