X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/241ef75ec8b6cf5dd14ce19fa068462adaeb0386..2d112118532f0e059bc7e72d85ee63083a98156f:/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py index 9ef54b3881..41919db07e 100644 --- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py +++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py @@ -72,11 +72,16 @@ class ComputeNodeShutdownActor(SlurmMixin, ShutdownActorBase): class ComputeNodeMonitorActor(SlurmMixin, MonitorActorBase): def shutdown_eligible(self): - if (self.arvados_node is not None and - self._get_slurm_state(self.arvados_node['hostname']) in self.SLURM_END_STATES): - return True - else: - return super(ComputeNodeMonitorActor, self).shutdown_eligible() + if self.arvados_node is not None: + state = self._get_slurm_state(self.arvados_node['hostname']) + # Automatically eligible for shutdown if it's down or failed, but + # not drain to avoid a race condition with resume_node(). + if state in self.SLURM_END_STATES: + if state in self.SLURM_DRAIN_STATES: + return "node is draining" + else: + return True + return super(ComputeNodeMonitorActor, self).shutdown_eligible() def resume_node(self): try: