8953: Drained SLURM nodes can be eligible for shutdown.
[arvados.git] / services / nodemanager / arvnodeman / computenode / dispatch / slurm.py
index 9ef54b3881c8c2483f48923b670bc9b6073cf607..6d979b6c5cbd08c3c6db27ffda4895e14234c11b 100644 (file)
@@ -72,11 +72,14 @@ class ComputeNodeShutdownActor(SlurmMixin, ShutdownActorBase):
 class ComputeNodeMonitorActor(SlurmMixin, MonitorActorBase):
 
     def shutdown_eligible(self):
-        if (self.arvados_node is not None and
-            self._get_slurm_state(self.arvados_node['hostname']) in self.SLURM_END_STATES):
-            return True
-        else:
-            return super(ComputeNodeMonitorActor, self).shutdown_eligible()
+        if self.arvados_node is not None:
+            state = self._get_slurm_state(self.arvados_node['hostname'])
+            # Automatically eligible for shutdown if it's down or failed, but
+            # not drain to avoid a race condition with resume_node().
+            if ((state in self.SLURM_END_STATES) and
+                  (state not in self.SLURM_DRAIN_STATES)):
+                return True
+        return super(ComputeNodeMonitorActor, self).shutdown_eligible()
 
     def resume_node(self):
         try: