Merge branch 'master' into 7658-websockets-reconnect-on-close
[arvados.git] / services / nodemanager / arvnodeman / computenode / dispatch / slurm.py
index 9ef54b3881c8c2483f48923b670bc9b6073cf607..41919db07e12efe7a262c4635e9a8432febdec2f 100644 (file)
@@ -72,11 +72,16 @@ class ComputeNodeShutdownActor(SlurmMixin, ShutdownActorBase):
 class ComputeNodeMonitorActor(SlurmMixin, MonitorActorBase):
 
     def shutdown_eligible(self):
-        if (self.arvados_node is not None and
-            self._get_slurm_state(self.arvados_node['hostname']) in self.SLURM_END_STATES):
-            return True
-        else:
-            return super(ComputeNodeMonitorActor, self).shutdown_eligible()
+        if self.arvados_node is not None:
+            state = self._get_slurm_state(self.arvados_node['hostname'])
+            # Automatically eligible for shutdown if it's down or failed, but
+            # not drain to avoid a race condition with resume_node().
+            if state in self.SLURM_END_STATES:
+                if state in self.SLURM_DRAIN_STATES:
+                    return "node is draining"
+                else:
+                    return True
+        return super(ComputeNodeMonitorActor, self).shutdown_eligible()
 
     def resume_node(self):
         try: