6142: If self._set_node_state('RESUME') in cancel_shutdown() returns non-zero,
authorPeter Amstutz <peter.amstutz@curoverse.com>
Tue, 6 Oct 2015 20:42:51 +0000 (16:42 -0400)
committerPeter Amstutz <peter.amstutz@curoverse.com>
Tue, 6 Oct 2015 20:42:51 +0000 (16:42 -0400)
check the node state and only retry if the node is in 'drain' or 'draining'.

services/nodemanager/arvnodeman/computenode/dispatch/slurm.py

index 225d856ba740cd207996d7a053e9169300b57561..bb397fa277d43d42e5b066d09198972c21e3371e 100644 (file)
@@ -13,6 +13,7 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
     SLURM_END_STATES = frozenset(['down\n', 'down*\n',
                                   'drain\n', 'drain*\n',
                                   'fail\n', 'fail*\n'])
+    SLURM_DRAIN_STATES = frozenset(['drain\n', 'drng\n'])
 
     def on_start(self):
         arv_node = self._arvados_node()
@@ -30,10 +31,26 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
         cmd.extend(args)
         subprocess.check_output(cmd)
 
+    def _get_slurm_state(self):
+        return subprocess.check_output(['sinfo', '--noheader', '-o', '%t', '-n', self._nodename])
+
     @ShutdownActorBase._retry((subprocess.CalledProcessError,))
     def cancel_shutdown(self):
         if self._nodename:
-            self._set_node_state('RESUME')
+            try:
+                self._set_node_state('RESUME')
+            except subprocess.CalledProcessError:
+                slum_state = self._get_slurm_state()
+                if slum_state in self.SLURM_DRAIN_STATES:
+                    # We expect to be able to resume from "drain" or "drng"
+                    # So if scontrol exited non-zero, something actually failed, so
+                    # raise an exception to signal the retry to kick in.
+                    raise
+                else:
+                    # Assume scontrol exited non-zero because the node is already in
+                    # 'idle' or 'alloc' (so it never started draining)
+                    # we don't need to do anything else resume it.
+                    pass
         return super(ComputeNodeShutdownActor, self).cancel_shutdown()
 
     @ShutdownActorBase._stop_if_window_closed
@@ -46,8 +63,7 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
     @ShutdownActorBase._stop_if_window_closed
     @ShutdownActorBase._retry((subprocess.CalledProcessError,))
     def await_slurm_drain(self):
-        output = subprocess.check_output(
-            ['sinfo', '--noheader', '-o', '%t', '-n', self._nodename])
+        output = self._get_slurm_state()
         if output in self.SLURM_END_STATES:
             self._later.shutdown_node()
         else: