- def _set_node_state(self, state, *args):
- cmd = ['scontrol', 'update', 'NodeName=' + self._nodename,
- 'State=' + state]
- cmd.extend(args)
- subprocess.check_output(cmd)
-
- def _get_slurm_state(self):
- return subprocess.check_output(['sinfo', '--noheader', '-o', '%t', '-n', self._nodename])
-
- # The following methods retry on OSError. This is intended to mitigate bug
- # #6321 where fork() of node manager raises "OSError: [Errno 12] Cannot
- # allocate memory" resulting in the untimely death of the shutdown actor
- # and tends to result in node manager getting into a wedged state where it
- # won't allocate new nodes or shut down gracefully. The underlying causes
- # of the excessive memory usage that result in the "Cannot allocate memory"
- # error are still being investigated.
-
- @_retry((subprocess.CalledProcessError, OSError))
- def cancel_shutdown(self, reason):
+ @RetryMixin._retry((subprocess.CalledProcessError, OSError))
+ def cancel_shutdown(self, reason, try_resume=True):