3 from __future__ import absolute_import, print_function
9 ComputeNodeSetupActor, ComputeNodeUpdateActor, ComputeNodeMonitorActor
10 from . import ComputeNodeShutdownActor as ShutdownActorBase
11 from .. import RetryMixin
13 class ComputeNodeShutdownActor(ShutdownActorBase):
14 SLURM_END_STATES = frozenset(['down\n', 'down*\n',
15 'drain\n', 'drain*\n',
17 SLURM_DRAIN_STATES = frozenset(['drain\n', 'drng\n'])
20 arv_node = self._arvados_node()
23 return super(ComputeNodeShutdownActor, self).on_start()
26 self._nodename = arv_node['hostname']
27 self._logger.info("Draining SLURM node %s", self._nodename)
28 self._later.issue_slurm_drain()
30 def _set_node_state(self, state, *args):
31 cmd = ['scontrol', 'update', 'NodeName=' + self._nodename,
34 subprocess.check_output(cmd)
36 def _get_slurm_state(self):
37 return subprocess.check_output(['sinfo', '--noheader', '-o', '%t', '-n', self._nodename])
39 # The following methods retry on OSError. This is intended to mitigate bug
40 # #6321 where fork() of node manager raises "OSError: [Errno 12] Cannot
41 # allocate memory" resulting in the untimely death of the shutdown actor
42 # and tends to result in node manager getting into a wedged state where it
43 # won't allocate new nodes or shut down gracefully. The underlying causes
44 # of the excessive memory usage that result in the "Cannot allocate memory"
45 # error are still being investigated.
47 @RetryMixin._retry((subprocess.CalledProcessError, OSError))
48 def cancel_shutdown(self, reason):
50 if self._get_slurm_state() in self.SLURM_DRAIN_STATES:
51 # Resume from "drng" or "drain"
52 self._set_node_state('RESUME')
54 # Node is in a state such as 'idle' or 'alloc' so don't
55 # try to resume it because that will just raise an error.
57 return super(ComputeNodeShutdownActor, self).cancel_shutdown(reason)
59 @RetryMixin._retry((subprocess.CalledProcessError, OSError))
60 @ShutdownActorBase._stop_if_window_closed
61 def issue_slurm_drain(self):
62 self._set_node_state('DRAIN', 'Reason=Node Manager shutdown')
63 self._logger.info("Waiting for SLURM node %s to drain", self._nodename)
64 self._later.await_slurm_drain()
66 @RetryMixin._retry((subprocess.CalledProcessError, OSError))
67 @ShutdownActorBase._stop_if_window_closed
68 def await_slurm_drain(self):
69 output = self._get_slurm_state()
70 if output in self.SLURM_END_STATES:
71 self._later.shutdown_node()
73 self._timer.schedule(time.time() + 10,
74 self._later.await_slurm_drain)