X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/31d76600cdb691251d0823cc6be601d958b4e1a4..0f644e242ef37c911ad3dc25aca8135c339de349:/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py index cbeabd1a8a..fa56578cff 100644 --- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py +++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py @@ -1,4 +1,7 @@ #!/usr/bin/env python +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0 from __future__ import absolute_import, print_function @@ -6,8 +9,9 @@ import subprocess import time from . import \ - ComputeNodeSetupActor, ComputeNodeUpdateActor, ComputeNodeMonitorActor + ComputeNodeSetupActor, ComputeNodeMonitorActor from . import ComputeNodeShutdownActor as ShutdownActorBase +from . import ComputeNodeUpdateActor as UpdateActorBase from .. import RetryMixin class SlurmMixin(object): @@ -38,7 +42,7 @@ class ComputeNodeShutdownActor(SlurmMixin, ShutdownActorBase): self._logger.info("Draining SLURM node %s", self._nodename) self._later.issue_slurm_drain() - @RetryMixin._retry((subprocess.CalledProcessError,)) + @RetryMixin._retry((subprocess.CalledProcessError, OSError)) def cancel_shutdown(self, reason, try_resume=True): if self._nodename: if try_resume and self._get_slurm_state(self._nodename) in self.SLURM_DRAIN_STATES: @@ -50,7 +54,7 @@ class ComputeNodeShutdownActor(SlurmMixin, ShutdownActorBase): pass return super(ComputeNodeShutdownActor, self).cancel_shutdown(reason) - @RetryMixin._retry((subprocess.CalledProcessError,)) + @RetryMixin._retry((subprocess.CalledProcessError, OSError)) def issue_slurm_drain(self): if self.cancel_reason is not None: return @@ -61,7 +65,7 @@ class ComputeNodeShutdownActor(SlurmMixin, ShutdownActorBase): else: self._later.shutdown_node() - @RetryMixin._retry((subprocess.CalledProcessError,)) + @RetryMixin._retry((subprocess.CalledProcessError, OSError)) def await_slurm_drain(self): if self.cancel_reason is not None: return @@ -75,3 +79,18 @@ class ComputeNodeShutdownActor(SlurmMixin, ShutdownActorBase): else: # any other state. self._later.shutdown_node() + + def _destroy_node(self): + if self._nodename: + self._set_node_state(self._nodename, 'DOWN', 'Reason=Node Manager shutdown') + super(ComputeNodeShutdownActor, self)._destroy_node() + + +class ComputeNodeUpdateActor(UpdateActorBase): + def sync_node(self, cloud_node, arvados_node): + if arvados_node.get("hostname"): + try: + subprocess.check_output(['scontrol', 'update', 'NodeName=' + arvados_node["hostname"], 'Weight=%i' % int(cloud_node.size.price * 1000)]) + except: + self._logger.error("Unable to set slurm node weight.", exc_info=True) + return super(ComputeNodeUpdateActor, self).sync_node(cloud_node, arvados_node)