X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/ea47e67af0e7528f0bcb23f3b34019b308eaa68a..8e31910034627dedd7259dd1e45a60768108c1e1:/services/nodemanager/arvnodeman/nodelist.py diff --git a/services/nodemanager/arvnodeman/nodelist.py b/services/nodemanager/arvnodeman/nodelist.py index 7bc3a5ebd2..66af7c32d1 100644 --- a/services/nodemanager/arvnodeman/nodelist.py +++ b/services/nodemanager/arvnodeman/nodelist.py @@ -1,4 +1,7 @@ #!/usr/bin/env python +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0 from __future__ import absolute_import, print_function @@ -12,8 +15,9 @@ import arvados.util class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor): """Actor to poll the Arvados node list. - This actor regularly polls the list of Arvados node records, and - sends it to subscribers. + This actor regularly polls the list of Arvados node records, + augments it with the latest SLURM node info (`sinfo`), and sends + it to subscribers. """ def is_common_error(self, exception): @@ -26,28 +30,32 @@ class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor): nodelist = arvados.util.list_all(self._client.nodes().list) # node hostname, state - sinfo_out = subprocess.check_output(["sinfo", "--noheader", "--format=%n %t"]) + sinfo_out = subprocess.check_output(["sinfo", "--noheader", "--format=%n|%t|%f"]) nodestates = {} + nodefeatures = {} for out in sinfo_out.splitlines(): try: - nodename, state = out.split(" ", 2) - if state in ('alloc', 'alloc*', - 'comp', 'comp*', - 'mix', 'mix*', - 'drng', 'drng*'): - nodestates[nodename] = 'busy' - elif state == 'idle': - nodestates[nodename] = 'idle' - else: - nodestates[nodename] = 'down' + nodename, state, features = out.split("|", 3) except ValueError: - pass + continue + if state in ('alloc', 'alloc*', + 'comp', 'comp*', + 'mix', 'mix*', + 'drng', 'drng*'): + nodestates[nodename] = 'busy' + elif state in ('idle', 'fail'): + nodestates[nodename] = state + else: + nodestates[nodename] = 'down' + if features != "(null)": + nodefeatures[nodename] = features for n in nodelist: if n["slot_number"] and n["hostname"] and n["hostname"] in nodestates: n["crunch_worker_state"] = nodestates[n["hostname"]] else: n["crunch_worker_state"] = 'down' + n["slurm_node_features"] = nodefeatures.get(n["hostname"], "") return nodelist @@ -64,7 +72,7 @@ class CloudNodeListMonitorActor(clientactor.RemotePollLoopActor): self._calculator = server_calc def is_common_error(self, exception): - return self._client.is_cloud_exception(exception) + return isinstance(exception, config.CLOUD_ERRORS) def _item_key(self, node): return node.id @@ -72,8 +80,8 @@ class CloudNodeListMonitorActor(clientactor.RemotePollLoopActor): def _send_request(self): nodes = self._client.list_nodes() for n in nodes: - # Replace with libcloud NodeSize object with compatible + # Replace the libcloud NodeSize object with compatible # CloudSizeWrapper object which merges the size info reported from # the cloud with size information from the configuration file. - n.size = self._calculator.find_size(n.size.id) + n.size = self._calculator.find_size(n.extra['arvados_node_size']) return nodes