projects
/
arvados.git
/ blobdiff
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
Merge branch '13330-intermediates-test' of git.curoverse.com:arvados into 13330-cwl...
[arvados.git]
/
services
/
nodemanager
/
arvnodeman
/
nodelist.py
diff --git
a/services/nodemanager/arvnodeman/nodelist.py
b/services/nodemanager/arvnodeman/nodelist.py
index 37142fe83814d73fdec66d3fe0029998dfc941af..66af7c32d128ab3a51815a74443b885779052f6b 100644
(file)
--- a/
services/nodemanager/arvnodeman/nodelist.py
+++ b/
services/nodemanager/arvnodeman/nodelist.py
@@
-1,4
+1,7
@@
#!/usr/bin/env python
#!/usr/bin/env python
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
from __future__ import absolute_import, print_function
from __future__ import absolute_import, print_function
@@
-12,8
+15,9
@@
import arvados.util
class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor):
"""Actor to poll the Arvados node list.
class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor):
"""Actor to poll the Arvados node list.
- This actor regularly polls the list of Arvados node records, and
- sends it to subscribers.
+ This actor regularly polls the list of Arvados node records,
+ augments it with the latest SLURM node info (`sinfo`), and sends
+ it to subscribers.
"""
def is_common_error(self, exception):
"""
def is_common_error(self, exception):
@@
-26,22
+30,32
@@
class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor):
nodelist = arvados.util.list_all(self._client.nodes().list)
# node hostname, state
nodelist = arvados.util.list_all(self._client.nodes().list)
# node hostname, state
- sinfo_out = subprocess.check_output(["sinfo", "--noheader", "--format=%n
%t
"])
+ sinfo_out = subprocess.check_output(["sinfo", "--noheader", "--format=%n
|%t|%f
"])
nodestates = {}
nodestates = {}
+ nodefeatures = {}
for out in sinfo_out.splitlines():
for out in sinfo_out.splitlines():
- nodename, state = out.split(" ", 2)
- if state in ('alloc', 'comp'):
+ try:
+ nodename, state, features = out.split("|", 3)
+ except ValueError:
+ continue
+ if state in ('alloc', 'alloc*',
+ 'comp', 'comp*',
+ 'mix', 'mix*',
+ 'drng', 'drng*'):
nodestates[nodename] = 'busy'
nodestates[nodename] = 'busy'
- elif state
== 'idle'
:
- nodestates[nodename] =
'idle'
+ elif state
in ('idle', 'fail')
:
+ nodestates[nodename] =
state
else:
nodestates[nodename] = 'down'
else:
nodestates[nodename] = 'down'
+ if features != "(null)":
+ nodefeatures[nodename] = features
for n in nodelist:
if n["slot_number"] and n["hostname"] and n["hostname"] in nodestates:
n["crunch_worker_state"] = nodestates[n["hostname"]]
else:
n["crunch_worker_state"] = 'down'
for n in nodelist:
if n["slot_number"] and n["hostname"] and n["hostname"] in nodestates:
n["crunch_worker_state"] = nodestates[n["hostname"]]
else:
n["crunch_worker_state"] = 'down'
+ n["slurm_node_features"] = nodefeatures.get(n["hostname"], "")
return nodelist
return nodelist
@@
-58,7
+72,7
@@
class CloudNodeListMonitorActor(clientactor.RemotePollLoopActor):
self._calculator = server_calc
def is_common_error(self, exception):
self._calculator = server_calc
def is_common_error(self, exception):
- return
self._client.is_cloud_exception(exception
)
+ return
isinstance(exception, config.CLOUD_ERRORS
)
def _item_key(self, node):
return node.id
def _item_key(self, node):
return node.id
@@
-66,8
+80,8
@@
class CloudNodeListMonitorActor(clientactor.RemotePollLoopActor):
def _send_request(self):
nodes = self._client.list_nodes()
for n in nodes:
def _send_request(self):
nodes = self._client.list_nodes()
for n in nodes:
- # Replace
with
libcloud NodeSize object with compatible
+ # Replace
the
libcloud NodeSize object with compatible
# CloudSizeWrapper object which merges the size info reported from
# the cloud with size information from the configuration file.
# CloudSizeWrapper object which merges the size info reported from
# the cloud with size information from the configuration file.
- n.size = self._calculator.find_size(n.
size.id
)
+ n.size = self._calculator.find_size(n.
extra['arvados_node_size']
)
return nodes
return nodes