13937: Export stats as prometheus metrics. (WIP)
[arvados.git] / services / nodemanager / arvnodeman / nodelist.py
1 #!/usr/bin/env python
2 # Copyright (C) The Arvados Authors. All rights reserved.
3 #
4 # SPDX-License-Identifier: AGPL-3.0
5
6 from __future__ import absolute_import, print_function
7
8 import subprocess32 as subprocess
9
10 from . import clientactor
11 from . import config
12
13 import arvados.util
14
15 class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor):
16     """Actor to poll the Arvados node list.
17
18     This actor regularly polls the list of Arvados node records,
19     augments it with the latest SLURM node info (`sinfo`), and sends
20     it to subscribers.
21     """
22
23     def is_common_error(self, exception):
24         return isinstance(exception, config.ARVADOS_ERRORS)
25
26     def _item_key(self, node):
27         return node['uuid']
28
29     def _send_request(self):
30         nodelist = arvados.util.list_all(self._client.nodes().list)
31
32         # node hostname, state
33         sinfo_out = subprocess.check_output(["sinfo", "--noheader", "--format=%n|%t|%f"])
34         nodestates = {}
35         nodefeatures = {}
36         for out in sinfo_out.splitlines():
37             try:
38                 nodename, state, features = out.split("|", 3)
39             except ValueError:
40                 continue
41             if state in ('alloc', 'alloc*',
42                          'comp',  'comp*',
43                          'mix',   'mix*',
44                          'drng',  'drng*'):
45                 nodestates[nodename] = 'busy'
46             elif state in ('idle', 'fail'):
47                 nodestates[nodename] = state
48             else:
49                 nodestates[nodename] = 'down'
50             if features != "(null)":
51                 nodefeatures[nodename] = features
52
53         for n in nodelist:
54             if n["slot_number"] and n["hostname"] and n["hostname"] in nodestates:
55                 n["crunch_worker_state"] = nodestates[n["hostname"]]
56             else:
57                 n["crunch_worker_state"] = 'down'
58             n["slurm_node_features"] = nodefeatures.get(n["hostname"], "")
59
60         return nodelist
61
62 class CloudNodeListMonitorActor(clientactor.RemotePollLoopActor):
63     """Actor to poll the cloud node list.
64
65     This actor regularly polls the cloud to get a list of running compute
66     nodes, and sends it to subscribers.
67     """
68
69     def __init__(self, client, timer_actor, server_calc, *args, **kwargs):
70         super(CloudNodeListMonitorActor, self).__init__(
71             client, timer_actor, *args, **kwargs)
72         self._calculator = server_calc
73
74     def is_common_error(self, exception):
75         return isinstance(exception, config.CLOUD_ERRORS)
76
77     def _item_key(self, node):
78         return node.id
79
80     def _send_request(self):
81         nodes = self._client.list_nodes()
82         for n in nodes:
83             # Replace the libcloud NodeSize object with compatible
84             # CloudSizeWrapper object which merges the size info reported from
85             # the cloud with size information from the configuration file.
86             n.size = self._calculator.find_size(n.extra['arvados_node_size'])
87         return nodes