15964: Remove qr1hi from a few more places. Delete unused includes.
[arvados.git] / services / nodemanager / arvnodeman / nodelist.py
index 83dd93f077bfb504a8f41b6f0addd93c717ac7da..0abb3b3a379cbbbec7e619fdcca081ec98a340ea 100644 (file)
@@ -1,17 +1,24 @@
 #!/usr/bin/env python
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
 
 from __future__ import absolute_import, print_function
 
+import subprocess32 as subprocess
+
 from . import clientactor
 from . import config
 
+import arvados.util
+
 class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor):
     """Actor to poll the Arvados node list.
 
-    This actor regularly polls the list of Arvados node records, and
-    sends it to subscribers.
+    This actor regularly polls the list of Arvados node records,
+    augments it with the latest SLURM node info (`sinfo`), and sends
+    it to subscribers.
     """
-    LOGGER_NAME = 'arvnodeman.arvados_nodes'
 
     def is_common_error(self, exception):
         return isinstance(exception, config.ARVADOS_ERRORS)
@@ -20,8 +27,37 @@ class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor):
         return node['uuid']
 
     def _send_request(self):
-        return self._client.nodes().list(limit=10000).execute()['items']
+        nodelist = arvados.util.list_all(self._client.nodes().list)
+
+        # node hostname, state
+        sinfo_out = subprocess.check_output(["sinfo", "--noheader", "--format=%n|%t|%f"])
+        nodestates = {}
+        nodefeatures = {}
+        for out in sinfo_out.splitlines():
+            try:
+                nodename, state, features = out.split("|", 3)
+            except ValueError:
+                continue
+            if state in ('alloc', 'alloc*',
+                         'comp',  'comp*',
+                         'mix',   'mix*',
+                         'drng',  'drng*'):
+                nodestates[nodename] = 'busy'
+            elif state in ('idle', 'fail'):
+                nodestates[nodename] = state
+            else:
+                nodestates[nodename] = 'down'
+            if features != "(null)":
+                nodefeatures[nodename] = features
 
+        for n in nodelist:
+            if n["slot_number"] and n["hostname"] and n["hostname"] in nodestates:
+                n["crunch_worker_state"] = nodestates[n["hostname"]]
+            else:
+                n["crunch_worker_state"] = 'down'
+            n["slurm_node_features"] = nodefeatures.get(n["hostname"], "")
+
+        return nodelist
 
 class CloudNodeListMonitorActor(clientactor.RemotePollLoopActor):
     """Actor to poll the cloud node list.
@@ -29,13 +65,23 @@ class CloudNodeListMonitorActor(clientactor.RemotePollLoopActor):
     This actor regularly polls the cloud to get a list of running compute
     nodes, and sends it to subscribers.
     """
-    LOGGER_NAME = 'arvnodeman.cloud_nodes'
+
+    def __init__(self, client, timer_actor, server_calc, *args, **kwargs):
+        super(CloudNodeListMonitorActor, self).__init__(
+            client, timer_actor, *args, **kwargs)
+        self._calculator = server_calc
 
     def is_common_error(self, exception):
-        return self._client.is_cloud_exception(exception)
+        return isinstance(exception, config.CLOUD_ERRORS)
 
     def _item_key(self, node):
         return node.id
 
     def _send_request(self):
-        return self._client.list_nodes()
+        nodes = self._client.list_nodes()
+        for n in nodes:
+            # Replace the libcloud NodeSize object with compatible
+            # CloudSizeWrapper object which merges the size info reported from
+            # the cloud with size information from the configuration file.
+            n.size = self._calculator.find_size(n.extra['arvados_node_size'])
+        return nodes