Merge branch 'master' into 13822-nm-delayed-daemon
[arvados.git] / services / nodemanager / arvnodeman / computenode / driver / azure.py
index b1494d02851f0f78b85ddc070a16211983e98b2f..719124d4000f724a271077d9f1614c50c6788f8d 100644 (file)
@@ -1,7 +1,11 @@
 #!/usr/bin/env python
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
 
 from __future__ import absolute_import, print_function
 
+import pipes
 import time
 
 import libcloud.compute.base as cloud_base
@@ -36,17 +40,33 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
             auth_kwargs, list_kwargs, create_kwargs,
             driver_class)
 
-    def arvados_create_kwargs(self, arvados_node):
-        cluster_id, _, node_id = arvados_node['uuid'].split('-')
-        name = 'compute-{}-{}'.format(node_id, cluster_id)
+    def create_cloud_name(self, arvados_node):
+        uuid_parts = arvados_node['uuid'].split('-', 2)
+        return 'compute-{parts[2]}-{parts[0]}'.format(parts=uuid_parts)
+
+    def arvados_create_kwargs(self, size, arvados_node):
         tags = {
+            # Set up tag indicating the Arvados assigned Cloud Size id.
+            'arvados_node_size': size.id,
             'booted_at': time.strftime(ARVADOS_TIMEFMT, time.gmtime()),
             'arv-ping-url': self._make_ping_url(arvados_node)
         }
         tags.update(self.tags)
+
+        name = self.create_cloud_name(arvados_node)
+        customdata = """#!/bin/sh
+mkdir -p    /var/tmp/arv-node-data/meta-data
+echo %s > /var/tmp/arv-node-data/arv-ping-url
+echo %s > /var/tmp/arv-node-data/meta-data/instance-id
+echo %s > /var/tmp/arv-node-data/meta-data/instance-type
+""" % (pipes.quote(tags['arv-ping-url']),
+       pipes.quote(name),
+       pipes.quote(size.id))
+
         return {
             'name': name,
             'ex_tags': tags,
+            'ex_customdata': customdata
         }
 
     def sync_node(self, cloud_node, arvados_node):
@@ -60,26 +80,24 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
     def _init_image(self, urn):
         return "image", self.get_image(urn)
 
-    def post_create_node(self, cloud_node):
-        self.real.ex_run_command(cloud_node,
-                                 """bash -c '
-                                 mkdir -p /var/tmp/arv-node-data/meta-data
-                                 echo "%s" > /var/tmp/arv-node-data/arv-ping-url
-                                 echo "%s" > /var/tmp/arv-node-data/meta-data/instance-id
-                                 echo "%s" > /var/tmp/arv-node-data/meta-data/instance-type
-                                 echo "%s" > /var/tmp/arv-node-data/meta-data/local-ipv4
-                                 '""" % (cloud_node.extra["tags"]["arv-ping-url"],
-                                         cloud_node.id,
-                                         cloud_node.extra["properties"]["hardwareProfile"]["vmSize"],
-                                         cloud_node.private_ips[0]),
-                                 timestamp=int(time.time()))
-
     def list_nodes(self):
         # Azure only supports filtering node lists by resource group.
         # Do our own filtering based on tag.
-        return [node for node in
-                super(ComputeNodeDriver, self).list_nodes()
-                if node.extra["tags"].get("arvados-class") == self.tags["arvados-class"]]
+        nodes = [node for node in
+                super(ComputeNodeDriver, self).list_nodes(ex_fetch_nic=False, ex_fetch_power_state=False)
+                if node.extra.get("tags", {}).get("arvados-class") == self.tags["arvados-class"]]
+        for n in nodes:
+            # Need to populate Node.size
+            if not n.size:
+                n.size = self.sizes()[n.extra["properties"]["hardwareProfile"]["vmSize"]]
+            n.extra['arvados_node_size'] = n.extra.get('tags', {}).get('arvados_node_size')
+        return nodes
+
+    def broken(self, cloud_node):
+        """Return true if libcloud has indicated the node is in a "broken" state."""
+        # UNKNOWN means the node state is unrecognized, which in practice means some combination
+        # of failure that the Azure libcloud driver doesn't know how to interpret.
+        return (cloud_node.state in (cloud_types.NodeState.ERROR, cloud_types.NodeState.UNKNOWN))
 
     @classmethod
     def node_fqdn(cls, node):
@@ -88,3 +106,7 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
     @classmethod
     def node_start_time(cls, node):
         return arvados_timestamp(node.extra["tags"].get("booted_at"))
+
+    @classmethod
+    def node_id(cls, node):
+        return node.name