7286: Add comments clarifying arvados_node_missing() and broken(). Also bump
[arvados.git] / services / nodemanager / arvnodeman / computenode / driver / azure.py
index d075bdb17435d0859925e2aa48d1277e6ea9a18f..b1ec5e6abc95e923e1b6583d3c33e49a3247368d 100644 (file)
@@ -3,11 +3,11 @@
 from __future__ import absolute_import, print_function
 
 import time
-from operator import attrgetter
 
 import libcloud.compute.base as cloud_base
 import libcloud.compute.providers as cloud_provider
 import libcloud.compute.types as cloud_types
+from libcloud.common.exceptions import BaseHTTPError
 
 from . import BaseComputeNodeDriver
 from .. import arvados_node_fqdn, arvados_timestamp, ARVADOS_TIMEFMT
@@ -16,6 +16,7 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
 
     DEFAULT_DRIVER = cloud_provider.get_driver(cloud_types.Provider.AZURE_ARM)
     SEARCH_CACHE = {}
+    CLOUD_ERRORS = BaseComputeNodeDriver.CLOUD_ERRORS + (BaseHTTPError,)
 
     def __init__(self, auth_kwargs, list_kwargs, create_kwargs,
                  driver_class=DEFAULT_DRIVER):
@@ -28,6 +29,10 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
         self.tags = {key[4:]: value
                      for key, value in create_kwargs.iteritems()
                      if key.startswith('tag_')}
+        # filter out tags from create_kwargs
+        create_kwargs = {key: value
+                         for key, value in create_kwargs.iteritems()
+                         if not key.startswith('tag_')}
         super(ComputeNodeDriver, self).__init__(
             auth_kwargs, list_kwargs, create_kwargs,
             driver_class)
@@ -37,8 +42,7 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
         name = 'compute-{}-{}'.format(node_id, cluster_id)
         tags = {
             'booted_at': time.strftime(ARVADOS_TIMEFMT, time.gmtime()),
-            'arv-ping-url': self._make_ping_url(arvados_node),
-            'hostname': arvados_node_fqdn(arvados_node)
+            'arv-ping-url': self._make_ping_url(arvados_node)
         }
         tags.update(self.tags)
         return {
@@ -47,15 +51,29 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
         }
 
     def sync_node(self, cloud_node, arvados_node):
-        pass
+        try:
+            self.real.ex_create_tags(cloud_node,
+                                     {'hostname': arvados_node_fqdn(arvados_node)})
+            return True
+        except BaseHTTPError as b:
+            return False
 
     def _init_image(self, urn):
         return "image", self.get_image(urn)
 
-    def _init_ssh_key(self, filename):
-        with open(filename) as ssh_file:
-            key = cloud_base.NodeAuthSSHKey(ssh_file.read())
-        return 'auth', key
+    def post_create_node(self, cloud_node):
+        self.real.ex_run_command(cloud_node,
+                                 """bash -c '
+                                 mkdir -p /var/tmp/arv-node-data/meta-data
+                                 echo "%s" > /var/tmp/arv-node-data/arv-ping-url
+                                 echo "%s" > /var/tmp/arv-node-data/meta-data/instance-id
+                                 echo "%s" > /var/tmp/arv-node-data/meta-data/instance-type
+                                 echo "%s" > /var/tmp/arv-node-data/meta-data/local-ipv4
+                                 '""" % (cloud_node.extra["tags"]["arv-ping-url"],
+                                         cloud_node.id,
+                                         cloud_node.extra["properties"]["hardwareProfile"]["vmSize"],
+                                         cloud_node.private_ips[0]),
+                                 timestamp=int(time.time()))
 
     def list_nodes(self):
         # Azure only supports filtering node lists by resource group.
@@ -64,6 +82,12 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
                 super(ComputeNodeDriver, self).list_nodes()
                 if node.extra["tags"].get("arvados-class") == self.tags["arvados-class"]]
 
+    def broken(self, cloud_node):
+        """Return true if libcloud has indicated the node is in a "broken" state."""
+        # UNKNOWN means the node state is unrecognized, which in practice means some combination
+        # of failure that the Azure libcloud driver doesn't know how to interpret.
+        return (cloud_node.state in (cloud_types.NodeState.ERROR, cloud_types.NodeState.UNKNOWN))
+
     @classmethod
     def node_fqdn(cls, node):
         return node.extra["tags"].get("hostname")