4670: Add a post-create hook to Node Manager for EC2 tagging.
authorBrett Smith <brett@curoverse.com>
Fri, 12 Dec 2014 21:16:39 +0000 (16:16 -0500)
committerBrett Smith <brett@curoverse.com>
Thu, 18 Dec 2014 21:19:50 +0000 (16:19 -0500)
The previous code was relying on the post-create tagging in libcloud's
EC2 driver.  Unfortunately, that's not working out too well for us: if
it fails, you get no indication of that, and it doesn't get retried.
This moves the work up into Node Manager, where failures can be logged
and retried appropriately.

The retry support may be sufficient to resolve #4670.  If it's not,
then the additional logging will help us track down the root cause.

services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
services/nodemanager/arvnodeman/computenode/driver/__init__.py
services/nodemanager/arvnodeman/computenode/driver/ec2.py
services/nodemanager/tests/test_computenode_dispatch.py
services/nodemanager/tests/test_computenode_driver_ec2.py

index f50670e7ba118c4c00ce88cbaf45ef3e9509d9c4..48e8dcf45888de232ba9004c290023e6bf5999b7 100644 (file)
@@ -129,6 +129,12 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
         self.cloud_node = self._cloud.create_node(self.cloud_size,
                                                   self.arvados_node)
         self._logger.info("Cloud node %s created.", self.cloud_node.id)
+        self._later.post_create()
+
+    @ComputeNodeStateChangeBase._retry()
+    def post_create(self):
+        self._cloud.post_create_node(self.cloud_node)
+        self._logger.info("%s post-create work done.", self.cloud_node.id)
         self._finished()
 
     def stop_if_no_cloud_node(self):
index 99b419ec4907572b0916e77431dfba7b0eadfde7..3a0c2063b426ccd759ad4dd8323847d51f72bce4 100644 (file)
@@ -58,6 +58,12 @@ class BaseComputeNodeDriver(object):
         kwargs['size'] = size
         return self.real.create_node(**kwargs)
 
+    def post_create_node(self, cloud_node):
+        # ComputeNodeSetupActor calls this method after the cloud node is
+        # created.  Any setup tasks that need to happen afterward (e.g.,
+        # tagging) should be done in this method.
+        pass
+
     def sync_node(self, cloud_node, arvados_node):
         # When a compute node first pings the API server, the API server
         # will automatically assign some attributes on the corresponding
index c0992f7b9635e2c47edd8e67cf5b887ba5692718..255a948a6c3aa0ee2c17ae1581685d282c341813 100644 (file)
@@ -79,8 +79,7 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
         return 'auth', key
 
     def arvados_create_kwargs(self, arvados_node):
-        result = {'ex_metadata': self.tags.copy(),
-                  'name': arvados_node_fqdn(arvados_node)}
+        result = {'name': arvados_node_fqdn(arvados_node)}
         ping_secret = arvados_node['info'].get('ping_secret')
         if ping_secret is not None:
             ping_url = ('https://{}/arvados/v1/nodes/{}/ping?ping_secret={}'.
@@ -89,11 +88,12 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
             result['ex_userdata'] = ping_url
         return result
 
+    def post_create_node(self, cloud_node):
+        self.real.ex_create_tags(cloud_node, self.tags)
+
     def sync_node(self, cloud_node, arvados_node):
-        metadata = self.arvados_create_kwargs(arvados_node)
-        tags = metadata['ex_metadata']
-        tags['Name'] = metadata['name']
-        self.real.ex_create_tags(cloud_node, tags)
+        self.real.ex_create_tags(cloud_node,
+                                 {'Name': arvados_node_fqdn(arvados_node)})
 
     @classmethod
     def node_start_time(cls, node):
index c86dcfdca21d314f20b5e7586cd2988389fc32ae..a1dfde30e1c90eeaeeb04e2542bd53866d4eff5f 100644 (file)
@@ -14,7 +14,7 @@ import arvnodeman.computenode.dispatch as dispatch
 from . import testutil
 
 class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
-    def make_mocks(self, arvados_effect=None, cloud_effect=None):
+    def make_mocks(self, arvados_effect=None):
         if arvados_effect is None:
             arvados_effect = [testutil.arvados_node_mock()]
         self.arvados_effect = arvados_effect
@@ -48,14 +48,33 @@ class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
         self.assertEqual(self.cloud_client.create_node(),
                          self.setup_actor.cloud_node.get(self.TIMEOUT))
 
-    def test_failed_calls_retried(self):
+    def test_failed_arvados_calls_retried(self):
         self.make_mocks([
                 arverror.ApiError(httplib2.Response({'status': '500'}), ""),
                 testutil.arvados_node_mock(),
                 ])
         self.make_actor()
+        self.wait_for_assignment(self.setup_actor, 'arvados_node')
+
+    def test_failed_cloud_calls_retried(self):
+        self.make_mocks()
+        self.cloud_client.create_node.side_effect = [
+            Exception("test cloud creation error"),
+            self.cloud_client.create_node.return_value,
+            ]
+        self.make_actor()
         self.wait_for_assignment(self.setup_actor, 'cloud_node')
 
+    def test_failed_post_create_retried(self):
+        self.make_mocks()
+        self.cloud_client.post_create_node.side_effect = [
+            Exception("test cloud post-create error"), None]
+        self.make_actor()
+        done = self.FUTURE_CLASS()
+        self.setup_actor.subscribe(done.set)
+        done.get(self.TIMEOUT)
+        self.assertEqual(2, self.cloud_client.post_create_node.call_count)
+
     def test_stop_when_no_cloud_node(self):
         self.make_mocks(
             arverror.ApiError(httplib2.Response({'status': '500'}), ""))
index c765587993d4984dde8d12284bb4f62ffed77bb4..fae63a5663d82035b43d82288de82ade2788f99b 100644 (file)
@@ -57,30 +57,41 @@ class EC2ComputeNodeDriverTestCase(unittest.TestCase):
                       create_method.call_args[1].get('ex_userdata',
                                                      'arg missing'))
 
-    def test_tags_created_from_arvados_node(self):
+    def test_hostname_from_arvados_node(self):
         arv_node = testutil.arvados_node_mock(8)
-        cloud_node = testutil.cloud_node_mock(8)
-        driver = self.new_driver(list_kwargs={'tag:list': 'test'})
-        self.assertEqual({'ex_metadata': {'list': 'test'},
-                          'name': 'compute8.zzzzz.arvadosapi.com'},
-                         driver.arvados_create_kwargs(arv_node))
+        driver = self.new_driver()
+        self.assertEqual('compute8.zzzzz.arvadosapi.com',
+                         driver.arvados_create_kwargs(arv_node)['name'])
 
-    def test_tags_set_default_hostname_from_new_arvados_node(self):
+    def test_default_hostname_from_new_arvados_node(self):
         arv_node = testutil.arvados_node_mock(hostname=None)
         driver = self.new_driver()
-        actual = driver.arvados_create_kwargs(arv_node)
         self.assertEqual('dynamic.compute.zzzzz.arvadosapi.com',
-                         actual['name'])
+                         driver.arvados_create_kwargs(arv_node)['name'])
+
+    def check_node_tagged(self, cloud_node, expected_tags):
+        tag_mock = self.driver_mock().ex_create_tags
+        self.assertTrue(tag_mock.called)
+        self.assertIs(cloud_node, tag_mock.call_args[0][0])
+        self.assertEqual(expected_tags, tag_mock.call_args[0][1])
+
+    def test_post_create_node_tags_from_list_kwargs(self):
+        expect_tags = {'key1': 'test value 1', 'key2': 'test value 2'}
+        list_kwargs = {('tag_' + key): value
+                       for key, value in expect_tags.iteritems()}
+        list_kwargs['instance-state-name'] = 'running'
+        cloud_node = testutil.cloud_node_mock()
+        driver = self.new_driver(list_kwargs=list_kwargs)
+        driver.post_create_node(cloud_node)
+        self.check_node_tagged(cloud_node, expect_tags)
 
     def test_sync_node(self):
         arv_node = testutil.arvados_node_mock(1)
         cloud_node = testutil.cloud_node_mock(2)
         driver = self.new_driver()
         driver.sync_node(cloud_node, arv_node)
-        tag_mock = self.driver_mock().ex_create_tags
-        self.assertTrue(tag_mock.called)
-        self.assertEqual('compute1.zzzzz.arvadosapi.com',
-                         tag_mock.call_args[0][1].get('Name', 'no name'))
+        self.check_node_tagged(cloud_node,
+                               {'Name': 'compute1.zzzzz.arvadosapi.com'})
 
     def test_node_create_time(self):
         refsecs = int(time.time())