From 4a35e06b098bdd44a24fcaf77921aea5f371c84b Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 4 Apr 2017 11:21:57 -0400 Subject: [PATCH] 11413: Fix issues with node manager on GCE: * Always override Node.size with CloudSizeWrapper * Get updated node record before setting metadata to minimize 'Supplied fingerprint does not match current metadata fingerprint.' error. * Use ex_set_node_metadata() instead of issuing request directly. --- .../arvnodeman/computenode/dispatch/__init__.py | 10 ++++++++-- .../arvnodeman/computenode/driver/gce.py | 12 ++++++------ services/nodemanager/arvnodeman/daemon.py | 2 +- .../tests/test_computenode_driver_gce.py | 13 +++++-------- 4 files changed, 20 insertions(+), 17 deletions(-) diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py index 71f9083c01..9ee26e336d 100644 --- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py +++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py @@ -121,8 +121,14 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase): self.cloud_size.name) self.cloud_node = self._cloud.create_node(self.cloud_size, self.arvados_node) - if not self.cloud_node.size: - self.cloud_node.size = self.cloud_size + + # The information included in the node size object we get from libcloud + # is inconsistent between cloud providers. Replace libcloud NodeSize + # object with compatible CloudSizeWrapper object which merges the size + # info reported from the cloud with size information from the + # configuration file. + self.cloud_node.size = self.cloud_size + self._logger.info("Cloud node %s created.", self.cloud_node.id) self._later.update_arvados_node_properties() diff --git a/services/nodemanager/arvnodeman/computenode/driver/gce.py b/services/nodemanager/arvnodeman/computenode/driver/gce.py index 1c6d214fe8..79e43cb52a 100644 --- a/services/nodemanager/arvnodeman/computenode/driver/gce.py +++ b/services/nodemanager/arvnodeman/computenode/driver/gce.py @@ -136,6 +136,10 @@ class ComputeNodeDriver(BaseComputeNodeDriver): raise def sync_node(self, cloud_node, arvados_node): + # Update the cloud node record to ensure we have the correct metadata + # fingerprint. + cloud_node = self.real.ex_get_node(cloud_node.name, cloud_node.extra['zone']) + # We can't store the FQDN on the name attribute or anything like it, # because (a) names are static throughout the node's life (so FQDN # isn't available because we don't know it at node creation time) and @@ -147,12 +151,8 @@ class ComputeNodeDriver(BaseComputeNodeDriver): self._find_metadata(metadata_items, 'hostname')['value'] = hostname except KeyError: metadata_items.append({'key': 'hostname', 'value': hostname}) - response = self.real.connection.async_request( - '/zones/{}/instances/{}/setMetadata'.format( - cloud_node.extra['zone'].name, cloud_node.name), - method='POST', data=metadata_req) - if not response.success(): - raise Exception("setMetadata error: {}".format(response.error)) + + self.real.ex_set_node_metadata(cloud_node, metadata_items) @classmethod def node_fqdn(cls, node): diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py index f23b2615e2..2287724828 100644 --- a/services/nodemanager/arvnodeman/daemon.py +++ b/services/nodemanager/arvnodeman/daemon.py @@ -336,7 +336,7 @@ class NodeManagerDaemonActor(actor_class): elif (nodes_wanted < 0) and self.booting: self._later.stop_booting_node(size) except Exception as e: - self._logger.exception("while calculating nodes wanted for size %s", size) + self._logger.exception("while calculating nodes wanted for size %s", size.id) def _check_poll_freshness(orig_func): """Decorator to inhibit a method when poll information is stale. diff --git a/services/nodemanager/tests/test_computenode_driver_gce.py b/services/nodemanager/tests/test_computenode_driver_gce.py index 84e061d867..d47dbdfa03 100644 --- a/services/nodemanager/tests/test_computenode_driver_gce.py +++ b/services/nodemanager/tests/test_computenode_driver_gce.py @@ -123,16 +123,15 @@ class GCEComputeNodeDriverTestCase(testutil.DriverTestMixin, unittest.TestCase): cloud_node = testutil.cloud_node_mock( 2, metadata=start_metadata.copy(), zone=testutil.cloud_object_mock('testzone')) + self.driver_mock().ex_get_node.return_value = cloud_node driver = self.new_driver() driver.sync_node(cloud_node, arv_node) - args, kwargs = self.driver_mock().connection.async_request.call_args - self.assertEqual('/zones/testzone/instances/2/setMetadata', args[0]) - for key in ['kind', 'fingerprint']: - self.assertEqual(start_metadata[key], kwargs['data'][key]) + args, kwargs = self.driver_mock().ex_set_node_metadata.call_args + self.assertEqual(cloud_node, args[0]) plain_metadata['hostname'] = 'compute1.zzzzz.arvadosapi.com' self.assertEqual( plain_metadata, - {item['key']: item['value'] for item in kwargs['data']['items']}) + {item['key']: item['value'] for item in args[1]}) def test_sync_node_updates_hostname_tag(self): self.check_sync_node_updates_hostname_tag( @@ -145,9 +144,7 @@ class GCEComputeNodeDriverTestCase(testutil.DriverTestMixin, unittest.TestCase): arv_node = testutil.arvados_node_mock(8) cloud_node = testutil.cloud_node_mock( 9, metadata={}, zone=testutil.cloud_object_mock('failzone')) - mock_response = self.driver_mock().connection.async_request() - mock_response.success.return_value = False - mock_response.error = 'sync error test' + mock_response = self.driver_mock().ex_set_node_metadata.side_effect = (Exception('sync error test'),) driver = self.new_driver() with self.assertRaises(Exception) as err_check: driver.sync_node(cloud_node, arv_node) -- 2.30.2