From 212b58706f2dd0f5fc925cf058e384ca66149d75 Mon Sep 17 00:00:00 2001 From: Brett Smith Date: Mon, 2 Mar 2015 11:27:59 -0500 Subject: [PATCH] 4751: Node Manager considers ping times for stricter node pairing. Because the pairing decision is currently based on IP address alone, Node Manager will occasionally pair a cloud node with the wrong Arvados node after an IP address is reused. Fix that by bringing the node's first_ping_at into consideration: if it's older than the cloud node, refuse to pair. --- .../arvnodeman/computenode/dispatch/__init__.py | 9 ++++++--- services/nodemanager/tests/test_computenode_dispatch.py | 7 +++++++ services/nodemanager/tests/test_daemon.py | 5 ++++- services/nodemanager/tests/testutil.py | 7 +++++-- 4 files changed, 22 insertions(+), 6 deletions(-) diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py index 1608b529fb..310e7887d5 100644 --- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py +++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py @@ -9,7 +9,8 @@ import time import libcloud.common.types as cloud_types import pykka -from .. import arvados_node_fqdn, arvados_node_mtime, timestamp_fresh +from .. import \ + arvados_node_fqdn, arvados_node_mtime, arvados_timestamp, timestamp_fresh from ...clientactor import _notify_subscribers from ... import config @@ -322,9 +323,11 @@ class ComputeNodeMonitorActor(config.actor_class): self.last_shutdown_opening = next_opening def offer_arvados_pair(self, arvados_node): - if self.arvados_node is not None: + first_ping_s = arvados_node.get('first_ping_at') + if (self.arvados_node is not None) or (not first_ping_s): return None - elif arvados_node['ip_address'] in self.cloud_node.private_ips: + elif ((arvados_node['ip_address'] in self.cloud_node.private_ips) and + (arvados_timestamp(first_ping_s) >= self.cloud_node_start_time)): self._later.update_arvados_node(arvados_node) return self.cloud_node.id else: diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py index 4a72f47884..8d69ea958c 100644 --- a/services/nodemanager/tests/test_computenode_dispatch.py +++ b/services/nodemanager/tests/test_computenode_dispatch.py @@ -319,6 +319,13 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin, self.assertIsNone( self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT)) + def test_arvados_node_mismatch_first_ping_too_early(self): + self.make_actor(4) + arv_node = testutil.arvados_node_mock( + 4, first_ping_at='1971-03-02T14:15:16.1717282Z') + self.assertIsNone( + self.node_actor.offer_arvados_pair(arv_node).get(self.TIMEOUT)) + def test_update_cloud_node(self): self.make_actor(1) self.make_mocks(2) diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py index bdba83ade1..dc8fdc3f84 100644 --- a/services/nodemanager/tests/test_daemon.py +++ b/services/nodemanager/tests/test_daemon.py @@ -94,8 +94,11 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin, self.check_monitors_arvados_nodes(arv_node) def test_arvados_node_un_and_re_paired(self): + # We need to create the Arvados node mock after spinning up the daemon + # to make sure it's new enough to pair with the cloud node. + self.make_daemon([testutil.cloud_node_mock(3)], arvados_nodes=None) arv_node = testutil.arvados_node_mock(3) - self.make_daemon([testutil.cloud_node_mock(3)], [arv_node]) + self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT) self.check_monitors_arvados_nodes(arv_node) self.daemon.update_cloud_nodes([]).get(self.TIMEOUT) self.assertEqual(0, self.alive_monitor_count()) diff --git a/services/nodemanager/tests/testutil.py b/services/nodemanager/tests/testutil.py index 1c53c68489..f0508e748c 100644 --- a/services/nodemanager/tests/testutil.py +++ b/services/nodemanager/tests/testutil.py @@ -13,14 +13,17 @@ from . import pykka_timeout no_sleep = mock.patch('time.sleep', lambda n: None) -def arvados_node_mock(node_num=99, job_uuid=None, age=0, **kwargs): +def arvados_node_mock(node_num=99, job_uuid=None, age=-1, **kwargs): mod_time = datetime.datetime.utcnow() - datetime.timedelta(seconds=age) + mod_time_s = mod_time.strftime('%Y-%m-%dT%H:%M:%S.%fZ') if job_uuid is True: job_uuid = 'zzzzz-jjjjj-jobjobjobjobjob' crunch_worker_state = 'idle' if (job_uuid is None) else 'busy' node = {'uuid': 'zzzzz-yyyyy-{:015x}'.format(node_num), 'created_at': '2014-01-01T01:02:03.04050607Z', - 'modified_at': mod_time.strftime('%Y-%m-%dT%H:%M:%S.%fZ'), + 'modified_at': mod_time_s, + 'first_ping_at': kwargs.pop('first_ping_at', mod_time_s), + 'last_ping_at': mod_time_s, 'slot_number': node_num, 'hostname': 'compute{}'.format(node_num), 'domain': 'zzzzz.arvadosapi.com', -- 2.30.2