From 0d6070d3b62fc367dbb02e8ead5fe25dce012156 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Fri, 5 Feb 2016 11:10:43 -0500 Subject: [PATCH] 7667: Combine polling logs into fewer lines for less noise. Adjust message when last_ping_at is unexpectedly none to be less severe (can happen in innocent circumstances). Report nodes in "booted" list as "booting" since they are unpaired. Fix tests. --- .../nodemanager/arvnodeman/clientactor.py | 5 +- .../computenode/dispatch/__init__.py | 6 +- services/nodemanager/arvnodeman/daemon.py | 60 +++++++++++-------- .../nodemanager/tests/test_clientactor.py | 4 +- .../tests/test_computenode_dispatch.py | 14 ++--- 5 files changed, 49 insertions(+), 40 deletions(-) diff --git a/services/nodemanager/arvnodeman/clientactor.py b/services/nodemanager/arvnodeman/clientactor.py index e5534c53f7..9a9ce588d3 100644 --- a/services/nodemanager/arvnodeman/clientactor.py +++ b/services/nodemanager/arvnodeman/clientactor.py @@ -72,7 +72,6 @@ class RemotePollLoopActor(actor_class): raise NotImplementedError("subclasses must implement request method") def _got_response(self, response): - self._logger.debug("got response with %d items", len(response)) self.poll_wait = self.min_poll_wait _notify_subscribers(response, self.all_subscribers) if hasattr(self, '_item_key'): @@ -105,7 +104,9 @@ class RemotePollLoopActor(actor_class): else: self._got_response(response) next_poll = scheduled_start + self.poll_wait - self._logger.info("request took %s seconds", (time.time() - scheduled_start)) + self._logger.info("got response with %d items in %s seconds, next poll at %s", + len(response), (time.time() - scheduled_start), + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(next_poll))) end_time = time.time() if next_poll < end_time: # We've drifted too much; start fresh. next_poll = end_time + self.poll_wait diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py index 17d04638a6..b6c05979e7 100644 --- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py +++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py @@ -374,9 +374,9 @@ class ComputeNodeMonitorActor(config.actor_class): # is in an error state, so shut it down. return True if missing is None and self._cloud.broken(self.cloud_node): - self._logger.warning( - "cloud reports broken node, but paired node %s never pinged " - "(bug?) -- skipped check for node_stale_after", + self._logger.info( + "Cloud node considered 'broken' but paired node %s last_ping_at is None, " + + "cannot check node_stale_after (node may be shut down and we just haven't gotten the message yet).", self.arvados_node['uuid']) if self.in_state('idle'): return True diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py index 5616376c62..47e77d1344 100644 --- a/services/nodemanager/arvnodeman/daemon.py +++ b/services/nodemanager/arvnodeman/daemon.py @@ -151,8 +151,8 @@ class NodeManagerDaemonActor(actor_class): self.last_polls[poll_key] = time.time() def _pair_nodes(self, node_record, arvados_node): - self._logger.info("Cloud node %s has associated with Arvados node %s", - node_record.cloud_node.id, arvados_node['uuid']) + self._logger.info("Cloud node %s is now paired with Arvados node %s", + node_record.cloud_node.name, arvados_node['uuid']) self._arvados_nodes_actor.subscribe_to( arvados_node['uuid'], node_record.actor.update_arvados_node) node_record.arvados_node = arvados_node @@ -217,14 +217,22 @@ class NodeManagerDaemonActor(actor_class): break def _nodes_booting(self, size): + s = sum(1 + for c in self.booting.itervalues() + if size is None or c.cloud_size.get().id == size.id) + s += sum(1 + for c in self.booted.itervalues() + if size is None or c.cloud_node.size.id == size.id) + return s + + def _nodes_unpaired(self, size): return sum(1 - for c in self.booting.itervalues() - if size is None or c.cloud_size.get().id == size.id) + for c in self.cloud_nodes.unpaired() + if size is None or c.cloud_node.size.id == size.id) def _nodes_booted(self, size): return sum(1 - for i in (self.booted, self.cloud_nodes.nodes) - for c in i.itervalues() + for c in self.cloud_nodes.nodes.itervalues() if size is None or c.cloud_node.size.id == size.id) def _nodes_up(self, size): @@ -278,21 +286,18 @@ class NodeManagerDaemonActor(actor_class): elif under_min > 0 and size.id == self.min_cloud_size.id: return under_min - try: - booting_count = self._nodes_booting(size) + sum(1 for _ in self.cloud_nodes.unpaired()) - shutdown_count = self._size_shutdowns(size) - busy_count = self._nodes_busy(size) - up_count = self._nodes_up(size) - (shutdown_count + busy_count + self._nodes_missing(size)) - - self._logger.info("%s: wishlist %i, up %i (booting %i, idle %i, busy %i), shutting down %i", size.name, - self._size_wishlist(size), - up_count + busy_count, - booting_count, - up_count - booting_count, - busy_count, - shutdown_count) - except Exception as e: - self._logger.exception("whoops") + booting_count = self._nodes_booting(size) + self._nodes_unpaired(size) + shutdown_count = self._size_shutdowns(size) + busy_count = self._nodes_busy(size) + up_count = self._nodes_up(size) - (shutdown_count + busy_count + self._nodes_missing(size)) + + self._logger.info("%s: wishlist %i, up %i (booting %i, idle %i, busy %i), shutting down %i", size.name, + self._size_wishlist(size), + up_count + busy_count, + booting_count, + up_count - booting_count, + busy_count, + shutdown_count) wanted = self._size_wishlist(size) - up_count if wanted > 0 and self.max_total_price and ((total_price + (size.price*wanted)) > self.max_total_price): @@ -314,11 +319,14 @@ class NodeManagerDaemonActor(actor_class): self._update_poll_time('server_wishlist') self.last_wishlist = wishlist for size in reversed(self.server_calculator.cloud_sizes): - nodes_wanted = self._nodes_wanted(size) - if nodes_wanted > 0: - self._later.start_node(size) - elif (nodes_wanted < 0) and self.booting: - self._later.stop_booting_node(size) + try: + nodes_wanted = self._nodes_wanted(size) + if nodes_wanted > 0: + self._later.start_node(size) + elif (nodes_wanted < 0) and self.booting: + self._later.stop_booting_node(size) + except Exception as e: + self._logger.exception("while calculating nodes wanted for size %s", size) def _check_poll_freshness(orig_func): """Decorator to inhibit a method when poll information is stale. diff --git a/services/nodemanager/tests/test_clientactor.py b/services/nodemanager/tests/test_clientactor.py index 57a0d32d06..cee9c85a22 100644 --- a/services/nodemanager/tests/test_clientactor.py +++ b/services/nodemanager/tests/test_clientactor.py @@ -47,7 +47,8 @@ class RemotePollLoopActorTestCase(testutil.RemotePollLoopActorTestMixin, def test_late_subscribers_get_responses(self): self.build_monitor(['pre_late_test', 'late_test']) - self.monitor.subscribe(lambda response: None).get(self.TIMEOUT) + mock_subscriber = mock.Mock(name='mock_subscriber') + self.monitor.subscribe(mock_subscriber).get(self.TIMEOUT) self.monitor.subscribe(self.subscriber) self.monitor.poll().get(self.TIMEOUT) self.stop_proxy(self.monitor) @@ -146,4 +147,3 @@ class RemotePollLoopActorWithKeysTestCase(testutil.RemotePollLoopActorTestMixin, if __name__ == '__main__': unittest.main() - diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py index ecf83c693a..040d8d5768 100644 --- a/services/nodemanager/tests/test_computenode_dispatch.py +++ b/services/nodemanager/tests/test_computenode_dispatch.py @@ -339,7 +339,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin, def test_no_shutdown_booting(self): self.make_actor() self.shutdowns._set_state(True, 600) - self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT)) + self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is still booting")) def test_shutdown_without_arvados_node(self): self.make_actor(start_time=0) @@ -352,7 +352,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin, last_ping_at='1970-01-01T01:02:03.04050607Z') self.make_actor(10, arv_node) self.shutdowns._set_state(True, 600) - self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT)) + self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle.")) def test_no_shutdown_running_broken(self): arv_node = testutil.arvados_node_mock(12, job_uuid=None, @@ -360,7 +360,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin, self.make_actor(12, arv_node) self.shutdowns._set_state(True, 600) self.cloud_client.broken.return_value = True - self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT)) + self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle.")) def test_shutdown_missing_broken(self): arv_node = testutil.arvados_node_mock(11, job_uuid=None, @@ -373,23 +373,23 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin, def test_no_shutdown_when_window_closed(self): self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None)) - self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT)) + self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("shutdown window is not open.")) def test_no_shutdown_when_node_running_job(self): self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True)) self.shutdowns._set_state(True, 600) - self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT)) + self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle.")) def test_no_shutdown_when_node_state_unknown(self): self.make_actor(5, testutil.arvados_node_mock( 5, crunch_worker_state=None)) self.shutdowns._set_state(True, 600) - self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT)) + self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle.")) def test_no_shutdown_when_node_state_stale(self): self.make_actor(6, testutil.arvados_node_mock(6, age=90000)) self.shutdowns._set_state(True, 600) - self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT)) + self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle.")) def test_arvados_node_match(self): self.make_actor(2) -- 2.30.2