From e9901a3f94eab93f9482b7cb3d7e2c1b50216f08 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Wed, 3 Feb 2016 17:51:46 -0500 Subject: [PATCH] Try to make logging identify the actor consistently --- .../nodemanager/arvnodeman/clientactor.py | 10 ++--- .../computenode/dispatch/__init__.py | 44 +++++++++++++------ .../arvnodeman/computenode/dispatch/slurm.py | 1 + services/nodemanager/arvnodeman/daemon.py | 4 +- services/nodemanager/arvnodeman/jobqueue.py | 3 +- services/nodemanager/arvnodeman/nodelist.py | 2 - 6 files changed, 40 insertions(+), 24 deletions(-) diff --git a/services/nodemanager/arvnodeman/clientactor.py b/services/nodemanager/arvnodeman/clientactor.py index 6319f4bbfc..5e5354a7da 100644 --- a/services/nodemanager/arvnodeman/clientactor.py +++ b/services/nodemanager/arvnodeman/clientactor.py @@ -38,10 +38,8 @@ class RemotePollLoopActor(actor_class): super(RemotePollLoopActor, self).__init__() self._client = client self._timer = timer_actor - self._logger = logging.getLogger(self.LOGGER_NAME) self._later = self.actor_ref.proxy() self._polling_started = False - self.log_prefix = "{} (at {})".format(self.__class__.__name__, id(self)) self.min_poll_wait = poll_wait self.max_poll_wait = max_poll_wait self.poll_wait = self.min_poll_wait @@ -50,6 +48,9 @@ class RemotePollLoopActor(actor_class): if hasattr(self, '_item_key'): self.subscribe_to = self._subscribe_to + def on_start(self): + self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, id(self.actor_urn[9:]))) + def _start_polling(self): if not self._polling_started: self._polling_started = True @@ -71,8 +72,7 @@ class RemotePollLoopActor(actor_class): raise NotImplementedError("subclasses must implement request method") def _got_response(self, response): - self._logger.debug("%s got response with %d items", - self.log_prefix, len(response)) + self._logger.debug("got response with %d items", len(response)) self.poll_wait = self.min_poll_wait _notify_subscribers(response, self.all_subscribers) if hasattr(self, '_item_key'): @@ -89,7 +89,7 @@ class RemotePollLoopActor(actor_class): return False def poll(self, scheduled_start=None): - self._logger.debug("%s sending poll", self.log_prefix) + self._logger.debug("sending request") start_time = time.time() if scheduled_start is None: scheduled_start = start_time diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py index 8c983c1ca4..45468c31d7 100644 --- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py +++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py @@ -21,18 +21,24 @@ class ComputeNodeStateChangeBase(config.actor_class, RetryMixin): This base class takes care of retrying changes and notifying subscribers when the change is finished. """ - def __init__(self, logger_name, cloud_client, arvados_client, timer_actor, + def __init__(self, cloud_client, arvados_client, timer_actor, retry_wait, max_retry_wait): super(ComputeNodeStateChangeBase, self).__init__() + self._later = self.actor_ref.proxy() + self._arvados = arvados_client + self.subscribers = set() + + def _set_logger(self): + self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[9:])) + + def on_start(self): + self._set_logger() RetryMixin.__init__(self, retry_wait, max_retry_wait, - logging.getLogger(logger_name), + self._logger, cloud_client, timer_actor) - self._later = self.actor_ref.proxy() - self._arvados = arvados_client - self.subscribers = set() def _finished(self): _notify_subscribers(self._later, self.subscribers) @@ -74,7 +80,7 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase): cloud_size, arvados_node=None, retry_wait=1, max_retry_wait=180): super(ComputeNodeSetupActor, self).__init__( - 'arvnodeman.nodeup', cloud_client, arvados_client, timer_actor, + cloud_client, arvados_client, timer_actor, retry_wait, max_retry_wait) self.cloud_size = cloud_size self.arvados_node = None @@ -97,7 +103,7 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase): @RetryMixin._retry() def create_cloud_node(self): - self._logger.info("Creating cloud node with size %s.", + self._logger.info("Creating cloud node of size %s.", self.cloud_size.name) self.cloud_node = self._cloud.create_node(self.cloud_size, self.arvados_node) @@ -160,7 +166,7 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase): # eligible. Normal shutdowns based on job demand should be # cancellable; shutdowns based on node misbehavior should not. super(ComputeNodeShutdownActor, self).__init__( - 'arvnodeman.nodedown', cloud_client, arvados_client, timer_actor, + cloud_client, arvados_client, timer_actor, retry_wait, max_retry_wait) self._monitor = node_monitor.proxy() self.cloud_node = self._monitor.cloud_node.get() @@ -168,7 +174,11 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase): self.cancel_reason = None self.success = None + def _set_logger(self): + self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[9:], self.cloud_node.id)) + def on_start(self): + super(ComputeNodeShutdownActor, self).on_start() self._later.shutdown_node() def _arvados_node(self): @@ -181,7 +191,7 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase): def cancel_shutdown(self, reason): self.cancel_reason = reason - self._logger.info("Cloud node %s shutdown cancelled: %s.", + self._logger.info("Shutdown cancelled: %s.", self.cloud_node.id, reason) self._finished(success_flag=False) @@ -199,13 +209,14 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase): @_stop_if_window_closed @RetryMixin._retry() def shutdown_node(self): + self._logger.info("Starting shutdown", self.cloud_node.id) if not self._cloud.destroy_node(self.cloud_node): if self._cloud.broken(self.cloud_node): self._later.cancel_shutdown(self.NODE_BROKEN) else: # Force a retry. raise cloud_types.LibcloudError("destroy_node failed") - self._logger.info("Cloud node %s shut down.", self.cloud_node.id) + self._logger.info("Shutdown success", self.cloud_node.id) arv_node = self._arvados_node() if arv_node is None: self._finished(success_flag=True) @@ -281,7 +292,6 @@ class ComputeNodeMonitorActor(config.actor_class): ): super(ComputeNodeMonitorActor, self).__init__() self._later = self.actor_ref.proxy() - self._logger = logging.getLogger('arvnodeman.computenode') self._last_log = None self._shutdowns = shutdown_timer self._cloud_node_fqdn = cloud_fqdn_func @@ -299,6 +309,12 @@ class ComputeNodeMonitorActor(config.actor_class): self.last_shutdown_opening = None self._later.consider_shutdown() + def _set_logger(self): + self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[9:], self.cloud_node.name)) + + def on_start(self): + self._set_logger() + def subscribe(self, subscriber): self.subscribers.add(subscriber) @@ -346,13 +362,13 @@ class ComputeNodeMonitorActor(config.actor_class): def consider_shutdown(self): next_opening = self._shutdowns.next_opening() if self.shutdown_eligible(): - self._debug("Node %s suggesting shutdown.", self.cloud_node.id) + self._debug("Suggesting shutdown.", self.cloud_node.id) _notify_subscribers(self._later, self.subscribers) elif self._shutdowns.window_open(): - self._debug("Node %s shutdown window open but node busy.", + self._debug("Shutdown window open but node busy.", self.cloud_node.id) elif self.last_shutdown_opening != next_opening: - self._debug("Node %s shutdown window closed. Next at %s.", + self._debug("Shutdown window closed. Next at %s.", self.cloud_node.id, time.ctime(next_opening)) self._timer.schedule(next_opening, self._later.consider_shutdown) self.last_shutdown_opening = next_opening diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py index 43f61c7c2c..4d70436801 100644 --- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py +++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py @@ -22,6 +22,7 @@ class ComputeNodeShutdownActor(ShutdownActorBase): self._nodename = None return super(ComputeNodeShutdownActor, self).on_start() else: + self._set_logger() self._nodename = arv_node['hostname'] self._logger.info("Draining SLURM node %s", self._nodename) self._later.issue_slurm_drain() diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py index 243d3bfaa4..42ee4db187 100644 --- a/services/nodemanager/arvnodeman/daemon.py +++ b/services/nodemanager/arvnodeman/daemon.py @@ -121,7 +121,6 @@ class NodeManagerDaemonActor(actor_class): self._new_arvados = arvados_factory self._new_cloud = cloud_factory self._cloud_driver = self._new_cloud() - self._logger = logging.getLogger('arvnodeman.daemon') self._later = self.actor_ref.proxy() self.shutdown_windows = shutdown_windows self.server_calculator = server_calculator @@ -145,6 +144,9 @@ class NodeManagerDaemonActor(actor_class): self.shutdowns = {} # Cloud node IDs to ComputeNodeShutdownActors self._logger.debug("Daemon initialized") + def on_start(self): + self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, id(self.actor_urn[9:]))) + def _update_poll_time(self, poll_key): self.last_polls[poll_key] = time.time() diff --git a/services/nodemanager/arvnodeman/jobqueue.py b/services/nodemanager/arvnodeman/jobqueue.py index e0f0a5b1ec..87cf738311 100644 --- a/services/nodemanager/arvnodeman/jobqueue.py +++ b/services/nodemanager/arvnodeman/jobqueue.py @@ -102,7 +102,6 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor): """ CLIENT_ERRORS = ARVADOS_ERRORS - LOGGER_NAME = 'arvnodeman.jobqueue' def __init__(self, client, timer_actor, server_calc, *args, **kwargs): super(JobQueueMonitorActor, self).__init__( @@ -114,6 +113,6 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor): def _got_response(self, queue): server_list = self._calculator.servers_for_queue(queue) - self._logger.debug("Sending server wishlist: %s", + self._logger.debug("Calculated wishlist: %s", ', '.join(s.name for s in server_list) or "(empty)") return super(JobQueueMonitorActor, self)._got_response(server_list) diff --git a/services/nodemanager/arvnodeman/nodelist.py b/services/nodemanager/arvnodeman/nodelist.py index 83dd93f077..26f49a34ec 100644 --- a/services/nodemanager/arvnodeman/nodelist.py +++ b/services/nodemanager/arvnodeman/nodelist.py @@ -11,7 +11,6 @@ class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor): This actor regularly polls the list of Arvados node records, and sends it to subscribers. """ - LOGGER_NAME = 'arvnodeman.arvados_nodes' def is_common_error(self, exception): return isinstance(exception, config.ARVADOS_ERRORS) @@ -29,7 +28,6 @@ class CloudNodeListMonitorActor(clientactor.RemotePollLoopActor): This actor regularly polls the cloud to get a list of running compute nodes, and sends it to subscribers. """ - LOGGER_NAME = 'arvnodeman.cloud_nodes' def is_common_error(self, exception): return self._client.is_cloud_exception(exception) -- 2.39.5