Try to make logging identify the actor consistently
authorPeter Amstutz <peter.amstutz@curoverse.com>
Wed, 3 Feb 2016 22:51:46 +0000 (17:51 -0500)
committerPeter Amstutz <peter.amstutz@curoverse.com>
Wed, 3 Feb 2016 22:51:46 +0000 (17:51 -0500)
services/nodemanager/arvnodeman/clientactor.py
services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
services/nodemanager/arvnodeman/daemon.py
services/nodemanager/arvnodeman/jobqueue.py
services/nodemanager/arvnodeman/nodelist.py

index 6319f4bbfc5cece52832842cfda05d9ae9253005..5e5354a7da427ceb9cfab38b5d65b7f6c120bff5 100644 (file)
@@ -38,10 +38,8 @@ class RemotePollLoopActor(actor_class):
         super(RemotePollLoopActor, self).__init__()
         self._client = client
         self._timer = timer_actor
         super(RemotePollLoopActor, self).__init__()
         self._client = client
         self._timer = timer_actor
-        self._logger = logging.getLogger(self.LOGGER_NAME)
         self._later = self.actor_ref.proxy()
         self._polling_started = False
         self._later = self.actor_ref.proxy()
         self._polling_started = False
-        self.log_prefix = "{} (at {})".format(self.__class__.__name__, id(self))
         self.min_poll_wait = poll_wait
         self.max_poll_wait = max_poll_wait
         self.poll_wait = self.min_poll_wait
         self.min_poll_wait = poll_wait
         self.max_poll_wait = max_poll_wait
         self.poll_wait = self.min_poll_wait
@@ -50,6 +48,9 @@ class RemotePollLoopActor(actor_class):
         if hasattr(self, '_item_key'):
             self.subscribe_to = self._subscribe_to
 
         if hasattr(self, '_item_key'):
             self.subscribe_to = self._subscribe_to
 
+    def on_start(self):
+        self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, id(self.actor_urn[9:])))
+
     def _start_polling(self):
         if not self._polling_started:
             self._polling_started = True
     def _start_polling(self):
         if not self._polling_started:
             self._polling_started = True
@@ -71,8 +72,7 @@ class RemotePollLoopActor(actor_class):
         raise NotImplementedError("subclasses must implement request method")
 
     def _got_response(self, response):
         raise NotImplementedError("subclasses must implement request method")
 
     def _got_response(self, response):
-        self._logger.debug("%s got response with %d items",
-                           self.log_prefix, len(response))
+        self._logger.debug("got response with %d items", len(response))
         self.poll_wait = self.min_poll_wait
         _notify_subscribers(response, self.all_subscribers)
         if hasattr(self, '_item_key'):
         self.poll_wait = self.min_poll_wait
         _notify_subscribers(response, self.all_subscribers)
         if hasattr(self, '_item_key'):
@@ -89,7 +89,7 @@ class RemotePollLoopActor(actor_class):
         return False
 
     def poll(self, scheduled_start=None):
         return False
 
     def poll(self, scheduled_start=None):
-        self._logger.debug("%s sending poll", self.log_prefix)
+        self._logger.debug("sending request")
         start_time = time.time()
         if scheduled_start is None:
             scheduled_start = start_time
         start_time = time.time()
         if scheduled_start is None:
             scheduled_start = start_time
index 8c983c1ca4042726881ce0e9019da0e9c2df9b60..45468c31d79c54f34a5c4c5c60c94c1ba52d89b2 100644 (file)
@@ -21,18 +21,24 @@ class ComputeNodeStateChangeBase(config.actor_class, RetryMixin):
     This base class takes care of retrying changes and notifying
     subscribers when the change is finished.
     """
     This base class takes care of retrying changes and notifying
     subscribers when the change is finished.
     """
-    def __init__(self, logger_name, cloud_client, arvados_client, timer_actor,
+    def __init__(self, cloud_client, arvados_client, timer_actor,
                  retry_wait, max_retry_wait):
         super(ComputeNodeStateChangeBase, self).__init__()
                  retry_wait, max_retry_wait):
         super(ComputeNodeStateChangeBase, self).__init__()
+        self._later = self.actor_ref.proxy()
+        self._arvados = arvados_client
+        self.subscribers = set()
+
+    def _set_logger(self):
+        self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[9:]))
+
+    def on_start(self):
+        self._set_logger()
         RetryMixin.__init__(self,
                             retry_wait,
                             max_retry_wait,
         RetryMixin.__init__(self,
                             retry_wait,
                             max_retry_wait,
-                            logging.getLogger(logger_name),
+                            self._logger,
                             cloud_client,
                             timer_actor)
                             cloud_client,
                             timer_actor)
-        self._later = self.actor_ref.proxy()
-        self._arvados = arvados_client
-        self.subscribers = set()
 
     def _finished(self):
         _notify_subscribers(self._later, self.subscribers)
 
     def _finished(self):
         _notify_subscribers(self._later, self.subscribers)
@@ -74,7 +80,7 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
                  cloud_size, arvados_node=None,
                  retry_wait=1, max_retry_wait=180):
         super(ComputeNodeSetupActor, self).__init__(
                  cloud_size, arvados_node=None,
                  retry_wait=1, max_retry_wait=180):
         super(ComputeNodeSetupActor, self).__init__(
-            'arvnodeman.nodeup', cloud_client, arvados_client, timer_actor,
+            cloud_client, arvados_client, timer_actor,
             retry_wait, max_retry_wait)
         self.cloud_size = cloud_size
         self.arvados_node = None
             retry_wait, max_retry_wait)
         self.cloud_size = cloud_size
         self.arvados_node = None
@@ -97,7 +103,7 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
 
     @RetryMixin._retry()
     def create_cloud_node(self):
 
     @RetryMixin._retry()
     def create_cloud_node(self):
-        self._logger.info("Creating cloud node with size %s.",
+        self._logger.info("Creating cloud node of size %s.",
                           self.cloud_size.name)
         self.cloud_node = self._cloud.create_node(self.cloud_size,
                                                   self.arvados_node)
                           self.cloud_size.name)
         self.cloud_node = self._cloud.create_node(self.cloud_size,
                                                   self.arvados_node)
@@ -160,7 +166,7 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
         # eligible.  Normal shutdowns based on job demand should be
         # cancellable; shutdowns based on node misbehavior should not.
         super(ComputeNodeShutdownActor, self).__init__(
         # eligible.  Normal shutdowns based on job demand should be
         # cancellable; shutdowns based on node misbehavior should not.
         super(ComputeNodeShutdownActor, self).__init__(
-            'arvnodeman.nodedown', cloud_client, arvados_client, timer_actor,
+            cloud_client, arvados_client, timer_actor,
             retry_wait, max_retry_wait)
         self._monitor = node_monitor.proxy()
         self.cloud_node = self._monitor.cloud_node.get()
             retry_wait, max_retry_wait)
         self._monitor = node_monitor.proxy()
         self.cloud_node = self._monitor.cloud_node.get()
@@ -168,7 +174,11 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
         self.cancel_reason = None
         self.success = None
 
         self.cancel_reason = None
         self.success = None
 
+    def _set_logger(self):
+        self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[9:], self.cloud_node.id))
+
     def on_start(self):
     def on_start(self):
+        super(ComputeNodeShutdownActor, self).on_start()
         self._later.shutdown_node()
 
     def _arvados_node(self):
         self._later.shutdown_node()
 
     def _arvados_node(self):
@@ -181,7 +191,7 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
 
     def cancel_shutdown(self, reason):
         self.cancel_reason = reason
 
     def cancel_shutdown(self, reason):
         self.cancel_reason = reason
-        self._logger.info("Cloud node %s shutdown cancelled: %s.",
+        self._logger.info("Shutdown cancelled: %s.",
                           self.cloud_node.id, reason)
         self._finished(success_flag=False)
 
                           self.cloud_node.id, reason)
         self._finished(success_flag=False)
 
@@ -199,13 +209,14 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
     @_stop_if_window_closed
     @RetryMixin._retry()
     def shutdown_node(self):
     @_stop_if_window_closed
     @RetryMixin._retry()
     def shutdown_node(self):
+        self._logger.info("Starting shutdown", self.cloud_node.id)
         if not self._cloud.destroy_node(self.cloud_node):
             if self._cloud.broken(self.cloud_node):
                 self._later.cancel_shutdown(self.NODE_BROKEN)
             else:
                 # Force a retry.
                 raise cloud_types.LibcloudError("destroy_node failed")
         if not self._cloud.destroy_node(self.cloud_node):
             if self._cloud.broken(self.cloud_node):
                 self._later.cancel_shutdown(self.NODE_BROKEN)
             else:
                 # Force a retry.
                 raise cloud_types.LibcloudError("destroy_node failed")
-        self._logger.info("Cloud node %s shut down.", self.cloud_node.id)
+        self._logger.info("Shutdown success", self.cloud_node.id)
         arv_node = self._arvados_node()
         if arv_node is None:
             self._finished(success_flag=True)
         arv_node = self._arvados_node()
         if arv_node is None:
             self._finished(success_flag=True)
@@ -281,7 +292,6 @@ class ComputeNodeMonitorActor(config.actor_class):
     ):
         super(ComputeNodeMonitorActor, self).__init__()
         self._later = self.actor_ref.proxy()
     ):
         super(ComputeNodeMonitorActor, self).__init__()
         self._later = self.actor_ref.proxy()
-        self._logger = logging.getLogger('arvnodeman.computenode')
         self._last_log = None
         self._shutdowns = shutdown_timer
         self._cloud_node_fqdn = cloud_fqdn_func
         self._last_log = None
         self._shutdowns = shutdown_timer
         self._cloud_node_fqdn = cloud_fqdn_func
@@ -299,6 +309,12 @@ class ComputeNodeMonitorActor(config.actor_class):
         self.last_shutdown_opening = None
         self._later.consider_shutdown()
 
         self.last_shutdown_opening = None
         self._later.consider_shutdown()
 
+    def _set_logger(self):
+        self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[9:], self.cloud_node.name))
+
+    def on_start(self):
+        self._set_logger()
+
     def subscribe(self, subscriber):
         self.subscribers.add(subscriber)
 
     def subscribe(self, subscriber):
         self.subscribers.add(subscriber)
 
@@ -346,13 +362,13 @@ class ComputeNodeMonitorActor(config.actor_class):
     def consider_shutdown(self):
         next_opening = self._shutdowns.next_opening()
         if self.shutdown_eligible():
     def consider_shutdown(self):
         next_opening = self._shutdowns.next_opening()
         if self.shutdown_eligible():
-            self._debug("Node %s suggesting shutdown.", self.cloud_node.id)
+            self._debug("Suggesting shutdown.", self.cloud_node.id)
             _notify_subscribers(self._later, self.subscribers)
         elif self._shutdowns.window_open():
             _notify_subscribers(self._later, self.subscribers)
         elif self._shutdowns.window_open():
-            self._debug("Node %s shutdown window open but node busy.",
+            self._debug("Shutdown window open but node busy.",
                         self.cloud_node.id)
         elif self.last_shutdown_opening != next_opening:
                         self.cloud_node.id)
         elif self.last_shutdown_opening != next_opening:
-            self._debug("Node %s shutdown window closed.  Next at %s.",
+            self._debug("Shutdown window closed.  Next at %s.",
                         self.cloud_node.id, time.ctime(next_opening))
             self._timer.schedule(next_opening, self._later.consider_shutdown)
             self.last_shutdown_opening = next_opening
                         self.cloud_node.id, time.ctime(next_opening))
             self._timer.schedule(next_opening, self._later.consider_shutdown)
             self.last_shutdown_opening = next_opening
index 43f61c7c2c38d8ca95aa7614a8782a77672a816b..4d70436801564e9a35675e95c18f33fddc125806 100644 (file)
@@ -22,6 +22,7 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
             self._nodename = None
             return super(ComputeNodeShutdownActor, self).on_start()
         else:
             self._nodename = None
             return super(ComputeNodeShutdownActor, self).on_start()
         else:
+            self._set_logger()
             self._nodename = arv_node['hostname']
             self._logger.info("Draining SLURM node %s", self._nodename)
             self._later.issue_slurm_drain()
             self._nodename = arv_node['hostname']
             self._logger.info("Draining SLURM node %s", self._nodename)
             self._later.issue_slurm_drain()
index 243d3bfaa4cd13fbf9a540affc20ae6e562979a9..42ee4db1871317088dbf2eeedf5b74b2174dcb2f 100644 (file)
@@ -121,7 +121,6 @@ class NodeManagerDaemonActor(actor_class):
         self._new_arvados = arvados_factory
         self._new_cloud = cloud_factory
         self._cloud_driver = self._new_cloud()
         self._new_arvados = arvados_factory
         self._new_cloud = cloud_factory
         self._cloud_driver = self._new_cloud()
-        self._logger = logging.getLogger('arvnodeman.daemon')
         self._later = self.actor_ref.proxy()
         self.shutdown_windows = shutdown_windows
         self.server_calculator = server_calculator
         self._later = self.actor_ref.proxy()
         self.shutdown_windows = shutdown_windows
         self.server_calculator = server_calculator
@@ -145,6 +144,9 @@ class NodeManagerDaemonActor(actor_class):
         self.shutdowns = {}     # Cloud node IDs to ComputeNodeShutdownActors
         self._logger.debug("Daemon initialized")
 
         self.shutdowns = {}     # Cloud node IDs to ComputeNodeShutdownActors
         self._logger.debug("Daemon initialized")
 
+    def on_start(self):
+        self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, id(self.actor_urn[9:])))
+
     def _update_poll_time(self, poll_key):
         self.last_polls[poll_key] = time.time()
 
     def _update_poll_time(self, poll_key):
         self.last_polls[poll_key] = time.time()
 
index e0f0a5b1ec26045745fe479f76b8614eb700875d..87cf738311730feed045d23c63af6481c0731e06 100644 (file)
@@ -102,7 +102,6 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
     """
 
     CLIENT_ERRORS = ARVADOS_ERRORS
     """
 
     CLIENT_ERRORS = ARVADOS_ERRORS
-    LOGGER_NAME = 'arvnodeman.jobqueue'
 
     def __init__(self, client, timer_actor, server_calc, *args, **kwargs):
         super(JobQueueMonitorActor, self).__init__(
 
     def __init__(self, client, timer_actor, server_calc, *args, **kwargs):
         super(JobQueueMonitorActor, self).__init__(
@@ -114,6 +113,6 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
 
     def _got_response(self, queue):
         server_list = self._calculator.servers_for_queue(queue)
 
     def _got_response(self, queue):
         server_list = self._calculator.servers_for_queue(queue)
-        self._logger.debug("Sending server wishlist: %s",
+        self._logger.debug("Calculated wishlist: %s",
                            ', '.join(s.name for s in server_list) or "(empty)")
         return super(JobQueueMonitorActor, self)._got_response(server_list)
                            ', '.join(s.name for s in server_list) or "(empty)")
         return super(JobQueueMonitorActor, self)._got_response(server_list)
index 83dd93f077bfb504a8f41b6f0addd93c717ac7da..26f49a34eca6937260d2721083971c1cb5a7fc84 100644 (file)
@@ -11,7 +11,6 @@ class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor):
     This actor regularly polls the list of Arvados node records, and
     sends it to subscribers.
     """
     This actor regularly polls the list of Arvados node records, and
     sends it to subscribers.
     """
-    LOGGER_NAME = 'arvnodeman.arvados_nodes'
 
     def is_common_error(self, exception):
         return isinstance(exception, config.ARVADOS_ERRORS)
 
     def is_common_error(self, exception):
         return isinstance(exception, config.ARVADOS_ERRORS)
@@ -29,7 +28,6 @@ class CloudNodeListMonitorActor(clientactor.RemotePollLoopActor):
     This actor regularly polls the cloud to get a list of running compute
     nodes, and sends it to subscribers.
     """
     This actor regularly polls the cloud to get a list of running compute
     nodes, and sends it to subscribers.
     """
-    LOGGER_NAME = 'arvnodeman.cloud_nodes'
 
     def is_common_error(self, exception):
         return self._client.is_cloud_exception(exception)
 
     def is_common_error(self, exception):
         return self._client.is_cloud_exception(exception)