projects
/
arvados.git
/ commitdiff
summary
|
shortlog
|
log
|
commit
| commitdiff |
tree
raw
|
patch
|
inline
| side by side (parent:
73812c5
)
Try to make logging identify the actor consistently
author
Peter Amstutz <peter.amstutz@curoverse.com>
Wed, 3 Feb 2016 22:51:46 +0000
(17:51 -0500)
committer
Peter Amstutz <peter.amstutz@curoverse.com>
Wed, 3 Feb 2016 22:51:46 +0000
(17:51 -0500)
services/nodemanager/arvnodeman/clientactor.py
patch
|
blob
|
history
services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
patch
|
blob
|
history
services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
patch
|
blob
|
history
services/nodemanager/arvnodeman/daemon.py
patch
|
blob
|
history
services/nodemanager/arvnodeman/jobqueue.py
patch
|
blob
|
history
services/nodemanager/arvnodeman/nodelist.py
patch
|
blob
|
history
diff --git
a/services/nodemanager/arvnodeman/clientactor.py
b/services/nodemanager/arvnodeman/clientactor.py
index 6319f4bbfc5cece52832842cfda05d9ae9253005..5e5354a7da427ceb9cfab38b5d65b7f6c120bff5 100644
(file)
--- a/
services/nodemanager/arvnodeman/clientactor.py
+++ b/
services/nodemanager/arvnodeman/clientactor.py
@@
-38,10
+38,8
@@
class RemotePollLoopActor(actor_class):
super(RemotePollLoopActor, self).__init__()
self._client = client
self._timer = timer_actor
super(RemotePollLoopActor, self).__init__()
self._client = client
self._timer = timer_actor
- self._logger = logging.getLogger(self.LOGGER_NAME)
self._later = self.actor_ref.proxy()
self._polling_started = False
self._later = self.actor_ref.proxy()
self._polling_started = False
- self.log_prefix = "{} (at {})".format(self.__class__.__name__, id(self))
self.min_poll_wait = poll_wait
self.max_poll_wait = max_poll_wait
self.poll_wait = self.min_poll_wait
self.min_poll_wait = poll_wait
self.max_poll_wait = max_poll_wait
self.poll_wait = self.min_poll_wait
@@
-50,6
+48,9
@@
class RemotePollLoopActor(actor_class):
if hasattr(self, '_item_key'):
self.subscribe_to = self._subscribe_to
if hasattr(self, '_item_key'):
self.subscribe_to = self._subscribe_to
+ def on_start(self):
+ self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, id(self.actor_urn[9:])))
+
def _start_polling(self):
if not self._polling_started:
self._polling_started = True
def _start_polling(self):
if not self._polling_started:
self._polling_started = True
@@
-71,8
+72,7
@@
class RemotePollLoopActor(actor_class):
raise NotImplementedError("subclasses must implement request method")
def _got_response(self, response):
raise NotImplementedError("subclasses must implement request method")
def _got_response(self, response):
- self._logger.debug("%s got response with %d items",
- self.log_prefix, len(response))
+ self._logger.debug("got response with %d items", len(response))
self.poll_wait = self.min_poll_wait
_notify_subscribers(response, self.all_subscribers)
if hasattr(self, '_item_key'):
self.poll_wait = self.min_poll_wait
_notify_subscribers(response, self.all_subscribers)
if hasattr(self, '_item_key'):
@@
-89,7
+89,7
@@
class RemotePollLoopActor(actor_class):
return False
def poll(self, scheduled_start=None):
return False
def poll(self, scheduled_start=None):
- self._logger.debug("
%s sending poll", self.log_prefix
)
+ self._logger.debug("
sending request"
)
start_time = time.time()
if scheduled_start is None:
scheduled_start = start_time
start_time = time.time()
if scheduled_start is None:
scheduled_start = start_time
diff --git
a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 8c983c1ca4042726881ce0e9019da0e9c2df9b60..45468c31d79c54f34a5c4c5c60c94c1ba52d89b2 100644
(file)
--- a/
services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/
services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@
-21,18
+21,24
@@
class ComputeNodeStateChangeBase(config.actor_class, RetryMixin):
This base class takes care of retrying changes and notifying
subscribers when the change is finished.
"""
This base class takes care of retrying changes and notifying
subscribers when the change is finished.
"""
- def __init__(self,
logger_name,
cloud_client, arvados_client, timer_actor,
+ def __init__(self, cloud_client, arvados_client, timer_actor,
retry_wait, max_retry_wait):
super(ComputeNodeStateChangeBase, self).__init__()
retry_wait, max_retry_wait):
super(ComputeNodeStateChangeBase, self).__init__()
+ self._later = self.actor_ref.proxy()
+ self._arvados = arvados_client
+ self.subscribers = set()
+
+ def _set_logger(self):
+ self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[9:]))
+
+ def on_start(self):
+ self._set_logger()
RetryMixin.__init__(self,
retry_wait,
max_retry_wait,
RetryMixin.__init__(self,
retry_wait,
max_retry_wait,
-
logging.getLogger(logger_name)
,
+
self._logger
,
cloud_client,
timer_actor)
cloud_client,
timer_actor)
- self._later = self.actor_ref.proxy()
- self._arvados = arvados_client
- self.subscribers = set()
def _finished(self):
_notify_subscribers(self._later, self.subscribers)
def _finished(self):
_notify_subscribers(self._later, self.subscribers)
@@
-74,7
+80,7
@@
class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
cloud_size, arvados_node=None,
retry_wait=1, max_retry_wait=180):
super(ComputeNodeSetupActor, self).__init__(
cloud_size, arvados_node=None,
retry_wait=1, max_retry_wait=180):
super(ComputeNodeSetupActor, self).__init__(
-
'arvnodeman.nodeup',
cloud_client, arvados_client, timer_actor,
+ cloud_client, arvados_client, timer_actor,
retry_wait, max_retry_wait)
self.cloud_size = cloud_size
self.arvados_node = None
retry_wait, max_retry_wait)
self.cloud_size = cloud_size
self.arvados_node = None
@@
-97,7
+103,7
@@
class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
@RetryMixin._retry()
def create_cloud_node(self):
@RetryMixin._retry()
def create_cloud_node(self):
- self._logger.info("Creating cloud node
with
size %s.",
+ self._logger.info("Creating cloud node
of
size %s.",
self.cloud_size.name)
self.cloud_node = self._cloud.create_node(self.cloud_size,
self.arvados_node)
self.cloud_size.name)
self.cloud_node = self._cloud.create_node(self.cloud_size,
self.arvados_node)
@@
-160,7
+166,7
@@
class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
# eligible. Normal shutdowns based on job demand should be
# cancellable; shutdowns based on node misbehavior should not.
super(ComputeNodeShutdownActor, self).__init__(
# eligible. Normal shutdowns based on job demand should be
# cancellable; shutdowns based on node misbehavior should not.
super(ComputeNodeShutdownActor, self).__init__(
-
'arvnodeman.nodedown',
cloud_client, arvados_client, timer_actor,
+ cloud_client, arvados_client, timer_actor,
retry_wait, max_retry_wait)
self._monitor = node_monitor.proxy()
self.cloud_node = self._monitor.cloud_node.get()
retry_wait, max_retry_wait)
self._monitor = node_monitor.proxy()
self.cloud_node = self._monitor.cloud_node.get()
@@
-168,7
+174,11
@@
class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
self.cancel_reason = None
self.success = None
self.cancel_reason = None
self.success = None
+ def _set_logger(self):
+ self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[9:], self.cloud_node.id))
+
def on_start(self):
def on_start(self):
+ super(ComputeNodeShutdownActor, self).on_start()
self._later.shutdown_node()
def _arvados_node(self):
self._later.shutdown_node()
def _arvados_node(self):
@@
-181,7
+191,7
@@
class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
def cancel_shutdown(self, reason):
self.cancel_reason = reason
def cancel_shutdown(self, reason):
self.cancel_reason = reason
- self._logger.info("
Cloud node %s s
hutdown cancelled: %s.",
+ self._logger.info("
S
hutdown cancelled: %s.",
self.cloud_node.id, reason)
self._finished(success_flag=False)
self.cloud_node.id, reason)
self._finished(success_flag=False)
@@
-199,13
+209,14
@@
class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
@_stop_if_window_closed
@RetryMixin._retry()
def shutdown_node(self):
@_stop_if_window_closed
@RetryMixin._retry()
def shutdown_node(self):
+ self._logger.info("Starting shutdown", self.cloud_node.id)
if not self._cloud.destroy_node(self.cloud_node):
if self._cloud.broken(self.cloud_node):
self._later.cancel_shutdown(self.NODE_BROKEN)
else:
# Force a retry.
raise cloud_types.LibcloudError("destroy_node failed")
if not self._cloud.destroy_node(self.cloud_node):
if self._cloud.broken(self.cloud_node):
self._later.cancel_shutdown(self.NODE_BROKEN)
else:
# Force a retry.
raise cloud_types.LibcloudError("destroy_node failed")
- self._logger.info("
Cloud node %s shut down.
", self.cloud_node.id)
+ self._logger.info("
Shutdown success
", self.cloud_node.id)
arv_node = self._arvados_node()
if arv_node is None:
self._finished(success_flag=True)
arv_node = self._arvados_node()
if arv_node is None:
self._finished(success_flag=True)
@@
-281,7
+292,6
@@
class ComputeNodeMonitorActor(config.actor_class):
):
super(ComputeNodeMonitorActor, self).__init__()
self._later = self.actor_ref.proxy()
):
super(ComputeNodeMonitorActor, self).__init__()
self._later = self.actor_ref.proxy()
- self._logger = logging.getLogger('arvnodeman.computenode')
self._last_log = None
self._shutdowns = shutdown_timer
self._cloud_node_fqdn = cloud_fqdn_func
self._last_log = None
self._shutdowns = shutdown_timer
self._cloud_node_fqdn = cloud_fqdn_func
@@
-299,6
+309,12
@@
class ComputeNodeMonitorActor(config.actor_class):
self.last_shutdown_opening = None
self._later.consider_shutdown()
self.last_shutdown_opening = None
self._later.consider_shutdown()
+ def _set_logger(self):
+ self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[9:], self.cloud_node.name))
+
+ def on_start(self):
+ self._set_logger()
+
def subscribe(self, subscriber):
self.subscribers.add(subscriber)
def subscribe(self, subscriber):
self.subscribers.add(subscriber)
@@
-346,13
+362,13
@@
class ComputeNodeMonitorActor(config.actor_class):
def consider_shutdown(self):
next_opening = self._shutdowns.next_opening()
if self.shutdown_eligible():
def consider_shutdown(self):
next_opening = self._shutdowns.next_opening()
if self.shutdown_eligible():
- self._debug("
Node %s s
uggesting shutdown.", self.cloud_node.id)
+ self._debug("
S
uggesting shutdown.", self.cloud_node.id)
_notify_subscribers(self._later, self.subscribers)
elif self._shutdowns.window_open():
_notify_subscribers(self._later, self.subscribers)
elif self._shutdowns.window_open():
- self._debug("
Node %s s
hutdown window open but node busy.",
+ self._debug("
S
hutdown window open but node busy.",
self.cloud_node.id)
elif self.last_shutdown_opening != next_opening:
self.cloud_node.id)
elif self.last_shutdown_opening != next_opening:
- self._debug("
Node %s s
hutdown window closed. Next at %s.",
+ self._debug("
S
hutdown window closed. Next at %s.",
self.cloud_node.id, time.ctime(next_opening))
self._timer.schedule(next_opening, self._later.consider_shutdown)
self.last_shutdown_opening = next_opening
self.cloud_node.id, time.ctime(next_opening))
self._timer.schedule(next_opening, self._later.consider_shutdown)
self.last_shutdown_opening = next_opening
diff --git
a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
index 43f61c7c2c38d8ca95aa7614a8782a77672a816b..4d70436801564e9a35675e95c18f33fddc125806 100644
(file)
--- a/
services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
+++ b/
services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
@@
-22,6
+22,7
@@
class ComputeNodeShutdownActor(ShutdownActorBase):
self._nodename = None
return super(ComputeNodeShutdownActor, self).on_start()
else:
self._nodename = None
return super(ComputeNodeShutdownActor, self).on_start()
else:
+ self._set_logger()
self._nodename = arv_node['hostname']
self._logger.info("Draining SLURM node %s", self._nodename)
self._later.issue_slurm_drain()
self._nodename = arv_node['hostname']
self._logger.info("Draining SLURM node %s", self._nodename)
self._later.issue_slurm_drain()
diff --git
a/services/nodemanager/arvnodeman/daemon.py
b/services/nodemanager/arvnodeman/daemon.py
index 243d3bfaa4cd13fbf9a540affc20ae6e562979a9..42ee4db1871317088dbf2eeedf5b74b2174dcb2f 100644
(file)
--- a/
services/nodemanager/arvnodeman/daemon.py
+++ b/
services/nodemanager/arvnodeman/daemon.py
@@
-121,7
+121,6
@@
class NodeManagerDaemonActor(actor_class):
self._new_arvados = arvados_factory
self._new_cloud = cloud_factory
self._cloud_driver = self._new_cloud()
self._new_arvados = arvados_factory
self._new_cloud = cloud_factory
self._cloud_driver = self._new_cloud()
- self._logger = logging.getLogger('arvnodeman.daemon')
self._later = self.actor_ref.proxy()
self.shutdown_windows = shutdown_windows
self.server_calculator = server_calculator
self._later = self.actor_ref.proxy()
self.shutdown_windows = shutdown_windows
self.server_calculator = server_calculator
@@
-145,6
+144,9
@@
class NodeManagerDaemonActor(actor_class):
self.shutdowns = {} # Cloud node IDs to ComputeNodeShutdownActors
self._logger.debug("Daemon initialized")
self.shutdowns = {} # Cloud node IDs to ComputeNodeShutdownActors
self._logger.debug("Daemon initialized")
+ def on_start(self):
+ self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, id(self.actor_urn[9:])))
+
def _update_poll_time(self, poll_key):
self.last_polls[poll_key] = time.time()
def _update_poll_time(self, poll_key):
self.last_polls[poll_key] = time.time()
diff --git
a/services/nodemanager/arvnodeman/jobqueue.py
b/services/nodemanager/arvnodeman/jobqueue.py
index e0f0a5b1ec26045745fe479f76b8614eb700875d..87cf738311730feed045d23c63af6481c0731e06 100644
(file)
--- a/
services/nodemanager/arvnodeman/jobqueue.py
+++ b/
services/nodemanager/arvnodeman/jobqueue.py
@@
-102,7
+102,6
@@
class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
"""
CLIENT_ERRORS = ARVADOS_ERRORS
"""
CLIENT_ERRORS = ARVADOS_ERRORS
- LOGGER_NAME = 'arvnodeman.jobqueue'
def __init__(self, client, timer_actor, server_calc, *args, **kwargs):
super(JobQueueMonitorActor, self).__init__(
def __init__(self, client, timer_actor, server_calc, *args, **kwargs):
super(JobQueueMonitorActor, self).__init__(
@@
-114,6
+113,6
@@
class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
def _got_response(self, queue):
server_list = self._calculator.servers_for_queue(queue)
def _got_response(self, queue):
server_list = self._calculator.servers_for_queue(queue)
- self._logger.debug("
Sending server
wishlist: %s",
+ self._logger.debug("
Calculated
wishlist: %s",
', '.join(s.name for s in server_list) or "(empty)")
return super(JobQueueMonitorActor, self)._got_response(server_list)
', '.join(s.name for s in server_list) or "(empty)")
return super(JobQueueMonitorActor, self)._got_response(server_list)
diff --git
a/services/nodemanager/arvnodeman/nodelist.py
b/services/nodemanager/arvnodeman/nodelist.py
index 83dd93f077bfb504a8f41b6f0addd93c717ac7da..26f49a34eca6937260d2721083971c1cb5a7fc84 100644
(file)
--- a/
services/nodemanager/arvnodeman/nodelist.py
+++ b/
services/nodemanager/arvnodeman/nodelist.py
@@
-11,7
+11,6
@@
class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor):
This actor regularly polls the list of Arvados node records, and
sends it to subscribers.
"""
This actor regularly polls the list of Arvados node records, and
sends it to subscribers.
"""
- LOGGER_NAME = 'arvnodeman.arvados_nodes'
def is_common_error(self, exception):
return isinstance(exception, config.ARVADOS_ERRORS)
def is_common_error(self, exception):
return isinstance(exception, config.ARVADOS_ERRORS)
@@
-29,7
+28,6
@@
class CloudNodeListMonitorActor(clientactor.RemotePollLoopActor):
This actor regularly polls the cloud to get a list of running compute
nodes, and sends it to subscribers.
"""
This actor regularly polls the cloud to get a list of running compute
nodes, and sends it to subscribers.
"""
- LOGGER_NAME = 'arvnodeman.cloud_nodes'
def is_common_error(self, exception):
return self._client.is_cloud_exception(exception)
def is_common_error(self, exception):
return self._client.is_cloud_exception(exception)