#!/usr/bin/env python from __future__ import absolute_import, print_function import functools import logging import time import libcloud.common.types as cloud_types import pykka from .. import \ arvados_node_fqdn, arvados_node_mtime, arvados_timestamp, timestamp_fresh, \ arvados_node_missing, RetryMixin from ...clientactor import _notify_subscribers from ... import config class ComputeNodeStateChangeBase(config.actor_class, RetryMixin): """Base class for actors that change a compute node's state. This base class takes care of retrying changes and notifying subscribers when the change is finished. """ def __init__(self, cloud_client, arvados_client, timer_actor, retry_wait, max_retry_wait): super(ComputeNodeStateChangeBase, self).__init__() RetryMixin.__init__(self, retry_wait, max_retry_wait, None, cloud_client, timer_actor) self._later = self.actor_ref.proxy() self._arvados = arvados_client self.subscribers = set() def _set_logger(self): self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[33:])) def on_start(self): self._set_logger() def _finished(self): _notify_subscribers(self._later, self.subscribers) self.subscribers = None self._logger.info("finished") def subscribe(self, subscriber): if self.subscribers is None: try: subscriber(self._later) except pykka.ActorDeadError: pass else: self.subscribers.add(subscriber) def _clean_arvados_node(self, arvados_node, explanation): return self._arvados.nodes().update( uuid=arvados_node['uuid'], body={'hostname': None, 'ip_address': None, 'slot_number': None, 'first_ping_at': None, 'last_ping_at': None, 'properties': {}, 'info': {'ec2_instance_id': None, 'last_action': explanation}}, ).execute() @staticmethod def _finish_on_exception(orig_func): @functools.wraps(orig_func) def finish_wrapper(self, *args, **kwargs): try: return orig_func(self, *args, **kwargs) except Exception as error: self._logger.error("Actor error %s", error) self._finished() return finish_wrapper class ComputeNodeSetupActor(ComputeNodeStateChangeBase): """Actor to create and set up a cloud compute node. This actor prepares an Arvados node record for a new compute node (either creating one or cleaning one passed in), then boots the actual compute node. It notifies subscribers when the cloud node is successfully created (the last step in the process for Node Manager to handle). """ def __init__(self, timer_actor, arvados_client, cloud_client, cloud_size, arvados_node=None, retry_wait=1, max_retry_wait=180): super(ComputeNodeSetupActor, self).__init__( cloud_client, arvados_client, timer_actor, retry_wait, max_retry_wait) self.cloud_size = cloud_size self.arvados_node = None self.cloud_node = None if arvados_node is None: self._later.create_arvados_node() else: self._later.prepare_arvados_node(arvados_node) @ComputeNodeStateChangeBase._finish_on_exception @RetryMixin._retry(config.ARVADOS_ERRORS) def create_arvados_node(self): self.arvados_node = self._arvados.nodes().create(body={}).execute() self._later.create_cloud_node() @ComputeNodeStateChangeBase._finish_on_exception @RetryMixin._retry(config.ARVADOS_ERRORS) def prepare_arvados_node(self, node): self.arvados_node = self._clean_arvados_node( node, "Prepared by Node Manager") self._later.create_cloud_node() @ComputeNodeStateChangeBase._finish_on_exception @RetryMixin._retry() def create_cloud_node(self): self._logger.info("Sending create_node request for node size %s.", self.cloud_size.name) self.cloud_node = self._cloud.create_node(self.cloud_size, self.arvados_node) if not self.cloud_node.size: self.cloud_node.size = self.cloud_size self._logger.info("Cloud node %s created.", self.cloud_node.id) self._later.update_arvados_node_properties() @ComputeNodeStateChangeBase._finish_on_exception @RetryMixin._retry(config.ARVADOS_ERRORS) def update_arvados_node_properties(self): """Tell Arvados some details about the cloud node. Currently we only include size/price from our request, which we already knew before create_cloud_node(), but doing it here gives us an opportunity to provide more detail from self.cloud_node, too. """ self.arvados_node['properties']['cloud_node'] = { # Note this 'size' is the node size we asked the cloud # driver to create -- not necessarily equal to the size # reported by the cloud driver for the node that was # created. 'size': self.cloud_size.id, 'price': self.cloud_size.price, } self.arvados_node = self._arvados.nodes().update( uuid=self.arvados_node['uuid'], body={'properties': self.arvados_node['properties']}, ).execute() self._logger.info("%s updated properties.", self.arvados_node['uuid']) self._later.post_create() @RetryMixin._retry() def post_create(self): self._cloud.post_create_node(self.cloud_node) self._logger.info("%s post-create work done.", self.cloud_node.id) self._finished() def stop_if_no_cloud_node(self): if self.cloud_node is not None: return False self.stop() return True class ComputeNodeShutdownActor(ComputeNodeStateChangeBase): """Actor to shut down a compute node. This actor simply destroys a cloud node, retrying as needed. """ # Reasons for a shutdown to be cancelled. WINDOW_CLOSED = "shutdown window closed" NODE_BROKEN = "cloud failed to shut down broken node" def __init__(self, timer_actor, cloud_client, arvados_client, node_monitor, cancellable=True, retry_wait=1, max_retry_wait=180): # If a ShutdownActor is cancellable, it will ask the # ComputeNodeMonitorActor if it's still eligible before taking each # action, and stop the shutdown process if the node is no longer # eligible. Normal shutdowns based on job demand should be # cancellable; shutdowns based on node misbehavior should not. super(ComputeNodeShutdownActor, self).__init__( cloud_client, arvados_client, timer_actor, retry_wait, max_retry_wait) self._monitor = node_monitor.proxy() self.cloud_node = self._monitor.cloud_node.get() self.cancellable = cancellable self.cancel_reason = None self.success = None def _set_logger(self): self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[33:], self.cloud_node.name)) def on_start(self): super(ComputeNodeShutdownActor, self).on_start() self._later.shutdown_node() def _arvados_node(self): return self._monitor.arvados_node.get() def _finished(self, success_flag=None): if success_flag is not None: self.success = success_flag return super(ComputeNodeShutdownActor, self)._finished() def cancel_shutdown(self, reason): self.cancel_reason = reason self._logger.info("Shutdown cancelled: %s.", reason) self._finished(success_flag=False) def _stop_if_window_closed(orig_func): @functools.wraps(orig_func) def stop_wrapper(self, *args, **kwargs): if (self.cancellable and (self._monitor.shutdown_eligible().get() is not True)): self._later.cancel_shutdown(self.WINDOW_CLOSED) return None else: return orig_func(self, *args, **kwargs) return stop_wrapper @ComputeNodeStateChangeBase._finish_on_exception @_stop_if_window_closed @RetryMixin._retry() def shutdown_node(self): self._logger.info("Starting shutdown") if not self._cloud.destroy_node(self.cloud_node): if self._cloud.broken(self.cloud_node): self._later.cancel_shutdown(self.NODE_BROKEN) else: # Force a retry. raise cloud_types.LibcloudError("destroy_node failed") self._logger.info("Shutdown success") arv_node = self._arvados_node() if arv_node is None: self._finished(success_flag=True) else: self._later.clean_arvados_node(arv_node) @ComputeNodeStateChangeBase._finish_on_exception @RetryMixin._retry(config.ARVADOS_ERRORS) def clean_arvados_node(self, arvados_node): self._clean_arvados_node(arvados_node, "Shut down by Node Manager") self._finished(success_flag=True) # Make the decorator available to subclasses. _stop_if_window_closed = staticmethod(_stop_if_window_closed) class ComputeNodeUpdateActor(config.actor_class): """Actor to dispatch one-off cloud management requests. This actor receives requests for small cloud updates, and dispatches them to a real driver. ComputeNodeMonitorActors use this to perform maintenance tasks on themselves. Having a dedicated actor for this gives us the opportunity to control the flow of requests; e.g., by backing off when errors occur. This actor is most like a "traditional" Pykka actor: there's no subscribing, but instead methods return real driver results. If you're interested in those results, you should get them from the Future that the proxy method returns. Be prepared to handle exceptions from the cloud driver when you do. """ def __init__(self, cloud_factory, max_retry_wait=180): super(ComputeNodeUpdateActor, self).__init__() self._cloud = cloud_factory() self.max_retry_wait = max_retry_wait self.error_streak = 0 self.next_request_time = time.time() def _throttle_errors(orig_func): @functools.wraps(orig_func) def throttle_wrapper(self, *args, **kwargs): throttle_time = self.next_request_time - time.time() if throttle_time > 0: time.sleep(throttle_time) self.next_request_time = time.time() try: result = orig_func(self, *args, **kwargs) except Exception as error: self.error_streak += 1 self.next_request_time += min(2 ** self.error_streak, self.max_retry_wait) raise else: self.error_streak = 0 return result return throttle_wrapper @_throttle_errors def sync_node(self, cloud_node, arvados_node): return self._cloud.sync_node(cloud_node, arvados_node) class ComputeNodeMonitorActor(config.actor_class): """Actor to manage a running compute node. This actor gets updates about a compute node's cloud and Arvados records. It uses this information to notify subscribers when the node is eligible for shutdown. """ def __init__(self, cloud_node, cloud_node_start_time, shutdown_timer, cloud_fqdn_func, timer_actor, update_actor, cloud_client, arvados_node=None, poll_stale_after=600, node_stale_after=3600, boot_fail_after=1800 ): super(ComputeNodeMonitorActor, self).__init__() self._later = self.actor_ref.proxy() self._last_log = None self._shutdowns = shutdown_timer self._cloud_node_fqdn = cloud_fqdn_func self._timer = timer_actor self._update = update_actor self._cloud = cloud_client self.cloud_node = cloud_node self.cloud_node_start_time = cloud_node_start_time self.poll_stale_after = poll_stale_after self.node_stale_after = node_stale_after self.boot_fail_after = boot_fail_after self.subscribers = set() self.arvados_node = None self._later.update_arvados_node(arvados_node) self.last_shutdown_opening = None self._later.consider_shutdown() def _set_logger(self): self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[33:], self.cloud_node.name)) def on_start(self): self._set_logger() self._timer.schedule(self.cloud_node_start_time + self.boot_fail_after, self._later.consider_shutdown) def subscribe(self, subscriber): self.subscribers.add(subscriber) def _debug(self, msg, *args): if msg == self._last_log: return self._last_log = msg self._logger.debug(msg, *args) def in_state(self, *states): # Return a boolean to say whether or not our Arvados node record is in # one of the given states. If state information is not # available--because this node has no Arvados record, the record is # stale, or the record has no state information--return None. if (self.arvados_node is None) or not timestamp_fresh( arvados_node_mtime(self.arvados_node), self.node_stale_after): return None state = self.arvados_node['crunch_worker_state'] if not state: return None result = state in states if state == 'idle': result = result and not self.arvados_node['job_uuid'] return result def shutdown_eligible(self): """Return True if eligible for shutdown, or a string explaining why the node is not eligible for shutdown.""" if not self._shutdowns.window_open(): return "shutdown window is not open." if self.arvados_node is None: # Node is unpaired. # If it hasn't pinged Arvados after boot_fail seconds, shut it down if timestamp_fresh(self.cloud_node_start_time, self.boot_fail_after): return "node is still booting, will be considered a failed boot at %s" % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.cloud_node_start_time + self.boot_fail_after)) else: return True missing = arvados_node_missing(self.arvados_node, self.node_stale_after) if missing and self._cloud.broken(self.cloud_node): # Node is paired, but Arvados says it is missing and the cloud says the node # is in an error state, so shut it down. return True if missing is None and self._cloud.broken(self.cloud_node): self._logger.info( "Cloud node considered 'broken' but paired node %s last_ping_at is None, " + "cannot check node_stale_after (node may be shut down and we just haven't gotten the message yet).", self.arvados_node['uuid']) if self.in_state('idle'): return True else: return "node is not idle." def consider_shutdown(self): try: next_opening = self._shutdowns.next_opening() eligible = self.shutdown_eligible() if eligible is True: self._debug("Suggesting shutdown.") _notify_subscribers(self._later, self.subscribers) elif self._shutdowns.window_open(): self._debug("Cannot shut down because %s", eligible) elif self.last_shutdown_opening != next_opening: self._debug("Shutdown window closed. Next at %s.", time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(next_opening))) self._timer.schedule(next_opening, self._later.consider_shutdown) self.last_shutdown_opening = next_opening except Exception: self._logger.exception("Unexpected exception") def offer_arvados_pair(self, arvados_node): first_ping_s = arvados_node.get('first_ping_at') if (self.arvados_node is not None) or (not first_ping_s): return None elif ((arvados_node['ip_address'] in self.cloud_node.private_ips) and (arvados_timestamp(first_ping_s) >= self.cloud_node_start_time)): self._later.update_arvados_node(arvados_node) return self.cloud_node.id else: return None def update_cloud_node(self, cloud_node): if cloud_node is not None: self.cloud_node = cloud_node self._later.consider_shutdown() def update_arvados_node(self, arvados_node): # If the cloud node's FQDN doesn't match what's in the Arvados node # record, make them match. # This method is a little unusual in the way it just fires off the # request without checking the result or retrying errors. That's # because this update happens every time we reload the Arvados node # list: if a previous sync attempt failed, we'll see that the names # are out of sync and just try again. ComputeNodeUpdateActor has # the logic to throttle those effective retries when there's trouble. if arvados_node is not None: self.arvados_node = arvados_node if (self._cloud_node_fqdn(self.cloud_node) != arvados_node_fqdn(self.arvados_node)): self._update.sync_node(self.cloud_node, self.arvados_node) self._later.consider_shutdown()