services/nodemanager/arvnodeman/computenode/dispatch/__init__.py

   1 #!/usr/bin/env python
   2 # Copyright (C) The Arvados Authors. All rights reserved.
   3 #
   4 # SPDX-License-Identifier: AGPL-3.0
   5
   6 from __future__ import absolute_import, print_function
   7
   8 import functools
   9 import logging
  10 import time
  11 import re
  12
  13 import libcloud.common.types as cloud_types
  14 from libcloud.common.exceptions import BaseHTTPError
  15
  16 import pykka
  17
  18 from .. import \
  19     arvados_node_fqdn, arvados_node_mtime, arvados_timestamp, timestamp_fresh, \
  20     arvados_node_missing, RetryMixin
  21 from ...clientactor import _notify_subscribers
  22 from ... import config
  23 from .transitions import transitions
  24
  25 QuotaExceeded = "QuotaExceeded"
  26
  27 class ComputeNodeStateChangeBase(config.actor_class, RetryMixin):
  28     """Base class for actors that change a compute node's state.
  29
  30     This base class takes care of retrying changes and notifying
  31     subscribers when the change is finished.
  32     """
  33     def __init__(self, cloud_client, arvados_client, timer_actor,
  34                  retry_wait, max_retry_wait):
  35         super(ComputeNodeStateChangeBase, self).__init__()
  36         RetryMixin.__init__(self, retry_wait, max_retry_wait,
  37                             None, cloud_client, timer_actor)
  38         self._later = self.actor_ref.tell_proxy()
  39         self._arvados = arvados_client
  40         self.subscribers = set()
  41
  42     def _set_logger(self):
  43         self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[33:]))
  44
  45     def on_start(self):
  46         self._set_logger()
  47
  48     def _finished(self):
  49         if self.subscribers is None:
  50             raise Exception("Actor tried to finish twice")
  51         _notify_subscribers(self.actor_ref.proxy(), self.subscribers)
  52         self.subscribers = None
  53         self._logger.info("finished")
  54
  55     def subscribe(self, subscriber):
  56         if self.subscribers is None:
  57             try:
  58                 subscriber(self.actor_ref.proxy())
  59             except pykka.ActorDeadError:
  60                 pass
  61         else:
  62             self.subscribers.add(subscriber)
  63
  64     def _clean_arvados_node(self, arvados_node, explanation):
  65         return self._arvados.nodes().update(
  66             uuid=arvados_node['uuid'],
  67             body={'hostname': None,
  68                   'ip_address': None,
  69                   'slot_number': None,
  70                   'first_ping_at': None,
  71                   'last_ping_at': None,
  72                   'properties': {},
  73                   'info': {'ec2_instance_id': None,
  74                            'last_action': explanation}},
  75             ).execute()
  76
  77     @staticmethod
  78     def _finish_on_exception(orig_func):
  79         @functools.wraps(orig_func)
  80         def finish_wrapper(self, *args, **kwargs):
  81             try:
  82                 return orig_func(self, *args, **kwargs)
  83             except Exception as error:
  84                 self._logger.error("Actor error %s", error)
  85                 self._finished()
  86         return finish_wrapper
  87
  88
  89 class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
  90     """Actor to create and set up a cloud compute node.
  91
  92     This actor prepares an Arvados node record for a new compute node
  93     (either creating one or cleaning one passed in), then boots the
  94     actual compute node.  It notifies subscribers when the cloud node
  95     is successfully created (the last step in the process for Node
  96     Manager to handle).
  97     """
  98     def __init__(self, timer_actor, arvados_client, cloud_client,
  99                  cloud_size, arvados_node=None,
 100                  retry_wait=1, max_retry_wait=180):
 101         super(ComputeNodeSetupActor, self).__init__(
 102             cloud_client, arvados_client, timer_actor,
 103             retry_wait, max_retry_wait)
 104         self.cloud_size = cloud_size
 105         self.arvados_node = None
 106         self.cloud_node = None
 107         self.error = None
 108         if arvados_node is None:
 109             self._later.create_arvados_node()
 110         else:
 111             self._later.prepare_arvados_node(arvados_node)
 112
 113     @ComputeNodeStateChangeBase._finish_on_exception
 114     @RetryMixin._retry(config.ARVADOS_ERRORS)
 115     def create_arvados_node(self):
 116         self.arvados_node = self._arvados.nodes().create(body={}).execute()
 117         self._later.create_cloud_node()
 118
 119     @ComputeNodeStateChangeBase._finish_on_exception
 120     @RetryMixin._retry(config.ARVADOS_ERRORS)
 121     def prepare_arvados_node(self, node):
 122         self.arvados_node = self._clean_arvados_node(
 123             node, "Prepared by Node Manager")
 124         self._later.create_cloud_node()
 125
 126     @ComputeNodeStateChangeBase._finish_on_exception
 127     @RetryMixin._retry()
 128     def create_cloud_node(self):
 129         self._logger.info("Sending create_node request for node size %s.",
 130                           self.cloud_size.name)
 131         try:
 132             self.cloud_node = self._cloud.create_node(self.cloud_size,
 133                                                       self.arvados_node)
 134         except BaseHTTPError as e:
 135             if e.code == 429 or "RequestLimitExceeded" in e.message:
 136                 # Don't consider API rate limits to be quota errors.
 137                 # re-raise so the Retry logic applies.
 138                 raise
 139
 140             # The set of possible error codes / messages isn't documented for
 141             # all clouds, so use a keyword heuristic to determine if the
 142             # failure is likely due to a quota.
 143             if re.search(r'(exceed|quota|limit)', e.message, re.I):
 144                 self.error = QuotaExceeded
 145                 self._logger.warning("Quota exceeded: %s", e)
 146                 self._finished()
 147                 return
 148             else:
 149                 # Something else happened, re-raise so the Retry logic applies.
 150                 raise
 151         except Exception as e:
 152             raise
 153
 154         # The information included in the node size object we get from libcloud
 155         # is inconsistent between cloud drivers.  Replace libcloud NodeSize
 156         # object with compatible CloudSizeWrapper object which merges the size
 157         # info reported from the cloud with size information from the
 158         # configuration file.
 159         self.cloud_node.size = self.cloud_size
 160
 161         self._logger.info("Cloud node %s created.", self.cloud_node.id)
 162         self._later.update_arvados_node_properties()
 163
 164     @ComputeNodeStateChangeBase._finish_on_exception
 165     @RetryMixin._retry(config.ARVADOS_ERRORS)
 166     def update_arvados_node_properties(self):
 167         """Tell Arvados some details about the cloud node.
 168
 169         Currently we only include size/price from our request, which
 170         we already knew before create_cloud_node(), but doing it here
 171         gives us an opportunity to provide more detail from
 172         self.cloud_node, too.
 173         """
 174         self.arvados_node['properties']['cloud_node'] = {
 175             # Note this 'size' is the node size we asked the cloud
 176             # driver to create -- not necessarily equal to the size
 177             # reported by the cloud driver for the node that was
 178             # created.
 179             'size': self.cloud_size.id,
 180             'price': self.cloud_size.price,
 181         }
 182         self.arvados_node = self._arvados.nodes().update(
 183             uuid=self.arvados_node['uuid'],
 184             body={'properties': self.arvados_node['properties']},
 185         ).execute()
 186         self._logger.info("%s updated properties.", self.arvados_node['uuid'])
 187         self._later.post_create()
 188
 189     @RetryMixin._retry()
 190     def post_create(self):
 191         self._cloud.post_create_node(self.cloud_node)
 192         self._logger.info("%s post-create work done.", self.cloud_node.id)
 193         self._finished()
 194
 195     def stop_if_no_cloud_node(self):
 196         if self.cloud_node is not None:
 197             return False
 198         self.stop()
 199         return True
 200
 201
 202 class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
 203     """Actor to shut down a compute node.
 204
 205     This actor simply destroys a cloud node, retrying as needed.
 206     """
 207     # Reasons for a shutdown to be cancelled.
 208     WINDOW_CLOSED = "shutdown window closed"
 209     DESTROY_FAILED = "destroy_node failed"
 210
 211     def __init__(self, timer_actor, cloud_client, arvados_client, node_monitor,
 212                  cancellable=True, retry_wait=1, max_retry_wait=180):
 213         # If a ShutdownActor is cancellable, it will ask the
 214         # ComputeNodeMonitorActor if it's still eligible before taking each
 215         # action, and stop the shutdown process if the node is no longer
 216         # eligible.  Normal shutdowns based on job demand should be
 217         # cancellable; shutdowns based on node misbehavior should not.
 218         super(ComputeNodeShutdownActor, self).__init__(
 219             cloud_client, arvados_client, timer_actor,
 220             retry_wait, max_retry_wait)
 221         self._monitor = node_monitor.proxy()
 222         self.cloud_node = self._monitor.cloud_node.get()
 223         self.cancellable = cancellable
 224         self.cancel_reason = None
 225         self.success = None
 226
 227     def _set_logger(self):
 228         self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[33:], self.cloud_node.name))
 229
 230     def on_start(self):
 231         super(ComputeNodeShutdownActor, self).on_start()
 232         self._later.shutdown_node()
 233
 234     def _arvados_node(self):
 235         return self._monitor.arvados_node.get()
 236
 237     def _finished(self, success_flag=None):
 238         if success_flag is not None:
 239             self.success = success_flag
 240         return super(ComputeNodeShutdownActor, self)._finished()
 241
 242     def cancel_shutdown(self, reason, **kwargs):
 243         self.cancel_reason = reason
 244         self._logger.info("Shutdown cancelled: %s.", reason)
 245         self._finished(success_flag=False)
 246
 247     def _cancel_on_exception(orig_func):
 248         @functools.wraps(orig_func)
 249         def finish_wrapper(self, *args, **kwargs):
 250             try:
 251                 return orig_func(self, *args, **kwargs)
 252             except Exception as error:
 253                 self._logger.error("Actor error %s", error)
 254                 self._logger.debug("", exc_info=True)
 255                 self._later.cancel_shutdown("Unhandled exception %s" % error, try_resume=False)
 256         return finish_wrapper
 257
 258     @_cancel_on_exception
 259     def shutdown_node(self):
 260         if self.cancellable:
 261             self._logger.info("Checking that node is still eligible for shutdown")
 262             eligible, reason = self._monitor.shutdown_eligible().get()
 263             if not eligible:
 264                 self.cancel_shutdown("No longer eligible for shut down because %s" % reason,
 265                                      try_resume=True)
 266                 return
 267         self._destroy_node()
 268
 269     def _destroy_node(self):
 270         self._logger.info("Starting shutdown")
 271         arv_node = self._arvados_node()
 272         if self._cloud.destroy_node(self.cloud_node):
 273             self._logger.info("Shutdown success")
 274             if arv_node:
 275                 self._later.clean_arvados_node(arv_node)
 276             else:
 277                 self._finished(success_flag=True)
 278         else:
 279             self.cancel_shutdown(self.DESTROY_FAILED, try_resume=False)
 280
 281     @ComputeNodeStateChangeBase._finish_on_exception
 282     @RetryMixin._retry(config.ARVADOS_ERRORS)
 283     def clean_arvados_node(self, arvados_node):
 284         self._clean_arvados_node(arvados_node, "Shut down by Node Manager")
 285         self._finished(success_flag=True)
 286
 287
 288 class ComputeNodeUpdateActor(config.actor_class, RetryMixin):
 289     """Actor to dispatch one-off cloud management requests.
 290
 291     This actor receives requests for small cloud updates, and
 292     dispatches them to a real driver.  ComputeNodeMonitorActors use
 293     this to perform maintenance tasks on themselves.  Having a
 294     dedicated actor for this gives us the opportunity to control the
 295     flow of requests; e.g., by backing off when errors occur.
 296     """
 297     def __init__(self, cloud_factory, timer_actor, max_retry_wait=180):
 298         super(ComputeNodeUpdateActor, self).__init__()
 299         RetryMixin.__init__(self, 1, max_retry_wait,
 300                             None, cloud_factory(), timer_actor)
 301         self._cloud = cloud_factory()
 302         self._later = self.actor_ref.tell_proxy()
 303
 304     def _set_logger(self):
 305         self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[33:]))
 306
 307     def on_start(self):
 308         self._set_logger()
 309
 310     @RetryMixin._retry()
 311     def sync_node(self, cloud_node, arvados_node):
 312         return self._cloud.sync_node(cloud_node, arvados_node)
 313
 314
 315 class ComputeNodeMonitorActor(config.actor_class):
 316     """Actor to manage a running compute node.
 317
 318     This actor gets updates about a compute node's cloud and Arvados records.
 319     It uses this information to notify subscribers when the node is eligible
 320     for shutdown.
 321     """
 322     def __init__(self, cloud_node, cloud_node_start_time, shutdown_timer,
 323                  cloud_fqdn_func, timer_actor, update_actor, cloud_client,
 324                  arvados_node=None, poll_stale_after=600, node_stale_after=3600,
 325                  boot_fail_after=1800
 326     ):
 327         super(ComputeNodeMonitorActor, self).__init__()
 328         self._later = self.actor_ref.tell_proxy()
 329         self._shutdowns = shutdown_timer
 330         self._cloud_node_fqdn = cloud_fqdn_func
 331         self._timer = timer_actor
 332         self._update = update_actor
 333         self._cloud = cloud_client
 334         self.cloud_node = cloud_node
 335         self.cloud_node_start_time = cloud_node_start_time
 336         self.poll_stale_after = poll_stale_after
 337         self.node_stale_after = node_stale_after
 338         self.boot_fail_after = boot_fail_after
 339         self.subscribers = set()
 340         self.arvados_node = None
 341         self._later.update_arvados_node(arvados_node)
 342         self.last_shutdown_opening = None
 343         self._later.consider_shutdown()
 344
 345     def _set_logger(self):
 346         self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[33:], self.cloud_node.name))
 347
 348     def on_start(self):
 349         self._set_logger()
 350         self._timer.schedule(self.cloud_node_start_time + self.boot_fail_after, self._later.consider_shutdown)
 351
 352     def subscribe(self, subscriber):
 353         self.subscribers.add(subscriber)
 354
 355     def _debug(self, msg, *args):
 356         self._logger.debug(msg, *args)
 357
 358     def get_state(self):
 359         """Get node state, one of ['unpaired', 'busy', 'idle', 'down']."""
 360
 361         # If this node is not associated with an Arvados node, return 'unpaired'.
 362         if self.arvados_node is None:
 363             return 'unpaired'
 364
 365         state = self.arvados_node['crunch_worker_state']
 366
 367         # If state information is not available because it is missing or the
 368         # record is stale, return 'down'.
 369         if not state or not timestamp_fresh(arvados_node_mtime(self.arvados_node),
 370                                             self.node_stale_after):
 371             state = 'down'
 372
 373         # There's a window between when a node pings for the first time and the
 374         # value of 'slurm_state' is synchronized by crunch-dispatch.  In this
 375         # window, the node will still report as 'down'.  Check that
 376         # first_ping_at is truthy and consider the node 'idle' during the
 377         # initial boot grace period.
 378         if (state == 'down' and
 379             self.arvados_node['first_ping_at'] and
 380             timestamp_fresh(self.cloud_node_start_time,
 381                             self.boot_fail_after) and
 382             not self._cloud.broken(self.cloud_node)):
 383             state = 'idle'
 384
 385         # "missing" means last_ping_at is stale, this should be
 386         # considered "down"
 387         if arvados_node_missing(self.arvados_node, self.node_stale_after):
 388             state = 'down'
 389
 390         # Turns out using 'job_uuid' this way is a bad idea.  The node record
 391         # is assigned the job_uuid before the job is locked (which removes it
 392         # from the queue) which means the job will be double-counted as both in
 393         # the wishlist and but also keeping a node busy.  This end result is
 394         # excess nodes being booted.
 395         #if state == 'idle' and self.arvados_node['job_uuid']:
 396         #    state = 'busy'
 397
 398         return state
 399
 400     def in_state(self, *states):
 401         return self.get_state() in states
 402
 403     def shutdown_eligible(self):
 404         """Determine if node is candidate for shut down.
 405
 406         Returns a tuple of (boolean, string) where the first value is whether
 407         the node is candidate for shut down, and the second value is the
 408         reason for the decision.
 409         """
 410
 411         # Collect states and then consult state transition table whether we
 412         # should shut down.  Possible states are:
 413         # crunch_worker_state = ['unpaired', 'busy', 'idle', 'down']
 414         # window = ["open", "closed"]
 415         # boot_grace = ["boot wait", "boot exceeded"]
 416         # idle_grace = ["not idle", "idle wait", "idle exceeded"]
 417
 418         if self.arvados_node and not timestamp_fresh(arvados_node_mtime(self.arvados_node), self.node_stale_after):
 419             return (False, "node state is stale")
 420
 421         crunch_worker_state = self.get_state()
 422
 423         window = "open" if self._shutdowns.window_open() else "closed"
 424
 425         if timestamp_fresh(self.cloud_node_start_time, self.boot_fail_after):
 426             boot_grace = "boot wait"
 427         else:
 428             boot_grace = "boot exceeded"
 429
 430         # API server side not implemented yet.
 431         idle_grace = 'idle exceeded'
 432
 433         node_state = (crunch_worker_state, window, boot_grace, idle_grace)
 434         t = transitions[node_state]
 435         if t is not None:
 436             # yes, shutdown eligible
 437             return (True, "node state is %s" % (node_state,))
 438         else:
 439             # no, return a reason
 440             return (False, "node state is %s" % (node_state,))
 441
 442     def consider_shutdown(self):
 443         try:
 444             eligible, reason = self.shutdown_eligible()
 445             next_opening = self._shutdowns.next_opening()
 446             if eligible:
 447                 self._debug("Suggesting shutdown because %s", reason)
 448                 _notify_subscribers(self.actor_ref.proxy(), self.subscribers)
 449             else:
 450                 self._debug("Not eligible for shut down because %s", reason)
 451
 452                 if self.last_shutdown_opening != next_opening:
 453                     self._debug("Shutdown window closed.  Next at %s.",
 454                                 time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(next_opening)))
 455                     self._timer.schedule(next_opening, self._later.consider_shutdown)
 456                     self.last_shutdown_opening = next_opening
 457         except Exception:
 458             self._logger.exception("Unexpected exception")
 459
 460     def offer_arvados_pair(self, arvados_node):
 461         first_ping_s = arvados_node.get('first_ping_at')
 462         if (self.arvados_node is not None) or (not first_ping_s):
 463             return None
 464         elif ((arvados_node['info'].get('ec2_instance_id') == self._cloud.node_id(self.cloud_node)) and
 465               (arvados_timestamp(first_ping_s) >= self.cloud_node_start_time)):
 466             self._later.update_arvados_node(arvados_node)
 467             return self.cloud_node.id
 468         else:
 469             return None
 470
 471     def update_cloud_node(self, cloud_node):
 472         if cloud_node is not None:
 473             self.cloud_node = cloud_node
 474             self._later.consider_shutdown()
 475
 476     def update_arvados_node(self, arvados_node):
 477         # If the cloud node's FQDN doesn't match what's in the Arvados node
 478         # record, make them match.
 479         # This method is a little unusual in the way it just fires off the
 480         # request without checking the result or retrying errors.  That's
 481         # because this update happens every time we reload the Arvados node
 482         # list: if a previous sync attempt failed, we'll see that the names
 483         # are out of sync and just try again.  ComputeNodeUpdateActor has
 484         # the logic to throttle those effective retries when there's trouble.
 485         if arvados_node is not None:
 486             self.arvados_node = arvados_node
 487             if (self._cloud_node_fqdn(self.cloud_node) !=
 488                   arvados_node_fqdn(self.arvados_node)):
 489                 self._update.sync_node(self.cloud_node, self.arvados_node)
 490             self._later.consider_shutdown()