services/nodemanager/arvnodeman/computenode/dispatch/__init__.py

   1 #!/usr/bin/env python
   2 # Copyright (C) The Arvados Authors. All rights reserved.
   3 #
   4 # SPDX-License-Identifier: AGPL-3.0
   5
   6 from __future__ import absolute_import, print_function
   7
   8 import functools
   9 import logging
  10 import time
  11 import re
  12
  13 import libcloud.common.types as cloud_types
  14 from libcloud.common.exceptions import BaseHTTPError
  15
  16 import pykka
  17
  18 from .. import \
  19     arvados_node_fqdn, arvados_node_mtime, arvados_timestamp, timestamp_fresh, \
  20     arvados_node_missing, RetryMixin
  21 from ...clientactor import _notify_subscribers
  22 from ... import config
  23 from ... import status
  24 from .transitions import transitions
  25
  26 QuotaExceeded = "QuotaExceeded"
  27
  28 class ComputeNodeStateChangeBase(config.actor_class, RetryMixin):
  29     """Base class for actors that change a compute node's state.
  30
  31     This base class takes care of retrying changes and notifying
  32     subscribers when the change is finished.
  33     """
  34     def __init__(self, cloud_client, arvados_client, timer_actor,
  35                  retry_wait, max_retry_wait):
  36         super(ComputeNodeStateChangeBase, self).__init__()
  37         RetryMixin.__init__(self, retry_wait, max_retry_wait,
  38                             None, cloud_client, timer_actor)
  39         self._later = self.actor_ref.tell_proxy()
  40         self._arvados = arvados_client
  41         self.subscribers = set()
  42
  43     def _set_logger(self):
  44         self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[33:]))
  45
  46     def on_start(self):
  47         self._set_logger()
  48
  49     def _finished(self):
  50         if self.subscribers is None:
  51             raise Exception("Actor tried to finish twice")
  52         _notify_subscribers(self.actor_ref.proxy(), self.subscribers)
  53         self.subscribers = None
  54         self._logger.info("finished")
  55
  56     def subscribe(self, subscriber):
  57         if self.subscribers is None:
  58             try:
  59                 subscriber(self.actor_ref.proxy())
  60             except pykka.ActorDeadError:
  61                 pass
  62         else:
  63             self.subscribers.add(subscriber)
  64
  65     def _clean_arvados_node(self, arvados_node, explanation):
  66         return self._arvados.nodes().update(
  67             uuid=arvados_node['uuid'],
  68             body={'hostname': None,
  69                   'ip_address': None,
  70                   'slot_number': None,
  71                   'first_ping_at': None,
  72                   'last_ping_at': None,
  73                   'properties': {},
  74                   'info': {'ec2_instance_id': None,
  75                            'last_action': explanation}},
  76             ).execute()
  77
  78     @staticmethod
  79     def _finish_on_exception(orig_func):
  80         @functools.wraps(orig_func)
  81         def finish_wrapper(self, *args, **kwargs):
  82             try:
  83                 return orig_func(self, *args, **kwargs)
  84             except Exception as error:
  85                 self._logger.error("Actor error %s", error)
  86                 self._finished()
  87         return finish_wrapper
  88
  89
  90 class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
  91     """Actor to create and set up a cloud compute node.
  92
  93     This actor prepares an Arvados node record for a new compute node
  94     (either creating one or cleaning one passed in), then boots the
  95     actual compute node.  It notifies subscribers when the cloud node
  96     is successfully created (the last step in the process for Node
  97     Manager to handle).
  98     """
  99     def __init__(self, timer_actor, arvados_client, cloud_client,
 100                  cloud_size, arvados_node=None,
 101                  retry_wait=1, max_retry_wait=180):
 102         super(ComputeNodeSetupActor, self).__init__(
 103             cloud_client, arvados_client, timer_actor,
 104             retry_wait, max_retry_wait)
 105         self.cloud_size = cloud_size
 106         self.arvados_node = None
 107         self.cloud_node = None
 108         self.error = None
 109         if arvados_node is None:
 110             self._later.create_arvados_node()
 111         else:
 112             self._later.prepare_arvados_node(arvados_node)
 113
 114     @ComputeNodeStateChangeBase._finish_on_exception
 115     @RetryMixin._retry(config.ARVADOS_ERRORS)
 116     def create_arvados_node(self):
 117         self.arvados_node = self._arvados.nodes().create(
 118             body={}, assign_slot=True).execute()
 119         self._later.create_cloud_node()
 120
 121     @ComputeNodeStateChangeBase._finish_on_exception
 122     @RetryMixin._retry(config.ARVADOS_ERRORS)
 123     def prepare_arvados_node(self, node):
 124         self._clean_arvados_node(node, "Prepared by Node Manager")
 125         self.arvados_node = self._arvados.nodes().update(
 126             uuid=node['uuid'], body={}, assign_slot=True).execute()
 127         self._later.create_cloud_node()
 128
 129     @ComputeNodeStateChangeBase._finish_on_exception
 130     @RetryMixin._retry()
 131     def create_cloud_node(self):
 132         self._logger.info("Sending create_node request for node size %s.",
 133                           self.cloud_size.name)
 134         try:
 135             self.cloud_node = self._cloud.create_node(self.cloud_size,
 136                                                       self.arvados_node)
 137         except BaseHTTPError as e:
 138             if e.code == 429 or "RequestLimitExceeded" in e.message:
 139                 # Don't consider API rate limits to be quota errors.
 140                 # re-raise so the Retry logic applies.
 141                 raise
 142
 143             # The set of possible error codes / messages isn't documented for
 144             # all clouds, so use a keyword heuristic to determine if the
 145             # failure is likely due to a quota.
 146             if re.search(r'(exceed|quota|limit)', e.message, re.I):
 147                 self.error = QuotaExceeded
 148                 self._logger.warning("Quota exceeded: %s", e)
 149                 self._finished()
 150                 return
 151             else:
 152                 # Something else happened, re-raise so the Retry logic applies.
 153                 raise
 154         except Exception as e:
 155             raise
 156
 157         # The information included in the node size object we get from libcloud
 158         # is inconsistent between cloud drivers.  Replace libcloud NodeSize
 159         # object with compatible CloudSizeWrapper object which merges the size
 160         # info reported from the cloud with size information from the
 161         # configuration file.
 162         self.cloud_node.size = self.cloud_size
 163
 164         self._logger.info("Cloud node %s created.", self.cloud_node.id)
 165         self._later.update_arvados_node_properties()
 166
 167     @ComputeNodeStateChangeBase._finish_on_exception
 168     @RetryMixin._retry(config.ARVADOS_ERRORS)
 169     def update_arvados_node_properties(self):
 170         """Tell Arvados some details about the cloud node.
 171
 172         Currently we only include size/price from our request, which
 173         we already knew before create_cloud_node(), but doing it here
 174         gives us an opportunity to provide more detail from
 175         self.cloud_node, too.
 176         """
 177         self.arvados_node['properties']['cloud_node'] = {
 178             # Note this 'size' is the node size we asked the cloud
 179             # driver to create -- not necessarily equal to the size
 180             # reported by the cloud driver for the node that was
 181             # created.
 182             'size': self.cloud_size.id,
 183             'price': self.cloud_size.price,
 184         }
 185         self.arvados_node = self._arvados.nodes().update(
 186             uuid=self.arvados_node['uuid'],
 187             body={'properties': self.arvados_node['properties']},
 188         ).execute()
 189         self._logger.info("%s updated properties.", self.arvados_node['uuid'])
 190         self._later.post_create()
 191
 192     @RetryMixin._retry()
 193     def post_create(self):
 194         self._cloud.post_create_node(self.cloud_node)
 195         self._logger.info("%s post-create work done.", self.cloud_node.id)
 196         self._finished()
 197
 198     def stop_if_no_cloud_node(self):
 199         if self.cloud_node is not None:
 200             return False
 201         self.stop()
 202         return True
 203
 204
 205 class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
 206     """Actor to shut down a compute node.
 207
 208     This actor simply destroys a cloud node, retrying as needed.
 209     """
 210     # Reasons for a shutdown to be cancelled.
 211     WINDOW_CLOSED = "shutdown window closed"
 212     DESTROY_FAILED = "destroy_node failed"
 213
 214     def __init__(self, timer_actor, cloud_client, arvados_client, node_monitor,
 215                  cancellable=True, retry_wait=1, max_retry_wait=180):
 216         # If a ShutdownActor is cancellable, it will ask the
 217         # ComputeNodeMonitorActor if it's still eligible before taking each
 218         # action, and stop the shutdown process if the node is no longer
 219         # eligible.  Normal shutdowns based on job demand should be
 220         # cancellable; shutdowns based on node misbehavior should not.
 221         super(ComputeNodeShutdownActor, self).__init__(
 222             cloud_client, arvados_client, timer_actor,
 223             retry_wait, max_retry_wait)
 224         self._monitor = node_monitor.proxy()
 225         self.cloud_node = self._monitor.cloud_node.get()
 226         self.cancellable = cancellable
 227         self.cancel_reason = None
 228         self.success = None
 229
 230     def _set_logger(self):
 231         self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[33:], self.cloud_node.name))
 232
 233     def on_start(self):
 234         super(ComputeNodeShutdownActor, self).on_start()
 235         self._later.shutdown_node()
 236
 237     def _arvados_node(self):
 238         return self._monitor.arvados_node.get()
 239
 240     def _finished(self, success_flag=None):
 241         if success_flag is not None:
 242             self.success = success_flag
 243         return super(ComputeNodeShutdownActor, self)._finished()
 244
 245     def cancel_shutdown(self, reason, **kwargs):
 246         if self.cancel_reason is not None:
 247             # already cancelled
 248             return
 249         self.cancel_reason = reason
 250         self._logger.info("Shutdown cancelled: %s.", reason)
 251         self._finished(success_flag=False)
 252
 253     def _cancel_on_exception(orig_func):
 254         @functools.wraps(orig_func)
 255         def finish_wrapper(self, *args, **kwargs):
 256             try:
 257                 return orig_func(self, *args, **kwargs)
 258             except Exception as error:
 259                 self._logger.error("Actor error %s", error)
 260                 self._logger.debug("", exc_info=True)
 261                 self._later.cancel_shutdown("Unhandled exception %s" % error, try_resume=False)
 262         return finish_wrapper
 263
 264     @_cancel_on_exception
 265     def shutdown_node(self):
 266         if self.cancel_reason is not None:
 267             # already cancelled
 268             return
 269         if self.cancellable:
 270             self._logger.info("Checking that node is still eligible for shutdown")
 271             eligible, reason = self._monitor.shutdown_eligible().get()
 272             if not eligible:
 273                 self.cancel_shutdown("No longer eligible for shut down because %s" % reason,
 274                                      try_resume=True)
 275                 return
 276         # If boot failed, count the event
 277         if self._monitor.get_state().get() == 'unpaired':
 278             status.tracker.counter_add('boot_failures')
 279         self._destroy_node()
 280
 281     def _destroy_node(self):
 282         self._logger.info("Starting shutdown")
 283         arv_node = self._arvados_node()
 284         if self._cloud.destroy_node(self.cloud_node):
 285             self._logger.info("Shutdown success")
 286             if arv_node:
 287                 self._later.clean_arvados_node(arv_node)
 288             else:
 289                 self._finished(success_flag=True)
 290         else:
 291             self.cancel_shutdown(self.DESTROY_FAILED, try_resume=False)
 292
 293     @ComputeNodeStateChangeBase._finish_on_exception
 294     @RetryMixin._retry(config.ARVADOS_ERRORS)
 295     def clean_arvados_node(self, arvados_node):
 296         self._clean_arvados_node(arvados_node, "Shut down by Node Manager")
 297         self._finished(success_flag=True)
 298
 299
 300 class ComputeNodeUpdateActor(config.actor_class, RetryMixin):
 301     """Actor to dispatch one-off cloud management requests.
 302
 303     This actor receives requests for small cloud updates, and
 304     dispatches them to a real driver.  ComputeNodeMonitorActors use
 305     this to perform maintenance tasks on themselves.  Having a
 306     dedicated actor for this gives us the opportunity to control the
 307     flow of requests; e.g., by backing off when errors occur.
 308     """
 309     def __init__(self, cloud_factory, timer_actor, max_retry_wait=180):
 310         super(ComputeNodeUpdateActor, self).__init__()
 311         RetryMixin.__init__(self, 1, max_retry_wait,
 312                             None, cloud_factory(), timer_actor)
 313         self._cloud = cloud_factory()
 314         self._later = self.actor_ref.tell_proxy()
 315
 316     def _set_logger(self):
 317         self._logger = logging.getLogger("%s.%s" % (self.__class__.__name__, self.actor_urn[33:]))
 318
 319     def on_start(self):
 320         self._set_logger()
 321
 322     @RetryMixin._retry()
 323     def sync_node(self, cloud_node, arvados_node):
 324         if self._cloud.node_fqdn(cloud_node) != arvados_node_fqdn(arvados_node):
 325             return self._cloud.sync_node(cloud_node, arvados_node)
 326
 327
 328 class ComputeNodeMonitorActor(config.actor_class):
 329     """Actor to manage a running compute node.
 330
 331     This actor gets updates about a compute node's cloud and Arvados records.
 332     It uses this information to notify subscribers when the node is eligible
 333     for shutdown.
 334     """
 335     def __init__(self, cloud_node, cloud_node_start_time, shutdown_timer,
 336                  timer_actor, update_actor, cloud_client,
 337                  arvados_node=None, poll_stale_after=600, node_stale_after=3600,
 338                  boot_fail_after=1800
 339     ):
 340         super(ComputeNodeMonitorActor, self).__init__()
 341         self._later = self.actor_ref.tell_proxy()
 342         self._shutdowns = shutdown_timer
 343         self._timer = timer_actor
 344         self._update = update_actor
 345         self._cloud = cloud_client
 346         self.cloud_node = cloud_node
 347         self.cloud_node_start_time = cloud_node_start_time
 348         self.poll_stale_after = poll_stale_after
 349         self.node_stale_after = node_stale_after
 350         self.boot_fail_after = boot_fail_after
 351         self.subscribers = set()
 352         self.arvados_node = None
 353         self._later.update_arvados_node(arvados_node)
 354         self.last_shutdown_opening = None
 355         self._later.consider_shutdown()
 356
 357     def _set_logger(self):
 358         self._logger = logging.getLogger("%s.%s.%s" % (self.__class__.__name__, self.actor_urn[33:], self.cloud_node.name))
 359
 360     def on_start(self):
 361         self._set_logger()
 362         self._timer.schedule(self.cloud_node_start_time + self.boot_fail_after, self._later.consider_shutdown)
 363
 364     def subscribe(self, subscriber):
 365         self.subscribers.add(subscriber)
 366
 367     def _debug(self, msg, *args):
 368         self._logger.debug(msg, *args)
 369
 370     def get_state(self):
 371         """Get node state, one of ['unpaired', 'busy', 'idle', 'down']."""
 372
 373         # If this node is not associated with an Arvados node, return
 374         # 'unpaired' if we're in the boot grace period, and 'down' if not,
 375         # so it isn't counted towards usable nodes.
 376         if self.arvados_node is None:
 377             if timestamp_fresh(self.cloud_node_start_time,
 378                                self.boot_fail_after):
 379                 return 'unpaired'
 380             else:
 381                 return 'down'
 382
 383         state = self.arvados_node['crunch_worker_state']
 384
 385         # If state information is not available because it is missing or the
 386         # record is stale, return 'down'.
 387         if not state or not timestamp_fresh(arvados_node_mtime(self.arvados_node),
 388                                             self.node_stale_after):
 389             state = 'down'
 390
 391         # There's a window between when a node pings for the first time and the
 392         # value of 'slurm_state' is synchronized by crunch-dispatch.  In this
 393         # window, the node will still report as 'down'.  Check that
 394         # first_ping_at is truthy and consider the node 'idle' during the
 395         # initial boot grace period.
 396         if (state == 'down' and
 397             self.arvados_node['first_ping_at'] and
 398             timestamp_fresh(self.cloud_node_start_time,
 399                             self.boot_fail_after) and
 400             not self._cloud.broken(self.cloud_node)):
 401             state = 'idle'
 402
 403         # "missing" means last_ping_at is stale, this should be
 404         # considered "down"
 405         if arvados_node_missing(self.arvados_node, self.node_stale_after):
 406             state = 'down'
 407
 408         # Turns out using 'job_uuid' this way is a bad idea.  The node record
 409         # is assigned the job_uuid before the job is locked (which removes it
 410         # from the queue) which means the job will be double-counted as both in
 411         # the wishlist and but also keeping a node busy.  This end result is
 412         # excess nodes being booted.
 413         #if state == 'idle' and self.arvados_node['job_uuid']:
 414         #    state = 'busy'
 415
 416         # Update idle node times tracker
 417         if state == 'idle':
 418             status.tracker.idle_in(self.arvados_node['hostname'])
 419         else:
 420             status.tracker.idle_out(self.arvados_node['hostname'])
 421
 422         return state
 423
 424     def in_state(self, *states):
 425         return self.get_state() in states
 426
 427     def shutdown_eligible(self):
 428         """Determine if node is candidate for shut down.
 429
 430         Returns a tuple of (boolean, string) where the first value is whether
 431         the node is candidate for shut down, and the second value is the
 432         reason for the decision.
 433         """
 434
 435         # Collect states and then consult state transition table whether we
 436         # should shut down.  Possible states are:
 437         # crunch_worker_state = ['unpaired', 'busy', 'idle', 'down']
 438         # window = ["open", "closed"]
 439         # boot_grace = ["boot wait", "boot exceeded"]
 440         # idle_grace = ["not idle", "idle wait", "idle exceeded"]
 441
 442         if self.arvados_node and not timestamp_fresh(arvados_node_mtime(self.arvados_node), self.node_stale_after):
 443             return (False, "node state is stale")
 444
 445         crunch_worker_state = self.get_state()
 446
 447         window = "open" if self._shutdowns.window_open() else "closed"
 448
 449         if timestamp_fresh(self.cloud_node_start_time, self.boot_fail_after):
 450             boot_grace = "boot wait"
 451         else:
 452             boot_grace = "boot exceeded"
 453
 454         # API server side not implemented yet.
 455         idle_grace = 'idle exceeded'
 456
 457         node_state = (crunch_worker_state, window, boot_grace, idle_grace)
 458         t = transitions[node_state]
 459         if t is not None:
 460             # yes, shutdown eligible
 461             return (True, "node state is %s" % (node_state,))
 462         else:
 463             # no, return a reason
 464             return (False, "node state is %s" % (node_state,))
 465
 466     def consider_shutdown(self):
 467         try:
 468             eligible, reason = self.shutdown_eligible()
 469             next_opening = self._shutdowns.next_opening()
 470             if eligible:
 471                 self._debug("Suggesting shutdown because %s", reason)
 472                 _notify_subscribers(self.actor_ref.proxy(), self.subscribers)
 473             else:
 474                 self._debug("Not eligible for shut down because %s", reason)
 475
 476                 if self.last_shutdown_opening != next_opening:
 477                     self._debug("Shutdown window closed.  Next at %s.",
 478                                 time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(next_opening)))
 479                     self._timer.schedule(next_opening, self._later.consider_shutdown)
 480                     self.last_shutdown_opening = next_opening
 481         except Exception:
 482             self._logger.exception("Unexpected exception")
 483
 484     def offer_arvados_pair(self, arvados_node):
 485         first_ping_s = arvados_node.get('first_ping_at')
 486         if (self.arvados_node is not None) or (not first_ping_s):
 487             return None
 488         elif ((arvados_node['info'].get('ec2_instance_id') == self._cloud.node_id(self.cloud_node)) and
 489               (arvados_timestamp(first_ping_s) >= self.cloud_node_start_time)):
 490             self._later.update_arvados_node(arvados_node)
 491             return self.cloud_node.id
 492         else:
 493             return None
 494
 495     def update_cloud_node(self, cloud_node):
 496         if cloud_node is not None:
 497             self.cloud_node = cloud_node
 498             self._later.consider_shutdown()
 499
 500     def update_arvados_node(self, arvados_node):
 501         """Called when the latest Arvados node record is retrieved.
 502
 503         Calls the updater's sync_node() method.
 504
 505         """
 506         # This method is a little unusual in the way it just fires off the
 507         # request without checking the result or retrying errors.  That's
 508         # because this update happens every time we reload the Arvados node
 509         # list: if a previous sync attempt failed, we'll see that the names
 510         # are out of sync and just try again.  ComputeNodeUpdateActor has
 511         # the logic to throttle those effective retries when there's trouble.
 512         if arvados_node is not None:
 513             self.arvados_node = arvados_node
 514             self._update.sync_node(self.cloud_node, self.arvados_node)
 515             self._later.consider_shutdown()