Merge branch '8784-dir-listings'
[arvados.git] / services / nodemanager / arvnodeman / computenode / __init__.py
index bc8ada532d3d92ad2aa18d8a8d674f05edd3763d..8a4e5f312b505bfa22cc03cb09b4be6198a04bf0 100644 (file)
@@ -1,4 +1,7 @@
 #!/usr/bin/env python
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
 
 from __future__ import absolute_import, print_function
 
@@ -8,6 +11,9 @@ import itertools
 import re
 import time
 
+from ..config import CLOUD_ERRORS
+from libcloud.common.exceptions import BaseHTTPError
+
 ARVADOS_TIMEFMT = '%Y-%m-%dT%H:%M:%SZ'
 ARVADOS_TIMESUBSEC_RE = re.compile(r'(\.\d+)Z$')
 
@@ -44,7 +50,7 @@ def arvados_node_missing(arvados_node, fresh_time):
     else:
         return not timestamp_fresh(arvados_timestamp(arvados_node["last_ping_at"]), fresh_time)
 
-def _retry(errors=()):
+class RetryMixin(object):
     """Retry decorator for an method that makes remote requests.
 
     Use this function to decorate method, and pass in a tuple of exceptions to
@@ -55,22 +61,66 @@ def _retry(errors=()):
     is a timer actor.)
 
     """
-
-    def decorator(orig_func):
-        @functools.wraps(orig_func)
-        def retry_wrapper(self, *args, **kwargs):
-            start_time = time.time()
-            while True:
-                try:
-                    ret = orig_func(self, *args, **kwargs)
-                except Exception as error:
-                    if not (isinstance(error, errors) or
-                            self._cloud.is_cloud_exception(error)):
+    def __init__(self, retry_wait, max_retry_wait,
+                 logger, cloud, timer=None):
+        self.min_retry_wait = retry_wait
+        self.max_retry_wait = max_retry_wait
+        self.retry_wait = retry_wait
+        self._logger = logger
+        self._cloud = cloud
+        self._timer = timer
+
+    @staticmethod
+    def _retry(errors=()):
+        def decorator(orig_func):
+            @functools.wraps(orig_func)
+            def retry_wrapper(self, *args, **kwargs):
+                while True:
+                    should_retry = False
+                    try:
+                        ret = orig_func(self, *args, **kwargs)
+                    except BaseHTTPError as error:
+                        if error.headers and error.headers.get("retry-after"):
+                            try:
+                                self.retry_wait = int(error.headers["retry-after"])
+                                if self.retry_wait < 0 or self.retry_wait > self.max_retry_wait:
+                                    self.retry_wait = self.max_retry_wait
+                                should_retry = True
+                            except ValueError:
+                                pass
+                        if error.code == 429 or error.code >= 500:
+                            should_retry = True
+                    except CLOUD_ERRORS as error:
+                        should_retry = True
+                    except errors as error:
+                        should_retry = True
+                    except Exception as error:
+                        # As a libcloud workaround for drivers that don't use
+                        # typed exceptions, consider bare Exception() objects
+                        # retryable.
+                        should_retry = type(error) is Exception
+                    else:
+                        # No exception,
+                        self.retry_wait = self.min_retry_wait
+                        return ret
+
+                    # Only got here if an exception was caught.  Now determine what to do about it.
+                    if not should_retry:
+                        self.retry_wait = self.min_retry_wait
+                        self._logger.warning(
+                            "Re-raising error (no retry): %s",
+                            error, exc_info=error)
                         raise
+
                     self._logger.warning(
-                        "Client error: %s - waiting %s seconds",
-                        error, self.retry_wait)
+                        "Client error: %s - %s %s seconds",
+                        error,
+                        "scheduling retry in" if self._timer else "sleeping",
+                        self.retry_wait,
+                        exc_info=error)
+
                     if self._timer:
+                        start_time = time.time()
                         # reschedule to be called again
                         self._timer.schedule(start_time + self.retry_wait,
                                              getattr(self._later,
@@ -79,16 +129,15 @@ def _retry(errors=()):
                     else:
                         # sleep on it.
                         time.sleep(self.retry_wait)
+
                     self.retry_wait = min(self.retry_wait * 2,
                                           self.max_retry_wait)
                     if self._timer:
                         # expect to be called again by timer so don't loop
                         return
-                else:
-                    self.retry_wait = self.min_retry_wait
-                    return ret
-        return retry_wrapper
-    return decorator
+
+            return retry_wrapper
+        return decorator
 
 class ShutdownTimer(object):
     """Keep track of a cloud node's shutdown windows.