Merge branch 'master' into 13822-nm-delayed-daemon
[arvados.git] / services / nodemanager / arvnodeman / computenode / __init__.py
index 8a4e5f312b505bfa22cc03cb09b4be6198a04bf0..b124c66540aab804db3ad555b048c9ef5097e8d0 100644 (file)
@@ -12,7 +12,8 @@ import re
 import time
 
 from ..config import CLOUD_ERRORS
-from libcloud.common.exceptions import BaseHTTPError
+from ..status import tracker
+from libcloud.common.exceptions import BaseHTTPError, RateLimitReachedError
 
 ARVADOS_TIMEFMT = '%Y-%m-%dT%H:%M:%SZ'
 ARVADOS_TIMESUBSEC_RE = re.compile(r'(\.\d+)Z$')
@@ -32,7 +33,7 @@ def arvados_timestamp(timestr):
         subsecs = float(subsec_match.group(1))
         timestr = timestr[:subsec_match.start()] + 'Z'
     return calendar.timegm(time.strptime(timestr + 'UTC',
-                                         ARVADOS_TIMEFMT + '%Z'))
+                                         ARVADOS_TIMEFMT + '%Z')) + subsecs
 
 def timestamp_fresh(timestamp, fresh_time):
     return (time.time() - timestamp) < fresh_time
@@ -61,10 +62,9 @@ class RetryMixin(object):
     is a timer actor.)
 
     """
-    def __init__(self, retry_wait, max_retry_wait,
-                 logger, cloud, timer=None):
-        self.min_retry_wait = retry_wait
-        self.max_retry_wait = max_retry_wait
+    def __init__(self, retry_wait, max_retry_wait, logger, cloud, timer=None):
+        self.min_retry_wait = max(1, retry_wait)
+        self.max_retry_wait = max(self.min_retry_wait, max_retry_wait)
         self.retry_wait = retry_wait
         self._logger = logger
         self._cloud = cloud
@@ -79,18 +79,30 @@ class RetryMixin(object):
                     should_retry = False
                     try:
                         ret = orig_func(self, *args, **kwargs)
+                    except RateLimitReachedError as error:
+                        # If retry-after is zero, continue with exponential
+                        # backoff.
+                        if error.retry_after != 0:
+                            self.retry_wait = error.retry_after
+                        should_retry = True
                     except BaseHTTPError as error:
                         if error.headers and error.headers.get("retry-after"):
                             try:
-                                self.retry_wait = int(error.headers["retry-after"])
-                                if self.retry_wait < 0 or self.retry_wait > self.max_retry_wait:
-                                    self.retry_wait = self.max_retry_wait
+                                retry_after = int(error.headers["retry-after"])
+                                # If retry-after is zero, continue with
+                                # exponential backoff.
+                                if retry_after != 0:
+                                    self.retry_wait = retry_after
                                 should_retry = True
                             except ValueError:
-                                pass
+                                self._logger.warning(
+                                    "Unrecognizable Retry-After header: %r",
+                                    error.headers["retry-after"],
+                                    exc_info=error)
                         if error.code == 429 or error.code >= 500:
                             should_retry = True
                     except CLOUD_ERRORS as error:
+                        tracker.counter_add('cloud_errors')
                         should_retry = True
                     except errors as error:
                         should_retry = True
@@ -98,9 +110,11 @@ class RetryMixin(object):
                         # As a libcloud workaround for drivers that don't use
                         # typed exceptions, consider bare Exception() objects
                         # retryable.
-                        should_retry = type(error) is Exception
+                        if type(error) is Exception:
+                            tracker.counter_add('cloud_errors')
+                            should_retry = True
                     else:
-                        # No exception,
+                        # No exception
                         self.retry_wait = self.min_retry_wait
                         return ret
 
@@ -112,6 +126,12 @@ class RetryMixin(object):
                             error, exc_info=error)
                         raise
 
+                    # Retry wait out of bounds?
+                    if self.retry_wait < self.min_retry_wait:
+                        self.retry_wait = self.min_retry_wait
+                    elif self.retry_wait > self.max_retry_wait:
+                        self.retry_wait = self.max_retry_wait
+
                     self._logger.warning(
                         "Client error: %s - %s %s seconds",
                         error,