1 """Utilities to retry operations.
3 The core of this module is `RetryLoop`, a utility class to retry operations
4 that might fail. It can distinguish between temporary and permanent failures;
5 provide exponential backoff; and save a series of results.
7 It also provides utility functions for common operations with `RetryLoop`:
9 * `check_http_response_success` can be used as a `RetryLoop` `success_check`
10 for HTTP response codes from the Arvados API server.
11 * `retry_method` can decorate methods to provide a default `num_retries`
14 # Copyright (C) The Arvados Authors. All rights reserved.
16 # SPDX-License-Identifier: Apache-2.0
23 from collections import deque
33 _HTTP_SUCCESSES = set(range(200, 300))
34 _HTTP_CAN_RETRY = set([408, 409, 423, 500, 502, 503, 504])
36 CT = TypeVar('CT', bound=Callable)
39 class RetryLoop(Generic[T]):
40 """Coordinate limited retries of code.
42 `RetryLoop` coordinates a loop that runs until it records a
43 successful result or tries too many times, whichever comes first.
44 Typical use looks like:
46 loop = RetryLoop(num_retries=2)
47 for tries_left in loop:
49 result = do_something()
50 except TemporaryError as error:
51 log("error: {} ({} tries left)".format(error, tries_left))
53 loop.save_result(result)
55 return loop.last_result()
59 * num_retries: int --- The maximum number of times to retry the loop if
60 it doesn't succeed. This means the loop body could run at most
61 `num_retries + 1` times.
63 * success_check: Callable[[T], bool | None] --- This is a function that
64 will be called each time the loop saves a result. The function should
65 return `True` if the result indicates the code succeeded, `False` if
66 it represents a permanent failure, and `None` if it represents a
67 temporary failure. If no function is provided, the loop will end
68 after any result is saved.
70 * backoff_start: float --- The number of seconds that must pass before
71 the loop's second iteration. Default 0, which disables all waiting.
73 * backoff_growth: float --- The wait time multiplier after each
74 iteration. Default 2 (i.e., double the wait time each time).
76 * save_results: int --- Specify a number to store that many saved
77 results from the loop. These are available through the `results`
78 attribute, oldest first. Default 1.
80 * max_wait: float --- Maximum number of seconds to wait between
86 success_check: Callable[[T], Optional[bool]]=lambda r: True,
87 backoff_start: float=0,
88 backoff_growth: float=2,
92 self.tries_left = num_retries + 1
93 self.check_result = success_check
94 self.backoff_wait = backoff_start
95 self.backoff_growth = backoff_growth
96 self.max_wait = max_wait
97 self.next_start_time = 0
98 self.results = deque(maxlen=save_results)
103 def __iter__(self) -> 'RetryLoop':
104 """Return an iterator of retries."""
107 def running(self) -> Optional[bool]:
108 """Return whether this loop is running.
110 Returns `None` if the loop has never run, `True` if it is still running,
111 or `False` if it has stopped—whether that's because it has saved a
112 successful result, a permanent failure, or has run out of retries.
114 return self._running and (self._success is None)
116 def __next__(self) -> int:
117 """Record a loop attempt.
119 If the loop is still running, decrements the number of tries left and
120 returns it. Otherwise, raises `StopIteration`.
122 if self._running is None:
124 if (self.tries_left < 1) or not self.running():
125 self._running = False
128 wait_time = max(0, self.next_start_time - time.time())
129 time.sleep(wait_time)
130 self.backoff_wait *= self.backoff_growth
131 if self.backoff_wait > self.max_wait:
132 self.backoff_wait = self.max_wait
133 self.next_start_time = time.time() + self.backoff_wait
135 return self.tries_left
137 def save_result(self, result: T) -> None:
138 """Record a loop result.
140 Save the given result, and end the loop if it indicates
141 success or permanent failure. See documentation for the `__init__`
142 `success_check` argument to learn how that's indicated.
144 Raises `arvados.errors.AssertionError` if called after the loop has
149 * result: T --- The result from this loop attempt to check and save.
151 if not self.running():
152 raise arvados.errors.AssertionError(
153 "recorded a loop result after the loop finished")
154 self.results.append(result)
155 self._success = self.check_result(result)
158 def success(self) -> Optional[bool]:
159 """Return the loop's end state.
161 Returns `True` if the loop recorded a successful result, `False` if it
162 recorded permanent failure, or else `None`.
166 def last_result(self) -> T:
167 """Return the most recent result the loop saved.
169 Raises `arvados.errors.AssertionError` if called before any result has
173 return self.results[-1]
175 raise arvados.errors.AssertionError(
176 "queried loop results before any were recorded")
178 def attempts(self) -> int:
179 """Return the number of results that have been saved.
181 This count includes all kinds of results: success, permanent failure,
182 and temporary failure.
184 return self._attempts
186 def attempts_str(self) -> str:
187 """Return a human-friendly string counting saved results.
189 This method returns '1 attempt' or 'N attempts', where the number
190 in the string is the number of saved results.
192 if self._attempts == 1:
195 return '{} attempts'.format(self._attempts)
198 def check_http_response_success(status_code: int) -> Optional[bool]:
199 """Convert a numeric HTTP status code to a loop control flag.
201 This method takes a numeric HTTP status code and returns `True` if
202 the code indicates success, `None` if it indicates temporary
203 failure, and `False` otherwise. You can use this as the
204 `success_check` for a `RetryLoop` that queries the Arvados API server.
207 * Any 2xx result returns `True`.
209 * A select few status codes, or any malformed responses, return `None`.
211 * Everything else returns `False`. Note that this includes 1xx and
212 3xx status codes. They don't indicate success, and you can't
213 retry those requests verbatim.
217 * status_code: int --- A numeric HTTP response code
219 if status_code in _HTTP_SUCCESSES:
221 elif status_code in _HTTP_CAN_RETRY:
223 elif 100 <= status_code < 600:
226 return None # Get well soon, server.
228 def retry_method(orig_func: CT) -> CT:
229 """Provide a default value for a method's num_retries argument.
231 This is a decorator for instance and class methods that accept a
232 `num_retries` keyword argument, with a `None` default. When the method
233 is called without a value for `num_retries`, this decorator will set it
234 from the `num_retries` attribute of the underlying instance or class.
238 * orig_func: Callable --- A class or instance method that accepts a
239 `num_retries` keyword argument
241 @functools.wraps(orig_func)
242 def num_retries_setter(self, *args, **kwargs):
243 if kwargs.get('num_retries') is None:
244 kwargs['num_retries'] = self.num_retries
245 return orig_func(self, *args, **kwargs)
246 return num_retries_setter