From: Peter Amstutz Date: Tue, 26 Apr 2016 13:51:53 +0000 (-0400) Subject: 8931: Use RetryLoop around websocket reconnect. Create a new _EventClient X-Git-Tag: 1.1.0~967^2~1 X-Git-Url: https://git.arvados.org/arvados.git/commitdiff_plain/c1276bd9f83a7826f10e1752ac793d8a1cd3c47f 8931: Use RetryLoop around websocket reconnect. Create a new _EventClient object on each loop iteration. Handle unexpected exceptions in PollClient retry loop. --- diff --git a/sdk/python/arvados/events.py b/sdk/python/arvados/events.py index 54f3019f36..4985aaf1b7 100644 --- a/sdk/python/arvados/events.py +++ b/sdk/python/arvados/events.py @@ -116,14 +116,18 @@ class EventClient(object): def on_closed(self): if self.is_closed == False: _logger.warn("Unexpected close. Reconnecting.") - self.ec = _EventClient(self.url, self.filters, self.on_event, self.last_log_id, self.on_closed) - while True: + for tries_left in RetryLoop(num_retries=25, backoff_start=.1, max_wait=15): try: + self.ec = _EventClient(self.url, self.filters, self.on_event, self.last_log_id, self.on_closed) self.ec.connect() break except Exception as e: - _logger.warn("Error '%s' during websocket reconnect. Will retry after 5s.", e, exc_info=e) - time.sleep(5) + _logger.warn("Error '%s' during websocket reconnect.", e) + if tries_left == 0: + _logger.exception("EventClient thread could not contact websocket server.") + self.is_closed = True + thread.interrupt_main() + return class PollClient(threading.Thread): @@ -178,6 +182,9 @@ class PollClient(threading.Thread): break except errors.ApiError as error: pass + else: + tries_left = 0 + break if tries_left == 0: _logger.exception("PollClient thread could not contact API server.") with self._closing_lock: diff --git a/sdk/python/arvados/retry.py b/sdk/python/arvados/retry.py index dccd9c875a..5ba4f4ea41 100644 --- a/sdk/python/arvados/retry.py +++ b/sdk/python/arvados/retry.py @@ -51,7 +51,7 @@ class RetryLoop(object): * save_results: Specify a number to save the last N results that the loop recorded. These records are available through the results attribute, oldest first. Default 1. - * max_wait: Maximum time to wait between retries. + * max_wait: Maximum number of seconds to wait between retries. """ self.tries_left = num_retries + 1 self.check_result = success_check