8931: Add retry and exception handling around PollClient initial log query a well.
[arvados.git] / sdk / python / arvados / events.py
index 94b8a9d06cfaec73b718b8514adcd3ba08ab2991..698601cb5d071a01d96cd8e5855cc1fdd1376bd8 100644 (file)
@@ -14,8 +14,8 @@ from ws4py.client.threadedclient import WebSocketClient
 _logger = logging.getLogger('arvados.events')
 
 
-class EventClient(WebSocketClient):
-    def __init__(self, url, filters, on_event, last_log_id):
+class _EventClient(WebSocketClient):
+    def __init__(self, url, filters, on_event, last_log_id, on_closed):
         ssl_options = {'ca_certs': arvados.util.ca_certs_path()}
         if config.flag_is_true('ARVADOS_API_HOST_INSECURE'):
             ssl_options['cert_reqs'] = ssl.CERT_NONE
@@ -26,19 +26,23 @@ class EventClient(WebSocketClient):
         # IPv4 addresses (common with "localhost"), only one of them
         # will be attempted -- and it might not be the right one. See
         # ws4py's WebSocketBaseClient.__init__.
-        super(EventClient, self).__init__(url, ssl_options=ssl_options)
+        super(_EventClient, self).__init__(url, ssl_options=ssl_options)
+
         self.filters = filters
         self.on_event = on_event
         self.last_log_id = last_log_id
         self._closing_lock = threading.RLock()
         self._closing = False
         self._closed = threading.Event()
+        self.on_closed = on_closed
 
     def opened(self):
-        self.subscribe(self.filters, self.last_log_id)
+        for f in self.filters:
+            self.subscribe(f, self.last_log_id)
 
     def closed(self, code, reason=None):
         self._closed.set()
+        self.on_closed()
 
     def received_message(self, m):
         with self._closing_lock:
@@ -51,21 +55,72 @@ class EventClient(WebSocketClient):
         :timeout: is the number of seconds to wait for ws4py to
         indicate that the connection has closed.
         """
-        super(EventClient, self).close(code, reason)
+        super(_EventClient, self).close(code, reason)
         with self._closing_lock:
             # make sure we don't process any more messages.
             self._closing = True
         # wait for ws4py to tell us the connection is closed.
         self._closed.wait(timeout=timeout)
 
-    def subscribe(self, filters, last_log_id=None):
-        m = {"method": "subscribe", "filters": filters}
+    def subscribe(self, f, last_log_id=None):
+        m = {"method": "subscribe", "filters": f}
         if last_log_id is not None:
             m["last_log_id"] = last_log_id
         self.send(json.dumps(m))
 
-    def unsubscribe(self, filters):
-        self.send(json.dumps({"method": "unsubscribe", "filters": filters}))
+    def unsubscribe(self, f):
+        self.send(json.dumps({"method": "unsubscribe", "filters": f}))
+
+
+class EventClient(object):
+    def __init__(self, url, filters, on_event_cb, last_log_id):
+        self.url = url
+        if filters:
+            self.filters = [filters]
+        else:
+            self.filters = [[]]
+        self.on_event_cb = on_event_cb
+        self.last_log_id = last_log_id
+        self.is_closed = False
+        self.ec = _EventClient(url, self.filters, self.on_event, last_log_id, self.on_closed)
+
+    def connect(self):
+        self.ec.connect()
+
+    def close_connection(self):
+        self.ec.close_connection()
+
+    def subscribe(self, f, last_log_id=None):
+        self.filters.append(f)
+        self.ec.subscribe(f, last_log_id)
+
+    def unsubscribe(self, f):
+        del self.filters[self.filters.index(f)]
+        self.ec.unsubscribe(f)
+
+    def close(self, code=1000, reason='', timeout=0):
+        self.is_closed = True
+        self.ec.close(code, reason, timeout)
+
+    def on_event(self, m):
+        if m.get('id') != None:
+            self.last_log_id = m.get('id')
+        try:
+            self.on_event_cb(m)
+        except Exception as e:
+            _logger.warn("Unexpected exception from event callback.", exc_info=e)
+
+    def on_closed(self):
+        if self.is_closed == False:
+            _logger.warn("Unexpected close. Reconnecting.")
+            self.ec = _EventClient(self.url, self.filters, self.on_event, self.last_log_id, self.on_closed)
+            while True:
+              try:
+                  self.ec.connect()
+                  break
+              except Exception as e:
+                  _logger.warn("Error '%s' during websocket reconnect. Will retry after 5s.", e, exc_info=e)
+                  time.sleep(5)
 
 
 class PollClient(threading.Thread):
@@ -89,7 +144,15 @@ class PollClient(threading.Thread):
             self.id = self.last_log_id
         else:
             for f in self.filters:
-                items = self.api.logs().list(limit=1, order="id desc", filters=f).execute()['items']
+                try:
+                    items = self.api.logs().list(limit=1, order="id desc", filters=f).execute(num_retries=1000000)['items']
+                except Exception as e:
+                    # Some apparently non-retryable error happened, so log the
+                    # error and shut down gracefully.
+                    _logger.error("Got exception from log query: %s", e)
+                    with self._closing_lock:
+                        self._closing.set()
+                    return
                 if items:
                     if items[0]['id'] > self.id:
                         self.id = items[0]['id']
@@ -100,14 +163,29 @@ class PollClient(threading.Thread):
             max_id = self.id
             moreitems = False
             for f in self.filters:
-                items = self.api.logs().list(order="id asc", filters=f+[["id", ">", str(self.id)]]).execute()
+                try:
+                    # If we get a transient error, we really really need to
+                    # just keep trying over and over with the same query or
+                    # we'll potentially drop events which would break the event
+                    # stream contract.
+                    items = self.api.logs().list(order="id asc", filters=f+[["id", ">", str(self.id)]]).execute(num_retries=1000000)
+                except Exception as e:
+                    # Some apparently non-retryable error happened, so log the
+                    # error and shut down gracefully.
+                    _logger.error("Got exception from log query: %s", e)
+                    with self._closing_lock:
+                        self._closing.set()
+                    return
                 for i in items["items"]:
                     if i['id'] > max_id:
                         max_id = i['id']
                     with self._closing_lock:
                         if self._closing.is_set():
                             return
-                        self.on_event(i)
+                        try:
+                            self.on_event(i)
+                        except Exception as e:
+                            _logger.warn("Unexpected exception from event callback.", exc_info=e)
                 if items["items_available"] > len(items["items"]):
                     moreitems = True
             self.id = max_id
@@ -143,12 +221,12 @@ class PollClient(threading.Thread):
             # to do so raises the same exception."
             pass
 
-    def subscribe(self, filters):
+    def subscribe(self, f):
         self.on_event({'status': 200})
-        self.filters.append(filters)
+        self.filters.append(f)
 
-    def unsubscribe(self, filters):
-        del self.filters[self.filters.index(filters)]
+    def unsubscribe(self, f):
+        del self.filters[self.filters.index(f)]
 
 
 def _subscribe_websocket(api, filters, on_event, last_log_id=None):
@@ -190,7 +268,10 @@ def subscribe(api, filters, on_event, poll_fallback=15, last_log_id=None):
         return _subscribe_websocket(api, filters, on_event, last_log_id)
 
     try:
-        return _subscribe_websocket(api, filters, on_event, last_log_id)
+        if not config.flag_is_true('ARVADOS_DISABLE_WEBSOCKETS'):
+            return _subscribe_websocket(api, filters, on_event, last_log_id)
+        else:
+            _logger.info("Using polling because ARVADOS_DISABLE_WEBSOCKETS is true")
     except Exception as e:
         _logger.warn("Falling back to polling after websocket error: %s" % e)
     p = PollClient(api, filters, on_event, poll_fallback, last_log_id)