Merge branch 'master' into 9998-unsigned_manifest

[arvados.git] / services / api / lib / eventbus.rb
diff --git a/services/api/lib/eventbus.rb b/services/api/lib/eventbus.rb

index 35671d65b287e495a76b2fc94b47cdf588983350..5e413d5cabf580688693754af4201497f2d79e0a 100644 (file)
--- a/services/api/lib/eventbus.rb
+++ b/services/api/lib/eventbus.rb
@@ -7,6 +7,8 @@ require 'oj'
  require 'faye/websocket'
  require 'record_filters'
  require 'load_param'
+require 'set'
+require 'thread'
  
  # Patch in user, last_log_id and filters fields into the Faye::Websocket class.
  module Faye
@@ -14,6 +16,29 @@ module Faye
      attr_accessor :user
      attr_accessor :last_log_id
      attr_accessor :filters
+    attr_accessor :sent_ids
+    attr_accessor :queue
+    attr_accessor :frame_mtx
+  end
+end
+
+module WebSocket
+  class Driver
+
+    class Server
+      alias_method :_write, :write
+
+      def write(data)
+        # Most of the sending activity will be from the thread set up in
+        # on_connect.  However, there is also some automatic activity in the
+        # form of ping/pong messages, so ensure that the write method used to
+        # send one complete message to the underlying socket can only be
+        # called by one thread at a time.
+        self.frame_mtx.synchronize do
+          _write(data)
+        end
+      end
+    end
    end
  end
  
@@ -50,6 +75,11 @@ class EventBus
      @channel = EventMachine::Channel.new
      @mtx = Mutex.new
      @bgthread = false
+    @connection_count = 0
+  end
+
+  def send_message(ws, obj)
+    ws.send(Oj.dump(obj, mode: :compat))
    end
  
    # Push out any pending events to the connection +ws+
@@ -60,35 +90,27 @@ class EventBus
    #
    # It queries the database for log rows that are either
    #  a) greater than ws.last_log_id, which is the last log id which was a candidate to be sent out
-  #  b) if ws.last_log_id is nil, then it queries rows starting with notify_id
+  #  b) if ws.last_log_id is nil, then it queries the row notify_id
    #
-  # Regular Arvados permissions are applied using readable_by() and filters using record_filters()
-  # To avoid clogging up the database, queries are limited to batches of 100.  It will schedule a new
-  # push_events call if there are more log rows to send.
+  # Regular Arvados permissions are applied using readable_by() and filters using record_filters().
    def push_events ws, notify_id
      begin
-      if !notify_id.nil? and !ws.last_log_id.nil? and notify_id <= ws.last_log_id
-        # This notify is for a row we've handled already.
-        return
-      end
-
        # Must have at least one filter set up to receive events
        if ws.filters.length > 0
-        # Start with log rows readable by user, sorted in ascending order
-        logs = Log.readable_by(ws.user).order("id asc")
+        # Start with log rows readable by user
+        logs = Log.readable_by(ws.user)
  
          cond_id = nil
          cond_out = []
          param_out = []
  
-        if !ws.last_log_id.nil?
-          # Client is only interested in log rows that are newer than the
-          # last log row seen by the client.
+        if not ws.last_log_id.nil?
+          # We are catching up from some starting point.
            cond_id = "logs.id > ?"
            param_out << ws.last_log_id
-        elsif !notify_id.nil?
-          # No last log id, so look at rows starting with notify id
-          cond_id = "logs.id >= ?"
+        elsif not notify_id.nil?
+          # Get next row being notified.
+          cond_id = "logs.id = ?"
            param_out << notify_id
          else
            # No log id to start from, nothing to do, return
@@ -114,36 +136,38 @@ class EventBus
            logs = logs.where(cond_id, *param_out)
          end
  
-        # Execute query and actually send the matching log rows
-        count = 0
-        limit = 20
-
-        logs.limit(limit).each do |l|
-          ws.send(l.as_api_response.to_json)
-          ws.last_log_id = l.id
-          count += 1
-        end
-
-        if count == limit
-          # Number of rows returned was capped by limit(), we need to schedule
-          # another query to get more logs (will start from last_log_id
-          # reported by current query)
-          EventMachine::schedule do
-            push_events ws, nil
+        # Execute query and actually send the matching log rows. Load
+        # the full log records only when we're ready to send them,
+        # though: otherwise, (1) postgres has to build the whole
+        # result set and return it to us before we can send the first
+        # event, and (2) we store lots of records in memory while
+        # waiting to spool them out to the client. Both of these are
+        # troublesome when log records are large (e.g., a collection
+        # update contains both old and new manifest_text).
+        #
+        # Note: find_each implies order('id asc'), which is what we
+        # want.
+        logs.select('logs.id').find_each do |l|
+          if not ws.sent_ids.include?(l.id)
+            # only send if not a duplicate
+            send_message(ws, Log.find(l.id).as_api_response)
+          end
+          if not ws.last_log_id.nil?
+            # record ids only when sending "catchup" messages, not notifies
+            ws.sent_ids << l.id
            end
-        elsif !notify_id.nil? and (ws.last_log_id.nil? or notify_id > ws.last_log_id)
-          # Number of rows returned was less than cap, but the notify id is
-          # higher than the last id visible to the client, so update last_log_id
-          ws.last_log_id = notify_id
          end
-      elsif !notify_id.nil?
-        # No filters set up, so just record the sequence number
-        ws.last_log_id = notify_id
+        ws.last_log_id = nil
        end
+    rescue ArgumentError => e
+      # There was some kind of user error.
+      Rails.logger.warn "Error publishing event: #{$!}"
+      send_message(ws, {status: 500, message: $!})
+      ws.close
      rescue => e
        Rails.logger.warn "Error publishing event: #{$!}"
        Rails.logger.warn "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
-      ws.send ({status: 500, message: 'error'}.to_json)
+      send_message(ws, {status: 500, message: $!})
        ws.close
        # These exceptions typically indicate serious server trouble:
        # out of memory issues, database connection problems, etc.  Go ahead and
@@ -157,10 +181,10 @@ class EventBus
      begin
        begin
          # Parse event data as JSON
-        p = (Oj.load event.data).symbolize_keys
+        p = (Oj.strict_load event.data).symbolize_keys
          filter = Filter.new(p)
        rescue Oj::Error => e
-        ws.send ({status: 400, message: "malformed request"}.to_json)
+        send_message(ws, {status: 400, message: "malformed request"})
          return
        end
  
@@ -171,18 +195,21 @@ class EventBus
            # Set or reset the last_log_id.  The event bus only reports events
            # for rows that come after last_log_id.
            ws.last_log_id = p[:last_log_id].to_i
+          # Reset sent_ids for consistency
+          # (always re-deliver all matching messages following last_log_id)
+          ws.sent_ids = Set.new
          end
  
-        if ws.filters.length < MAX_FILTERS
+        if ws.filters.length < Rails.configuration.websocket_max_filters
            # Add a filter.  This gets the :filters field which is the same
            # format as used for regular index queries.
            ws.filters << filter
-          ws.send ({status: 200, message: 'subscribe ok', filter: p}.to_json)
+          send_message(ws, {status: 200, message: 'subscribe ok', filter: p})
  
            # Send any pending events
            push_events ws, nil
          else
-          ws.send ({status: 403, message: "maximum of #{MAX_FILTERS} filters allowed per connection"}.to_json)
+          send_message(ws, {status: 403, message: "maximum of #{Rails.configuration.websocket_max_filters} filters allowed per connection"})
          end
  
        elsif p[:method] == 'unsubscribe'
@@ -191,33 +218,40 @@ class EventBus
          len = ws.filters.length
          ws.filters.select! { |f| not ((f.filters == p[:filters]) or (f.filters.empty? and p[:filters].nil?)) }
          if ws.filters.length < len
-          ws.send ({status: 200, message: 'unsubscribe ok'}.to_json)
+          send_message(ws, {status: 200, message: 'unsubscribe ok'})
          else
-          ws.send ({status: 404, message: 'filter not found'}.to_json)
+          send_message(ws, {status: 404, message: 'filter not found'})
          end
  
        else
-        ws.send ({status: 400, message: "missing or unrecognized method"}.to_json)
+        send_message(ws, {status: 400, message: "missing or unrecognized method"})
        end
      rescue => e
        Rails.logger.warn "Error handling message: #{$!}"
        Rails.logger.warn "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
-      ws.send ({status: 500, message: 'error'}.to_json)
+      send_message(ws, {status: 500, message: 'error'})
        ws.close
      end
    end
  
-  # Constant maximum number of filters, to avoid silly huge database queries.
-  MAX_FILTERS = 16
+  def overloaded?
+    @mtx.synchronize do
+      @connection_count >= Rails.configuration.websocket_max_connections
+    end
+  end
  
    # Called by RackSocket when a new websocket connection has been established.
    def on_connect ws
-
      # Disconnect if no valid API token.
      # current_user is included from CurrentApiClient
      if not current_user
-      ws.send ({status: 401, message: "Valid API token required"}.to_json)
-      ws.close
+      send_message(ws, {status: 401, message: "Valid API token required"})
+      # Wait for the handshake to complete before closing the
+      # socket. Otherwise, nginx responds with HTTP 502 Bad gateway,
+      # and the client never sees our real error message.
+      ws.on :open do |event|
+        ws.close
+      end
        return
      end
  
@@ -225,22 +259,65 @@ class EventBus
      ws.user = current_user
      ws.filters = []
      ws.last_log_id = nil
+    ws.sent_ids = Set.new
+    ws.queue = Queue.new
+    ws.frame_mtx = Mutex.new
+
+    @mtx.synchronize do
+      @connection_count += 1
+    end
  
-    # Subscribe to internal postgres notifications through @channel.  This will
-    # call push_events when a notification comes through.
+    # Subscribe to internal postgres notifications through @channel and
+    # forward them to the thread associated with the connection.
      sub = @channel.subscribe do |msg|
-      push_events ws, msg
+      if ws.queue.length > Rails.configuration.websocket_max_notify_backlog
+        send_message(ws, {status: 500, message: 'Notify backlog too long'})
+        ws.close
+        @channel.unsubscribe sub
+        ws.queue.clear
+      else
+        ws.queue << [:notify, msg]
+      end
      end
  
      # Set up callback for inbound message dispatch.
      ws.on :message do |event|
-      handle_message ws, event
+      ws.queue << [:message, event]
      end
  
      # Set up socket close callback
      ws.on :close do |event|
        @channel.unsubscribe sub
-      ws = nil
+      ws.queue.clear
+      ws.queue << [:close, nil]
+    end
+
+    # Spin off a new thread to handle sending events to the client.  We need a
+    # separate thread per connection so that a slow client doesn't interfere
+    # with other clients.
+    #
+    # We don't want the loop in the request thread because on a TERM signal,
+    # Puma waits for outstanding requests to complete, and long-lived websocket
+    # connections may not complete in a timely manner.
+    Thread.new do
+      # Loop and react to socket events.
+      begin
+        loop do
+          eventType, msg = ws.queue.pop
+          if eventType == :message
+            handle_message ws, msg
+          elsif eventType == :notify
+            push_events ws, msg
+          elsif eventType == :close
+            break
+          end
+        end
+      ensure
+        @mtx.synchronize do
+          @connection_count -= 1
+        end
+        ActiveRecord::Base.connection.close
+      end
      end
  
      # Start up thread to monitor the Postgres database, if none exists already.
@@ -276,8 +353,5 @@ class EventBus
        end
      end
  
-    # Since EventMachine is an asynchronous event based dispatcher, #on_connect
-    # does not block but instead returns immediately after having set up the
-    # websocket and notification channel callbacks.
    end
  end