X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/12b5d95d9d4f062539c3d7d7ac8dd5421dd6b6b0..b73985d8a0c9173aec57f6a81fe540b2813a5bff:/services/api/lib/eventbus.rb diff --git a/services/api/lib/eventbus.rb b/services/api/lib/eventbus.rb index 9bf95f5735..16bb030941 100644 --- a/services/api/lib/eventbus.rb +++ b/services/api/lib/eventbus.rb @@ -7,6 +7,8 @@ require 'oj' require 'faye/websocket' require 'record_filters' require 'load_param' +require 'set' +require 'thread' # Patch in user, last_log_id and filters fields into the Faye::Websocket class. module Faye @@ -14,6 +16,29 @@ module Faye attr_accessor :user attr_accessor :last_log_id attr_accessor :filters + attr_accessor :sent_ids + attr_accessor :queue + attr_accessor :frame_mtx + end +end + +module WebSocket + class Driver + + class Server + alias_method :_write, :write + + def write(data) + # Most of the sending activity will be from the thread set up in + # on_connect. However, there is also some automatic activity in the + # form of ping/pong messages, so ensure that the write method used to + # send one complete message to the underlying socket can only be + # called by one thread at a time. + self.frame_mtx.synchronize do + _write(data) + end + end + end end end @@ -50,6 +75,7 @@ class EventBus @channel = EventMachine::Channel.new @mtx = Mutex.new @bgthread = false + @connection_count = 0 end # Push out any pending events to the connection +ws+ @@ -60,35 +86,27 @@ class EventBus # # It queries the database for log rows that are either # a) greater than ws.last_log_id, which is the last log id which was a candidate to be sent out - # b) if ws.last_log_id is nil, then it queries rows starting with notify_id + # b) if ws.last_log_id is nil, then it queries the row notify_id # - # Regular Arvados permissions are applied using readable_by() and filters using record_filters() - # To avoid clogging up the database, queries are limited to batches of 100. It will schedule a new - # push_events call if there are more log rows to send. + # Regular Arvados permissions are applied using readable_by() and filters using record_filters(). def push_events ws, notify_id begin - if !notify_id.nil? and !ws.last_log_id.nil? and notify_id <= ws.last_log_id - # This notify is for a row we've handled already. - return - end - # Must have at least one filter set up to receive events if ws.filters.length > 0 - # Start with log rows readable by user, sorted in ascending order - logs = Log.readable_by(ws.user).order("id asc") + # Start with log rows readable by user + logs = Log.readable_by(ws.user) cond_id = nil cond_out = [] param_out = [] - if !ws.last_log_id.nil? - # Client is only interested in log rows that are newer than the - # last log row seen by the client. + if not ws.last_log_id.nil? + # We are catching up from some starting point. cond_id = "logs.id > ?" param_out << ws.last_log_id - elsif !notify_id.nil? - # No last log id, so look at rows starting with notify id - cond_id = "logs.id >= ?" + elsif not notify_id.nil? + # Get next row being notified. + cond_id = "logs.id = ?" param_out << notify_id else # No log id to start from, nothing to do, return @@ -114,31 +132,28 @@ class EventBus logs = logs.where(cond_id, *param_out) end - # Execute query and actually send the matching log rows - count = 0 - limit = 10 - - logs.limit(limit).each do |l| - ws.send(l.as_api_response.to_json) - ws.last_log_id = l.id - count += 1 - end - - if count == limit - # Number of rows returned was capped by limit(), we need to schedule - # another query to get more logs (will start from last_log_id - # reported by current query) - EventMachine::next_tick do - push_events ws, nil + # Execute query and actually send the matching log rows. Load + # the full log records only when we're ready to send them, + # though: otherwise, (1) postgres has to build the whole + # result set and return it to us before we can send the first + # event, and (2) we store lots of records in memory while + # waiting to spool them out to the client. Both of these are + # troublesome when log records are large (e.g., a collection + # update contains both old and new manifest_text). + # + # Note: find_each implies order('id asc'), which is what we + # want. + logs.select('logs.id').find_each do |l| + if not ws.sent_ids.include?(l.id) + # only send if not a duplicate + ws.send(Log.find(l.id).as_api_response.to_json) + end + if not ws.last_log_id.nil? + # record ids only when sending "catchup" messages, not notifies + ws.sent_ids << l.id end - elsif !notify_id.nil? and (ws.last_log_id.nil? or notify_id > ws.last_log_id) - # Number of rows returned was less than cap, but the notify id is - # higher than the last id visible to the client, so update last_log_id - ws.last_log_id = notify_id end - elsif !notify_id.nil? - # No filters set up, so just record the sequence number - ws.last_log_id = notify_id + ws.last_log_id = nil end rescue ArgumentError => e # There was some kind of user error. @@ -176,9 +191,12 @@ class EventBus # Set or reset the last_log_id. The event bus only reports events # for rows that come after last_log_id. ws.last_log_id = p[:last_log_id].to_i + # Reset sent_ids for consistency + # (always re-deliver all matching messages following last_log_id) + ws.sent_ids = Set.new end - if ws.filters.length < MAX_FILTERS + if ws.filters.length < Rails.configuration.websocket_max_filters # Add a filter. This gets the :filters field which is the same # format as used for regular index queries. ws.filters << filter @@ -187,7 +205,7 @@ class EventBus # Send any pending events push_events ws, nil else - ws.send ({status: 403, message: "maximum of #{MAX_FILTERS} filters allowed per connection"}.to_json) + ws.send ({status: 403, message: "maximum of #{Rails.configuration.websocket_max_filters} filters allowed per connection"}.to_json) end elsif p[:method] == 'unsubscribe' @@ -212,12 +230,14 @@ class EventBus end end - # Constant maximum number of filters, to avoid silly huge database queries. - MAX_FILTERS = 16 + def overloaded? + @mtx.synchronize do + @connection_count >= Rails.configuration.websocket_max_connections + end + end # Called by RackSocket when a new websocket connection has been established. def on_connect ws - # Disconnect if no valid API token. # current_user is included from CurrentApiClient if not current_user @@ -230,22 +250,64 @@ class EventBus ws.user = current_user ws.filters = [] ws.last_log_id = nil + ws.sent_ids = Set.new + ws.queue = Queue.new + ws.frame_mtx = Mutex.new + + @mtx.synchronize do + @connection_count += 1 + end - # Subscribe to internal postgres notifications through @channel. This will - # call push_events when a notification comes through. + # Subscribe to internal postgres notifications through @channel and + # forward them to the thread associated with the connection. sub = @channel.subscribe do |msg| - push_events ws, msg + if ws.queue.length > Rails.configuration.websocket_max_notify_backlog + ws.send ({status: 500, message: 'Notify backlog too long'}.to_json) + ws.close + @channel.unsubscribe sub + ws.queue.clear + else + ws.queue << [:notify, msg] + end end # Set up callback for inbound message dispatch. ws.on :message do |event| - handle_message ws, event + ws.queue << [:message, event] end # Set up socket close callback ws.on :close do |event| @channel.unsubscribe sub - ws = nil + ws.queue.clear + ws.queue << [:close, nil] + end + + # Spin off a new thread to handle sending events to the client. We need a + # separate thread per connection so that a slow client doesn't interfere + # with other clients. + # + # We don't want the loop in the request thread because on a TERM signal, + # Puma waits for outstanding requests to complete, and long-lived websocket + # connections may not complete in a timely manner. + Thread.new do + # Loop and react to socket events. + begin + loop do + eventType, msg = ws.queue.pop + if eventType == :message + handle_message ws, msg + elsif eventType == :notify + push_events ws, msg + elsif eventType == :close + break + end + end + ensure + @mtx.synchronize do + @connection_count -= 1 + end + end end # Start up thread to monitor the Postgres database, if none exists already. @@ -281,8 +343,5 @@ class EventBus end end - # Since EventMachine is an asynchronous event based dispatcher, #on_connect - # does not block but instead returns immediately after having set up the - # websocket and notification channel callbacks. end end