X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/7c12fc4d989dd2b4c47a174280a4f9526ecb0798..79e53c0eed77396cb37f60b48be0c60fe7e0ab89:/services/api/lib/eventbus.rb diff --git a/services/api/lib/eventbus.rb b/services/api/lib/eventbus.rb index ac023e5c03..11b178d985 100644 --- a/services/api/lib/eventbus.rb +++ b/services/api/lib/eventbus.rb @@ -1,37 +1,67 @@ +# If any threads raise an unhandled exception, make them all die. +# We trust a supervisor like runit to restart the server in this case. +Thread.abort_on_exception = true + require 'eventmachine' -require 'oj' require 'faye/websocket' -require 'record_filters' require 'load_param' +require 'oj' +require 'record_filters' +require 'safe_json' +require 'set' +require 'thread' +# Patch in user, last_log_id and filters fields into the Faye::Websocket class. module Faye class WebSocket attr_accessor :user attr_accessor :last_log_id attr_accessor :filters + attr_accessor :sent_ids + attr_accessor :queue + attr_accessor :frame_mtx end end +module WebSocket + class Driver + + class Server + alias_method :_write, :write + + def write(data) + # Most of the sending activity will be from the thread set up in + # on_connect. However, there is also some automatic activity in the + # form of ping/pong messages, so ensure that the write method used to + # send one complete message to the underlying socket can only be + # called by one thread at a time. + self.frame_mtx.synchronize do + _write(data) + end + end + end + end +end + +# Store the filters supplied by the user that will be applied to the logs table +# to determine which events to return to the listener. class Filter include LoadParam attr_accessor :filters - def initialize p, fid + def initialize p @params = p - @filter_id = fid load_filters_param end def params @params end - - def filter_id - @filter_id - end end +# Manages websocket connections, accepts subscription messages and publishes +# log table events. class EventBus include CurrentApiClient include RecordFilters @@ -41,134 +71,257 @@ class EventBus Log end - # used in RecordFilters - def table_name - model_class.table_name - end - + # Initialize EventBus. Takes no parameters. def initialize @channel = EventMachine::Channel.new @mtx = Mutex.new @bgthread = false - @filter_id_counter = 0 + @connection_count = 0 end - def alloc_filter_id - (@filter_id_counter += 1) + def send_message(ws, obj) + ws.send(SafeJSON.dump(obj)) end - def push_events ws, msg = nil - begin - # Must have at least one filter set up to receive events - if ws.filters.length > 0 - # Start with log rows readable by user, sorted in ascending order - logs = Log.readable_by(ws.user).order("id asc") - - if ws.last_log_id - # Only interested in log rows that are new - logs = logs.where("logs.id > ?", ws.last_log_id) - elsif msg - # No last log id, so only look at the most recently changed row - logs = logs.where("logs.id = ?", msg.to_i) - else - return - end + # Push out any pending events to the connection +ws+ + # +notify_id+ the id of the most recent row in the log table, may be nil + # + # This accepts a websocket and a notify_id (this is the row id from Postgres + # LISTEN/NOTIFY, it may be nil if called from somewhere else) + # + # It queries the database for log rows that are either + # a) greater than ws.last_log_id, which is the last log id which was a candidate to be sent out + # b) if ws.last_log_id is nil, then it queries the row notify_id + # + # Regular Arvados permissions are applied using readable_by() and filters using record_filters(). + def push_events ws, notify_id + begin + # Must have at least one filter set up to receive events + if ws.filters.length > 0 + # Start with log rows readable by user + logs = Log.readable_by(ws.user) + + cond_id = nil + cond_out = [] + param_out = [] - # Now process filters provided by client - cond_out = [] - param_out = [] - ws.filters.each do |filter| - ft = record_filters filter.filters - cond_out += ft[:cond_out] + if not ws.last_log_id.nil? + # We are catching up from some starting point. + cond_id = "logs.id > ?" + param_out << ws.last_log_id + elsif not notify_id.nil? + # Get next row being notified. + cond_id = "logs.id = ?" + param_out << notify_id + else + # No log id to start from, nothing to do, return + return + end + + # Now build filters provided by client + ws.filters.each do |filter| + ft = record_filters filter.filters, Log + if ft[:cond_out].any? + # Join the clauses within a single subscription filter with AND + # so it is consistent with regular queries + cond_out << "(#{ft[:cond_out].join ') AND ('})" param_out += ft[:param_out] end + end - # Add filters to query - if cond_out.any? - logs = logs.where(cond_out.join(' OR '), *param_out) - end + # Add filters to query + if cond_out.any? + # Join subscriptions with OR + logs = logs.where(cond_id + " AND ((#{cond_out.join ') OR ('}))", *param_out) + else + logs = logs.where(cond_id, *param_out) + end - # Finally execute query and send matching rows - logs.each do |l| - ws.send(l.as_api_response.to_json) - ws.last_log_id = l.id + # Execute query and actually send the matching log rows. Load + # the full log records only when we're ready to send them, + # though: otherwise, (1) postgres has to build the whole + # result set and return it to us before we can send the first + # event, and (2) we store lots of records in memory while + # waiting to spool them out to the client. Both of these are + # troublesome when log records are large (e.g., a collection + # update contains both old and new manifest_text). + # + # Note: find_each implies order('id asc'), which is what we + # want. + logs.select('logs.id').find_each do |l| + if not ws.sent_ids.include?(l.id) + # only send if not a duplicate + send_message(ws, Log.find(l.id).as_api_response) + end + if not ws.last_log_id.nil? + # record ids only when sending "catchup" messages, not notifies + ws.sent_ids << l.id end - elsif msg - # No filters set up, so just record the sequence number - ws.last_log_id = msg.to_i end - rescue Exception => e - puts "Error publishing event: #{$!}" - puts "Backtrace:\n\t#{e.backtrace.join("\n\t")}" - ws.send ({status: 500, message: 'error'}.to_json) - ws.close + ws.last_log_id = nil end + rescue ArgumentError => e + # There was some kind of user error. + Rails.logger.warn "Error publishing event: #{$!}" + send_message(ws, {status: 500, message: $!}) + ws.close + rescue => e + Rails.logger.warn "Error publishing event: #{$!}" + Rails.logger.warn "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + send_message(ws, {status: 500, message: $!}) + ws.close + # These exceptions typically indicate serious server trouble: + # out of memory issues, database connection problems, etc. Go ahead and + # crash; we expect that a supervisor service like runit will restart us. + raise + end end - MAX_FILTERS = 16 + # Handle inbound subscribe or unsubscribe message. + def handle_message ws, event + begin + begin + # Parse event data as JSON + p = SafeJSON.load(event.data).symbolize_keys + filter = Filter.new(p) + rescue Oj::Error => e + send_message(ws, {status: 400, message: "malformed request"}) + return + end + + if p[:method] == 'subscribe' + # Handle subscribe event + if p[:last_log_id] + # Set or reset the last_log_id. The event bus only reports events + # for rows that come after last_log_id. + ws.last_log_id = p[:last_log_id].to_i + # Reset sent_ids for consistency + # (always re-deliver all matching messages following last_log_id) + ws.sent_ids = Set.new + end + + if ws.filters.length < Rails.configuration.websocket_max_filters + # Add a filter. This gets the :filters field which is the same + # format as used for regular index queries. + ws.filters << filter + send_message(ws, {status: 200, message: 'subscribe ok', filter: p}) + + # Send any pending events + push_events ws, nil + else + send_message(ws, {status: 403, message: "maximum of #{Rails.configuration.websocket_max_filters} filters allowed per connection"}) + end + + elsif p[:method] == 'unsubscribe' + # Handle unsubscribe event + + len = ws.filters.length + ws.filters.select! { |f| not ((f.filters == p[:filters]) or (f.filters.empty? and p[:filters].nil?)) } + if ws.filters.length < len + send_message(ws, {status: 200, message: 'unsubscribe ok'}) + else + send_message(ws, {status: 404, message: 'filter not found'}) + end + + else + send_message(ws, {status: 400, message: "missing or unrecognized method"}) + end + rescue => e + Rails.logger.warn "Error handling message: #{$!}" + Rails.logger.warn "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + send_message(ws, {status: 500, message: 'error'}) + ws.close + end + end + + def overloaded? + @mtx.synchronize do + @connection_count >= Rails.configuration.websocket_max_connections + end + end + + # Called by RackSocket when a new websocket connection has been established. def on_connect ws + # Disconnect if no valid API token. + # current_user is included from CurrentApiClient if not current_user - ws.send ({status: 401, message: "Valid API token required"}.to_json) - ws.close + send_message(ws, {status: 401, message: "Valid API token required"}) + # Wait for the handshake to complete before closing the + # socket. Otherwise, nginx responds with HTTP 502 Bad gateway, + # and the client never sees our real error message. + ws.on :open do |event| + ws.close + end return end + # Initialize our custom fields on the websocket connection object. ws.user = current_user ws.filters = [] ws.last_log_id = nil + ws.sent_ids = Set.new + ws.queue = Queue.new + ws.frame_mtx = Mutex.new - sub = @channel.subscribe do |msg| - push_events ws, msg + @mtx.synchronize do + @connection_count += 1 end - ws.on :message do |event| - begin - p = (Oj.load event.data).symbolize_keys - if p[:method] == 'subscribe' - if p[:last_log_id] - ws.last_log_id = p[:last_log_id].to_i - end - - if ws.filters.length < MAX_FILTERS - filter_id = alloc_filter_id - ws.filters.push Filter.new(p, filter_id) - ws.send ({status: 200, message: 'subscribe ok', filter_id: filter_id}.to_json) - push_events ws - else - ws.send ({status: 403, message: "maximum of #{MAX_FILTERS} filters allowed per connection"}.to_json) - end - elsif p[:method] == 'unsubscribe' - if filter_id = p[:filter_id] - filter_id = filter_id.to_i - len = ws.filters.length - ws.filters = ws.filters.select { |f| f.filter_id != filter_id } - if ws.filters.length < len - ws.send ({status: 200, message: 'unsubscribe ok', filter_id: filter_id}.to_json) - else - ws.send ({status: 404, message: 'filter_id not found', filter_id: filter_id}.to_json) - end - else - ws.send ({status: 400, message: 'must provide filter_id'}.to_json) - end - else - ws.send ({status: 400, message: "missing or unrecognized method"}.to_json) - end - rescue Oj::Error => e - ws.send ({status: 400, message: "malformed request"}.to_json) - rescue Exception => e - puts "Error handling message: #{$!}" - puts "Backtrace:\n\t#{e.backtrace.join("\n\t")}" - ws.send ({status: 500, message: 'error'}.to_json) + # Subscribe to internal postgres notifications through @channel and + # forward them to the thread associated with the connection. + sub = @channel.subscribe do |msg| + if ws.queue.length > Rails.configuration.websocket_max_notify_backlog + send_message(ws, {status: 500, message: 'Notify backlog too long'}) ws.close + @channel.unsubscribe sub + ws.queue.clear + else + ws.queue << [:notify, msg] end end + # Set up callback for inbound message dispatch. + ws.on :message do |event| + ws.queue << [:message, event] + end + + # Set up socket close callback ws.on :close do |event| @channel.unsubscribe sub - ws = nil + ws.queue.clear + ws.queue << [:close, nil] + end + + # Spin off a new thread to handle sending events to the client. We need a + # separate thread per connection so that a slow client doesn't interfere + # with other clients. + # + # We don't want the loop in the request thread because on a TERM signal, + # Puma waits for outstanding requests to complete, and long-lived websocket + # connections may not complete in a timely manner. + Thread.new do + # Loop and react to socket events. + begin + loop do + eventType, msg = ws.queue.pop + if eventType == :message + handle_message ws, msg + elsif eventType == :notify + push_events ws, msg + elsif eventType == :close + break + end + end + ensure + @mtx.synchronize do + @connection_count -= 1 + end + ActiveRecord::Base.connection.close + end end + # Start up thread to monitor the Postgres database, if none exists already. @mtx.synchronize do unless @bgthread @bgthread = true @@ -179,8 +332,14 @@ class EventBus begin conn.async_exec "LISTEN logs" while true + # wait_for_notify will block until there is a change + # notification from Postgres about the logs table, then push + # the notification into the EventMachine channel. Each + # websocket connection subscribes to the other end of the + # channel and calls #push_events to actually dispatch the + # events to the client. conn.wait_for_notify do |channel, pid, payload| - @channel.push payload + @channel.push payload.to_i end end ensure @@ -190,8 +349,10 @@ class EventBus conn.async_exec "UNLISTEN *" end end + @bgthread = false end end end + end end