X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/f99ffd6844ae9b88cfd17c8fe6e9fcc85a55d79c..03aadd3864de6d5687e1e6c71815fafc4ec030af:/services/api/script/crunch-dispatch.rb diff --git a/services/api/script/crunch-dispatch.rb b/services/api/script/crunch-dispatch.rb index dff68d867e..693dfbdb7f 100755 --- a/services/api/script/crunch-dispatch.rb +++ b/services/api/script/crunch-dispatch.rb @@ -11,10 +11,6 @@ $signal = {} $signal[:term] = true end end -Signal.trap('HUP') do - $stderr.puts "Received HUP signal" - $signal[:hup] = true -end if ENV["CRUNCH_DISPATCH_LOCKFILE"] lockfilename = ENV.delete "CRUNCH_DISPATCH_LOCKFILE" @@ -42,6 +38,30 @@ class Dispatcher def refresh_todo @todo = Job.queue + @todo_pipelines = PipelineInstance.queue + end + + def sinfo + @@slurm_version ||= Gem::Version.new(`sinfo --version`.match(/\b[\d\.]+\b/)[0]) + if Gem::Version.new('2.3') <= @@slurm_version + `sinfo --noheader -o '%n:%t'`.strip + else + # Expand rows with hostname ranges (like "foo[1-3,5,9-12]:idle") + # into multiple rows with one hostname each. + `sinfo --noheader -o '%N:%t'`.split("\n").collect do |line| + tokens = line.split ":" + if (re = tokens[0].match /^(.*?)\[([-,\d]+)\]$/) + re[2].split(",").collect do |range| + range = range.split("-").collect(&:to_i) + (range[0]..range[-1]).collect do |n| + [re[1] + n.to_s, tokens[1..-1]].join ":" + end + end + else + tokens.join ":" + end + end.flatten.join "\n" + end end def update_node_status @@ -50,8 +70,7 @@ class Dispatcher @node_state ||= {} node_seen = {} begin - `sinfo --noheader -o '%n:%t'`. - split("\n"). + sinfo.split("\n"). each do |line| re = line.match /(\S+?):+(idle|alloc|down)/ next if !re @@ -126,16 +145,17 @@ class Dispatcher api_client_id: 0) job_auth.save - cmd_args << (ENV['CRUNCH_JOB_BIN'] || `which crunch-job`.strip) + crunch_job_bin = (ENV['CRUNCH_JOB_BIN'] || `which arv-crunch-job`.strip) + if crunch_job_bin == '' + raise "No CRUNCH_JOB_BIN env var, and crunch-job not in path." + end + + cmd_args << crunch_job_bin cmd_args << '--job-api-token' cmd_args << job_auth.api_token cmd_args << '--job' cmd_args << job.uuid - if cmd_args[0] == '' - raise "No CRUNCH_JOB_BIN env var, and crunch-job not in path." - end - commit = Commit.where(sha1: job.script_version).first if commit cmd_args << '--git-dir' @@ -295,6 +315,17 @@ class Dispatcher @running.delete job_done.uuid end + def update_pipelines + @todo_pipelines.each do |p| + pipe_auth = ApiClientAuthorization. + new(user: User.where('uuid=?', p.modified_by_user_uuid).first, + api_client_id: 0) + pipe_auth.save + + puts `export ARVADOS_API_TOKEN=#{pipe_auth.api_token} && arv-run-pipeline-instance --run-here --no-wait --instance #{p.uuid}` + end + end + def run act_as_system_user @running ||= {} @@ -314,33 +345,14 @@ class Dispatcher end end else - if File.exists?(Rails.configuration.crunch_dispatch_hup_trigger) - begin - File.unlink(Rails.configuration.crunch_dispatch_hup_trigger) - $signal[:hup] = true - rescue Errno::ENOENT - $stderr.puts "Weird, hup_trigger file was deleted by someone else." - rescue Errno::EPERM - if not $warned[:hup_trigger_perm] - $warned[:hup_trigger_perm] = true - $stderr.puts "Install problem: I see the hup_trigger file but cannot delete it." - end - end - end - if $signal[:hup] - # Pass HUP through to all crunch-job processes. - @running.each do |uuid, j| - begin - Process.kill 'HUP', j[:wait_thr].pid - rescue Errno::ESRCH - # Process ended but hasn't been reaped. Nothing to do. - end - end - $signal.delete :hup - end refresh_todo unless did_recently(:refresh_todo, 1.0) update_node_status - start_jobs unless @todo.empty? or did_recently(:start_jobs, 1.0) + unless @todo.empty? or did_recently(:start_jobs, 1.0) or $signal[:term] + start_jobs + end + unless @todo_pipelines.empty? or did_recently(:update_pipelines, 5.0) + update_pipelines + end end reap_children select(@running.values.collect { |j| [j[:stdout], j[:stderr]] }.flatten, @@ -348,6 +360,8 @@ class Dispatcher end end + + protected def did_recently(thing, min_interval) @@ -361,4 +375,7 @@ class Dispatcher end end +# This is how crunch-job child procs know where the "refresh" trigger file is +ENV["CRUNCH_REFRESH_TRIGGER"] = Rails.configuration.crunch_refresh_trigger + Dispatcher.new.run