X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/5e0ad23d97f3c818e8c2b8f2ad127413779cf146..d0bf7a1ff103285e54433d3bcb67c2138b534542:/services/api/script/crunch-dispatch.rb diff --git a/services/api/script/crunch-dispatch.rb b/services/api/script/crunch-dispatch.rb index 12f4e4feec..d9db69f650 100755 --- a/services/api/script/crunch-dispatch.rb +++ b/services/api/script/crunch-dispatch.rb @@ -2,6 +2,7 @@ include Process +$warned = {} $signal = {} %w{TERM INT}.each do |sig| signame = sig @@ -26,6 +27,7 @@ require File.dirname(__FILE__) + '/../config/environment' require 'open3' $redis ||= Redis.new +LOG_BUFFER_SIZE = 2**20 class Dispatcher include ApplicationHelper @@ -36,6 +38,30 @@ class Dispatcher def refresh_todo @todo = Job.queue + @todo_pipelines = PipelineInstance.queue + end + + def sinfo + @@slurm_version ||= Gem::Version.new(`sinfo --version`.match(/\b[\d\.]+\b/)[0]) + if Gem::Version.new('2.3') <= @@slurm_version + `sinfo --noheader -o '%n:%t'`.strip + else + # Expand rows with hostname ranges (like "foo[1-3,5,9-12]:idle") + # into multiple rows with one hostname each. + `sinfo --noheader -o '%N:%t'`.split("\n").collect do |line| + tokens = line.split ":" + if (re = tokens[0].match /^(.*?)\[([-,\d]+)\]$/) + re[2].split(",").collect do |range| + range = range.split("-").collect(&:to_i) + (range[0]..range[-1]).collect do |n| + [re[1] + n.to_s, tokens[1..-1]].join ":" + end + end + else + tokens.join ":" + end + end.flatten.join "\n" + end end def update_node_status @@ -44,8 +70,7 @@ class Dispatcher @node_state ||= {} node_seen = {} begin - `sinfo --noheader -o '%n:%t'`. - split("\n"). + sinfo.split("\n"). each do |line| re = line.match /(\S+?):+(idle|alloc|down)/ next if !re @@ -120,32 +145,38 @@ class Dispatcher api_client_id: 0) job_auth.save - cmd_args << (ENV['CRUNCH_JOB_BIN'] || `which crunch-job`.strip) - cmd_args << '--job-api-token' - cmd_args << job_auth.api_token - cmd_args << '--job' - cmd_args << job.uuid - - if cmd_args[0] == '' + crunch_job_bin = (ENV['CRUNCH_JOB_BIN'] || `which arv-crunch-job`.strip) + if crunch_job_bin == '' raise "No CRUNCH_JOB_BIN env var, and crunch-job not in path." end - commit = Commit.where(sha1: job.script_version).first - if commit - cmd_args << '--git-dir' - if File.exists?(File. - join(Rails.configuration.git_repositories_dir, - commit.repository_name + '.git')) - cmd_args << File. - join(Rails.configuration.git_repositories_dir, - commit.repository_name + '.git') - else - cmd_args << File. - join(Rails.configuration.git_repositories_dir, - commit.repository_name, '.git') - end + require 'shellwords' + + arvados_internal = Rails.configuration.git_internal_dir + if not File.exists? arvados_internal + $stderr.puts `mkdir -p #{arvados_internal.shellescape} && cd #{arvados_internal.shellescape} && git init --bare` + end + + src_repo = File.join(Rails.configuration.git_repositories_dir, job.repository + '.git') + src_repo = File.join(Rails.configuration.git_repositories_dir, job.repository, '.git') unless File.exists? src_repo + + unless src_repo + $stderr.puts "dispatch: #{File.join Rails.configuration.git_repositories_dir, job.repository} doesn't exist" + sleep 1 + untake(job) + next end + $stderr.puts `cd #{arvados_internal.shellescape} && git fetch --no-tags #{src_repo.shellescape} && git tag #{job.uuid.shellescape} #{job.script_version.shellescape}` + + cmd_args << crunch_job_bin + cmd_args << '--job-api-token' + cmd_args << job_auth.api_token + cmd_args << '--job' + cmd_args << job.uuid + cmd_args << '--git-dir' + cmd_args << arvados_internal + $stderr.puts "dispatch: #{cmd_args.join ' '}" begin @@ -156,8 +187,14 @@ class Dispatcher untake(job) next end - $stderr.puts "dispatch: job #{job.uuid} start" - $stderr.puts "dispatch: child #{t.pid} start" + + $stderr.puts "dispatch: job #{job.uuid}" + start_banner = "dispatch: child #{t.pid} start #{Time.now.ctime.to_s}" + $stderr.puts start_banner + $redis.set job.uuid, start_banner + "\n" + $redis.publish job.uuid, start_banner + $redis.publish job.owner_uuid, start_banner + @running[job.uuid] = { stdin: i, stdout: o, @@ -212,7 +249,16 @@ class Dispatcher lines.each do |line| $stderr.print "#{job_uuid} ! " unless line.index(job_uuid) $stderr.puts line - $redis.publish job_uuid, "#{Time.now.ctime.to_s} #{line.strip}" + pub_msg = "#{Time.now.ctime.to_s} #{line.strip}" + $redis.publish job.owner_uuid, pub_msg + $redis.publish job_uuid, pub_msg + $redis.append job_uuid, pub_msg + "\n" + if LOG_BUFFER_SIZE < $redis.strlen(job_uuid) + $redis.set(job_uuid, + $redis + .getrange(job_uuid, (LOG_BUFFER_SIZE >> 1), -1) + .sub(/^.*?\n/, '')) + end end end end @@ -257,7 +303,6 @@ class Dispatcher job_done = j_done[:job] $stderr.puts "dispatch: child #{pid_done} exit" $stderr.puts "dispatch: job #{job_done.uuid} end" - $redis.publish job_done.uuid, "end" # Ensure every last drop of stdout and stderr is consumed read_pipes @@ -268,15 +313,41 @@ class Dispatcher # Wait the thread j_done[:wait_thr].value + jobrecord = Job.find_by_uuid(job_done.uuid) + jobrecord.running = false + jobrecord.finished_at ||= Time.now + # Don't set 'jobrecord.success = false' because if the job failed to run due to an + # issue with crunch-job or slurm, we want the job to stay in the queue. + jobrecord.save! + # Invalidate the per-job auth token j_done[:job_auth].update_attributes expires_at: Time.now + $redis.publish job_done.uuid, "end" + @running.delete job_done.uuid end + def update_pipelines + expire_tokens = @pipe_auth_tokens.dup + @todo_pipelines.each do |p| + pipe_auth = (@pipe_auth_tokens[p.uuid] ||= ApiClientAuthorization. + create(user: User.where('uuid=?', p.modified_by_user_uuid).first, + api_client_id: 0)) + puts `export ARVADOS_API_TOKEN=#{pipe_auth.api_token} && arv-run-pipeline-instance --run-here --no-wait --instance #{p.uuid}` + expire_tokens.delete p.uuid + end + + expire_tokens.each do |k, v| + v.update_attributes expires_at: Time.now + @pipe_auth_tokens.delete k + end + end + def run act_as_system_user @running ||= {} + @pipe_auth_tokens ||= { } $stderr.puts "dispatch: ready" while !$signal[:term] or @running.size > 0 read_pipes @@ -295,7 +366,12 @@ class Dispatcher else refresh_todo unless did_recently(:refresh_todo, 1.0) update_node_status - start_jobs unless @todo.empty? or did_recently(:start_jobs, 1.0) + unless @todo.empty? or did_recently(:start_jobs, 1.0) or $signal[:term] + start_jobs + end + unless (@todo_pipelines.empty? and @pipe_auth_tokens.empty?) or did_recently(:update_pipelines, 5.0) + update_pipelines + end end reap_children select(@running.values.collect { |j| [j[:stdout], j[:stderr]] }.flatten, @@ -316,4 +392,7 @@ class Dispatcher end end +# This is how crunch-job child procs know where the "refresh" trigger file is +ENV["CRUNCH_REFRESH_TRIGGER"] = Rails.configuration.crunch_refresh_trigger + Dispatcher.new.run