X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/3687df5d70eb3112914b81c002c6fa10af3a1e8a..e46ae4015db8f166359515f2e9823ac1a2c7cab8:/services/api/script/crunch-dispatch.rb diff --git a/services/api/script/crunch-dispatch.rb b/services/api/script/crunch-dispatch.rb index a0ba9b1fde..ecc7f5d98e 100755 --- a/services/api/script/crunch-dispatch.rb +++ b/services/api/script/crunch-dispatch.rb @@ -2,6 +2,7 @@ include Process +$warned = {} $signal = {} %w{TERM INT}.each do |sig| signame = sig @@ -39,14 +40,36 @@ class Dispatcher @todo = Job.queue end + def sinfo + @@slurm_version ||= Gem::Version.new(`sinfo --version`.match(/\b[\d\.]+\b/)[0]) + if Gem::Version.new('2.3') <= @@slurm_version + `sinfo --noheader -o '%n:%t'`.strip + else + # Expand rows with hostname ranges (like "foo[1-3,5,9-12]:idle") + # into multiple rows with one hostname each. + `sinfo --noheader -o '%N:%t'`.split("\n").collect do |line| + tokens = line.split ":" + if (re = tokens[0].match /^(.*?)\[([-,\d]+)\]$/) + re[2].split(",").collect do |range| + range = range.split("-").collect(&:to_i) + (range[0]..range[-1]).collect do |n| + [re[1] + n.to_s, tokens[1..-1]].join ":" + end + end + else + tokens.join ":" + end + end.flatten.join "\n" + end + end + def update_node_status if Server::Application.config.crunch_job_wrapper.to_s.match /^slurm/ @nodes_in_state = {idle: 0, alloc: 0, down: 0} @node_state ||= {} node_seen = {} begin - `sinfo --noheader -o '%n:%t'`. - split("\n"). + sinfo.split("\n"). each do |line| re = line.match /(\S+?):+(idle|alloc|down)/ next if !re @@ -121,16 +144,17 @@ class Dispatcher api_client_id: 0) job_auth.save - cmd_args << (ENV['CRUNCH_JOB_BIN'] || `which crunch-job`.strip) + crunch_job_bin = (ENV['CRUNCH_JOB_BIN'] || `which arv-crunch-job`.strip) + if crunch_job_bin == '' + raise "No CRUNCH_JOB_BIN env var, and crunch-job not in path." + end + + cmd_args << crunch_job_bin cmd_args << '--job-api-token' cmd_args << job_auth.api_token cmd_args << '--job' cmd_args << job.uuid - if cmd_args[0] == '' - raise "No CRUNCH_JOB_BIN env var, and crunch-job not in path." - end - commit = Commit.where(sha1: job.script_version).first if commit cmd_args << '--git-dir' @@ -163,6 +187,7 @@ class Dispatcher $stderr.puts start_banner $redis.set job.uuid, start_banner + "\n" $redis.publish job.uuid, start_banner + $redis.publish job.owner_uuid, start_banner @running[job.uuid] = { stdin: i, @@ -218,8 +243,10 @@ class Dispatcher lines.each do |line| $stderr.print "#{job_uuid} ! " unless line.index(job_uuid) $stderr.puts line - $redis.publish job_uuid, "#{Time.now.ctime.to_s} #{line.strip}" - $redis.append job_uuid, "#{Time.now.ctime.to_s} #{line}" + pub_msg = "#{Time.now.ctime.to_s} #{line.strip}" + $redis.publish job.owner_uuid, pub_msg + $redis.publish job_uuid, pub_msg + $redis.append job_uuid, pub_msg + "\n" if LOG_BUFFER_SIZE < $redis.strlen(job_uuid) $redis.set(job_uuid, $redis @@ -308,7 +335,9 @@ class Dispatcher else refresh_todo unless did_recently(:refresh_todo, 1.0) update_node_status - start_jobs unless @todo.empty? or did_recently(:start_jobs, 1.0) + unless @todo.empty? or did_recently(:start_jobs, 1.0) or $signal[:term] + start_jobs + end end reap_children select(@running.values.collect { |j| [j[:stdout], j[:stderr]] }.flatten, @@ -329,4 +358,7 @@ class Dispatcher end end +# This is how crunch-job child procs know where the "refresh" trigger file is +ENV["CRUNCH_REFRESH_TRIGGER"] = Rails.configuration.crunch_refresh_trigger + Dispatcher.new.run