X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/381f3ce2529a027cc0eb1c402b94135711658f6b..5be63dcb589e10fbfc11fdd85dcb382708852baa:/sdk/cli/bin/arv-run-pipeline-instance diff --git a/sdk/cli/bin/arv-run-pipeline-instance b/sdk/cli/bin/arv-run-pipeline-instance index 7578abc7b0..4810768ded 100755 --- a/sdk/cli/bin/arv-run-pipeline-instance +++ b/sdk/cli/bin/arv-run-pipeline-instance @@ -28,11 +28,6 @@ # [--no-wait] Make only as much progress as possible without entering # a sleep/poll loop. # -# [--no-reuse-finished] Do not reuse existing outputs to satisfy -# pipeline components. Always submit a new job -# or use an existing job which has not yet -# finished. -# # [--no-reuse] Do not reuse existing jobs to satisfy pipeline # components. Submit a new job for every component. # @@ -153,10 +148,6 @@ p = Trollop::Parser.new do "Do not wait for jobs to finish. Just look up status, submit new jobs if needed, and exit.", :short => :none, :type => :boolean) - opt(:no_reuse_finished, - "Do not reuse existing outputs to satisfy pipeline components. Always submit a new job or use an existing job which has not yet finished.", - :short => :none, - :type => :boolean) opt(:no_reuse, "Do not reuse existing jobs to satisfy pipeline components. Submit a new job for every component.", :short => :none, @@ -196,7 +187,9 @@ if $options[:instance] abort "#{$0}: syntax error: --instance cannot be combined with --template or --submit." end elsif not $options[:template] - abort "#{$0}: syntax error: you must supply a --template or --instance." + puts "error: you must supply a --template or --instance." + p.educate + abort end if $options[:run_here] == $options[:submit] @@ -235,10 +228,10 @@ class PipelineInstance :parameters => { :uuid => uuid }, - :body => { - :api_token => ENV['ARVADOS_API_TOKEN'] - }, - :authenticated => false) + :authenticated => false, + :headers => { + authorization: 'OAuth2 '+ENV['ARVADOS_API_TOKEN'] + }) j = JSON.parse result.body, :symbolize_names => true unless j.is_a? Hash and j[:uuid] debuglog "Failed to get pipeline_instance: #{j[:errors] rescue nil}", 0 @@ -251,10 +244,12 @@ class PipelineInstance def self.create(attributes) result = $client.execute(:api_method => $arvados.pipeline_instances.create, :body => { - :api_token => ENV['ARVADOS_API_TOKEN'], :pipeline_instance => attributes }, - :authenticated => false) + :authenticated => false, + :headers => { + authorization: 'OAuth2 '+ENV['ARVADOS_API_TOKEN'] + }) j = JSON.parse result.body, :symbolize_names => true unless j.is_a? Hash and j[:uuid] abort "Failed to create pipeline_instance: #{j[:errors] rescue nil} #{j.inspect}" @@ -268,10 +263,12 @@ class PipelineInstance :uuid => @pi[:uuid] }, :body => { - :api_token => ENV['ARVADOS_API_TOKEN'], :pipeline_instance => @attributes_to_update.to_json }, - :authenticated => false) + :authenticated => false, + :headers => { + authorization: 'OAuth2 '+ENV['ARVADOS_API_TOKEN'] + }) j = JSON.parse result.body, :symbolize_names => true unless j.is_a? Hash and j[:uuid] debuglog "Failed to save pipeline_instance: #{j[:errors] rescue nil}", 0 @@ -300,20 +297,24 @@ class JobCache @cache ||= {} result = $client.execute(:api_method => $arvados.jobs.get, :parameters => { - :api_token => ENV['ARVADOS_API_TOKEN'], :uuid => uuid }, - :authenticated => false) + :authenticated => false, + :headers => { + authorization: 'OAuth2 '+ENV['ARVADOS_API_TOKEN'] + }) @cache[uuid] = JSON.parse result.body, :symbolize_names => true end def self.where(conditions) result = $client.execute(:api_method => $arvados.jobs.list, :parameters => { - :api_token => ENV['ARVADOS_API_TOKEN'], :limit => 10000, :where => conditions.to_json }, - :authenticated => false) + :authenticated => false, + :headers => { + authorization: 'OAuth2 '+ENV['ARVADOS_API_TOKEN'] + }) list = JSON.parse result.body, :symbolize_names => true if list and list[:items].is_a? Array list[:items] @@ -321,19 +322,21 @@ class JobCache [] end end - def self.create(attributes) + def self.create(job, create_params) @cache ||= {} result = $client.execute(:api_method => $arvados.jobs.create, - :parameters => { - :api_token => ENV['ARVADOS_API_TOKEN'], - :job => attributes.to_json - }, - :authenticated => false) + :body => { + :job => job.to_json + }.merge(create_params), + :authenticated => false, + :headers => { + authorization: 'OAuth2 '+ENV['ARVADOS_API_TOKEN'] + }) j = JSON.parse result.body, :symbolize_names => true if j.is_a? Hash and j[:uuid] @cache[j[:uuid]] = j else - debuglog "create job: #{j[:errors] rescue nil}", 0 + debuglog "create job: #{j[:errors] rescue nil} with attributes #{job}", 0 nil end end @@ -357,10 +360,12 @@ class WhRunPipelineInstance else result = $client.execute(:api_method => $arvados.pipeline_templates.get, :parameters => { - :api_token => ENV['ARVADOS_API_TOKEN'], :uuid => template }, - :authenticated => false) + :authenticated => false, + :headers => { + authorization: 'OAuth2 '+ENV['ARVADOS_API_TOKEN'] + }) @template = JSON.parse result.body, :symbolize_names => true if !@template[:uuid] abort "#{$0}: fatal: failed to retrieve pipeline template #{template} #{@template[:errors].inspect rescue nil}" @@ -422,101 +427,79 @@ class WhRunPipelineInstance end def setup_instance - @instance ||= PipelineInstance. - create(:components => @components, + if $options[:submit] + @instance ||= PipelineInstance. + create(:components => @components, + :pipeline_template_uuid => @template[:uuid], + :state => 'New') + else + @instance ||= PipelineInstance. + create(:components => @components, :pipeline_template_uuid => @template[:uuid], - :active => true) + :state => 'RunningOnClient') + end self end def run moretodo = true + interrupted = false + + job_creation_failed = 0 while moretodo moretodo = false @components.each do |cname, c| job = nil + owner_uuid = @instance[:owner_uuid] + # Is the job satisfying this component already known to be + # finished? (Already meaning "before we query API server about + # the job's current state") c_already_finished = (c[:job] && c[:job][:uuid] && !c[:job][:success].nil?) if !c[:job] and - c[:script_parameters].select { |pname, p| p.is_a? Hash }.empty? - # Job is fully specified (all parameter values are present) but - # no particular job has been found. - - debuglog "component #{cname} ready to satisfy." - - c.delete :wait - second_place_job = nil # satisfies component, but not finished yet - - (@options[:no_reuse] ? [] : JobCache. - where(script: c[:script], - script_parameters: c[:script_parameters], - script_version_descends_from: c[:script_version]) - ).each do |candidate_job| - candidate_params_downcase = Hash[candidate_job[:script_parameters]. - map { |k,v| [k.downcase,v] }] - c_params_downcase = Hash[c[:script_parameters]. - map { |k,v| [k.downcase,v] }] - - debuglog "component #{cname} considering job #{candidate_job[:uuid]} version #{candidate_job[:script_version]} parameters #{candidate_params_downcase.inspect}", 3 - - unless candidate_params_downcase == c_params_downcase - next - end - - if c[:script_version] != - candidate_job[:script_version][0,c[:script_version].length] - debuglog "component #{cname} would be satisfied by job #{candidate_job[:uuid]} if script_version matched.", 2 - next - end - - unless candidate_job[:success] || candidate_job[:running] || - (!candidate_job[:started_at] && !candidate_job[:cancelled_at]) - debuglog "component #{cname} would be satisfied by job #{candidate_job[:uuid]} if it were running or successful.", 2 - next - end - - if candidate_job[:success] - unless @options[:no_reuse_finished] - job = candidate_job - $stderr.puts "using #{job[:uuid]} (finished at #{job[:finished_at]}) for component #{cname}" - c[:job] = job - end - else - second_place_job ||= candidate_job - end - break - end - if not c[:job] and second_place_job - job = second_place_job - $stderr.puts "using #{job[:uuid]} (running since #{job[:started_at]}) for component #{cname}" + c[:script_parameters].select { |pname, p| p.is_a? Hash and p[:output_of]}.empty? + # No job yet associated with this component and is component inputs + # are fully specified (any output_of script_parameters are resolved + # to real value) + job = JobCache.create({ + :script => c[:script], + :script_parameters => c[:script_parameters], + :script_version => c[:script_version], + :repository => c[:repository], + :nondeterministic => c[:nondeterministic], + :output_is_persistent => c[:output_is_persistent] || false, + :owner_uuid => owner_uuid, + # TODO: Delete the following three attributes when + # supporting pre-20140418 API servers is no longer + # important. New API servers take these as flags that + # control behavior of create, rather than job attributes. + :minimum_script_version => c[:minimum_script_version], + :exclude_script_versions => c[:exclude_minimum_script_versions], + :no_reuse => @options[:no_reuse] || c[:nondeterministic], + }, { + # This is the right place to put these attributes when + # dealing with new API servers. + :minimum_script_version => c[:minimum_script_version], + :exclude_script_versions => c[:exclude_minimum_script_versions], + :find_or_create => !(@options[:no_reuse] || c[:nondeterministic]), + }) + if job + debuglog "component #{cname} new job #{job[:uuid]}" c[:job] = job + else + debuglog "component #{cname} new job failed", 0 + job_creation_failed += 1 end - if not c[:job] - debuglog "component #{cname} not satisfied by any existing job." - if !@options[:dry_run] - debuglog "component #{cname} new job." - job = JobCache.create(:script => c[:script], - :script_parameters => c[:script_parameters], - :runtime_constraints => c[:runtime_constraints] || {}, - :script_version => c[:script_version] || 'master', - :output_is_persistent => c[:output_is_persistent] || false) - if job - debuglog "component #{cname} new job #{job[:uuid]}" - c[:job] = job - else - debuglog "component #{cname} new job failed" - end - end - end - else - c[:wait] = true end + if c[:job] and c[:job][:uuid] if (c[:job][:running] or not (c[:job][:finished_at] or c[:job][:cancelled_at])) + # Job is running so update copy of job record c[:job] = JobCache.get(c[:job][:uuid]) end + if c[:job][:success] # Populate script_parameters of other components waiting for # this job @@ -530,10 +513,12 @@ class WhRunPipelineInstance end end unless c_already_finished + # This is my first time discovering that the job + # succeeded. (At the top of this loop, I was still + # waiting for it to finish.) if c[:output_is_persistent] - # This is my first time discovering that the job - # succeeded. I need to make sure a resources/wants - # link is in place to protect the output from garbage + # I need to make sure a resources/wants link is in + # place to protect the output from garbage # collection. (Normally Crunch does this for me, but # here I might be reusing the output of someone else's # job and I need to make sure it's understood that the @@ -558,7 +543,8 @@ class WhRunPipelineInstance tail_kind: 'arvados#user', tail_uuid: @my_user_uuid, head_kind: 'arvados#collection', - head_uuid: wanted + head_uuid: wanted, + owner_uuid: owner_uuid } debuglog "added link, uuid #{newlink[:uuid]}" end @@ -566,6 +552,7 @@ class WhRunPipelineInstance end elsif c[:job][:running] || (!c[:job][:started_at] && !c[:job][:cancelled_at]) + # Job is still running moretodo = true elsif c[:job][:cancelled_at] debuglog "component #{cname} job #{c[:job][:uuid]} cancelled." @@ -573,19 +560,24 @@ class WhRunPipelineInstance end end @instance[:components] = @components - @instance[:active] = moretodo report_status if @options[:no_wait] moretodo = false end + # If job creation fails, just give up on this pipeline instance. + if job_creation_failed > 0 + moretodo = false + end + if moretodo begin sleep 10 rescue Interrupt debuglog "interrupt", 0 - abort + interrupted = true + break end end end @@ -595,7 +587,7 @@ class WhRunPipelineInstance failed = 0 @components.each do |cname, c| if c[:job] - if c[:job][:finished_at] + if c[:job][:finished_at] or c[:job][:cancelled_at] or (c[:job][:running] == false and c[:job][:success] == false) ended += 1 if c[:job][:success] == true succeeded += 1 @@ -605,18 +597,40 @@ class WhRunPipelineInstance end end end - - if ended == @components.length or failed > 0 - @instance[:active] = false - @instance[:success] = (succeeded == @components.length) + + success = (succeeded == @components.length) + + # A job create call failed. Just give up. + if job_creation_failed > 0 + debuglog "job creation failed - giving up on this pipeline instance", 0 + success = false + failed += 1 + end + + if interrupted + if success + @instance[:state] = 'Complete' + else + @instance[:state] = 'Paused' + end + else + if ended == @components.length or failed > 0 + @instance[:state] = success ? 'Complete' : 'Failed' + end end + debuglog "pipeline instance state is #{@instance[:state]}" + + # set components_summary + components_summary = {"todo" => @components.length - ended, "done" => succeeded, "failed" => failed} + @instance[:components_summary] = components_summary + @instance.save end def cleanup - if @instance - @instance[:active] = false + if @instance and @instance[:state] == 'RunningOnClient' + @instance[:state] = 'Paused' @instance.save end end