3550: Merge branch 'master' into 3550-local-pipeline
[arvados.git] / sdk / cli / bin / arv-run-pipeline-instance
index e0e002fc7f50ad8cafc8ebb447676646ab46926a..472c20bd73a283feb3149c44b6b3f534d3ed50bb 100755 (executable)
@@ -42,6 +42,8 @@
 # [--status-json path] Print JSON status report to a file or
 #                      fifo. Default: /dev/null
 #
+# [--description] Description for the pipeline instance.
+#
 # == Parameters
 #
 # [param_name=param_value]
@@ -59,8 +61,6 @@
 class WhRunPipelineInstance
 end
 
-$application_version = 1.0
-
 if RUBY_VERSION < '1.9.3' then
   abort <<-EOS
 #{$0.gsub(/^\.\//,'')} requires Ruby version 1.9.3 or higher.
@@ -74,6 +74,7 @@ $arvados_api_token = ENV['ARVADOS_API_TOKEN'] or
   abort "#{$0}: fatal: ARVADOS_API_TOKEN environment variable not set."
 
 begin
+  require 'arvados'
   require 'rubygems'
   require 'json'
   require 'pp'
@@ -84,7 +85,7 @@ rescue LoadError => l
   abort <<-EOS
 #{$0}: fatal: #{l.message}
 Some runtime dependencies may be missing.
-Try: gem install pp google-api-client json trollop
+Try: gem install arvados pp google-api-client json trollop
   EOS
 end
 
@@ -108,21 +109,6 @@ if $arvados_api_host.match /local/
   suppress_warnings { OpenSSL::SSL::VERIFY_PEER = OpenSSL::SSL::VERIFY_NONE }
 end
 
-class Google::APIClient
-  def discovery_document(api, version)
-    api = api.to_s
-    return @discovery_documents["#{api}:#{version}"] ||=
-      begin
-        response = self.execute!(
-                                 :http_method => :get,
-                                 :uri => self.discovery_uri(api, version),
-                                 :authenticated => false
-                                 )
-        response.body.class == String ? JSON.parse(response.body) : response.body
-      end
-  end
-end
-
 
 # Parse command line options (the kind that control the behavior of
 # this program, that is, not the pipeline component parameters).
@@ -167,13 +153,25 @@ p = Trollop::Parser.new do
       :short => :none,
       :type => :string)
   opt(:submit,
-      "Do not try to satisfy any components. Just create a pipeline instance and output its UUID.",
+      "Submit the pipeline instance to the server, and exit. Let the Crunch dispatch service satisfy the components by finding/running jobs.",
+      :short => :none,
+      :type => :boolean)
+  opt(:run_pipeline_here,
+      "Manage the pipeline instance in-process. Submit jobs to Crunch as needed. Do not exit until the pipeline finishes (or fails).",
+      :short => :none,
+      :type => :boolean)
+  opt(:run_jobs_here,
+      "Run jobs in the local terminal session instead of submitting them to Crunch. Implies --run-pipeline-here. Note: this results in a significantly different job execution environment, and some Crunch features are not supported. It can be necessary to modify a pipeline in order to make it run this way.",
       :short => :none,
       :type => :boolean)
   opt(:run_here,
-      "Manage the pipeline in process.",
+      "Synonym for --run-jobs-here.",
       :short => :none,
       :type => :boolean)
+  opt(:description,
+      "Description for the pipeline instance.",
+      :short => :none,
+      :type => :string)
   stop_on [:'--']
 end
 $options = Trollop::with_standard_exception_handling p do
@@ -181,16 +179,21 @@ $options = Trollop::with_standard_exception_handling p do
 end
 $debuglevel = $options[:debug_level] || ($options[:debug] && 1) || 0
 
+$options[:run_jobs_here] ||= $options[:run_here] # old flag name
+$options[:run_pipeline_here] ||= $options[:run_jobs_here] # B requires A
+
 if $options[:instance]
   if $options[:template] or $options[:submit]
     abort "#{$0}: syntax error: --instance cannot be combined with --template or --submit."
   end
 elsif not $options[:template]
-  abort "#{$0}: syntax error: you must supply a --template or --instance."
+  puts "error: you must supply a --template or --instance."
+  p.educate
+  abort
 end
 
-if $options[:run_here] == $options[:submit]
-  abort "#{$0}: syntax error: you must supply either --run-here or --submit."
+if $options[:run_pipeline_here] == $options[:submit]
+  abort "#{$0}: error: you must supply --run-pipeline-here, --run-jobs-here, or --submit."
 end
 
 # Suppress SSL certificate checks if ARVADOS_API_HOST_INSECURE
@@ -211,12 +214,9 @@ end
 
 # Set up the API client.
 
-$client ||= Google::APIClient.
-  new(:host => $arvados_api_host,
-      :application_name => File.split($0).last,
-      :application_version => $application_version.to_s)
-$arvados = $client.discovered_api('arvados', $arvados_api_version)
-
+$arv = Arvados.new api_version: 'v1'
+$client = $arv.client
+$arvados = $arv.arvados_api
 
 class PipelineInstance
   def self.find(uuid)
@@ -224,10 +224,10 @@ class PipelineInstance
                              :parameters => {
                                :uuid => uuid
                              },
-                             :body => {
-                               :api_token => ENV['ARVADOS_API_TOKEN']
-                             },
-                             :authenticated => false)
+                             :authenticated => false,
+                             :headers => {
+                               authorization: 'OAuth2 '+ENV['ARVADOS_API_TOKEN']
+                             })
     j = JSON.parse result.body, :symbolize_names => true
     unless j.is_a? Hash and j[:uuid]
       debuglog "Failed to get pipeline_instance: #{j[:errors] rescue nil}", 0
@@ -239,14 +239,16 @@ class PipelineInstance
   end
   def self.create(attributes)
     result = $client.execute(:api_method => $arvados.pipeline_instances.create,
-                             :body => {
-                               :api_token => ENV['ARVADOS_API_TOKEN'],
+                             :body_object => {
                                :pipeline_instance => attributes
                              },
-                             :authenticated => false)
+                             :authenticated => false,
+                             :headers => {
+                               authorization: 'OAuth2 '+ENV['ARVADOS_API_TOKEN']
+                             })
     j = JSON.parse result.body, :symbolize_names => true
     unless j.is_a? Hash and j[:uuid]
-      abort "Failed to create pipeline_instance: #{j[:errors] rescue nil} #{j.inspect}"
+      abort "\n#{Time.now} -- pipeline_template #{@template[:uuid]}\nFailed to create pipeline_instance: #{j[:errors] rescue nil} #{j.inspect}"
     end
     debuglog "Created pipeline instance: #{j[:uuid]}"
     self.new(j)
@@ -256,11 +258,13 @@ class PipelineInstance
                              :parameters => {
                                :uuid => @pi[:uuid]
                              },
-                             :body => {
-                               :api_token => ENV['ARVADOS_API_TOKEN'],
-                               :pipeline_instance => @attributes_to_update.to_json
+                             :body_object => {
+                               :pipeline_instance => @attributes_to_update
                              },
-                             :authenticated => false)
+                             :authenticated => false,
+                             :headers => {
+                               authorization: 'OAuth2 '+ENV['ARVADOS_API_TOKEN']
+                             })
     j = JSON.parse result.body, :symbolize_names => true
     unless j.is_a? Hash and j[:uuid]
       debuglog "Failed to save pipeline_instance: #{j[:errors] rescue nil}", 0
@@ -277,6 +281,16 @@ class PipelineInstance
   def [](x)
     @pi[x]
   end
+
+  def log_stderr(msg)
+    $arv.log.create log: {
+      event_type: 'stderr',
+      object_uuid: self[:uuid],
+      owner_uuid: self[:owner_uuid],
+      properties: {"text" => msg},
+    }
+  end
+
   protected
   def initialize(j)
     @attributes_to_update = {}
@@ -289,20 +303,24 @@ class JobCache
     @cache ||= {}
     result = $client.execute(:api_method => $arvados.jobs.get,
                              :parameters => {
-                               :api_token => ENV['ARVADOS_API_TOKEN'],
                                :uuid => uuid
                              },
-                             :authenticated => false)
+                             :authenticated => false,
+                             :headers => {
+                               authorization: 'OAuth2 '+ENV['ARVADOS_API_TOKEN']
+                             })
     @cache[uuid] = JSON.parse result.body, :symbolize_names => true
   end
   def self.where(conditions)
     result = $client.execute(:api_method => $arvados.jobs.list,
                              :parameters => {
-                               :api_token => ENV['ARVADOS_API_TOKEN'],
                                :limit => 10000,
                                :where => conditions.to_json
                              },
-                             :authenticated => false)
+                             :authenticated => false,
+                             :headers => {
+                               authorization: 'OAuth2 '+ENV['ARVADOS_API_TOKEN']
+                             })
     list = JSON.parse result.body, :symbolize_names => true
     if list and list[:items].is_a? Array
       list[:items]
@@ -310,22 +328,39 @@ class JobCache
       []
     end
   end
-  def self.create(attributes)
+  def self.create(pipeline, component, job, create_params)
     @cache ||= {}
+
+    body = {job: no_nil_values(job)}.merge(no_nil_values(create_params))
+
     result = $client.execute(:api_method => $arvados.jobs.create,
-                             :parameters => {
-                               :api_token => ENV['ARVADOS_API_TOKEN'],
-                               :job => attributes.to_json
-                             },
-                             :authenticated => false)
+                             :body_object => body,
+                             :authenticated => false,
+                             :headers => {
+                               authorization: 'OAuth2 '+ENV['ARVADOS_API_TOKEN']
+                             })
     j = JSON.parse result.body, :symbolize_names => true
     if j.is_a? Hash and j[:uuid]
       @cache[j[:uuid]] = j
     else
-      debuglog "create job: #{j[:errors] rescue nil} with attribute #{attributes}", 0
+      debuglog "create job: #{j[:errors] rescue nil} with attributes #{body}", 0
+
+      msg = ""
+      j[:errors].each do |err|
+        msg += "Error creating job for component #{component}: #{err}\n"
+      end
+      msg += "Job submission was: #{body.to_json}"
+
+      pipeline.log_stderr(msg)
       nil
     end
   end
+
+  protected
+
+  def self.no_nil_values(hash)
+    hash.reject { |key, value| value.nil? }
+  end
 end
 
 class WhRunPipelineInstance
@@ -339,17 +374,15 @@ class WhRunPipelineInstance
     if template.match /[^-0-9a-z]/
       # Doesn't look like a uuid -- use it as a filename.
       @template = JSON.parse File.read(template), :symbolize_names => true
-      if !@template[:components]
-        abort ("#{$0}: Template loaded from #{template} " +
-               "does not have a \"components\" key")
-      end
     else
       result = $client.execute(:api_method => $arvados.pipeline_templates.get,
                                :parameters => {
-                                 :api_token => ENV['ARVADOS_API_TOKEN'],
                                  :uuid => template
                                },
-                               :authenticated => false)
+                               :authenticated => false,
+                               :headers => {
+                                 authorization: 'OAuth2 '+ENV['ARVADOS_API_TOKEN']
+                               })
       @template = JSON.parse result.body, :symbolize_names => true
       if !@template[:uuid]
         abort "#{$0}: fatal: failed to retrieve pipeline template #{template} #{@template[:errors].inspect rescue nil}"
@@ -375,12 +408,29 @@ class WhRunPipelineInstance
         param = params_args.shift.sub /^--/, ''
         params[param] = params_args.shift
       else
-        abort "Syntax error: I do not know what to do with arg \"#{params_args[0]}\""
+        abort "\n#{Time.now} -- pipeline_template #{@template[:uuid]}\nSyntax error: I do not know what to do with arg \"#{params_args[0]}\""
       end
     end
 
+    if not @template[:components].is_a?(Hash)
+      abort "\n#{Time.now} -- pipeline_template #{@template[:uuid]}\nSyntax error: Template missing \"components\" hash"
+    end
     @components = @template[:components].dup
 
+    bad_components = @components.each_pair.select do |cname, cspec|
+      not cspec.is_a?(Hash)
+    end
+    if bad_components.any?
+      abort "\n#{Time.now} -- pipeline_template #{@template[:uuid]}\nSyntax error: Components not specified with hashes: #{bad_components.map(&:first).join(', ')}"
+    end
+
+    bad_components = @components.each_pair.select do |cname, cspec|
+      not cspec[:script_parameters].is_a?(Hash)
+    end
+    if bad_components.any?
+      abort "\n#{Time.now} -- pipeline_template #{@template[:uuid]}\nSyntax error: Components missing \"script_parameters\" hashes: #{bad_components.map(&:first).join(', ')}"
+    end
+
     errors = []
     @components.each do |componentname, component|
       component[:script_parameters].each do |parametername, parameter|
@@ -404,45 +454,131 @@ class WhRunPipelineInstance
       end
     end
     if !errors.empty?
-      abort "Errors:\n#{errors.collect { |c,p,e| "#{c}::#{p} - #{e}\n" }.join ""}"
+      abort "\n#{Time.now} -- pipeline_template #{@template[:uuid]}\nErrors:\n#{errors.collect { |c,p,e| "#{c}::#{p} - #{e}\n" }.join ""}"
     end
     debuglog "options=" + @options.pretty_inspect
     self
   end
 
   def setup_instance
-    @instance ||= PipelineInstance.
-      create(:components => @components,
-             :pipeline_template_uuid => @template[:uuid],
-             :active => true)
+    if @instance
+      @instance[:properties][:run_options] ||= {}
+      if @options[:no_reuse]
+        # override properties of existing instance
+        @instance[:properties][:run_options][:enable_job_reuse] = false
+      else
+        # Default to "enable reuse" if not specified. (This code path
+        # can go away when old clients go away.)
+        if @instance[:properties][:run_options][:enable_job_reuse].nil?
+          @instance[:properties][:run_options][:enable_job_reuse] = true
+        end
+      end
+    else
+      description = $options[:description]
+      description = ("Created at #{Time.now.localtime}" + (@template[:name].andand.size.andand>0 ? " using the pipeline template *#{@template[:name]}*" : "")) if !description
+      @instance = PipelineInstance.
+        create(components: @components,
+               properties: {
+                 run_options: {
+                   enable_job_reuse: !@options[:no_reuse]
+                 }
+               },
+               pipeline_template_uuid: @template[:uuid],
+               description: description,
+               state: ($options[:submit] ? 'RunningOnServer' : 'RunningOnClient'))
+    end
     self
   end
 
   def run
     moretodo = true
+    interrupted = false
+
+    if @instance[:started_at].nil?
+      @instance[:started_at] = Time.now
+    end
+
+    job_creation_failed = 0
     while moretodo
       moretodo = false
       @components.each do |cname, c|
         job = nil
-
+        owner_uuid = @instance[:owner_uuid]
+        # Is the job satisfying this component already known to be
+        # finished? (Already meaning "before we query API server about
+        # the job's current state")
+        c_already_finished = (c[:job] &&
+                              c[:job][:uuid] &&
+                              !c[:job][:success].nil?)
         if !c[:job] and
             c[:script_parameters].select { |pname, p| p.is_a? Hash and p[:output_of]}.empty?
           # No job yet associated with this component and is component inputs
           # are fully specified (any output_of script_parameters are resolved
           # to real value)
-          job = JobCache.create({:script => c[:script],
-                            :script_parameters => c[:script_parameters],
-                            :script_version => c[:script_version],
-                            :repository => c[:repository],
-                            :minimum_script_version => c[:minimum_script_version],
-                            :exclude_script_versions => c[:exclude_minimum_script_versions],
-                            :nondeterministic => c[:nondeterministic],
-                            :no_reuse => @options[:no_reuse]})
+          my_submit_id = "instance #{@instance[:uuid]} rand #{rand(2**64).to_s(36)}"
+          job = JobCache.create(@instance, cname, {
+            :script => c[:script],
+            :script_parameters => c[:script_parameters],
+            :script_version => c[:script_version],
+            :repository => c[:repository],
+            :nondeterministic => c[:nondeterministic],
+            :runtime_constraints => c[:runtime_constraints],
+            :owner_uuid => owner_uuid,
+            :is_locked_by_uuid => (@options[:run_jobs_here] ? owner_uuid : nil),
+            :submit_id => my_submit_id,
+          }, {
+            # This is the right place to put these attributes when
+            # dealing with new API servers.
+            :minimum_script_version => c[:minimum_script_version],
+            :exclude_script_versions => c[:exclude_minimum_script_versions],
+            :find_or_create => (@instance[:properties][:run_options].andand[:enable_job_reuse] &&
+                                !c[:nondeterministic]),
+            :filters => c[:filters]
+          })
           if job
             debuglog "component #{cname} new job #{job[:uuid]}"
             c[:job] = job
+            c[:run_in_process] = (@options[:run_jobs_here] and
+                                  job[:submit_id] == my_submit_id)
           else
-            debuglog "component #{cname} new job failed"
+            debuglog "component #{cname} new job failed", 0
+            job_creation_failed += 1
+          end
+        end
+
+        if c[:job] and c[:run_in_process] and c[:job][:success].nil?
+          report_status
+          begin
+            require 'open3'
+            Open3.popen3("arv-crunch-job", "--force-unlock",
+                         "--job", c[:job][:uuid]) do |stdin, stdout, stderr, wait_thr|
+              debuglog "arv-crunch-job pid #{wait_thr.pid} started", 0
+              stdin.close
+              while true
+                rready, wready, = IO.select([stdout, stderr], [])
+                break if !rready[0]
+                begin
+                  buf = rready[0].read_nonblock(2**20)
+                rescue EOFError
+                  break
+                end
+                (rready[0] == stdout ? $stdout : $stderr).write(buf)
+              end
+              stdout.close
+              stderr.close
+              debuglog "arv-crunch-job pid #{wait_thr.pid} exit #{wait_thr.value.to_i}", 0
+            end
+            if not $arv.job.get(uuid: c[:job][:uuid])[:finished_at]
+              raise Exception.new("arv-crunch-job did not set finished_at.")
+            end
+          rescue Exception => e
+            debuglog "Interrupted (#{e}). Failing job.", 0
+            $arv.job.update(uuid: c[:job][:uuid],
+                            job: {
+                              finished_at: Time.now,
+                              running: false,
+                              success: false
+                            })
           end
         end
 
@@ -450,7 +586,7 @@ class WhRunPipelineInstance
           if (c[:job][:running] or
               not (c[:job][:finished_at] or c[:job][:cancelled_at]))
             # Job is running so update copy of job record
-            c[:job] = JobCache.get(c[:job][:uuid])            
+            c[:job] = JobCache.get(c[:job][:uuid])
           end
 
           if c[:job][:success]
@@ -465,6 +601,59 @@ class WhRunPipelineInstance
                 end
               end
             end
+            unless c_already_finished
+              # This is my first time discovering that the job
+              # succeeded. (At the top of this loop, I was still
+              # waiting for it to finish.)
+
+              if @instance[:name].andand.length.andand > 0
+                pipeline_name = @instance[:name]
+              elsif @template.andand[:name].andand.length.andand > 0
+                pipeline_name = @template[:name]
+              else
+                pipeline_name = @instance[:uuid]
+              end
+              if c[:output_name] != false
+                # Create a collection located in the same project as the pipeline with the contents of the output.
+                portable_data_hash = c[:job][:output]
+                collections = $arv.collection.list(limit: 1,
+                                                   filters: [['portable_data_hash', '=', portable_data_hash]],
+                                                   select: ["portable_data_hash", "manifest_text"]
+                                                   )[:items]
+                if collections.any?
+                  name = c[:output_name] || "Output #{portable_data_hash[0..7]} of #{cname} of #{pipeline_name}"
+
+                  # check if there is a name collision.
+                  name_collisions = $arv.collection.list(filters: [["owner_uuid", "=", owner_uuid],
+                                                                   ["name", "=", name]])[:items]
+
+                  newcollection_actual = nil
+                  if name_collisions.any? and name_collisions.first[:portable_data_hash] == portable_data_hash
+                    # There is already a collection with the same name and the
+                    # same contents, so just point to that.
+                    newcollection_actual = name_collisions.first
+                  end
+
+                  if newcollection_actual.nil?
+                    # Did not find a collection with the same name (or the
+                    # collection has a different portable data hash) so create
+                    # a new collection with ensure_unique_name: true.
+                    newcollection = {
+                      owner_uuid: owner_uuid,
+                      name: name,
+                      portable_data_hash: collections.first[:portable_data_hash],
+                      manifest_text: collections.first[:manifest_text]
+                    }
+                    debuglog "Creating collection #{newcollection}", 0
+                    newcollection_actual = $arv.collection.create collection: newcollection, ensure_unique_name: true
+                  end
+
+                  c[:output_uuid] = newcollection_actual[:uuid]
+                else
+                  debuglog "Could not find a collection with portable data hash #{portable_data_hash}", 0
+                end
+              end
+            end
           elsif c[:job][:running] ||
               (!c[:job][:started_at] && !c[:job][:cancelled_at])
             # Job is still running
@@ -475,19 +664,24 @@ class WhRunPipelineInstance
         end
       end
       @instance[:components] = @components
-      @instance[:active] = moretodo
       report_status
 
       if @options[:no_wait]
         moretodo = false
       end
 
+      # If job creation fails, just give up on this pipeline instance.
+      if job_creation_failed > 0
+        moretodo = false
+      end
+
       if moretodo
         begin
           sleep 10
         rescue Interrupt
           debuglog "interrupt", 0
-          abort
+          interrupted = true
+          break
         end
       end
     end
@@ -497,28 +691,54 @@ class WhRunPipelineInstance
     failed = 0
     @components.each do |cname, c|
       if c[:job]
-        if c[:job][:finished_at]
+        if c[:job][:finished_at] or c[:job][:cancelled_at] or (c[:job][:running] == false and c[:job][:success] == false)
           ended += 1
           if c[:job][:success] == true
             succeeded += 1
-          elsif c[:job][:success] == false
+          elsif c[:job][:success] == false or c[:job][:cancelled_at]
             failed += 1
           end
         end
       end
     end
-    
-    if ended == @components.length or failed > 0
-      @instance[:active] = false
-      @instance[:success] = (succeeded == @components.length)
+
+    success = (succeeded == @components.length)
+
+    # A job create call failed. Just give up.
+    if job_creation_failed > 0
+      debuglog "job creation failed - giving up on this pipeline instance", 0
+      success = false
+      failed += 1
+    end
+
+    if interrupted
+     if success
+        @instance[:state] = 'Complete'
+     else
+        @instance[:state] = 'Paused'
+      end
+    else
+      if ended == @components.length or failed > 0
+        @instance[:state] = success ? 'Complete' : 'Failed'
+      end
     end
 
+    if @instance[:finished_at].nil? and ['Complete', 'Failed'].include? @instance[:state]
+      @instance[:finished_at] = Time.now
+    end
+
+    debuglog "pipeline instance state is #{@instance[:state]}"
+
+    # set components_summary
+    components_summary = {"todo" => @components.length - ended, "done" => succeeded, "failed" => failed}
+    @instance[:components_summary] = components_summary
+
     @instance.save
   end
 
   def cleanup
-    if @instance
-      @instance[:active] = false
+    if @instance and @instance[:state] == 'RunningOnClient'
+      @instance[:state] = 'Paused'
       @instance.save
     end
   end
@@ -556,6 +776,8 @@ class WhRunPipelineInstance
                       "failed #{c[:job][:finished_at]}"
                     elsif c[:job][:started_at]
                       "started #{c[:job][:started_at]}"
+                    elsif c[:job][:is_locked_by_uuid]
+                      "starting #{c[:job][:started_at]}"
                     else
                       "queued #{c[:job][:created_at]}"
                     end
@@ -564,6 +786,19 @@ class WhRunPipelineInstance
       end
     end
   end
+
+  def abort(msg)
+    if @instance
+      if ["New", "Ready", "RunningOnClient",
+          "RunningOnServer"].include?(@instance[:state])
+        @instance[:state] = "Failed"
+        @instance[:finished_at] = Time.now
+        @instance.save
+      end
+      @instance.log_stderr(msg)
+    end
+    Kernel::abort(msg)
+  end
 end
 
 runner = WhRunPipelineInstance.new($options)