Merge branch '8784-dir-listings'
[arvados.git] / services / api / lib / crunch_dispatch.rb
index 48b0eb5983a750aad99e195cd2f08a7b4c01c92e..230f03e5777a56aec28b57bda56dcea4e82b065f 100644 (file)
@@ -1,3 +1,7 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
 require 'open3'
 require 'shellwords'
 
@@ -95,7 +99,7 @@ class CrunchDispatch
       # hasn't been able to communicate with it recently.
       state.sub!(/^idle\*/, "down")
       state.sub!(/\W+$/, "")
-      state = "down" unless %w(idle alloc down).include?(state)
+      state = "down" unless %w(idle alloc comp mix drng down).include?(state)
       slurm_nodes[hostname] = {state: state, job: nil}
     end
     each_slurm_line("squeue", "%j") do |hostname, job_uuid|
@@ -169,7 +173,25 @@ class CrunchDispatch
       end
       usable_nodes << node
       if usable_nodes.count >= min_node_count
-        return usable_nodes.map { |n| n.hostname }
+        hostnames = usable_nodes.map(&:hostname)
+        log_nodes = usable_nodes.map do |n|
+          "#{n.hostname} #{n.uuid} #{n.properties.to_json}"
+        end
+        log_job = "#{job.uuid} #{job.runtime_constraints}"
+        log_text = "dispatching job #{log_job} to #{log_nodes.join(", ")}"
+        $stderr.puts log_text
+        begin
+          act_as_system_user do
+            Log.new(object_uuid: job.uuid,
+                    event_type: 'dispatch',
+                    owner_uuid: system_user_uuid,
+                    summary: "dispatching to #{hostnames.join(", ")}",
+                    properties: {'text' => log_text}).save!
+          end
+        rescue => e
+          $stderr.puts "dispatch: log.create failed: #{e}"
+        end
+        return hostnames
       end
     end
     nil
@@ -204,8 +226,8 @@ class CrunchDispatch
               owner_uuid: job.owner_uuid,
               summary: message,
               properties: {"text" => message}).save!
-    rescue
-      $stderr.puts "dispatch: log.create failed"
+    rescue => e
+      $stderr.puts "dispatch: log.create failed: #{e}"
     end
 
     if not skip_lock and not have_job_lock?(job)
@@ -666,17 +688,20 @@ class CrunchDispatch
     jobrecord = Job.find_by_uuid(job_done.uuid)
 
     if exit_status == EXIT_RETRY_UNLOCKED or (exit_tempfail and @job_retry_counts.include? jobrecord.uuid)
+      $stderr.puts("dispatch: job #{jobrecord.uuid} was interrupted by node failure")
       # Only this crunch-dispatch process can retry the job:
       # it's already locked, and there's no way to put it back in the
       # Queued state.  Put it in our internal todo list unless the job
       # has failed this way excessively.
       @job_retry_counts[jobrecord.uuid] += 1
       exit_tempfail = @job_retry_counts[jobrecord.uuid] <= RETRY_UNLOCKED_LIMIT
+      do_what_next = "give up now"
       if exit_tempfail
         @todo_job_retries[jobrecord.uuid] = jobrecord
-      else
-        $stderr.puts("dispatch: job #{jobrecord.uuid} exceeded node failure retry limit -- giving up")
+        do_what_next = "re-attempt"
       end
+      $stderr.puts("dispatch: job #{jobrecord.uuid} has been interrupted " +
+                   "#{@job_retry_counts[jobrecord.uuid]}x, will #{do_what_next}")
     end
 
     if !exit_tempfail
@@ -814,6 +839,9 @@ class CrunchDispatch
         unless (@todo_pipelines.empty? and @pipe_auth_tokens.empty?) or did_recently(:update_pipelines, 5.0)
           update_pipelines
         end
+        unless did_recently('check_orphaned_slurm_jobs', 60)
+          check_orphaned_slurm_jobs
+        end
       end
       reap_children
       select(@running.values.collect { |j| [j[:stdout], j[:stderr]] }.flatten,
@@ -850,22 +878,14 @@ class CrunchDispatch
       end
       Rails.logger.info "fail_jobs: threshold is #{threshold}"
 
-      if Rails.configuration.crunch_job_wrapper == :slurm_immediate
-        # [["slurm_job_id", "slurm_job_name"], ...]
-        squeue = File.popen(['squeue', '-h', '-o', '%i %j']).readlines.map do |line|
-          line.strip.split(' ', 2)
-        end
-      else
-        squeue = []
-      end
-
+      squeue = squeue_jobs
       Job.where('state = ? and started_at < ?', Job::Running, threshold).
         each do |job|
         Rails.logger.debug "fail_jobs: #{job.uuid} started #{job.started_at}"
-        squeue.each do |slurm_id, slurm_name|
+        squeue.each do |slurm_name|
           if slurm_name == job.uuid
-            Rails.logger.info "fail_jobs: scancel #{slurm_id} for #{job.uuid}"
-            scancel slurm_id
+            Rails.logger.info "fail_jobs: scancel #{job.uuid}"
+            scancel slurm_name
           end
         end
         fail_job(job, "cleaned up stale job: started before #{threshold}",
@@ -874,6 +894,37 @@ class CrunchDispatch
     end
   end
 
+  def check_orphaned_slurm_jobs
+    act_as_system_user do
+      squeue_uuids = squeue_jobs.select{|uuid| uuid.match(/^[0-9a-z]{5}-8i9sb-[0-9a-z]{15}$/)}.
+                                  select{|uuid| !@running.has_key?(uuid)}
+
+      return if squeue_uuids.size == 0
+
+      scancel_uuids = squeue_uuids - Job.where('uuid in (?) and (state in (?) or modified_at>?)',
+                                               squeue_uuids,
+                                               ['Running', 'Queued'],
+                                               (Time.now - 60)).
+                                         collect(&:uuid)
+      scancel_uuids.each do |uuid|
+        Rails.logger.info "orphaned job: scancel #{uuid}"
+        scancel uuid
+      end
+    end
+  end
+
+  def sudo_preface
+    return [] if not Server::Application.config.crunch_job_user
+    ["sudo", "-E", "-u",
+     Server::Application.config.crunch_job_user,
+     "LD_LIBRARY_PATH=#{ENV['LD_LIBRARY_PATH']}",
+     "PATH=#{ENV['PATH']}",
+     "PERLLIB=#{ENV['PERLLIB']}",
+     "PYTHONPATH=#{ENV['PYTHONPATH']}",
+     "RUBYLIB=#{ENV['RUBYLIB']}",
+     "GEM_PATH=#{ENV['GEM_PATH']}"]
+  end
+
   protected
 
   def have_job_lock?(job)
@@ -916,23 +967,27 @@ class CrunchDispatch
     end
   end
 
-  def scancel slurm_id
-    cmd = sudo_preface + ['scancel', slurm_id]
-    puts File.popen(cmd).read
-    if not $?.success?
-      Rails.logger.error "scancel #{slurm_id.shellescape}: $?"
+  # An array of job_uuids in squeue
+  def squeue_jobs
+    if Rails.configuration.crunch_job_wrapper == :slurm_immediate
+      p = IO.popen(['squeue', '-a', '-h', '-o', '%j'])
+      begin
+        p.readlines.map {|line| line.strip}
+      ensure
+        p.close
+      end
+    else
+      []
     end
   end
 
-  def sudo_preface
-    return [] if not Server::Application.config.crunch_job_user
-    ["sudo", "-E", "-u",
-     Server::Application.config.crunch_job_user,
-     "LD_LIBRARY_PATH=#{ENV['LD_LIBRARY_PATH']}",
-     "PATH=#{ENV['PATH']}",
-     "PERLLIB=#{ENV['PERLLIB']}",
-     "PYTHONPATH=#{ENV['PYTHONPATH']}",
-     "RUBYLIB=#{ENV['RUBYLIB']}",
-     "GEM_PATH=#{ENV['GEM_PATH']}"]
+  def scancel slurm_name
+    cmd = sudo_preface + ['scancel', '-n', slurm_name]
+    IO.popen(cmd) do |scancel_pipe|
+      puts scancel_pipe.read
+    end
+    if not $?.success?
+      Rails.logger.error "scancel #{slurm_name.shellescape}: $?"
+    end
   end
 end