Merge branch '11590-log-reuse'
[arvados.git] / services / api / lib / crunch_dispatch.rb
index c95db312f4337bb95efccf72e66e01e3bd8cf4e7..2ae99f01c59b7bef384e4e48011357a37f3363d3 100644 (file)
@@ -684,17 +684,20 @@ class CrunchDispatch
     jobrecord = Job.find_by_uuid(job_done.uuid)
 
     if exit_status == EXIT_RETRY_UNLOCKED or (exit_tempfail and @job_retry_counts.include? jobrecord.uuid)
+      $stderr.puts("dispatch: job #{jobrecord.uuid} was interrupted by node failure")
       # Only this crunch-dispatch process can retry the job:
       # it's already locked, and there's no way to put it back in the
       # Queued state.  Put it in our internal todo list unless the job
       # has failed this way excessively.
       @job_retry_counts[jobrecord.uuid] += 1
       exit_tempfail = @job_retry_counts[jobrecord.uuid] <= RETRY_UNLOCKED_LIMIT
+      do_what_next = "give up now"
       if exit_tempfail
         @todo_job_retries[jobrecord.uuid] = jobrecord
-      else
-        $stderr.puts("dispatch: job #{jobrecord.uuid} exceeded node failure retry limit -- giving up")
+        do_what_next = "re-attempt"
       end
+      $stderr.puts("dispatch: job #{jobrecord.uuid} has been interrupted " +
+                   "#{@job_retry_counts[jobrecord.uuid]}x, will #{do_what_next}")
     end
 
     if !exit_tempfail
@@ -964,9 +967,11 @@ class CrunchDispatch
   def squeue_jobs
     if Rails.configuration.crunch_job_wrapper == :slurm_immediate
       p = IO.popen(['squeue', '-a', '-h', '-o', '%j'])
-      l = p.readlines.map {|line| line.strip}
-      p.close
-      l
+      begin
+        p.readlines.map {|line| line.strip}
+      ensure
+        p.close
+      end
     else
       []
     end