include ApplicationHelper
EXIT_TEMPFAIL = 75
+ EXIT_RETRY_UNLOCKED = 93
+ RETRY_UNLOCKED_LIMIT = 3
def initialize
@crunch_job_bin = (ENV['CRUNCH_JOB_BIN'] || `which arv-crunch-job`.strip)
@pipe_auth_tokens = {}
@running = {}
@todo = []
+ @todo_job_retries = {}
+ @job_retry_counts = Hash.new(0)
@todo_pipelines = []
end
def refresh_todo
if $options[:jobs]
- @todo = Job.queue.select(&:repository)
+ @todo = @todo_job_retries.values + Job.queue.select(&:repository)
end
if $options[:pipelines]
@todo_pipelines = PipelineInstance.queue
'--job', job.uuid,
'--git-dir', @arvados_internal]
+ if @todo_job_retries.include?(job.uuid)
+ cmd_args << "--force-unlock"
+ end
+
$stderr.puts "dispatch: #{cmd_args.join ' '}"
begin
log_throttle_bytes_skipped: 0,
}
i.close
+ @todo_job_retries.delete(job.uuid)
update_node_status
end
end
# Wait the thread (returns a Process::Status)
exit_status = j_done[:wait_thr].value.exitstatus
+ exit_tempfail = exit_status == EXIT_TEMPFAIL
$stderr.puts "dispatch: child #{pid_done} exit #{exit_status}"
$stderr.puts "dispatch: job #{job_done.uuid} end"
jobrecord = Job.find_by_uuid(job_done.uuid)
- if exit_status != EXIT_TEMPFAIL and jobrecord.state == "Running"
- # crunch-job did not return exit code 75 (see below) and left the job in
- # the "Running" state, which means there was an unhandled error. Fail
- # the job.
- jobrecord.state = "Failed"
- if not jobrecord.save
- $stderr.puts "dispatch: jobrecord.save failed"
+
+ if exit_status == EXIT_RETRY_UNLOCKED
+ # The job failed because all of the nodes allocated to it
+ # failed. Only this crunch-dispatch process can retry the job:
+ # it's already locked, and there's no way to put it back in the
+ # Queued state. Put it in our internal todo list unless the job
+ # has failed this way excessively.
+ @job_retry_counts[jobrecord.uuid] += 1
+ exit_tempfail = @job_retry_counts[jobrecord.uuid] <= RETRY_UNLOCKED_LIMIT
+ if exit_tempfail
+ @todo_job_retries[jobrecord.uuid] = jobrecord
+ else
+ $stderr.puts("dispatch: job #{jobrecord.uuid} exceeded node failure retry limit -- giving up")
+ end
+ end
+
+ if !exit_tempfail
+ @job_retry_counts.delete(jobrecord.uuid)
+ if jobrecord.state == "Running"
+ # Apparently there was an unhandled error. Fail the job.
+ jobrecord.state = "Failed"
+ if not jobrecord.save
+ $stderr.puts "dispatch: jobrecord.save failed"
+ end
end
else
# Don't fail the job if crunch-job didn't even get as far as
# queue. If crunch-job exited after losing a race to another
# crunch-job process, it exits 75 and we should leave the job
# record alone so the winner of the race do its thing.
+ # If crunch-job exited after all of its allocated nodes failed,
+ # it exits 93, and we want to retry it later (see the
+ # EXIT_RETRY_UNLOCKED `if` block).
#
# There is still an unhandled race condition: If our crunch-job
# process is about to lose a race with another crunch-job
# Invalidate the per-job auth token, unless the job is still queued and we
# might want to try it again.
- if jobrecord.state != "Queued"
+ if jobrecord.state != "Queued" and !@todo_job_retries.include?(jobrecord.uuid)
j_done[:job_auth].update_attributes expires_at: Time.now
end