# update our database (and cache) when a node's state changes
if @node_state[re[1]] != re[2]
@node_state[re[1]] = re[2]
- node = Node.where('hostname=?', re[1]).first
+ node = Node.where('hostname=?', re[1]).order(:last_ping_at).last
if node
$stderr.puts "dispatch: update #{re[1]} state to #{re[2]}"
node.info['slurm_state'] = re[2]
next
end
- $stderr.puts `cd #{arvados_internal.shellescape} && git fetch --no-tags #{src_repo.shellescape} && git tag #{job.uuid.shellescape} #{job.script_version.shellescape}`
+ $stderr.puts `cd #{arvados_internal.shellescape} && git fetch-pack --all #{src_repo.shellescape} && git tag #{job.uuid.shellescape} #{job.script_version.shellescape}`
cmd_args << crunch_job_bin
cmd_args << '--job-api-token'
$stderr.puts j_done[:stderr_buf] + "\n"
end
- # Wait the thread
- j_done[:wait_thr].value
+ # Wait the thread (returns a Process::Status)
+ exit_status = j_done[:wait_thr].value
jobrecord = Job.find_by_uuid(job_done.uuid)
- if jobrecord.started_at
+ if exit_status.to_i != 75 and jobrecord.started_at
# Clean up state fields in case crunch-job exited without
# putting the job in a suitable "finished" state.
jobrecord.running = false
# Don't fail the job if crunch-job didn't even get as far as
# starting it. If the job failed to run due to an infrastructure
# issue with crunch-job or slurm, we want the job to stay in the
- # queue.
+ # queue. If crunch-job exited after losing a race to another
+ # crunch-job process, it exits 75 and we should leave the job
+ # record alone so the winner of the race do its thing.
+ #
+ # There is still an unhandled race condition: If our crunch-job
+ # process is about to lose a race with another crunch-job
+ # process, but crashes before getting to its "exit 75" (for
+ # example, "cannot fork" or "cannot reach API server") then we
+ # will assume incorrectly that it's our process's fault
+ # jobrecord.started_at is non-nil, and mark the job as failed
+ # even though the winner of the race is probably still doing
+ # fine.
end
# Invalidate the per-job auth token