- if exit_status != EXIT_TEMPFAIL and jobrecord.state == "Running"
- # crunch-job did not return exit code 75 (see below) and left the job in
- # the "Running" state, which means there was an unhandled error. Fail
- # the job.
- jobrecord.state = "Failed"
- if not jobrecord.save
- $stderr.puts "dispatch: jobrecord.save failed"
+
+ if exit_status == EXIT_RETRY_UNLOCKED
+ # The job failed because all of the nodes allocated to it
+ # failed. Only this crunch-dispatch process can retry the job:
+ # it's already locked, and there's no way to put it back in the
+ # Queued state. Put it in our internal todo list unless the job
+ # has failed this way excessively.
+ @job_retry_counts[jobrecord.uuid] += 1
+ exit_tempfail = @job_retry_counts[jobrecord.uuid] <= RETRY_UNLOCKED_LIMIT
+ if exit_tempfail
+ @todo_job_retries[jobrecord.uuid] = jobrecord
+ else
+ $stderr.puts("dispatch: job #{jobrecord.uuid} exceeded node failure retry limit -- giving up")
+ end
+ end
+
+ if !exit_tempfail
+ @job_retry_counts.delete(jobrecord.uuid)
+ if jobrecord.state == "Running"
+ # Apparently there was an unhandled error. That could potentially
+ # include "all allocated nodes failed" when we don't to retry
+ # because the job has already been retried RETRY_UNLOCKED_LIMIT
+ # times. Fail the job.
+ jobrecord.state = "Failed"
+ if not jobrecord.save
+ $stderr.puts "dispatch: jobrecord.save failed"
+ end