From b3d286cda65b90e4dd0aaef88f085f45ea855ed5 Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Wed, 29 Mar 2017 11:49:49 -0400 Subject: [PATCH] 11235: Log a message when a job is interrupted by node failure. ...and say what's going to happen as a result, even if that is not "giving up" yet. --- services/api/lib/crunch_dispatch.rb | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/services/api/lib/crunch_dispatch.rb b/services/api/lib/crunch_dispatch.rb index bea1657de2..2ae99f01c5 100644 --- a/services/api/lib/crunch_dispatch.rb +++ b/services/api/lib/crunch_dispatch.rb @@ -684,17 +684,20 @@ class CrunchDispatch jobrecord = Job.find_by_uuid(job_done.uuid) if exit_status == EXIT_RETRY_UNLOCKED or (exit_tempfail and @job_retry_counts.include? jobrecord.uuid) + $stderr.puts("dispatch: job #{jobrecord.uuid} was interrupted by node failure") # Only this crunch-dispatch process can retry the job: # it's already locked, and there's no way to put it back in the # Queued state. Put it in our internal todo list unless the job # has failed this way excessively. @job_retry_counts[jobrecord.uuid] += 1 exit_tempfail = @job_retry_counts[jobrecord.uuid] <= RETRY_UNLOCKED_LIMIT + do_what_next = "give up now" if exit_tempfail @todo_job_retries[jobrecord.uuid] = jobrecord - else - $stderr.puts("dispatch: job #{jobrecord.uuid} exceeded node failure retry limit -- giving up") + do_what_next = "re-attempt" end + $stderr.puts("dispatch: job #{jobrecord.uuid} has been interrupted " + + "#{@job_retry_counts[jobrecord.uuid]}x, will #{do_what_next}") end if !exit_tempfail -- 2.30.2