From 7ea5d6161e4eab67795fbe2758d98822a9a03f13 Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Mon, 11 Aug 2014 13:25:48 -0400 Subject: [PATCH] 3570: Do not fail the job when crunch-job loses a locking race during startup. --- sdk/cli/bin/crunch-job | 23 +++++++++++++++++------ services/api/script/crunch-dispatch.rb | 6 +++--- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job index 06b3da99a9..d5edf0beed 100755 --- a/sdk/cli/bin/crunch-job +++ b/sdk/cli/bin/crunch-job @@ -150,17 +150,25 @@ if ($job_has_uuid) { $Job = $arv->{'jobs'}->{'get'}->execute('uuid' => $jobspec); if (!$force_unlock) { + # If some other crunch-job process has grabbed this job (or we see + # other evidence that the job is already underway) we exit 111 so + # crunch-dispatch (our parent process) doesn't mark the job as + # failed. if ($Job->{'is_locked_by_uuid'}) { - croak("Job is locked: " . $Job->{'is_locked_by_uuid'}); + Log(undef, "Job is locked by " . $Job->{'is_locked_by_uuid'} . ", exiting 111"); + exit(111); } if ($Job->{'success'} ne undef) { - croak("Job 'success' flag (" . $Job->{'success'} . ") is not null"); + Log(undef, "Job 'success' flag (" . $Job->{'success'} . ") is not null"); + exit(111); } if ($Job->{'running'}) { - croak("Job 'running' flag is already set"); + Log(undef, "Job 'running' flag is already set"); + exit(111); } if ($Job->{'started_at'}) { - croak("Job 'started_at' time is already set (" . $Job->{'started_at'} . ")"); + Log(undef, "Job 'started_at' time is already set (" . $Job->{'started_at'} . ")"); + exit(111); } } } @@ -273,7 +281,8 @@ if ($job_has_uuid) # Claim this job, and make sure nobody else does unless ($Job->update_attributes('is_locked_by_uuid' => $User->{'uuid'}) && $Job->{'is_locked_by_uuid'} == $User->{'uuid'}) { - croak("Error while updating / locking job"); + Log(undef, "Error while updating / locking job, exiting 111"); + exit(111); } $Job->update_attributes('started_at' => scalar gmtime, 'running' => 1, @@ -688,7 +697,9 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++) my @execargs = ('bash', '-c', $command); srun (\@srunargs, \@execargs, undef, $build_script_to_send); - exit (111); + # exec() failed, we assume nothing happened. + Log(undef, "srun() failed on build script"); + die; } close("writer"); if (!defined $childpid) diff --git a/services/api/script/crunch-dispatch.rb b/services/api/script/crunch-dispatch.rb index 5a990f0cb4..58e6645763 100755 --- a/services/api/script/crunch-dispatch.rb +++ b/services/api/script/crunch-dispatch.rb @@ -375,11 +375,11 @@ class Dispatcher $stderr.puts j_done[:stderr_buf] + "\n" end - # Wait the thread - j_done[:wait_thr].value + # Wait the thread (returns a Process::Status) + exit_status = j_done[:wait_thr].value jobrecord = Job.find_by_uuid(job_done.uuid) - if jobrecord.started_at + if exit_status.to_i != 111 and jobrecord.started_at # Clean up state fields in case crunch-job exited without # putting the job in a suitable "finished" state. jobrecord.running = false -- 2.30.2