From caa5dd776dfad5e50592a5cc2824c70ac3474b46 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Thu, 25 Sep 2014 14:08:25 -0400 Subject: [PATCH] 3859: Check crunch-job return code properly. Restore job state change validation. --- services/api/app/models/job.rb | 15 ++++++++ services/api/script/crunch-dispatch.rb | 53 ++++++++++++++------------ 2 files changed, 43 insertions(+), 25 deletions(-) diff --git a/services/api/app/models/job.rb b/services/api/app/models/job.rb index ea9990d9e7..81744c7643 100644 --- a/services/api/app/models/job.rb +++ b/services/api/app/models/job.rb @@ -14,6 +14,7 @@ class Job < ArvadosModel validate :ensure_script_version_is_commit validate :find_docker_image_locator validate :validate_status + validate :validate_state_change has_many :commit_ancestors, :foreign_key => :descendant, :primary_key => :script_version has_many(:nodes, foreign_key: :job_uuid, primary_key: :uuid) @@ -330,4 +331,18 @@ class Job < ArvadosModel end end + def validate_state_change + if self.state_changed? + if self.state_was.in? [Complete, Failed, Cancelled] + # Once in a finished state, don't permit any changes + errors.add :state, "invalid change from #{self.state_was} to #{self.state}" + return false + elsif self.state_was == Running and not self.state.in? [Complete, Failed, Cancelled] + # From running, can only transition to a finished state + errors.add :state, "invalid change from #{self.state_was} to #{self.state}" + return false + end + end + true + end end diff --git a/services/api/script/crunch-dispatch.rb b/services/api/script/crunch-dispatch.rb index 45fb6dd33f..68af85d18f 100755 --- a/services/api/script/crunch-dispatch.rb +++ b/services/api/script/crunch-dispatch.rb @@ -419,34 +419,37 @@ class Dispatcher end # Wait the thread (returns a Process::Status) - exit_status = j_done[:wait_thr].value + exit_status = j_done[:wait_thr].value.exitstatus jobrecord = Job.find_by_uuid(job_done.uuid) - if exit_status.to_i != 75 and jobrecord.started_at - # Clean up state fields in case crunch-job exited without - # putting the job in a suitable "finished" state. - jobrecord.running = false - jobrecord.finished_at ||= Time.now - if jobrecord.success.nil? - jobrecord.success = false + if exit_status != 0 + # crunch-job exited with some kind of failure. + if exit_status != 75 and jobrecord.started_at + # Clean up state fields in case crunch-job exited without + # putting the job in a suitable "finished" state. + jobrecord.running = false + jobrecord.finished_at ||= Time.now + if jobrecord.success.nil? + jobrecord.success = false + end + jobrecord.save! + else + # Don't fail the job if crunch-job didn't even get as far as + # starting it. If the job failed to run due to an infrastructure + # issue with crunch-job or slurm, we want the job to stay in the + # queue. If crunch-job exited after losing a race to another + # crunch-job process, it exits 75 and we should leave the job + # record alone so the winner of the race do its thing. + # + # There is still an unhandled race condition: If our crunch-job + # process is about to lose a race with another crunch-job + # process, but crashes before getting to its "exit 75" (for + # example, "cannot fork" or "cannot reach API server") then we + # will assume incorrectly that it's our process's fault + # jobrecord.started_at is non-nil, and mark the job as failed + # even though the winner of the race is probably still doing + # fine. end - jobrecord.save! - else - # Don't fail the job if crunch-job didn't even get as far as - # starting it. If the job failed to run due to an infrastructure - # issue with crunch-job or slurm, we want the job to stay in the - # queue. If crunch-job exited after losing a race to another - # crunch-job process, it exits 75 and we should leave the job - # record alone so the winner of the race do its thing. - # - # There is still an unhandled race condition: If our crunch-job - # process is about to lose a race with another crunch-job - # process, but crashes before getting to its "exit 75" (for - # example, "cannot fork" or "cannot reach API server") then we - # will assume incorrectly that it's our process's fault - # jobrecord.started_at is non-nil, and mark the job as failed - # even though the winner of the race is probably still doing - # fine. end # Invalidate the per-job auth token -- 2.30.2