+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
require 'open3'
require 'shellwords'
jobrecord = Job.find_by_uuid(job_done.uuid)
if exit_status == EXIT_RETRY_UNLOCKED or (exit_tempfail and @job_retry_counts.include? jobrecord.uuid)
+ $stderr.puts("dispatch: job #{jobrecord.uuid} was interrupted by node failure")
# Only this crunch-dispatch process can retry the job:
# it's already locked, and there's no way to put it back in the
# Queued state. Put it in our internal todo list unless the job
# has failed this way excessively.
@job_retry_counts[jobrecord.uuid] += 1
exit_tempfail = @job_retry_counts[jobrecord.uuid] <= RETRY_UNLOCKED_LIMIT
+ do_what_next = "give up now"
if exit_tempfail
@todo_job_retries[jobrecord.uuid] = jobrecord
- else
- $stderr.puts("dispatch: job #{jobrecord.uuid} exceeded node failure retry limit -- giving up")
+ do_what_next = "re-attempt"
end
+ $stderr.puts("dispatch: job #{jobrecord.uuid} has been interrupted " +
+ "#{@job_retry_counts[jobrecord.uuid]}x, will #{do_what_next}")
end
if !exit_tempfail
def check_orphaned_slurm_jobs
act_as_system_user do
- squeue_uuids = squeue_jobs.select{|uuid| uuid.match(HasUuid::UUID_REGEX)}.
+ squeue_uuids = squeue_jobs.select{|uuid| uuid.match(/^[0-9a-z]{5}-8i9sb-[0-9a-z]{15}$/)}.
select{|uuid| !@running.has_key?(uuid)}
return if squeue_uuids.size == 0
# An array of job_uuids in squeue
def squeue_jobs
if Rails.configuration.crunch_job_wrapper == :slurm_immediate
- File.popen(['squeue', '-a', '-h', '-o', '%j']).readlines.map do |line|
- line.strip
+ p = IO.popen(['squeue', '-a', '-h', '-o', '%j'])
+ begin
+ p.readlines.map {|line| line.strip}
+ ensure
+ p.close
end
else
[]
def scancel slurm_name
cmd = sudo_preface + ['scancel', '-n', slurm_name]
- puts File.popen(cmd).read
+ IO.popen(cmd) do |scancel_pipe|
+ puts scancel_pipe.read
+ end
if not $?.success?
Rails.logger.error "scancel #{slurm_name.shellescape}: $?"
end