+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
require 'open3'
require 'shellwords'
@fetched_commits[sha1] = ($? == 0)
end
- def tag_commit(commit_hash, tag_name)
+ def tag_commit(job, commit_hash, tag_name)
# @git_tags[T]==V if we know commit V has been tagged T in the
# arvados_internal repository.
if not @git_tags[tag_name]
next
end
ready &&= get_commit repo.server_path, job.script_version
- ready &&= tag_commit job.script_version, job.uuid
+ ready &&= tag_commit job, job.script_version, job.uuid
end
# This should be unnecessary, because API server does it during
# job create/update, but it's still not a bad idea to verify the
# tag is correct before starting the job:
- ready &&= tag_commit job.script_version, job.uuid
+ ready &&= tag_commit job, job.script_version, job.uuid
# The arvados_sdk_version doesn't support use of arbitrary
# remote URLs, so the requested version isn't necessarily copied
# into the internal repository yet.
if job.arvados_sdk_version
ready &&= get_commit @arvados_repo_path, job.arvados_sdk_version
- ready &&= tag_commit job.arvados_sdk_version, "#{job.uuid}-arvados-sdk"
+ ready &&= tag_commit job, job.arvados_sdk_version, "#{job.uuid}-arvados-sdk"
end
if not ready
i, o, e, t = Open3.popen3(*cmd_args)
rescue
$stderr.puts "dispatch: popen3: #{$!}"
- sleep 1
- next
+ # This is a dispatch problem like "Too many open files";
+ # retrying another job right away would be futile. Just return
+ # and hope things are better next time, after (at least) a
+ # did_recently() delay.
+ return
end
$stderr.puts "dispatch: job #{job.uuid}"
pid_done = nil
j_done = nil
- if false
- begin
- pid_done = waitpid(-1, Process::WNOHANG | Process::WUNTRACED)
- if pid_done
- j_done = @running.values.
- select { |j| j[:wait_thr].pid == pid_done }.
- first
- end
- rescue SystemCallError
- # I have @running processes but system reports I have no
- # children. This is likely to happen repeatedly if it happens at
- # all; I will log this no more than once per child process I
- # start.
- if 0 < @running.select { |uuid,j| j[:warned_waitpid_error].nil? }.size
- children = @running.values.collect { |j| j[:wait_thr].pid }.join ' '
- $stderr.puts "dispatch: IPC bug: waitpid() error (#{$!}), but I have children #{children}"
- end
- @running.each do |uuid,j| j[:warned_waitpid_error] = true end
- end
- else
- @running.each do |uuid, j|
- if j[:wait_thr].status == false
- pid_done = j[:wait_thr].pid
- j_done = j
- end
+ @running.each do |uuid, j|
+ if !j[:wait_thr].status
+ pid_done = j[:wait_thr].pid
+ j_done = j
+ break
end
end
jobrecord = Job.find_by_uuid(job_done.uuid)
if exit_status == EXIT_RETRY_UNLOCKED or (exit_tempfail and @job_retry_counts.include? jobrecord.uuid)
+ $stderr.puts("dispatch: job #{jobrecord.uuid} was interrupted by node failure")
# Only this crunch-dispatch process can retry the job:
# it's already locked, and there's no way to put it back in the
# Queued state. Put it in our internal todo list unless the job
# has failed this way excessively.
@job_retry_counts[jobrecord.uuid] += 1
exit_tempfail = @job_retry_counts[jobrecord.uuid] <= RETRY_UNLOCKED_LIMIT
+ do_what_next = "give up now"
if exit_tempfail
@todo_job_retries[jobrecord.uuid] = jobrecord
- else
- $stderr.puts("dispatch: job #{jobrecord.uuid} exceeded node failure retry limit -- giving up")
+ do_what_next = "re-attempt"
end
+ $stderr.puts("dispatch: job #{jobrecord.uuid} has been interrupted " +
+ "#{@job_retry_counts[jobrecord.uuid]}x, will #{do_what_next}")
end
if !exit_tempfail
def check_orphaned_slurm_jobs
act_as_system_user do
- squeue_uuids = squeue_jobs.select{|uuid| uuid.match(HasUuid::UUID_REGEX)}.
+ squeue_uuids = squeue_jobs.select{|uuid| uuid.match(/^[0-9a-z]{5}-8i9sb-[0-9a-z]{15}$/)}.
select{|uuid| !@running.has_key?(uuid)}
return if squeue_uuids.size == 0
# An array of job_uuids in squeue
def squeue_jobs
if Rails.configuration.crunch_job_wrapper == :slurm_immediate
- IO.popen(['squeue', '-a', '-h', '-o', '%j']) do |squeue_pipe|
- squeue_pipe.readlines.map do |line|
- line.strip
- end
+ p = IO.popen(['squeue', '-a', '-h', '-o', '%j'])
+ begin
+ p.readlines.map {|line| line.strip}
+ ensure
+ p.close
end
else
[]