X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/353eb4f4c4fb52e7f2a1c9aaad93e9d6bf0088f4..035b113f60302f6d9c265e6e3a63dbb3c5873153:/services/api/lib/crunch_dispatch.rb diff --git a/services/api/lib/crunch_dispatch.rb b/services/api/lib/crunch_dispatch.rb index 21843de67e..3cabc1e3ce 100644 --- a/services/api/lib/crunch_dispatch.rb +++ b/services/api/lib/crunch_dispatch.rb @@ -1,3 +1,7 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0 + require 'open3' require 'shellwords' @@ -425,8 +429,11 @@ class CrunchDispatch i, o, e, t = Open3.popen3(*cmd_args) rescue $stderr.puts "dispatch: popen3: #{$!}" - sleep 1 - next + # This is a dispatch problem like "Too many open files"; + # retrying another job right away would be futile. Just return + # and hope things are better next time, after (at least) a + # did_recently() delay. + return end $stderr.puts "dispatch: job #{job.uuid}" @@ -629,31 +636,11 @@ class CrunchDispatch pid_done = nil j_done = nil - if false - begin - pid_done = waitpid(-1, Process::WNOHANG | Process::WUNTRACED) - if pid_done - j_done = @running.values. - select { |j| j[:wait_thr].pid == pid_done }. - first - end - rescue SystemCallError - # I have @running processes but system reports I have no - # children. This is likely to happen repeatedly if it happens at - # all; I will log this no more than once per child process I - # start. - if 0 < @running.select { |uuid,j| j[:warned_waitpid_error].nil? }.size - children = @running.values.collect { |j| j[:wait_thr].pid }.join ' ' - $stderr.puts "dispatch: IPC bug: waitpid() error (#{$!}), but I have children #{children}" - end - @running.each do |uuid,j| j[:warned_waitpid_error] = true end - end - else - @running.each do |uuid, j| - if j[:wait_thr].status == false - pid_done = j[:wait_thr].pid - j_done = j - end + @running.each do |uuid, j| + if !j[:wait_thr].status + pid_done = j[:wait_thr].pid + j_done = j + break end end @@ -684,17 +671,20 @@ class CrunchDispatch jobrecord = Job.find_by_uuid(job_done.uuid) if exit_status == EXIT_RETRY_UNLOCKED or (exit_tempfail and @job_retry_counts.include? jobrecord.uuid) + $stderr.puts("dispatch: job #{jobrecord.uuid} was interrupted by node failure") # Only this crunch-dispatch process can retry the job: # it's already locked, and there's no way to put it back in the # Queued state. Put it in our internal todo list unless the job # has failed this way excessively. @job_retry_counts[jobrecord.uuid] += 1 exit_tempfail = @job_retry_counts[jobrecord.uuid] <= RETRY_UNLOCKED_LIMIT + do_what_next = "give up now" if exit_tempfail @todo_job_retries[jobrecord.uuid] = jobrecord - else - $stderr.puts("dispatch: job #{jobrecord.uuid} exceeded node failure retry limit -- giving up") + do_what_next = "re-attempt" end + $stderr.puts("dispatch: job #{jobrecord.uuid} has been interrupted " + + "#{@job_retry_counts[jobrecord.uuid]}x, will #{do_what_next}") end if !exit_tempfail @@ -889,7 +879,7 @@ class CrunchDispatch def check_orphaned_slurm_jobs act_as_system_user do - squeue_uuids = squeue_jobs.select{|uuid| uuid.match(HasUuid::UUID_REGEX)}. + squeue_uuids = squeue_jobs.select{|uuid| uuid.match(/^[0-9a-z]{5}-8i9sb-[0-9a-z]{15}$/)}. select{|uuid| !@running.has_key?(uuid)} return if squeue_uuids.size == 0 @@ -963,8 +953,11 @@ class CrunchDispatch # An array of job_uuids in squeue def squeue_jobs if Rails.configuration.crunch_job_wrapper == :slurm_immediate - File.popen(['squeue', '-a', '-h', '-o', '%j']).readlines.map do |line| - line.strip + p = IO.popen(['squeue', '-a', '-h', '-o', '%j']) + begin + p.readlines.map {|line| line.strip} + ensure + p.close end else [] @@ -973,7 +966,9 @@ class CrunchDispatch def scancel slurm_name cmd = sudo_preface + ['scancel', '-n', slurm_name] - puts File.popen(cmd).read + IO.popen(cmd) do |scancel_pipe| + puts scancel_pipe.read + end if not $?.success? Rails.logger.error "scancel #{slurm_name.shellescape}: $?" end