X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/2b34839cdf95291a7356554e05e50b9ced177dd6..f6b5fd14131be53487ed1652bf5d6cc328171b66:/services/api/lib/crunch_dispatch.rb diff --git a/services/api/lib/crunch_dispatch.rb b/services/api/lib/crunch_dispatch.rb index 34fed92fe0..230f03e577 100644 --- a/services/api/lib/crunch_dispatch.rb +++ b/services/api/lib/crunch_dispatch.rb @@ -1,3 +1,7 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0 + require 'open3' require 'shellwords' @@ -684,17 +688,20 @@ class CrunchDispatch jobrecord = Job.find_by_uuid(job_done.uuid) if exit_status == EXIT_RETRY_UNLOCKED or (exit_tempfail and @job_retry_counts.include? jobrecord.uuid) + $stderr.puts("dispatch: job #{jobrecord.uuid} was interrupted by node failure") # Only this crunch-dispatch process can retry the job: # it's already locked, and there's no way to put it back in the # Queued state. Put it in our internal todo list unless the job # has failed this way excessively. @job_retry_counts[jobrecord.uuid] += 1 exit_tempfail = @job_retry_counts[jobrecord.uuid] <= RETRY_UNLOCKED_LIMIT + do_what_next = "give up now" if exit_tempfail @todo_job_retries[jobrecord.uuid] = jobrecord - else - $stderr.puts("dispatch: job #{jobrecord.uuid} exceeded node failure retry limit -- giving up") + do_what_next = "re-attempt" end + $stderr.puts("dispatch: job #{jobrecord.uuid} has been interrupted " + + "#{@job_retry_counts[jobrecord.uuid]}x, will #{do_what_next}") end if !exit_tempfail @@ -871,22 +878,14 @@ class CrunchDispatch end Rails.logger.info "fail_jobs: threshold is #{threshold}" - if Rails.configuration.crunch_job_wrapper == :slurm_immediate - # [["slurm_job_id", "slurm_job_name"], ...] - squeue = File.popen(['squeue', '-h', '-o', '%i %j']).readlines.map do |line| - line.strip.split(' ', 2) - end - else - squeue = [] - end - + squeue = squeue_jobs Job.where('state = ? and started_at < ?', Job::Running, threshold). each do |job| Rails.logger.debug "fail_jobs: #{job.uuid} started #{job.started_at}" - squeue.each do |slurm_id, slurm_name| + squeue.each do |slurm_name| if slurm_name == job.uuid - Rails.logger.info "fail_jobs: scancel #{slurm_id} for #{job.uuid}" - scancel slurm_id + Rails.logger.info "fail_jobs: scancel #{job.uuid}" + scancel slurm_name end end fail_job(job, "cleaned up stale job: started before #{threshold}", @@ -897,21 +896,19 @@ class CrunchDispatch def check_orphaned_slurm_jobs act_as_system_user do - if Rails.configuration.crunch_job_wrapper == :slurm_immediate - squeue_uuids = File.popen(['squeue', '-a', '-h', '-o', '%j']).readlines.map do |line| - line.strip.split(' ', 1) - end.collect{|l| l[0]}. - select{|uuid| uuid.match(HasUuid::UUID_REGEX)}. - select{|uuid| !@running.has_key?(uuid)} - - return if squeue_uuids.size == 0 - - scancel_uuids = squeue_uuids - Job.where('uuid in (?) and (state=? or modified_at>?)', - squeue_uuids, 'Running', (Time.now - 60)).collect(&:uuid) - scancel_uuids.each do |uuid| - Rails.logger.info "orphaned job: scancel #{uuid}" - scancel uuid, true - end + squeue_uuids = squeue_jobs.select{|uuid| uuid.match(/^[0-9a-z]{5}-8i9sb-[0-9a-z]{15}$/)}. + select{|uuid| !@running.has_key?(uuid)} + + return if squeue_uuids.size == 0 + + scancel_uuids = squeue_uuids - Job.where('uuid in (?) and (state in (?) or modified_at>?)', + squeue_uuids, + ['Running', 'Queued'], + (Time.now - 60)). + collect(&:uuid) + scancel_uuids.each do |uuid| + Rails.logger.info "orphaned job: scancel #{uuid}" + scancel uuid end end end @@ -970,14 +967,27 @@ class CrunchDispatch end end - def scancel slurm_id, use_name=false - scancel_cmd = ['scancel'] - scancel_cmd << '-n' if use_name - scancel_cmd << slurm_id - cmd = sudo_preface + scancel_cmd - puts File.popen(cmd).read + # An array of job_uuids in squeue + def squeue_jobs + if Rails.configuration.crunch_job_wrapper == :slurm_immediate + p = IO.popen(['squeue', '-a', '-h', '-o', '%j']) + begin + p.readlines.map {|line| line.strip} + ensure + p.close + end + else + [] + end + end + + def scancel slurm_name + cmd = sudo_preface + ['scancel', '-n', slurm_name] + IO.popen(cmd) do |scancel_pipe| + puts scancel_pipe.read + end if not $?.success? - Rails.logger.error "scancel #{slurm_id.shellescape}: $?" + Rails.logger.error "scancel #{slurm_name.shellescape}: $?" end end end