Merge branch '13006-api-is_a-filter' into 13006-sync-groups-is_a-filter

[arvados.git] / services / api / lib / crunch_dispatch.rb
diff --git a/services/api/lib/crunch_dispatch.rb b/services/api/lib/crunch_dispatch.rb

index bea1657de22b72c0d5296a4c571afcee3ffc0993..449d7d51626a1963ab39e83e3e95998f50d21b1e 100644 (file)
--- a/services/api/lib/crunch_dispatch.rb
+++ b/services/api/lib/crunch_dispatch.rb
@@ -1,3 +1,7 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
  require 'open3'
  require 'shellwords'
  
@@ -25,6 +29,7 @@ class CrunchDispatch
      @docker_bin = ENV['CRUNCH_JOB_DOCKER_BIN']
      @docker_run_args = ENV['CRUNCH_JOB_DOCKER_RUN_ARGS']
      @cgroup_root = ENV['CRUNCH_CGROUP_ROOT']
+    @srun_sync_timeout = ENV['CRUNCH_SRUN_SYNC_TIMEOUT']
  
      @arvados_internal = Rails.configuration.git_internal_dir
      if not File.exist? @arvados_internal
@@ -293,7 +298,7 @@ class CrunchDispatch
      @fetched_commits[sha1] = ($? == 0)
    end
  
-  def tag_commit(commit_hash, tag_name)
+  def tag_commit(job, commit_hash, tag_name)
      # @git_tags[T]==V if we know commit V has been tagged T in the
      # arvados_internal repository.
      if not @git_tags[tag_name]
@@ -377,20 +382,20 @@ class CrunchDispatch
            next
          end
          ready &&= get_commit repo.server_path, job.script_version
-        ready &&= tag_commit job.script_version, job.uuid
+        ready &&= tag_commit job, job.script_version, job.uuid
        end
  
        # This should be unnecessary, because API server does it during
        # job create/update, but it's still not a bad idea to verify the
        # tag is correct before starting the job:
-      ready &&= tag_commit job.script_version, job.uuid
+      ready &&= tag_commit job, job.script_version, job.uuid
  
        # The arvados_sdk_version doesn't support use of arbitrary
        # remote URLs, so the requested version isn't necessarily copied
        # into the internal repository yet.
        if job.arvados_sdk_version
          ready &&= get_commit @arvados_repo_path, job.arvados_sdk_version
-        ready &&= tag_commit job.arvados_sdk_version, "#{job.uuid}-arvados-sdk"
+        ready &&= tag_commit job, job.arvados_sdk_version, "#{job.uuid}-arvados-sdk"
        end
  
        if not ready
@@ -415,6 +420,10 @@ class CrunchDispatch
          cmd_args += ['--docker-run-args', @docker_run_args]
        end
  
+      if @srun_sync_timeout
+        cmd_args += ['--srun-sync-timeout', @srun_sync_timeout]
+      end
+
        if have_job_lock?(job)
          cmd_args << "--force-unlock"
        end
@@ -425,8 +434,11 @@ class CrunchDispatch
          i, o, e, t = Open3.popen3(*cmd_args)
        rescue
          $stderr.puts "dispatch: popen3: #{$!}"
-        sleep 1
-        next
+        # This is a dispatch problem like "Too many open files";
+        # retrying another job right away would be futile. Just return
+        # and hope things are better next time, after (at least) a
+        # did_recently() delay.
+        return
        end
  
        $stderr.puts "dispatch: job #{job.uuid}"
@@ -629,31 +641,11 @@ class CrunchDispatch
      pid_done = nil
      j_done = nil
  
-    if false
-      begin
-        pid_done = waitpid(-1, Process::WNOHANG | Process::WUNTRACED)
-        if pid_done
-          j_done = @running.values.
-            select { |j| j[:wait_thr].pid == pid_done }.
-            first
-        end
-      rescue SystemCallError
-        # I have @running processes but system reports I have no
-        # children. This is likely to happen repeatedly if it happens at
-        # all; I will log this no more than once per child process I
-        # start.
-        if 0 < @running.select { |uuid,j| j[:warned_waitpid_error].nil? }.size
-          children = @running.values.collect { |j| j[:wait_thr].pid }.join ' '
-          $stderr.puts "dispatch: IPC bug: waitpid() error (#{$!}), but I have children #{children}"
-        end
-        @running.each do |uuid,j| j[:warned_waitpid_error] = true end
-      end
-    else
-      @running.each do |uuid, j|
-        if j[:wait_thr].status == false
-          pid_done = j[:wait_thr].pid
-          j_done = j
-        end
+    @running.each do |uuid, j|
+      if !j[:wait_thr].status
+        pid_done = j[:wait_thr].pid
+        j_done = j
+        break
        end
      end
  
@@ -684,17 +676,20 @@ class CrunchDispatch
      jobrecord = Job.find_by_uuid(job_done.uuid)
  
      if exit_status == EXIT_RETRY_UNLOCKED or (exit_tempfail and @job_retry_counts.include? jobrecord.uuid)
+      $stderr.puts("dispatch: job #{jobrecord.uuid} was interrupted by node failure")
        # Only this crunch-dispatch process can retry the job:
        # it's already locked, and there's no way to put it back in the
        # Queued state.  Put it in our internal todo list unless the job
        # has failed this way excessively.
        @job_retry_counts[jobrecord.uuid] += 1
        exit_tempfail = @job_retry_counts[jobrecord.uuid] <= RETRY_UNLOCKED_LIMIT
+      do_what_next = "give up now"
        if exit_tempfail
          @todo_job_retries[jobrecord.uuid] = jobrecord
-      else
-        $stderr.puts("dispatch: job #{jobrecord.uuid} exceeded node failure retry limit -- giving up")
+        do_what_next = "re-attempt"
        end
+      $stderr.puts("dispatch: job #{jobrecord.uuid} has been interrupted " +
+                   "#{@job_retry_counts[jobrecord.uuid]}x, will #{do_what_next}")
      end
  
      if !exit_tempfail
@@ -889,7 +884,7 @@ class CrunchDispatch
  
    def check_orphaned_slurm_jobs
      act_as_system_user do
-      squeue_uuids = squeue_jobs.select{|uuid| uuid.match(HasUuid::UUID_REGEX)}.
+      squeue_uuids = squeue_jobs.select{|uuid| uuid.match(/^[0-9a-z]{5}-8i9sb-[0-9a-z]{15}$/)}.
                                    select{|uuid| !@running.has_key?(uuid)}
  
        return if squeue_uuids.size == 0