13546: crunch-job has timeout on srun_sync
authorPeter Amstutz <pamstutz@veritasgenetics.com>
Fri, 20 Jul 2018 20:42:56 +0000 (16:42 -0400)
committerPeter Amstutz <pamstutz@veritasgenetics.com>
Fri, 20 Jul 2018 20:42:56 +0000 (16:42 -0400)
* Add global timeout to srun_sync, default 15 minutes, terminates
  job if a call to srun_sync exceeds the timeout.

* Default can be adjusted by setting CRUNCH_SRUN_SYNC_TIMEOUT in the
  environment of crunch_dispatch.rb (value is in seconds)

Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <pamstutz@veritasgenetics.com>

sdk/cli/bin/crunch-job
services/api/lib/crunch_dispatch.rb

index 9343fcfbfd2f97bc182daa788f5c45f74b8ae078..b8afe638ac3c6a517058fd3e85a49b90607f150c 100755 (executable)
@@ -132,6 +132,7 @@ my $resume_stash;
 my $cgroup_root = "/sys/fs/cgroup";
 my $docker_bin = "docker.io";
 my $docker_run_args = "";
+my $srun_sync_timeout = 15*60;
 GetOptions('force-unlock' => \$force_unlock,
            'git-dir=s' => \$git_dir,
            'job=s' => \$jobspec,
@@ -141,6 +142,7 @@ GetOptions('force-unlock' => \$force_unlock,
            'cgroup-root=s' => \$cgroup_root,
            'docker-bin=s' => \$docker_bin,
            'docker-run-args=s' => \$docker_run_args,
+           'srun-sync-timeout=i' => \$srun_sync_timeout,
     );
 
 if (defined $job_api_token) {
@@ -2007,6 +2009,8 @@ sub srun_sync
   my ($stdout_r, $stdout_w);
   pipe $stdout_r, $stdout_w or croak("pipe() failed: $!");
 
+  my $started_srun = scalar time;
+
   my $srunpid = fork();
   if ($srunpid == 0)
   {
@@ -2050,6 +2054,12 @@ sub srun_sync
     if (!$busy) {
       select(undef, undef, undef, 0.1);
     }
+    if (($started_srun + $srun_sync_timeout) < scalar time) {
+      # Exceeded general timeout for "srun_sync" operations, likely
+      # means something got stuck on the remote node.
+      Log(undef, "srun_sync exceeded timeout, will fail.");
+      $main::please_freeze = 1;
+    }
     killem(keys %proc) if $main::please_freeze;
   }
   my $exited = $?;
index 73ad7606cc879ef58f7569c960196191c7fb7721..449d7d51626a1963ab39e83e3e95998f50d21b1e 100644 (file)
@@ -29,6 +29,7 @@ class CrunchDispatch
     @docker_bin = ENV['CRUNCH_JOB_DOCKER_BIN']
     @docker_run_args = ENV['CRUNCH_JOB_DOCKER_RUN_ARGS']
     @cgroup_root = ENV['CRUNCH_CGROUP_ROOT']
+    @srun_sync_timeout = ENV['CRUNCH_SRUN_SYNC_TIMEOUT']
 
     @arvados_internal = Rails.configuration.git_internal_dir
     if not File.exist? @arvados_internal
@@ -419,6 +420,10 @@ class CrunchDispatch
         cmd_args += ['--docker-run-args', @docker_run_args]
       end
 
+      if @srun_sync_timeout
+        cmd_args += ['--srun-sync-timeout', @srun_sync_timeout]
+      end
+
       if have_job_lock?(job)
         cmd_args << "--force-unlock"
       end