From febdebbb58592be73dcf7d4bd4b2c7ff96657741 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Fri, 20 Jul 2018 16:42:56 -0400 Subject: [PATCH] 13546: crunch-job has timeout on srun_sync * Add global timeout to srun_sync, default 15 minutes, terminates job if a call to srun_sync exceeds the timeout. * Default can be adjusted by setting CRUNCH_SRUN_SYNC_TIMEOUT in the environment of crunch_dispatch.rb (value is in seconds) Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- sdk/cli/bin/crunch-job | 10 ++++++++++ services/api/lib/crunch_dispatch.rb | 5 +++++ 2 files changed, 15 insertions(+) diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job index 9343fcfbfd..b8afe638ac 100755 --- a/sdk/cli/bin/crunch-job +++ b/sdk/cli/bin/crunch-job @@ -132,6 +132,7 @@ my $resume_stash; my $cgroup_root = "/sys/fs/cgroup"; my $docker_bin = "docker.io"; my $docker_run_args = ""; +my $srun_sync_timeout = 15*60; GetOptions('force-unlock' => \$force_unlock, 'git-dir=s' => \$git_dir, 'job=s' => \$jobspec, @@ -141,6 +142,7 @@ GetOptions('force-unlock' => \$force_unlock, 'cgroup-root=s' => \$cgroup_root, 'docker-bin=s' => \$docker_bin, 'docker-run-args=s' => \$docker_run_args, + 'srun-sync-timeout=i' => \$srun_sync_timeout, ); if (defined $job_api_token) { @@ -2007,6 +2009,8 @@ sub srun_sync my ($stdout_r, $stdout_w); pipe $stdout_r, $stdout_w or croak("pipe() failed: $!"); + my $started_srun = scalar time; + my $srunpid = fork(); if ($srunpid == 0) { @@ -2050,6 +2054,12 @@ sub srun_sync if (!$busy) { select(undef, undef, undef, 0.1); } + if (($started_srun + $srun_sync_timeout) < scalar time) { + # Exceeded general timeout for "srun_sync" operations, likely + # means something got stuck on the remote node. + Log(undef, "srun_sync exceeded timeout, will fail."); + $main::please_freeze = 1; + } killem(keys %proc) if $main::please_freeze; } my $exited = $?; diff --git a/services/api/lib/crunch_dispatch.rb b/services/api/lib/crunch_dispatch.rb index 73ad7606cc..449d7d5162 100644 --- a/services/api/lib/crunch_dispatch.rb +++ b/services/api/lib/crunch_dispatch.rb @@ -29,6 +29,7 @@ class CrunchDispatch @docker_bin = ENV['CRUNCH_JOB_DOCKER_BIN'] @docker_run_args = ENV['CRUNCH_JOB_DOCKER_RUN_ARGS'] @cgroup_root = ENV['CRUNCH_CGROUP_ROOT'] + @srun_sync_timeout = ENV['CRUNCH_SRUN_SYNC_TIMEOUT'] @arvados_internal = Rails.configuration.git_internal_dir if not File.exist? @arvados_internal @@ -419,6 +420,10 @@ class CrunchDispatch cmd_args += ['--docker-run-args', @docker_run_args] end + if @srun_sync_timeout + cmd_args += ['--srun-sync-timeout', @srun_sync_timeout] + end + if have_job_lock?(job) cmd_args << "--force-unlock" end -- 2.30.2