my $cgroup_root = "/sys/fs/cgroup";
my $docker_bin = "docker.io";
my $docker_run_args = "";
+my $srun_sync_timeout = 15*60;
GetOptions('force-unlock' => \$force_unlock,
'git-dir=s' => \$git_dir,
'job=s' => \$jobspec,
'cgroup-root=s' => \$cgroup_root,
'docker-bin=s' => \$docker_bin,
'docker-run-args=s' => \$docker_run_args,
+ 'srun-sync-timeout=i' => \$srun_sync_timeout,
);
if (defined $job_api_token) {
my ($stdout_r, $stdout_w);
pipe $stdout_r, $stdout_w or croak("pipe() failed: $!");
+ my $started_srun = scalar time;
+
my $srunpid = fork();
if ($srunpid == 0)
{
if (!$busy) {
select(undef, undef, undef, 0.1);
}
+ if (($started_srun + $srun_sync_timeout) < scalar time) {
+ # Exceeded general timeout for "srun_sync" operations, likely
+ # means something got stuck on the remote node.
+ Log(undef, "srun_sync exceeded timeout, will fail.");
+ $main::please_freeze = 1;
+ }
killem(keys %proc) if $main::please_freeze;
}
my $exited = $?;
@docker_bin = ENV['CRUNCH_JOB_DOCKER_BIN']
@docker_run_args = ENV['CRUNCH_JOB_DOCKER_RUN_ARGS']
@cgroup_root = ENV['CRUNCH_CGROUP_ROOT']
+ @srun_sync_timeout = ENV['CRUNCH_SRUN_SYNC_TIMEOUT']
@arvados_internal = Rails.configuration.git_internal_dir
if not File.exist? @arvados_internal
cmd_args += ['--docker-run-args', @docker_run_args]
end
+ if @srun_sync_timeout
+ cmd_args += ['--srun-sync-timeout', @srun_sync_timeout]
+ end
+
if have_job_lock?(job)
cmd_args << "--force-unlock"
end