X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/1e3f8ceebd90058e902494fae84b1fd57ac6693b..febdebbb58592be73dcf7d4bd4b2c7ff96657741:/sdk/cli/bin/crunch-job diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job index f2e9fc2878..b8afe638ac 100755 --- a/sdk/cli/bin/crunch-job +++ b/sdk/cli/bin/crunch-job @@ -1,10 +1,9 @@ #!/usr/bin/env perl +# -*- mode: perl; perl-indent-level: 2; indent-tabs-mode: nil; -*- # Copyright (C) The Arvados Authors. All rights reserved. # # SPDX-License-Identifier: AGPL-3.0 -# -*- mode: perl; perl-indent-level: 2; indent-tabs-mode: nil; -*- - =head1 NAME crunch-job: Execute job steps, save snapshots as requested, collate output. @@ -133,6 +132,7 @@ my $resume_stash; my $cgroup_root = "/sys/fs/cgroup"; my $docker_bin = "docker.io"; my $docker_run_args = ""; +my $srun_sync_timeout = 15*60; GetOptions('force-unlock' => \$force_unlock, 'git-dir=s' => \$git_dir, 'job=s' => \$jobspec, @@ -142,6 +142,7 @@ GetOptions('force-unlock' => \$force_unlock, 'cgroup-root=s' => \$cgroup_root, 'docker-bin=s' => \$docker_bin, 'docker-run-args=s' => \$docker_run_args, + 'srun-sync-timeout=i' => \$srun_sync_timeout, ); if (defined $job_api_token) { @@ -1188,6 +1189,8 @@ sub reapchildren . $slot[$proc{$pid}->{slot}]->{cpu}); my $jobstepidx = $proc{$pid}->{jobstepidx}; + readfrompipes_after_exit ($jobstepidx); + $children_reaped++; my $elapsed = time - $proc{$pid}->{time}; my $Jobstep = $jobstep[$jobstepidx]; @@ -1259,7 +1262,6 @@ sub reapchildren $Jobstep->{finishtime} = time; $Jobstep->{'arvados_task'}->{finished_at} = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($Jobstep->{finishtime}); retry_op(sub { $Jobstep->{'arvados_task'}->save; }, "job_tasks.update API"); - process_stderr_final ($jobstepidx); Log ($jobstepidx, sprintf("task output (%d bytes): %s", length($Jobstep->{'arvados_task'}->{output}), $Jobstep->{'arvados_task'}->{output})); @@ -1562,9 +1564,27 @@ sub preprocess_stderr } -sub process_stderr_final +# Read whatever is still available on its stderr+stdout pipes after +# the given child process has exited. +sub readfrompipes_after_exit { my $jobstepidx = shift; + + # The fact that the child has exited allows some convenient + # simplifications: (1) all data must have already been written, so + # there's no need to wait for more once sysread returns 0; (2) the + # total amount of data available is bounded by the pipe buffer size, + # so it's safe to read everything into one string. + my $buf; + while (0 < sysread ($reader{$jobstepidx}, $buf, 65536)) { + $jobstep[$jobstepidx]->{stderr_at} = time; + $jobstep[$jobstepidx]->{stderr} .= $buf; + } + if ($jobstep[$jobstepidx]->{stdout_r}) { + while (0 < sysread ($jobstep[$jobstepidx]->{stdout_r}, $buf, 65536)) { + $jobstep[$jobstepidx]->{stdout_captured} .= $buf; + } + } preprocess_stderr ($jobstepidx); map { @@ -1989,6 +2009,8 @@ sub srun_sync my ($stdout_r, $stdout_w); pipe $stdout_r, $stdout_w or croak("pipe() failed: $!"); + my $started_srun = scalar time; + my $srunpid = fork(); if ($srunpid == 0) { @@ -2032,12 +2054,17 @@ sub srun_sync if (!$busy) { select(undef, undef, undef, 0.1); } + if (($started_srun + $srun_sync_timeout) < scalar time) { + # Exceeded general timeout for "srun_sync" operations, likely + # means something got stuck on the remote node. + Log(undef, "srun_sync exceeded timeout, will fail."); + $main::please_freeze = 1; + } killem(keys %proc) if $main::please_freeze; } my $exited = $?; - 1 while readfrompipes(); - process_stderr_final ($jobstepidx); + readfrompipes_after_exit ($jobstepidx); Log (undef, "$label: exit ".exit_status_s($exited));