#!/usr/bin/env perl
+# -*- mode: perl; perl-indent-level: 2; indent-tabs-mode: nil; -*-
# Copyright (C) The Arvados Authors. All rights reserved.
#
# SPDX-License-Identifier: AGPL-3.0
-# -*- mode: perl; perl-indent-level: 2; indent-tabs-mode: nil; -*-
-
=head1 NAME
crunch-job: Execute job steps, save snapshots as requested, collate output.
my $cgroup_root = "/sys/fs/cgroup";
my $docker_bin = "docker.io";
my $docker_run_args = "";
+my $srun_sync_timeout = 15*60;
GetOptions('force-unlock' => \$force_unlock,
'git-dir=s' => \$git_dir,
'job=s' => \$jobspec,
'cgroup-root=s' => \$cgroup_root,
'docker-bin=s' => \$docker_bin,
'docker-run-args=s' => \$docker_run_args,
+ 'srun-sync-timeout=i' => \$srun_sync_timeout,
);
if (defined $job_api_token) {
close($_);
}
fcntl ("writer", F_SETFL, 0) or croak ($!); # no close-on-exec
- open(STDOUT,">&writer");
- open(STDERR,">&writer");
+ open(STDOUT,">&writer") or croak ($!);
+ open(STDERR,">&writer") or croak ($!);
undef $dbh;
undef $sth;
. $slot[$proc{$pid}->{slot}]->{cpu});
my $jobstepidx = $proc{$pid}->{jobstepidx};
+ readfrompipes_after_exit ($jobstepidx);
+
$children_reaped++;
my $elapsed = time - $proc{$pid}->{time};
my $Jobstep = $jobstep[$jobstepidx];
$Jobstep->{finishtime} = time;
$Jobstep->{'arvados_task'}->{finished_at} = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($Jobstep->{finishtime});
retry_op(sub { $Jobstep->{'arvados_task'}->save; }, "job_tasks.update API");
- process_stderr_final ($jobstepidx);
Log ($jobstepidx, sprintf("task output (%d bytes): %s",
length($Jobstep->{'arvados_task'}->{output}),
$Jobstep->{'arvados_task'}->{output}));
}
-sub process_stderr_final
+# Read whatever is still available on its stderr+stdout pipes after
+# the given child process has exited.
+sub readfrompipes_after_exit
{
my $jobstepidx = shift;
+
+ # The fact that the child has exited allows some convenient
+ # simplifications: (1) all data must have already been written, so
+ # there's no need to wait for more once sysread returns 0; (2) the
+ # total amount of data available is bounded by the pipe buffer size,
+ # so it's safe to read everything into one string.
+ my $buf;
+ while (0 < sysread ($reader{$jobstepidx}, $buf, 65536)) {
+ $jobstep[$jobstepidx]->{stderr_at} = time;
+ $jobstep[$jobstepidx]->{stderr} .= $buf;
+ }
+ if ($jobstep[$jobstepidx]->{stdout_r}) {
+ while (0 < sysread ($jobstep[$jobstepidx]->{stdout_r}, $buf, 65536)) {
+ $jobstep[$jobstepidx]->{stdout_captured} .= $buf;
+ }
+ }
preprocess_stderr ($jobstepidx);
map {
my ($stdout_r, $stdout_w);
pipe $stdout_r, $stdout_w or croak("pipe() failed: $!");
+ my $started_srun = scalar time;
+
my $srunpid = fork();
if ($srunpid == 0)
{
close($stdout_r);
fcntl($stderr_w, F_SETFL, 0) or croak($!); # no close-on-exec
fcntl($stdout_w, F_SETFL, 0) or croak($!);
- open(STDERR, ">&", $stderr_w);
- open(STDOUT, ">&", $stdout_w);
+ open(STDERR, ">&", $stderr_w) or croak ($!);
+ open(STDOUT, ">&", $stdout_w) or croak ($!);
srun ($srunargs, $execargs, $opts, $stdin);
exit (1);
}
if (!$busy) {
select(undef, undef, undef, 0.1);
}
+ if (($started_srun + $srun_sync_timeout) < scalar time) {
+ # Exceeded general timeout for "srun_sync" operations, likely
+ # means something got stuck on the remote node.
+ Log(undef, "srun_sync exceeded timeout, will fail.");
+ $main::please_freeze = 1;
+ }
killem(keys %proc) if $main::please_freeze;
}
my $exited = $?;
- 1 while readfrompipes();
- process_stderr_final ($jobstepidx);
+ readfrompipes_after_exit ($jobstepidx);
Log (undef, "$label: exit ".exit_status_s($exited));
# Hide messages from the install script (unless it fails: shell_or_die
# will show $destdir.log in that case).
-open(STDOUT, ">>", "$destdir.log");
-open(STDERR, ">&", STDOUT);
+open(STDOUT, ">>", "$destdir.log") or die ($!);
+open(STDERR, ">&", STDOUT) or die ($!);
if (-e "$destdir/crunch_scripts/install") {
shell_or_die (undef, "$destdir/crunch_scripts/install", $install_dir);
sub can_run {
my $command_name = shift;
- open(my $which, "-|", "which", $command_name);
+ open(my $which, "-|", "which", $command_name) or die ($!);
while (<$which>) { }
close($which);
return ($? == 0);