#!/usr/bin/env perl
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
# -*- mode: perl; perl-indent-level: 2; indent-tabs-mode: nil; -*-
=head1 NAME
$cmd = [$docker_bin, 'ps', '-q'];
}
Log(undef, "Sanity check is `@$cmd`");
-my ($exited, $stdout, $stderr) = srun_sync(
+my ($exited, $stdout, $stderr, $tempfail) = srun_sync(
["srun", "--nodes=\Q$ENV{SLURM_NNODES}\E", "--ntasks-per-node=1"],
$cmd,
{label => "sanity check"});
# Find FUSE mounts under $CRUNCH_TMP and unmount them. Then clean
# up work directories crunch_tmp/work, crunch_tmp/opt,
# crunch_tmp/src*.
- #
- # TODO: When #5036 is done and widely deployed, we can limit mount's
- # -t option to simply fuse.keep.
- my ($exited, $stdout, $stderr) = srun_sync(
+ my ($exited, $stdout, $stderr, $tempfail) = srun_sync(
["srun", "--nodelist=$nodelist", "-D", $ENV{'TMPDIR'}],
- ['bash', '-ec', '-o', 'pipefail', 'mount -t fuse,fuse.keep | awk "(index(\$3, \"$CRUNCH_TMP\") == 1){print \$3}" | xargs -r -n 1 fusermount -u -z; sleep 1; rm -rf $JOB_WORK $CRUNCH_INSTALL $CRUNCH_TMP/task $CRUNCH_TMP/src* $CRUNCH_TMP/*.cid'],
+ ['bash', '-ec', q{
+arv-mount --unmount-timeout 10 --unmount-all ${CRUNCH_TMP}
+rm -rf ${JOB_WORK} ${CRUNCH_INSTALL} ${CRUNCH_TMP}/task ${CRUNCH_TMP}/src* ${CRUNCH_TMP}/*.cid
+ }],
{label => "clean work dirs"});
if ($exited != 0) {
- exit(EX_RETRY_UNLOCKED);
+ exit_retry_unlocked();
}
}
$docker_stream =~ s/^\.//;
my $docker_install_script = qq{
loaded() {
- [[ \$($docker_bin inspect --format="{{.ID}}" \Q$docker_hash\E) = \Q$docker_hash\E ]]
+ id=\$($docker_bin inspect --format="{{.ID}}" \Q$docker_hash\E) || return 1
+ echo "image ID is \$id"
+ [[ \${id} = \Q$docker_hash\E ]]
}
-if loaded 2>/dev/null; then
+if loaded >&2 2>/dev/null; then
echo >&2 "image is already present"
exit 0
fi
echo >&2 "docker image is not present; loading"
arv-get \Q$docker_locator$docker_stream/$docker_hash.tar\E | $docker_bin load
-if ! loaded; then
+if ! loaded >&2; then
echo >&2 "`docker load` exited 0, but image is not found (!)"
exit 1
fi
echo >&2 "image loaded successfully"
};
- my ($exited, $stdout, $stderr) = srun_sync(
+ my ($exited, $stdout, $stderr, $tempfail) = srun_sync(
["srun", "--nodelist=" . join(',', @node)],
["/bin/bash", "-o", "pipefail", "-ec", $docker_install_script],
{label => "load docker image"});
if ($exited != 0)
{
- exit(EX_RETRY_UNLOCKED);
+ exit_retry_unlocked();
}
# Determine whether this version of Docker supports memory+swap limits.
- ($exited, $stdout, $stderr) = srun_sync(
+ ($exited, $stdout, $stderr, $tempfail) = srun_sync(
["srun", "--nodes=1"],
[$docker_bin, 'run', '--help'],
{label => "check --memory-swap feature"});
+ if ($tempfail) {
+ exit_retry_unlocked();
+ }
$docker_limitmem = ($stdout =~ /--memory-swap/);
# Find a non-root Docker user to use.
$label = "check whether user '$try_user' is UID 0";
$try_user_arg = "--user=$try_user";
}
- my ($exited, $stdout, $stderr) = srun_sync(
+ my ($exited, $stdout, $stderr, $tempfail) = srun_sync(
["srun", "--nodes=1"],
["/bin/sh", "-ec",
"$docker_bin run $docker_run_args $try_user_arg $docker_hash id --user"],
Log(undef, "Container will run with $dockeruserarg");
}
last;
+ } elsif ($tempfail) {
+ exit_retry_unlocked();
}
}
"mkdir -p $ENV{CRUNCH_INSTALL} && cd $ENV{CRUNCH_TMP} && perl -");
$ENV{"CRUNCH_GIT_ARCHIVE_HASH"} = md5_hex($git_archive);
- my ($stdout, $stderr);
- ($exited, $stdout, $stderr) = srun_sync(
+ my ($stdout, $stderr, $tempfail);
+ ($exited, $stdout, $stderr, $tempfail) = srun_sync(
\@srunargs, \@execargs,
{label => "run install script on all workers"},
- $build_script . $git_archive);
+ $build_script . $git_archive);
+ if ($tempfail) {
+ exit_retry_unlocked();
+ }
my $stderr_anything_from_script = 0;
for my $line (split(/\n/, $stderr)) {
} elsif ($working_slot_count < 1) {
save_output_collection();
save_meta();
- exit(EX_RETRY_UNLOCKED);
+ exit_retry_unlocked();
} elsif ($thisround_succeeded == 0 &&
($thisround_failed == 0 || $thisround_failed > 4)) {
my $message = "stop because $thisround_failed tasks failed and none succeeded";
my $collated_output = save_output_collection();
Log (undef, "finish");
-save_meta();
+my $final_log = save_meta();
my $final_state;
-if ($collated_output && $main::success) {
+if ($collated_output && $final_log && $main::success) {
$final_state = 'Complete';
} else {
$final_state = 'Failed';
$st->{node}->{fail_count}++;
}
}
- elsif ($line =~ /srun: error: .*?\b(Node failure on|Aborting, .*?\bio error\b)/i) {
+ elsif ($line =~ /srun: error: .*?\b(Node failure on|Aborting, .*?\bio error\b|cannot communicate with node .* aborting job)/i) {
$jobstep[$jobstepidx]->{tempfail} = 1;
if (defined($job_slot_index)) {
$slot[$job_slot_index]->{node}->{fail_count}++;
$log_pipe_pid = open2($log_pipe_out, $log_pipe_in,
'arv-put',
'--stream',
- '--retries', '3',
+ '--retries', '6',
'--filename', $logfilename,
'-');
$log_pipe_out_buf = "";
});
Log(undef, "log collection is " . $log_coll->{portable_data_hash});
$Job->update_attributes('log' => $log_coll->{portable_data_hash});
+
+ return $log_coll->{portable_data_hash};
}
if ($main::please_freeze || $j->{tempfail}) {
$exited ||= 255;
}
- return ($exited, $j->{stdout_captured}, $j->{stderr_captured});
+ return ($exited, $j->{stdout_captured}, $j->{stderr_captured}, $j->{tempfail});
}
}
}
+sub exit_retry_unlocked {
+ Log(undef, "Transient failure with lock acquired; asking for re-dispatch by exiting ".EX_RETRY_UNLOCKED);
+ exit(EX_RETRY_UNLOCKED);
+}
+
sub retry_count {
# Calculate the number of times an operation should be retried,
# assuming exponential backoff, and that we're willing to retry as