my @jobstep_tomerge = ();
my $jobstep_tomerge_level = 0;
my $squeue_checked = 0;
+my $sinfo_checked = 0;
my $latest_refresh = scalar time;
Log (undef, "docker image hash is $docker_hash");
$docker_stream =~ s/^\.//;
my $docker_install_script = qq{
-if ! $docker_bin images -q --no-trunc --all | grep -qxF \Q$docker_hash\E; then
- arv-get \Q$docker_locator$docker_stream/$docker_hash.tar\E | $docker_bin load
+if $docker_bin images -q --no-trunc --all | grep -xF \Q$docker_hash\E >/dev/null; then
+ exit 0
+fi
+declare -a exit_codes=("\${PIPESTATUS[@]}")
+if [ 0 != "\${exit_codes[0]}" ]; then
+ exit "\${exit_codes[0]}" # `docker images` failed
+elif [ 1 != "\${exit_codes[1]}" ]; then
+ exit "\${exit_codes[1]}" # `grep` encountered an error
+else
+ # Everything worked fine, but grep didn't find the image on this host.
+ arv-get \Q$docker_locator$docker_stream/$docker_hash.tar\E | $docker_bin load
fi
};
.q{&& SWAP=$(awk '($1 == "SwapTotal:"){print $2}' </proc/meminfo) }
."&& MEMLIMIT=\$(( (\$MEM * 95) / ($ENV{CRUNCH_NODE_SLOTS} * 100) )) "
."&& let SWAPLIMIT=\$MEMLIMIT+\$SWAP "
- # $VOLUME_CRUNCHRUNNER and $VOLUME_CERTS will be passed unquoted as
- # arguments to `docker run`. They must contain their own quoting.
- .q{&& VOLUME_CRUNCHRUNNER="" VOLUME_CERTS="" }
- .q{&& if which crunchrunner >/dev/null ; then VOLUME_CRUNCHRUNNER=\\"--volume=$(which crunchrunner):/usr/local/bin/crunchrunner\\" ; fi }
- .q{&& if test -f /etc/ssl/certs/ca-certificates.crt ; then VOLUME_CERTS=\\"--volume=/etc/ssl/certs/ca-certificates.crt:/etc/arvados/ca-certificates.crt\\" ; }
- .q{elif test -f /etc/pki/tls/certs/ca-bundle.crt ; then VOLUME_CERTS=\\"--volume=/etc/pki/tls/certs/ca-bundle.crt:/etc/arvados/ca-certificates.crt\\" ; fi };
+ .q{&& declare -a VOLUMES=() }
+ .q{&& if which crunchrunner >/dev/null ; then VOLUMES+=("--volume=$(which crunchrunner):/usr/local/bin/crunchrunner") ; fi }
+ .q{&& if test -f /etc/ssl/certs/ca-certificates.crt ; then VOLUMES+=("--volume=/etc/ssl/certs/ca-certificates.crt:/etc/arvados/ca-certificates.crt") ; }
+ .q{elif test -f /etc/pki/tls/certs/ca-bundle.crt ; then VOLUMES+=("--volume=/etc/pki/tls/certs/ca-bundle.crt:/etc/arvados/ca-certificates.crt") ; fi };
$command .= "&& exec arv-mount --read-write --mount-by-pdh=by_pdh --mount-tmp=tmp --crunchstat-interval=10 --allow-other $arv_file_cache \Q$keep_mnt\E --exec ";
$ENV{TASK_KEEPMOUNT} = "$keep_mnt/by_pdh";
# Bind mount the crunchrunner binary and host TLS certificates file into
# the container.
- $command .= "\$VOLUME_CRUNCHRUNNER \$VOLUME_CERTS ";
+ $command .= '"${VOLUMES[@]}" ';
while (my ($env_key, $env_val) = each %ENV)
{
}
}
+sub check_sinfo
+{
+ # If a node fails in a multi-node "srun" call during job setup, the call
+ # may hang instead of exiting with a nonzero code. This function checks
+ # "sinfo" for the health of the nodes that were allocated and ensures that
+ # they are all still in the "alloc" state. If a node that is allocated to
+ # this job is not in "alloc" state, then set please_freeze.
+ #
+ # This is only called from srun_sync() for node configuration. If a
+ # node fails doing actual work, there are other recovery mechanisms.
+
+ # Do not call `sinfo` more than once every 15 seconds.
+ return if $sinfo_checked > time - 15;
+ $sinfo_checked = time;
+
+ # The output format "%t" means output node states.
+ my @sinfo = `sinfo --nodes=\Q$ENV{SLURM_NODELIST}\E --noheader -o "%t"`;
+ if ($? != 0)
+ {
+ Log(undef, "warning: sinfo exit status $? ($!)");
+ return;
+ }
+ chop @sinfo;
+
+ foreach (@sinfo)
+ {
+ if ($_ != "alloc" && $_ != "alloc*") {
+ $main::please_freeze = 1;
+ }
+ }
+}
sub release_allocation
{
substr $jobstep[$jobstepidx]->{stderr}, 0, 1+length($line), "";
Log ($jobstepidx, "stderr $line");
if ($line =~ /srun: error: (SLURM job $ENV{SLURM_JOB_ID} has expired|Unable to confirm allocation for job $ENV{SLURM_JOB_ID})/) {
- # whoa.
+ # If the allocation is revoked, we can't possibly continue, so mark all
+ # nodes as failed. This will cause the overall exit code to be
+ # EX_RETRY_UNLOCKED instead of failure so that crunch_dispatch can re-run
+ # this job.
$main::please_freeze = 1;
+ foreach my $st (@slot) {
+ $st->{node}->{fail_count}++;
+ }
}
elsif ($line =~ /srun: error: (Node failure on|Aborting, .*\bio error\b)/) {
$jobstep[$jobstepidx]->{tempfail} = 1;
return $s;
}
-
sub srun_sync
{
my $srunargs = shift;
if (!$busy || ($latest_refresh + 2 < scalar time)) {
check_refresh_wanted();
check_squeue();
+ check_sinfo();
}
if (!$busy) {
select(undef, undef, undef, 0.1);