."&& MEMLIMIT=\$(( (\$MEM * 95) / ($ENV{CRUNCH_NODE_SLOTS} * 100) )) "
."&& let SWAPLIMIT=\$MEMLIMIT+\$SWAP "
.q{&& declare -a VOLUMES=() }
- .q{&& if which crunchrunner >/dev/null ; then VOLUMES+=("--volume=$(which crunchrunner):/usr/local/bin/crunchrunner") ; fi }
- .q{&& if test -f /etc/ssl/certs/ca-certificates.crt ; then VOLUMES+=("--volume=/etc/ssl/certs/ca-certificates.crt:/etc/arvados/ca-certificates.crt") ; }
- .q{elif test -f /etc/pki/tls/certs/ca-bundle.crt ; then VOLUMES+=("--volume=/etc/pki/tls/certs/ca-bundle.crt:/etc/arvados/ca-certificates.crt") ; fi };
+ .q{&& if which crunchrunner >/dev/null ; then VOLUMES+=("--volume=$(which crunchrunner):/usr/local/bin/crunchrunner:ro") ; fi }
+ .q{&& if test -f /etc/ssl/certs/ca-certificates.crt ; then VOLUMES+=("--volume=/etc/ssl/certs/ca-certificates.crt:/etc/arvados/ca-certificates.crt:ro") ; }
+ .q{elif test -f /etc/pki/tls/certs/ca-bundle.crt ; then VOLUMES+=("--volume=/etc/pki/tls/certs/ca-bundle.crt:/etc/arvados/ca-certificates.crt:ro") ; fi };
$command .= "&& exec arv-mount --read-write --mount-by-pdh=by_pdh --mount-tmp=tmp --crunchstat-interval=10 --allow-other $arv_file_cache \Q$keep_mnt\E --exec ";
$ENV{TASK_KEEPMOUNT} = "$keep_mnt/by_pdh";
sub check_sinfo
{
- my $last_sinfo_check = $sinfo_checked;
+ # If a node fails in a multi-node "srun" call during job setup, the call
+ # may hang instead of exiting with a nonzero code. This function checks
+ # "sinfo" for the health of the nodes that were allocated and ensures that
+ # they are all still in the "alloc" state. If a node that is allocated to
+ # this job is not in "alloc" state, then set please_freeze.
+ #
+ # This is only called from srun_sync() for node configuration. If a
+ # node fails doing actual work, there are other recovery mechanisms.
# Do not call `sinfo` more than once every 15 seconds.
- return if $last_sinfo_check > time - 15;
+ return if $sinfo_checked > time - 15;
$sinfo_checked = time;
+ # The output format "%t" means output node states.
my @sinfo = `sinfo --nodes=\Q$ENV{SLURM_NODELIST}\E --noheader -o "%t"`;
if ($? != 0)
{
my $line = $1;
substr $jobstep[$jobstepidx]->{stderr}, 0, 1+length($line), "";
Log ($jobstepidx, "stderr $line");
- if ($line =~ /srun: error: (SLURM job $ENV{SLURM_JOB_ID} has expired|Unable to confirm allocation for job $ENV{SLURM_JOB_ID})/) {
- # whoa.
+ if ($line =~ /srun: error: (SLURM job $ENV{SLURM_JOB_ID} has expired|Unable to confirm allocation for job $ENV{SLURM_JOB_ID})/i) {
+ # If the allocation is revoked, we can't possibly continue, so mark all
+ # nodes as failed. This will cause the overall exit code to be
+ # EX_RETRY_UNLOCKED instead of failure so that crunch_dispatch can re-run
+ # this job.
$main::please_freeze = 1;
+ foreach my $st (@slot) {
+ $st->{node}->{fail_count}++;
+ }
}
- elsif ($line =~ /srun: error: (Node failure on|Aborting, .*\bio error\b)/) {
+ elsif ($line =~ /srun: error: .*?\b(Node failure on|Aborting, .*?\bio error\b)/i) {
$jobstep[$jobstepidx]->{tempfail} = 1;
if (defined($job_slot_index)) {
$slot[$job_slot_index]->{node}->{fail_count}++;
ban_node_by_slot($job_slot_index);
}
}
- elsif ($line =~ /srun: error: (Unable to create job step|.*: Communication connection failure)/) {
+ elsif ($line =~ /srun: error: (Unable to create job step|.*?: Communication connection failure)/i) {
$jobstep[$jobstepidx]->{tempfail} = 1;
ban_node_by_slot($job_slot_index) if (defined($job_slot_index));
}
}
}
}
- if (defined($filename) and ($filename =~ /^([0-9A-Fa-f]{64})\.tar$/)) {
+ if (defined($filename) and ($filename =~ /^((?:sha256:)?[0-9A-Fa-f]{64})\.tar$/)) {
return ($streamname, $1);
} else {
return (undef, undef);