X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/b54478ea1b7c8aaeaf565d591f32769bcdc09b8f..b8036cfd2acd1bf2910130deb46be8a38eaff253:/sdk/cli/bin/crunch-job diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job index e0aff312cc..7584d3a83d 100755 --- a/sdk/cli/bin/crunch-job +++ b/sdk/cli/bin/crunch-job @@ -1510,8 +1510,14 @@ sub preprocess_stderr substr $jobstep[$jobstepidx]->{stderr}, 0, 1+length($line), ""; Log ($jobstepidx, "stderr $line"); if ($line =~ /srun: error: (SLURM job $ENV{SLURM_JOB_ID} has expired|Unable to confirm allocation for job $ENV{SLURM_JOB_ID})/) { - # whoa. + # If the allocation is revoked, we can't possibly continue, so mark all + # nodes as failed. This will cause the overall exit code to be + # EX_RETRY_UNLOCKED instead of failure so that crunch_dispatch can re-run + # this job. $main::please_freeze = 1; + foreach my $st (@slot) { + $st->{node}->{fail_count}++; + } } elsif ($line =~ /srun: error: (Node failure on|Aborting, .*\bio error\b)/) { $jobstep[$jobstepidx]->{tempfail} = 1;