my $line = $1;
substr $jobstep[$jobstepidx]->{stderr}, 0, 1+length($line), "";
Log ($jobstepidx, "stderr $line");
- if ($line =~ /srun: error: (SLURM job $ENV{SLURM_JOB_ID} has expired|Unable to confirm allocation for job $ENV{SLURM_JOB_ID})/) {
+ if ($line =~ /srun: error: (SLURM job $ENV{SLURM_JOB_ID} has expired|Unable to confirm allocation for job $ENV{SLURM_JOB_ID})/i) {
# If the allocation is revoked, we can't possibly continue, so mark all
# nodes as failed. This will cause the overall exit code to be
# EX_RETRY_UNLOCKED instead of failure so that crunch_dispatch can re-run
$st->{node}->{fail_count}++;
}
}
- elsif ($line =~ /srun: error: (Node failure on|Aborting, .*\bio error\b)/) {
+ elsif ($line =~ /srun: error: .*?\b(Node failure on|Aborting, .*?\bio error\b)/i) {
$jobstep[$jobstepidx]->{tempfail} = 1;
if (defined($job_slot_index)) {
$slot[$job_slot_index]->{node}->{fail_count}++;
ban_node_by_slot($job_slot_index);
}
}
- elsif ($line =~ /srun: error: (Unable to create job step|.*: Communication connection failure)/) {
+ elsif ($line =~ /srun: error: (Unable to create job step|.*?: Communication connection failure)/i) {
$jobstep[$jobstepidx]->{tempfail} = 1;
ban_node_by_slot($job_slot_index) if (defined($job_slot_index));
}