12027: Recognize a new "node failed" error message.
authorTom Clegg <tom@curoverse.com>
Mon, 7 Aug 2017 13:58:04 +0000 (09:58 -0400)
committerTom Clegg <tom@curoverse.com>
Mon, 7 Aug 2017 13:58:04 +0000 (09:58 -0400)
"srun: error: Cannot communicate with node 0.  Aborting job."

Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curoverse.com>

sdk/cli/bin/crunch-job

index 5a92176e7f02fba11525190ccee511339819e1d2..5e6c3a084ed49d4f5d9e8beff6d9e38f815a2c5c 100755 (executable)
@@ -1544,7 +1544,7 @@ sub preprocess_stderr
         $st->{node}->{fail_count}++;
       }
     }
-    elsif ($line =~ /srun: error: .*?\b(Node failure on|Aborting, .*?\bio error\b)/i) {
+    elsif ($line =~ /srun: error: .*?\b(Node failure on|Aborting, .*?\bio error\b|cannot communicate with node .* aborting job)/i) {
       $jobstep[$jobstepidx]->{tempfail} = 1;
       if (defined($job_slot_index)) {
         $slot[$job_slot_index]->{node}->{fail_count}++;