8811: Fixup preprocess_stderr bypass. 8811-srun-sync-tempfail-wip
authorBrett Smith <brett@curoverse.com>
Fri, 1 Apr 2016 19:40:13 +0000 (15:40 -0400)
committerBrett Smith <brett@curoverse.com>
Fri, 1 Apr 2016 19:40:13 +0000 (15:40 -0400)
sdk/cli/bin/crunch-job

index 168f238e87ea7495e4b883b80f4dfe5820b60ffc..05164ac6f042fd477b1a7d71a6c1f17e28cb5ade 100755 (executable)
@@ -1473,6 +1473,9 @@ sub readfrompipes
 sub preprocess_stderr
 {
   my $jobstepidx = shift;
+  # slotindex is only defined for children running Arvados job tasks.
+  # Be prepared to handle the undef case (for setup srun calls, etc.).
+  my $job_slot_index = $jobstep[$jobstepidx]->{slotindex};
 
   while ($jobstep[$jobstepidx]->{stderr} =~ /^(.*?)\n/) {
     my $line = $1;
@@ -1482,19 +1485,16 @@ sub preprocess_stderr
       # whoa.
       $main::please_freeze = 1;
     }
-    elsif (!exists $jobstep[$jobstepidx]->{slotindex}) {
-      # Skip the following tempfail checks if this srun proc isn't
-      # attached to a particular worker slot.
-    }
     elsif ($line =~ /srun: error: (Node failure on|Aborting, .*\bio error\b)/) {
-      my $job_slot_index = $jobstep[$jobstepidx]->{slotindex};
-      $slot[$job_slot_index]->{node}->{fail_count}++;
       $jobstep[$jobstepidx]->{tempfail} = 1;
-      ban_node_by_slot($job_slot_index);
+      if (defined($job_slot_index)) {
+        $slot[$job_slot_index]->{node}->{fail_count}++;
+        ban_node_by_slot($job_slot_index);
+      }
     }
     elsif ($line =~ /srun: error: (Unable to create job step|.*: Communication connection failure)/) {
       $jobstep[$jobstepidx]->{tempfail} = 1;
-      ban_node_by_slot($jobstep[$jobstepidx]->{slotindex});
+      ban_node_by_slot($job_slot_index) if (defined($job_slot_index));
     }
     elsif ($line =~ /\bKeep(Read|Write|Request)Error:/) {
       $jobstep[$jobstepidx]->{tempfail} = 1;