From 3a8714e6fcf41c46d1fde0a6a3e4beb1367d181d Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Sun, 24 Jan 2016 19:48:06 -0500 Subject: [PATCH] 8284: Fix confusion between %proc and %jobstep. $proc{$pid}->{jobstep} is an index into @jobstep $proc{$pid}->{jobstepname} is the name we told srun to use $proc{$pid}->{killtime} is a deadline when we should kill the process $jobstep[$jobstepid]->{stderr_at} is the time of last stderr received We were mistakenly using $proc->{$pid}->{stderr_at}, which was always undef and therefore always less than $last_squeue_check. This resulted in jobs being killed as "slurm orphans" when the real reason they hadn't been returned by waitpid() was that we hadn't finished consuming their stderr yet. --- sdk/cli/bin/crunch-job | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job index 70d05f023c..7c50c282da 100755 --- a/sdk/cli/bin/crunch-job +++ b/sdk/cli/bin/crunch-job @@ -1340,8 +1340,9 @@ sub check_squeue # squeue check interval (15s) this should make the squeue check an # infrequent event. my $silent_procs = 0; - for my $jobstep (values %proc) + for my $procinfo (values %proc) { + my $jobstep = $jobstep[$procinfo->{jobstep}]; if ($jobstep->{stderr_at} < $last_squeue_check) { $silent_procs++; @@ -1350,17 +1351,18 @@ sub check_squeue return if $silent_procs == 0; # use killem() on procs whose killtime is reached - while (my ($pid, $jobstep) = each %proc) + while (my ($pid, $procinfo) = each %proc) { - if (exists $jobstep->{killtime} - && $jobstep->{killtime} <= time + my $jobstep = $jobstep[$procinfo->{jobstep}]; + if (exists $procinfo->{killtime} + && $procinfo->{killtime} <= time && $jobstep->{stderr_at} < $last_squeue_check) { my $sincewhen = ""; if ($jobstep->{stderr_at}) { $sincewhen = " in last " . (time - $jobstep->{stderr_at}) . "s"; } - Log($jobstep->{jobstep}, "killing orphaned srun process $pid (task not in slurm queue, no stderr received$sincewhen)"); + Log($procinfo->{jobstep}, "killing orphaned srun process $pid (task not in slurm queue, no stderr received$sincewhen)"); killem ($pid); } } @@ -1395,12 +1397,12 @@ sub check_squeue } # Check for child procs >60s old and not mentioned by squeue. - while (my ($pid, $jobstep) = each %proc) + while (my ($pid, $procinfo) = each %proc) { - if ($jobstep->{time} < time - 60 - && $jobstep->{jobstepname} - && !exists $ok{$jobstep->{jobstepname}} - && !exists $jobstep->{killtime}) + if ($procinfo->{time} < time - 60 + && $procinfo->{jobstepname} + && !exists $ok{$procinfo->{jobstepname}} + && !exists $procinfo->{killtime}) { # According to slurm, this task has ended (successfully or not) # -- but our srun child hasn't exited. First we must wait (30 @@ -1409,8 +1411,8 @@ sub check_squeue # terminated, we'll conclude some slurm communication # error/delay has caused the task to die without notifying srun, # and we'll kill srun ourselves. - $jobstep->{killtime} = time + 30; - Log($jobstep->{jobstep}, "notice: task is not in slurm queue but srun process $pid has not exited"); + $procinfo->{killtime} = time + 30; + Log($procinfo->{jobstep}, "notice: task is not in slurm queue but srun process $pid has not exited"); } } } -- 2.30.2