From 1b5c30eb957594e00a09df745df7630f661e3807 Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Wed, 27 May 2015 15:48:54 -0400 Subject: [PATCH] 6146: Retry install (max 3 attempts) if install script fails with no error messages. Also: if install fails, croak() instead of exit(1) so we still get a log file. --- sdk/cli/bin/crunch-job | 119 ++++++++++++++++++++++++++--------------- 1 file changed, 76 insertions(+), 43 deletions(-) diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job index c748904105..6cdaf904c4 100755 --- a/sdk/cli/bin/crunch-job +++ b/sdk/cli/bin/crunch-job @@ -118,6 +118,7 @@ $ENV{"CRUNCH_INSTALL"} = "$ENV{CRUNCH_TMP}/opt"; $ENV{"CRUNCH_WORK"} = $ENV{"JOB_WORK"}; # deprecated mkdir ($ENV{"JOB_WORK"}); +my %proc; my $force_unlock; my $git_dir; my $jobspec; @@ -589,56 +590,89 @@ if (!defined $git_archive) { } } else { - Log(undef, "Run install script on all workers"); - - my @srunargs = ("srun", - "--nodelist=$nodelist", - "-D", $ENV{'TMPDIR'}, "--job-name=$job_id"); - my @execargs = ("sh", "-c", - "mkdir -p $ENV{CRUNCH_INSTALL} && cd $ENV{CRUNCH_TMP} && perl -"); - - $ENV{"CRUNCH_GIT_ARCHIVE_HASH"} = md5_hex($git_archive); - my ($install_stderr_r, $install_stderr_w); - pipe $install_stderr_r, $install_stderr_w or croak("pipe() failed: $!"); - set_nonblocking($install_stderr_r); - my $installpid = fork(); - if ($installpid == 0) - { - close($install_stderr_r); - fcntl($install_stderr_w, F_SETFL, 0) or croak($!); # no close-on-exec - open(STDOUT, ">&", $install_stderr_w); - open(STDERR, ">&", $install_stderr_w); - srun (\@srunargs, \@execargs, {}, $build_script . $git_archive); - exit (1); - } - close($install_stderr_w); - my $stderr_buf = ''; - while ($installpid != waitpid(-1, WNOHANG)) { - freeze_if_want_freeze ($installpid); - # Wait up to 0.1 seconds for something to appear on stderr, then - # do a non-blocking read. - my $bits = fhbits($install_stderr_r); - select ($bits, undef, $bits, 0.1); - if (0 < sysread ($install_stderr_r, $stderr_buf, 8192, length($stderr_buf))) + my $install_exited; + my $install_script_tries_left = 3; + for (my $attempts = 0; $attempts < 3; $attempts++) { + Log(undef, "Run install script on all workers"); + + my @srunargs = ("srun", + "--nodelist=$nodelist", + "-D", $ENV{'TMPDIR'}, "--job-name=$job_id"); + my @execargs = ("sh", "-c", + "mkdir -p $ENV{CRUNCH_INSTALL} && cd $ENV{CRUNCH_TMP} && perl -"); + + $ENV{"CRUNCH_GIT_ARCHIVE_HASH"} = md5_hex($git_archive); + my ($install_stderr_r, $install_stderr_w); + pipe $install_stderr_r, $install_stderr_w or croak("pipe() failed: $!"); + set_nonblocking($install_stderr_r); + my $installpid = fork(); + if ($installpid == 0) { - while ($stderr_buf =~ /^(.*?)\n/) { - my $line = $1; - substr $stderr_buf, 0, 1+length($line), ""; - Log(undef, "stderr $line"); + close($install_stderr_r); + fcntl($install_stderr_w, F_SETFL, 0) or croak($!); # no close-on-exec + open(STDOUT, ">&", $install_stderr_w); + open(STDERR, ">&", $install_stderr_w); + srun (\@srunargs, \@execargs, {}, $build_script . $git_archive); + exit (1); + } + close($install_stderr_w); + # Tell freeze_if_want_freeze how to kill the child, otherwise the + # "waitpid(installpid)" loop won't get interrupted by a freeze: + $proc{$installpid} = {}; + my $stderr_buf = ''; + # Track whether anything appears on stderr other than slurm errors + # ("srun: ...") and the "starting: ..." message printed by the + # srun subroutine itself: + my $stderr_anything_from_script = 0; + my $match_our_own_errors = '^(srun: error: |starting: \[)'; + while ($installpid != waitpid(-1, WNOHANG)) { + freeze_if_want_freeze ($installpid); + # Wait up to 0.1 seconds for something to appear on stderr, then + # do a non-blocking read. + my $bits = fhbits($install_stderr_r); + select ($bits, undef, $bits, 0.1); + if (0 < sysread ($install_stderr_r, $stderr_buf, 8192, length($stderr_buf))) + { + while ($stderr_buf =~ /^(.*?)\n/) { + my $line = $1; + substr $stderr_buf, 0, 1+length($line), ""; + Log(undef, "stderr $line"); + if ($line !~ /$match_our_own_errors/) { + $stderr_anything_from_script = 1; + } + } } } - } - my $install_exited = $?; - close($install_stderr_r); - if (length($stderr_buf) > 0) { - Log(undef, "stderr $stderr_buf") + delete $proc{$installpid}; + $install_exited = $?; + close($install_stderr_r); + if (length($stderr_buf) > 0) { + if ($stderr_buf !~ /$match_our_own_errors/) { + $stderr_anything_from_script = 1; + } + Log(undef, "stderr $stderr_buf") + } + + Log (undef, "Install script exited ".exit_status_s($install_exited)); + last if $install_exited == 0 || $main::please_freeze; + # If the install script fails but doesn't print an error message, + # the next thing anyone is likely to do is just run it again in + # case it was a transient problem like "slurm communication fails + # because the network isn't reliable enough". So we'll just do + # that ourselves (up to 3 attempts in total). OTOH, if there is an + # error message, the problem is more likely to have a real fix and + # we should fail the job so the fixing process can start, instead + # of doing 2 more attempts. + last if $stderr_anything_from_script; } - Log (undef, "Install script exited ".exit_status_s($install_exited)); foreach my $tar_filename (map { tar_filename_n($_); } (1..$git_tar_count)) { unlink($tar_filename); } - exit (1) if $install_exited != 0; + + if ($install_exited != 0) { + croak("Giving up"); + } } foreach (qw (script script_version script_parameters runtime_constraints)) @@ -704,7 +738,6 @@ for (my $ii = $#freeslot; $ii >= 0; $ii--) { } Log(undef, "start level $level with $round_num_freeslots slots"); -my %proc; my @holdslot; my %reader; my $progress_is_dirty = 1; -- 2.30.2