From: Peter Amstutz Date: Wed, 18 Mar 2015 16:23:21 +0000 (-0400) Subject: 5500: Add SLURM "Communication connection failure" to pattern of temporary node X-Git-Tag: 1.1.0~1735^2~1 X-Git-Url: https://git.arvados.org/arvados.git/commitdiff_plain/3365d47ab4f504a1e849852691313cddd89d0f15 5500: Add SLURM "Communication connection failure" to pattern of temporary node failures. "pip install" failure returns temporary error status (111) so the task can be retried. --- diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job index cc47bbeb5d..d40df908ee 100755 --- a/sdk/cli/bin/crunch-job +++ b/sdk/cli/bin/crunch-job @@ -1232,7 +1232,7 @@ sub preprocess_stderr # whoa. $main::please_freeze = 1; } - elsif ($line =~ /srun: error: (Node failure on|Unable to create job step) /) { + elsif ($line =~ /srun: error: (Node failure on|Unable to create job step|.*: Communication connection failure)/) { $jobstep[$job]->{node_fail} = 1; ban_node_by_slot($jobstep[$job]->{slotindex}); } @@ -1876,9 +1876,9 @@ if (@ARGV) { my $venv_dir = "$job_work/.arvados.venv"; my $venv_built = -e "$venv_dir/bin/activate"; if ((!$venv_built) and (-d $python_src) and can_run("virtualenv")) { - shell_or_die("virtualenv", "--quiet", "--system-site-packages", + shell_or_die(undef, "virtualenv", "--quiet", "--system-site-packages", "--python=python2.7", $venv_dir); - shell_or_die("$venv_dir/bin/pip", "--quiet", "install", "-I", $python_src); + shell_or_die(111, "$venv_dir/bin/pip", "--quiet", "install", "-I", $python_src); $venv_built = 1; $Log->("Built Python SDK virtualenv"); } @@ -1974,12 +1974,12 @@ if ((-d $python_dir) and can_run("python2.7") and } if (-e "$destdir/crunch_scripts/install") { - shell_or_die ("$destdir/crunch_scripts/install", $install_dir); + shell_or_die (undef, "$destdir/crunch_scripts/install", $install_dir); } elsif (!-e "./install.sh" && -e "./tests/autotests.sh") { # Old version - shell_or_die ("./tests/autotests.sh", $install_dir); + shell_or_die (undef, "./tests/autotests.sh", $install_dir); } elsif (-e "./install.sh") { - shell_or_die ("./install.sh", $install_dir); + shell_or_die (undef, "./install.sh", $install_dir); } if ($commit) { @@ -2000,15 +2000,24 @@ sub can_run { sub shell_or_die { + my $tempfail = shift; + if ($ENV{"DEBUG"}) { print STDERR "@_\n"; } if (system (@_) != 0) { my $err = $!; - my $exitstatus = sprintf("exit %d signal %d", $? >> 8, $? & 0x7f); + my $code = $?; + my $exitstatus = sprintf("exit %d signal %d", $code >> 8, $code & 0x7f); open STDERR, ">&STDERR_ORIG"; system ("cat $destdir.log >&2"); - die "@_ failed ($err): $exitstatus"; + print STDERR "@_ failed ($err): $exitstatus"; + if ($tempfail) { + exit $tempfail; + } + else { + exit ($code >> 8); + } } }