X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/3386dd9826cf143d078aa8985726516932fafa5b..b54478ea1b7c8aaeaf565d591f32769bcdc09b8f:/sdk/cli/bin/crunch-job diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job index 689609dd03..e0aff312cc 100755 --- a/sdk/cli/bin/crunch-job +++ b/sdk/cli/bin/crunch-job @@ -355,6 +355,7 @@ my @jobstep_done = (); my @jobstep_tomerge = (); my $jobstep_tomerge_level = 0; my $squeue_checked = 0; +my $sinfo_checked = 0; my $latest_refresh = scalar time; @@ -416,8 +417,17 @@ if ($docker_locator = $Job->{docker_image_locator}) { Log (undef, "docker image hash is $docker_hash"); $docker_stream =~ s/^\.//; my $docker_install_script = qq{ -if ! $docker_bin images -q --no-trunc --all | grep -qxF \Q$docker_hash\E; then - arv-get \Q$docker_locator$docker_stream/$docker_hash.tar\E | $docker_bin load +if $docker_bin images -q --no-trunc --all | grep -xF \Q$docker_hash\E >/dev/null; then + exit 0 +fi +declare -a exit_codes=("\${PIPESTATUS[@]}") +if [ 0 != "\${exit_codes[0]}" ]; then + exit "\${exit_codes[0]}" # `docker images` failed +elif [ 1 != "\${exit_codes[1]}" ]; then + exit "\${exit_codes[1]}" # `grep` encountered an error +else + # Everything worked fine, but grep didn't find the image on this host. + arv-get \Q$docker_locator$docker_stream/$docker_hash.tar\E | $docker_bin load fi }; @@ -853,9 +863,10 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++) .q{&& SWAP=$(awk '($1 == "SwapTotal:"){print $2}' /dev/null ; then VOLUME_CRUNCHRUNNER=\"--volume=\$(which crunchrunner):/usr/local/bin/crunchrunner\" ; fi " - ."&& if test -f /etc/ssl/certs/ca-certificates.crt ; then VOLUME_CERTS=\"--volume=/etc/ssl/certs/ca-certificates.crt:/etc/arvados/ca-certificates.crt\" ; fi " - ."&& if test -f /etc/pki/tls/certs/ca-bundle.crt ; then VOLUME_CERTS=\"--volume=/etc/pki/tls/certs/ca-bundle.crt:/etc/arvados/ca-certificates.crt\" ; fi "; + .q{&& declare -a VOLUMES=() } + .q{&& if which crunchrunner >/dev/null ; then VOLUMES+=("--volume=$(which crunchrunner):/usr/local/bin/crunchrunner") ; fi } + .q{&& if test -f /etc/ssl/certs/ca-certificates.crt ; then VOLUMES+=("--volume=/etc/ssl/certs/ca-certificates.crt:/etc/arvados/ca-certificates.crt") ; } + .q{elif test -f /etc/pki/tls/certs/ca-bundle.crt ; then VOLUMES+=("--volume=/etc/pki/tls/certs/ca-bundle.crt:/etc/arvados/ca-certificates.crt") ; fi }; $command .= "&& exec arv-mount --read-write --mount-by-pdh=by_pdh --mount-tmp=tmp --crunchstat-interval=10 --allow-other $arv_file_cache \Q$keep_mnt\E --exec "; $ENV{TASK_KEEPMOUNT} = "$keep_mnt/by_pdh"; @@ -922,7 +933,7 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++) # Bind mount the crunchrunner binary and host TLS certificates file into # the container. - $command .= "\"\$VOLUME_CRUNCHRUNNER\" \"\$VOLUME_CERTS\" "; + $command .= '"${VOLUMES[@]}" '; while (my ($env_key, $env_val) = each %ENV) { @@ -1152,13 +1163,6 @@ sub reapchildren . $slot[$proc{$pid}->{slot}]->{cpu}); my $jobstepidx = $proc{$pid}->{jobstepidx}; - if (!WIFEXITED($childstatus)) - { - # child did not exit (may be temporarily stopped) - Log ($jobstepidx, "child $pid did not actually exit in reapchildren, ignoring for now."); - next; - } - $children_reaped++; my $elapsed = time - $proc{$pid}->{time}; my $Jobstep = $jobstep[$jobstepidx]; @@ -1398,6 +1402,37 @@ sub check_squeue } } +sub check_sinfo +{ + # If a node fails in a multi-node "srun" call during job setup, the call + # may hang instead of exiting with a nonzero code. This function checks + # "sinfo" for the health of the nodes that were allocated and ensures that + # they are all still in the "alloc" state. If a node that is allocated to + # this job is not in "alloc" state, then set please_freeze. + # + # This is only called from srun_sync() for node configuration. If a + # node fails doing actual work, there are other recovery mechanisms. + + # Do not call `sinfo` more than once every 15 seconds. + return if $sinfo_checked > time - 15; + $sinfo_checked = time; + + # The output format "%t" means output node states. + my @sinfo = `sinfo --nodes=\Q$ENV{SLURM_NODELIST}\E --noheader -o "%t"`; + if ($? != 0) + { + Log(undef, "warning: sinfo exit status $? ($!)"); + return; + } + chop @sinfo; + + foreach (@sinfo) + { + if ($_ != "alloc" && $_ != "alloc*") { + $main::please_freeze = 1; + } + } +} sub release_allocation { @@ -1466,6 +1501,9 @@ sub readfrompipes sub preprocess_stderr { my $jobstepidx = shift; + # slotindex is only defined for children running Arvados job tasks. + # Be prepared to handle the undef case (for setup srun calls, etc.). + my $job_slot_index = $jobstep[$jobstepidx]->{slotindex}; while ($jobstep[$jobstepidx]->{stderr} =~ /^(.*?)\n/) { my $line = $1; @@ -1475,19 +1513,16 @@ sub preprocess_stderr # whoa. $main::please_freeze = 1; } - elsif (!exists $jobstep[$jobstepidx]->{slotindex}) { - # Skip the following tempfail checks if this srun proc isn't - # attached to a particular worker slot. - } elsif ($line =~ /srun: error: (Node failure on|Aborting, .*\bio error\b)/) { - my $job_slot_index = $jobstep[$jobstepidx]->{slotindex}; - $slot[$job_slot_index]->{node}->{fail_count}++; $jobstep[$jobstepidx]->{tempfail} = 1; - ban_node_by_slot($job_slot_index); + if (defined($job_slot_index)) { + $slot[$job_slot_index]->{node}->{fail_count}++; + ban_node_by_slot($job_slot_index); + } } elsif ($line =~ /srun: error: (Unable to create job step|.*: Communication connection failure)/) { $jobstep[$jobstepidx]->{tempfail} = 1; - ban_node_by_slot($jobstep[$jobstepidx]->{slotindex}); + ban_node_by_slot($job_slot_index) if (defined($job_slot_index)); } elsif ($line =~ /\bKeep(Read|Write|Request)Error:/) { $jobstep[$jobstepidx]->{tempfail} = 1; @@ -1903,7 +1938,6 @@ sub freezeunquote return $s; } - sub srun_sync { my $srunargs = shift; @@ -1958,6 +1992,7 @@ sub srun_sync if (!$busy || ($latest_refresh + 2 < scalar time)) { check_refresh_wanted(); check_squeue(); + check_sinfo(); } if (!$busy) { select(undef, undef, undef, 0.1); @@ -1977,6 +2012,11 @@ sub srun_sync delete $reader{$jobstepidx}; my $j = pop @jobstep; + # If the srun showed signs of tempfail, ensure the caller treats that as a + # failure case. + if ($main::please_freeze || $j->{tempfail}) { + $exited ||= 255; + } return ($exited, $j->{stdout_captured}, $j->{stderr_captured}); }