X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/ec51cd72eabf0f0bbebf7cd979d1a23a1319a416..91b379c4a76b8278354903158a65e5d47babd363:/sdk/cli/bin/crunch-job diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job index e473710c24..6423c1cf79 100755 --- a/sdk/cli/bin/crunch-job +++ b/sdk/cli/bin/crunch-job @@ -432,7 +432,7 @@ fi # Determine whether this version of Docker supports memory+swap limits. ($exited, $stdout, $stderr) = srun_sync( - ["srun", "--nodelist=" . $node[0]], + ["srun", "--nodes=1"], [$docker_bin, 'run', '--help'], {label => "check --memory-swap feature"}); $docker_limitmem = ($stdout =~ /--memory-swap/); @@ -455,7 +455,7 @@ fi $try_user_arg = "--user=$try_user"; } my ($exited, $stdout, $stderr) = srun_sync( - ["srun", "--nodelist=" . $node[0]], + ["srun", "--nodes=1"], ["/bin/sh", "-ec", "$docker_bin run $docker_run_args $try_user_arg $docker_hash id --user"], {label => $label}); @@ -852,7 +852,13 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++) .q{&& MEM=$(awk '($1 == "MemTotal:"){print $2}' /dev/null ; then VOLUME_CRUNCHRUNNER=\\"--volume=$(which crunchrunner):/usr/local/bin/crunchrunner\\" ; fi } + .q{&& if test -f /etc/ssl/certs/ca-certificates.crt ; then VOLUME_CERTS=\\"--volume=/etc/ssl/certs/ca-certificates.crt:/etc/arvados/ca-certificates.crt\\" ; } + .q{elif test -f /etc/pki/tls/certs/ca-bundle.crt ; then VOLUME_CERTS=\\"--volume=/etc/pki/tls/certs/ca-bundle.crt:/etc/arvados/ca-certificates.crt\\" ; fi }; $command .= "&& exec arv-mount --read-write --mount-by-pdh=by_pdh --mount-tmp=tmp --crunchstat-interval=10 --allow-other $arv_file_cache \Q$keep_mnt\E --exec "; $ENV{TASK_KEEPMOUNT} = "$keep_mnt/by_pdh"; @@ -917,6 +923,10 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++) # For now, use the same approach as TASK_WORK above. $ENV{"JOB_WORK"} = "/tmp/crunch-job-work"; + # Bind mount the crunchrunner binary and host TLS certificates file into + # the container. + $command .= "\$VOLUME_CRUNCHRUNNER \$VOLUME_CERTS "; + while (my ($env_key, $env_val) = each %ENV) { if ($env_key =~ /^(ARVADOS|CRUNCH|JOB|TASK)_/) { @@ -1145,13 +1155,6 @@ sub reapchildren . $slot[$proc{$pid}->{slot}]->{cpu}); my $jobstepidx = $proc{$pid}->{jobstepidx}; - if (!WIFEXITED($childstatus)) - { - # child did not exit (may be temporarily stopped) - Log ($jobstepidx, "child $pid did not actually exit in reapchildren, ignoring for now."); - next; - } - $children_reaped++; my $elapsed = time - $proc{$pid}->{time}; my $Jobstep = $jobstep[$jobstepidx]; @@ -1459,6 +1462,9 @@ sub readfrompipes sub preprocess_stderr { my $jobstepidx = shift; + # slotindex is only defined for children running Arvados job tasks. + # Be prepared to handle the undef case (for setup srun calls, etc.). + my $job_slot_index = $jobstep[$jobstepidx]->{slotindex}; while ($jobstep[$jobstepidx]->{stderr} =~ /^(.*?)\n/) { my $line = $1; @@ -1468,19 +1474,16 @@ sub preprocess_stderr # whoa. $main::please_freeze = 1; } - elsif (!exists $jobstep[$jobstepidx]->{slotindex}) { - # Skip the following tempfail checks if this srun proc isn't - # attached to a particular worker slot. - } elsif ($line =~ /srun: error: (Node failure on|Aborting, .*\bio error\b)/) { - my $job_slot_index = $jobstep[$jobstepidx]->{slotindex}; - $slot[$job_slot_index]->{node}->{fail_count}++; $jobstep[$jobstepidx]->{tempfail} = 1; - ban_node_by_slot($job_slot_index); + if (defined($job_slot_index)) { + $slot[$job_slot_index]->{node}->{fail_count}++; + ban_node_by_slot($job_slot_index); + } } elsif ($line =~ /srun: error: (Unable to create job step|.*: Communication connection failure)/) { $jobstep[$jobstepidx]->{tempfail} = 1; - ban_node_by_slot($jobstep[$jobstepidx]->{slotindex}); + ban_node_by_slot($job_slot_index) if (defined($job_slot_index)); } elsif ($line =~ /\bKeep(Read|Write|Request)Error:/) { $jobstep[$jobstepidx]->{tempfail} = 1; @@ -1970,6 +1973,11 @@ sub srun_sync delete $reader{$jobstepidx}; my $j = pop @jobstep; + # If the srun showed signs of tempfail, ensure the caller treats that as a + # failure case. + if ($main::please_freeze || $j->{tempfail}) { + $exited ||= 255; + } return ($exited, $j->{stdout_captured}, $j->{stderr_captured}); }