X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/1a87c77a5cfe200921a9d09e995383bc675e7370..0654fa160f872fe20ea4ada42a655f9d154c0833:/sdk/cli/bin/crunch-job diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job index 7584d3a83d..e0b2710798 100755 --- a/sdk/cli/bin/crunch-job +++ b/sdk/cli/bin/crunch-job @@ -417,18 +417,22 @@ if ($docker_locator = $Job->{docker_image_locator}) { Log (undef, "docker image hash is $docker_hash"); $docker_stream =~ s/^\.//; my $docker_install_script = qq{ -if $docker_bin images -q --no-trunc --all | grep -xF \Q$docker_hash\E >/dev/null; then - exit 0 +loaded() { + id=\$($docker_bin inspect --format="{{.ID}}" \Q$docker_hash\E) || return 1 + echo "image ID is \$id" + [[ \${id} = \Q$docker_hash\E ]] +} +if loaded >&2 2>/dev/null; then + echo >&2 "image is already present" + exit 0 fi -declare -a exit_codes=("\${PIPESTATUS[@]}") -if [ 0 != "\${exit_codes[0]}" ]; then - exit "\${exit_codes[0]}" # `docker images` failed -elif [ 1 != "\${exit_codes[1]}" ]; then - exit "\${exit_codes[1]}" # `grep` encountered an error -else - # Everything worked fine, but grep didn't find the image on this host. - arv-get \Q$docker_locator$docker_stream/$docker_hash.tar\E | $docker_bin load +echo >&2 "docker image is not present; loading" +arv-get \Q$docker_locator$docker_stream/$docker_hash.tar\E | $docker_bin load +if ! loaded >&2; then + echo >&2 "`docker load` exited 0, but image is not found (!)" + exit 1 fi +echo >&2 "image loaded successfully" }; my ($exited, $stdout, $stderr) = srun_sync( @@ -715,6 +719,15 @@ foreach (split (/\n/, $Job->{knobs})) { Log (undef, "knob " . $_); } +my $resp = api_call( + 'nodes/list', + 'filters' => [['hostname', 'in', \@node]], + 'order' => 'hostname', + 'limit' => scalar(@node), + ); +for my $n (@{$resp->{items}}) { + Log(undef, "$n->{hostname} $n->{uuid} ".JSON::encode_json($n->{properties})); +} @@ -864,9 +877,9 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++) ."&& MEMLIMIT=\$(( (\$MEM * 95) / ($ENV{CRUNCH_NODE_SLOTS} * 100) )) " ."&& let SWAPLIMIT=\$MEMLIMIT+\$SWAP " .q{&& declare -a VOLUMES=() } - .q{&& if which crunchrunner >/dev/null ; then VOLUMES+=("--volume=$(which crunchrunner):/usr/local/bin/crunchrunner") ; fi } - .q{&& if test -f /etc/ssl/certs/ca-certificates.crt ; then VOLUMES+=("--volume=/etc/ssl/certs/ca-certificates.crt:/etc/arvados/ca-certificates.crt") ; } - .q{elif test -f /etc/pki/tls/certs/ca-bundle.crt ; then VOLUMES+=("--volume=/etc/pki/tls/certs/ca-bundle.crt:/etc/arvados/ca-certificates.crt") ; fi }; + .q{&& if which crunchrunner >/dev/null ; then VOLUMES+=("--volume=$(which crunchrunner):/usr/local/bin/crunchrunner:ro") ; fi } + .q{&& if test -f /etc/ssl/certs/ca-certificates.crt ; then VOLUMES+=("--volume=/etc/ssl/certs/ca-certificates.crt:/etc/arvados/ca-certificates.crt:ro") ; } + .q{elif test -f /etc/pki/tls/certs/ca-bundle.crt ; then VOLUMES+=("--volume=/etc/pki/tls/certs/ca-bundle.crt:/etc/arvados/ca-certificates.crt:ro") ; fi }; $command .= "&& exec arv-mount --read-write --mount-by-pdh=by_pdh --mount-tmp=tmp --crunchstat-interval=10 --allow-other $arv_file_cache \Q$keep_mnt\E --exec "; $ENV{TASK_KEEPMOUNT} = "$keep_mnt/by_pdh"; @@ -1509,7 +1522,7 @@ sub preprocess_stderr my $line = $1; substr $jobstep[$jobstepidx]->{stderr}, 0, 1+length($line), ""; Log ($jobstepidx, "stderr $line"); - if ($line =~ /srun: error: (SLURM job $ENV{SLURM_JOB_ID} has expired|Unable to confirm allocation for job $ENV{SLURM_JOB_ID})/) { + if ($line =~ /srun: error: (SLURM job $ENV{SLURM_JOB_ID} has expired|Unable to confirm allocation for job $ENV{SLURM_JOB_ID})/i) { # If the allocation is revoked, we can't possibly continue, so mark all # nodes as failed. This will cause the overall exit code to be # EX_RETRY_UNLOCKED instead of failure so that crunch_dispatch can re-run @@ -1519,14 +1532,14 @@ sub preprocess_stderr $st->{node}->{fail_count}++; } } - elsif ($line =~ /srun: error: (Node failure on|Aborting, .*\bio error\b)/) { + elsif ($line =~ /srun: error: .*?\b(Node failure on|Aborting, .*?\bio error\b)/i) { $jobstep[$jobstepidx]->{tempfail} = 1; if (defined($job_slot_index)) { $slot[$job_slot_index]->{node}->{fail_count}++; ban_node_by_slot($job_slot_index); } } - elsif ($line =~ /srun: error: (Unable to create job step|.*: Communication connection failure)/) { + elsif ($line =~ /srun: error: (Unable to create job step|.*?: Communication connection failure)/i) { $jobstep[$jobstepidx]->{tempfail} = 1; ban_node_by_slot($job_slot_index) if (defined($job_slot_index)); } @@ -1593,9 +1606,10 @@ sub create_output_collection import arvados import sys print (arvados.api("v1").collections(). - create(body={"manifest_text": sys.stdin.read()}). + create(body={"manifest_text": sys.stdin.read(), + "owner_uuid": sys.argv[2]}). execute(num_retries=int(sys.argv[1]))["portable_data_hash"]) -}, retry_count()); +}, retry_count(), $Job->{owner_uuid}); my $task_idx = -1; my $manifest_size = 0; @@ -1780,7 +1794,7 @@ sub log_writer_finish() close($log_pipe_in); my $logger_failed = 0; - my $read_result = log_writer_read_output(120); + my $read_result = log_writer_read_output(600); if ($read_result == -1) { $logger_failed = -1; Log (undef, "timed out reading from 'arv-put'"); @@ -1843,6 +1857,7 @@ sub croak my ($package, $file, $line) = caller; my $message = "@_ at $file line $line\n"; Log (undef, $message); + release_allocation(); freeze() if @jobstep_todo; create_output_collection() if @jobstep_todo; cleanup(); @@ -2104,7 +2119,7 @@ sub find_docker_image { } } } - if (defined($filename) and ($filename =~ /^([0-9A-Fa-f]{64})\.tar$/)) { + if (defined($filename) and ($filename =~ /^((?:sha256:)?[0-9A-Fa-f]{64})\.tar$/)) { return ($streamname, $1); } else { return (undef, undef);