X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/0d62edcb9d25bf4dcdb20d8872ea7b438e12fc59..3b3c3a0869c2cf528b9e1c45c969bbbd47f6446e:/sdk/cli/bin/crunch-job diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job index 40c9cf325c..35a3b1f23b 100755 --- a/sdk/cli/bin/crunch-job +++ b/sdk/cli/bin/crunch-job @@ -393,12 +393,12 @@ if (!defined $no_clear_tmp) { # Find FUSE mounts under $CRUNCH_TMP and unmount them. Then clean # up work directories crunch_tmp/work, crunch_tmp/opt, # crunch_tmp/src*. - # - # TODO: When #5036 is done and widely deployed, we can limit mount's - # -t option to simply fuse.keep. my ($exited, $stdout, $stderr) = srun_sync( ["srun", "--nodelist=$nodelist", "-D", $ENV{'TMPDIR'}], - ['bash', '-ec', '-o', 'pipefail', 'mount -t fuse,fuse.keep | awk "(index(\$3, \"$CRUNCH_TMP\") == 1){print \$3}" | xargs -r -n 1 fusermount -u -z; sleep 1; rm -rf $JOB_WORK $CRUNCH_INSTALL $CRUNCH_TMP/task $CRUNCH_TMP/src* $CRUNCH_TMP/*.cid'], + ['bash', '-ec', q{ +arv-mount --unmount-timeout 10 --unmount-all ${CRUNCH_TMP} +rm -rf ${JOB_WORK} ${CRUNCH_INSTALL} ${CRUNCH_TMP}/task ${CRUNCH_TMP}/src* ${CRUNCH_TMP}/*.cid + }], {label => "clean work dirs"}); if ($exited != 0) { exit(EX_RETRY_UNLOCKED); @@ -417,18 +417,22 @@ if ($docker_locator = $Job->{docker_image_locator}) { Log (undef, "docker image hash is $docker_hash"); $docker_stream =~ s/^\.//; my $docker_install_script = qq{ -if $docker_bin images -q --no-trunc --all | grep -xF \Q$docker_hash\E >/dev/null; then - exit 0 +loaded() { + id=\$($docker_bin inspect --format="{{.ID}}" \Q$docker_hash\E) || return 1 + echo "image ID is \$id" + [[ \${id} = \Q$docker_hash\E ]] +} +if loaded >&2 2>/dev/null; then + echo >&2 "image is already present" + exit 0 fi -declare -a exit_codes=("\${PIPESTATUS[@]}") -if [ 0 != "\${exit_codes[0]}" ]; then - exit "\${exit_codes[0]}" # `docker images` failed -elif [ 1 != "\${exit_codes[1]}" ]; then - exit "\${exit_codes[1]}" # `grep` encountered an error -else - # Everything worked fine, but grep didn't find the image on this host. - arv-get \Q$docker_locator$docker_stream/$docker_hash.tar\E | $docker_bin load +echo >&2 "docker image is not present; loading" +arv-get \Q$docker_locator$docker_stream/$docker_hash.tar\E | $docker_bin load +if ! loaded >&2; then + echo >&2 "`docker load` exited 0, but image is not found (!)" + exit 1 fi +echo >&2 "image loaded successfully" }; my ($exited, $stdout, $stderr) = srun_sync( @@ -715,6 +719,15 @@ foreach (split (/\n/, $Job->{knobs})) { Log (undef, "knob " . $_); } +my $resp = api_call( + 'nodes/list', + 'filters' => [['hostname', 'in', \@node]], + 'order' => 'hostname', + 'limit' => scalar(@node), + ); +for my $n (@{$resp->{items}}) { + Log(undef, "$n->{hostname} $n->{uuid} ".JSON::encode_json($n->{properties})); +} @@ -1117,10 +1130,10 @@ freeze(); my $collated_output = save_output_collection(); Log (undef, "finish"); -save_meta(); +my $final_log = save_meta(); my $final_state; -if ($collated_output && $main::success) { +if ($collated_output && $final_log && $main::success) { $final_state = 'Complete'; } else { $final_state = 'Failed'; @@ -1593,9 +1606,10 @@ sub create_output_collection import arvados import sys print (arvados.api("v1").collections(). - create(body={"manifest_text": sys.stdin.read()}). + create(body={"manifest_text": sys.stdin.read(), + "owner_uuid": sys.argv[2]}). execute(num_retries=int(sys.argv[1]))["portable_data_hash"]) -}, retry_count()); +}, retry_count(), $Job->{owner_uuid}); my $task_idx = -1; my $manifest_size = 0; @@ -1746,7 +1760,7 @@ sub log_writer_start($) $log_pipe_pid = open2($log_pipe_out, $log_pipe_in, 'arv-put', '--stream', - '--retries', '3', + '--retries', '6', '--filename', $logfilename, '-'); $log_pipe_out_buf = ""; @@ -1780,7 +1794,7 @@ sub log_writer_finish() close($log_pipe_in); my $logger_failed = 0; - my $read_result = log_writer_read_output(120); + my $read_result = log_writer_read_output(600); if ($read_result == -1) { $logger_failed = -1; Log (undef, "timed out reading from 'arv-put'"); @@ -1843,6 +1857,7 @@ sub croak my ($package, $file, $line) = caller; my $message = "@_ at $file line $line\n"; Log (undef, $message); + release_allocation(); freeze() if @jobstep_todo; create_output_collection() if @jobstep_todo; cleanup(); @@ -1883,6 +1898,8 @@ sub save_meta }); Log(undef, "log collection is " . $log_coll->{portable_data_hash}); $Job->update_attributes('log' => $log_coll->{portable_data_hash}); + + return $log_coll->{portable_data_hash}; }