Log (undef, "docker image hash is $docker_hash");
$docker_stream =~ s/^\.//;
my $docker_install_script = qq{
-if $docker_bin images -q --no-trunc --all | grep -xF \Q$docker_hash\E >/dev/null; then
- exit 0
+loaded() {
+ id=\$($docker_bin inspect --format="{{.ID}}" \Q$docker_hash\E) || return 1
+ echo "image ID is \$id"
+ [[ \${id} = \Q$docker_hash\E ]]
+}
+if loaded >&2 2>/dev/null; then
+ echo >&2 "image is already present"
+ exit 0
fi
-declare -a exit_codes=("\${PIPESTATUS[@]}")
-if [ 0 != "\${exit_codes[0]}" ]; then
- exit "\${exit_codes[0]}" # `docker images` failed
-elif [ 1 != "\${exit_codes[1]}" ]; then
- exit "\${exit_codes[1]}" # `grep` encountered an error
-else
- # Everything worked fine, but grep didn't find the image on this host.
- arv-get \Q$docker_locator$docker_stream/$docker_hash.tar\E | $docker_bin load
+echo >&2 "docker image is not present; loading"
+arv-get \Q$docker_locator$docker_stream/$docker_hash.tar\E | $docker_bin load
+if ! loaded >&2; then
+ echo >&2 "`docker load` exited 0, but image is not found (!)"
+ exit 1
fi
+echo >&2 "image loaded successfully"
};
my ($exited, $stdout, $stderr) = srun_sync(
{
Log (undef, "knob " . $_);
}
+my $resp = api_call(
+ 'nodes/list',
+ 'filters' => [['hostname', 'in', \@node]],
+ 'order' => 'hostname',
+ 'limit' => scalar(@node),
+ );
+for my $n (@{$resp->{items}}) {
+ Log(undef, "$n->{hostname} $n->{uuid} ".JSON::encode_json($n->{properties}));
+}
."&& MEMLIMIT=\$(( (\$MEM * 95) / ($ENV{CRUNCH_NODE_SLOTS} * 100) )) "
."&& let SWAPLIMIT=\$MEMLIMIT+\$SWAP "
.q{&& declare -a VOLUMES=() }
- .q{&& if which crunchrunner >/dev/null ; then VOLUMES+=("--volume=$(which crunchrunner):/usr/local/bin/crunchrunner") ; fi }
- .q{&& if test -f /etc/ssl/certs/ca-certificates.crt ; then VOLUMES+=("--volume=/etc/ssl/certs/ca-certificates.crt:/etc/arvados/ca-certificates.crt") ; }
- .q{elif test -f /etc/pki/tls/certs/ca-bundle.crt ; then VOLUMES+=("--volume=/etc/pki/tls/certs/ca-bundle.crt:/etc/arvados/ca-certificates.crt") ; fi };
+ .q{&& if which crunchrunner >/dev/null ; then VOLUMES+=("--volume=$(which crunchrunner):/usr/local/bin/crunchrunner:ro") ; fi }
+ .q{&& if test -f /etc/ssl/certs/ca-certificates.crt ; then VOLUMES+=("--volume=/etc/ssl/certs/ca-certificates.crt:/etc/arvados/ca-certificates.crt:ro") ; }
+ .q{elif test -f /etc/pki/tls/certs/ca-bundle.crt ; then VOLUMES+=("--volume=/etc/pki/tls/certs/ca-bundle.crt:/etc/arvados/ca-certificates.crt:ro") ; fi };
$command .= "&& exec arv-mount --read-write --mount-by-pdh=by_pdh --mount-tmp=tmp --crunchstat-interval=10 --allow-other $arv_file_cache \Q$keep_mnt\E --exec ";
$ENV{TASK_KEEPMOUNT} = "$keep_mnt/by_pdh";
my $line = $1;
substr $jobstep[$jobstepidx]->{stderr}, 0, 1+length($line), "";
Log ($jobstepidx, "stderr $line");
- if ($line =~ /srun: error: (SLURM job $ENV{SLURM_JOB_ID} has expired|Unable to confirm allocation for job $ENV{SLURM_JOB_ID})/) {
+ if ($line =~ /srun: error: (SLURM job $ENV{SLURM_JOB_ID} has expired|Unable to confirm allocation for job $ENV{SLURM_JOB_ID})/i) {
# If the allocation is revoked, we can't possibly continue, so mark all
# nodes as failed. This will cause the overall exit code to be
# EX_RETRY_UNLOCKED instead of failure so that crunch_dispatch can re-run
$st->{node}->{fail_count}++;
}
}
- elsif ($line =~ /srun: error: (Node failure on|Aborting, .*\bio error\b)/) {
+ elsif ($line =~ /srun: error: .*?\b(Node failure on|Aborting, .*?\bio error\b)/i) {
$jobstep[$jobstepidx]->{tempfail} = 1;
if (defined($job_slot_index)) {
$slot[$job_slot_index]->{node}->{fail_count}++;
ban_node_by_slot($job_slot_index);
}
}
- elsif ($line =~ /srun: error: (Unable to create job step|.*: Communication connection failure)/) {
+ elsif ($line =~ /srun: error: (Unable to create job step|.*?: Communication connection failure)/i) {
$jobstep[$jobstepidx]->{tempfail} = 1;
ban_node_by_slot($job_slot_index) if (defined($job_slot_index));
}
import arvados
import sys
print (arvados.api("v1").collections().
- create(body={"manifest_text": sys.stdin.read()}).
+ create(body={"manifest_text": sys.stdin.read(),
+ "owner_uuid": sys.argv[2]}).
execute(num_retries=int(sys.argv[1]))["portable_data_hash"])
-}, retry_count());
+}, retry_count(), $Job->{owner_uuid});
my $task_idx = -1;
my $manifest_size = 0;
close($log_pipe_in);
my $logger_failed = 0;
- my $read_result = log_writer_read_output(120);
+ my $read_result = log_writer_read_output(600);
if ($read_result == -1) {
$logger_failed = -1;
Log (undef, "timed out reading from 'arv-put'");
my ($package, $file, $line) = caller;
my $message = "@_ at $file line $line\n";
Log (undef, $message);
+ release_allocation();
freeze() if @jobstep_todo;
create_output_collection() if @jobstep_todo;
cleanup();
}
}
}
- if (defined($filename) and ($filename =~ /^([0-9A-Fa-f]{64})\.tar$/)) {
+ if (defined($filename) and ($filename =~ /^((?:sha256:)?[0-9A-Fa-f]{64})\.tar$/)) {
return ($streamname, $1);
} else {
return (undef, undef);