X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/74bfe6e5794b42d3158d9358245802e73942b909..32e3f6eb604d3692f10f16220a78e07c056be00e:/sdk/cli/bin/crunch-job diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job index f153ec8e97..ae210a6f44 100755 --- a/sdk/cli/bin/crunch-job +++ b/sdk/cli/bin/crunch-job @@ -415,11 +415,13 @@ if (!defined $no_clear_tmp) { # If this job requires a Docker image, install that. my ($docker_locator, $docker_stream, $docker_hash, $docker_limitmem, $dockeruserarg); if ($docker_locator = $Job->{docker_image_locator}) { + Log (undef, "Install docker image $docker_locator"); ($docker_stream, $docker_hash) = find_docker_image($docker_locator); if (!$docker_hash) { croak("No Docker image hash found from locator $docker_locator"); } + Log (undef, "docker image hash is $docker_hash"); $docker_stream =~ s/^\.//; my $docker_install_script = qq{ if ! $docker_bin images -q --no-trunc --all | grep -qxF \Q$docker_hash\E; then @@ -430,7 +432,7 @@ fi if ($docker_pid == 0) { srun (["srun", "--nodelist=" . join(',', @node)], - ["/bin/sh", "-ec", $docker_install_script]); + ["/bin/bash", "-o", "pipefail", "-ec", $docker_install_script]); exit ($?); } while (1) @@ -441,8 +443,8 @@ fi } if ($? != 0) { - croak("Installing Docker image from $docker_locator exited " - .exit_status_s($?)); + Log(undef, "Installing Docker image from $docker_locator exited " . exit_status_s($?)); + exit(EX_RETRY_UNLOCKED); } # Determine whether this version of Docker supports memory+swap limits. @@ -1342,8 +1344,9 @@ sub check_squeue # squeue check interval (15s) this should make the squeue check an # infrequent event. my $silent_procs = 0; - for my $jobstep (values %proc) + for my $procinfo (values %proc) { + my $jobstep = $jobstep[$procinfo->{jobstep}]; if ($jobstep->{stderr_at} < $last_squeue_check) { $silent_procs++; @@ -1352,17 +1355,18 @@ sub check_squeue return if $silent_procs == 0; # use killem() on procs whose killtime is reached - while (my ($pid, $jobstep) = each %proc) + while (my ($pid, $procinfo) = each %proc) { - if (exists $jobstep->{killtime} - && $jobstep->{killtime} <= time + my $jobstep = $jobstep[$procinfo->{jobstep}]; + if (exists $procinfo->{killtime} + && $procinfo->{killtime} <= time && $jobstep->{stderr_at} < $last_squeue_check) { my $sincewhen = ""; if ($jobstep->{stderr_at}) { $sincewhen = " in last " . (time - $jobstep->{stderr_at}) . "s"; } - Log($jobstep->{jobstep}, "killing orphaned srun process $pid (task not in slurm queue, no stderr received$sincewhen)"); + Log($procinfo->{jobstep}, "killing orphaned srun process $pid (task not in slurm queue, no stderr received$sincewhen)"); killem ($pid); } } @@ -1397,12 +1401,12 @@ sub check_squeue } # Check for child procs >60s old and not mentioned by squeue. - while (my ($pid, $jobstep) = each %proc) + while (my ($pid, $procinfo) = each %proc) { - if ($jobstep->{time} < time - 60 - && $jobstep->{jobstepname} - && !exists $ok{$jobstep->{jobstepname}} - && !exists $jobstep->{killtime}) + if ($procinfo->{time} < time - 60 + && $procinfo->{jobstepname} + && !exists $ok{$procinfo->{jobstepname}} + && !exists $procinfo->{killtime}) { # According to slurm, this task has ended (successfully or not) # -- but our srun child hasn't exited. First we must wait (30 @@ -1411,8 +1415,8 @@ sub check_squeue # terminated, we'll conclude some slurm communication # error/delay has caused the task to die without notifying srun, # and we'll kill srun ourselves. - $jobstep->{killtime} = time + 30; - Log($jobstep->{jobstep}, "notice: task is not in slurm queue but srun process $pid has not exited"); + $procinfo->{killtime} = time + 30; + Log($procinfo->{jobstep}, "notice: task is not in slurm queue but srun process $pid has not exited"); } } }