X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/6c31bea91f136bb300376847ced1fd965e037dd3..714c555bda26a6a27fad7caef382d1d6705ad215:/sdk/cli/bin/crunch-job?ds=sidebyside diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job index 5539012c49..28da66d0ca 100755 --- a/sdk/cli/bin/crunch-job +++ b/sdk/cli/bin/crunch-job @@ -411,7 +411,7 @@ if (!defined $no_clear_tmp) { } # If this job requires a Docker image, install that. -my ($docker_locator, $docker_stream, $docker_hash, $docker_limitmem); +my ($docker_locator, $docker_stream, $docker_hash, $docker_limitmem, $dockeruserarg); if ($docker_locator = $Job->{docker_image_locator}) { ($docker_stream, $docker_hash) = find_docker_image($docker_locator); if (!$docker_hash) @@ -449,6 +449,42 @@ fi {fork => 1}); $docker_limitmem = ($? == 0); + # Find a non-root Docker user to use. + # Tries the default user for the container, then 'crunch', then 'nobody', + # testing for whether the actual user id is non-zero. This defends against + # mistakes but not malice, but we intend to harden the security in the future + # so we don't want anyone getting used to their jobs running as root in their + # Docker containers. + my @tryusers = ("", "crunch", "nobody"); + foreach my $try_user (@tryusers) { + my $try_user_arg; + if ($try_user eq "") { + Log(undef, "Checking if container default user is not UID 0"); + $try_user_arg = ""; + } else { + Log(undef, "Checking if user '$try_user' is not UID 0"); + $try_user_arg = "--user=$try_user"; + } + srun(["srun", "--nodelist=" . $node[0]], + ["/bin/sh", "-ec", + "a=`$docker_bin run $try_user_arg $docker_hash id --user` && " . + " test \$a -ne 0"], + {fork => 1}); + if ($? == 0) { + $dockeruserarg = $try_user_arg; + if ($try_user eq "") { + Log(undef, "Container will run with default user"); + } else { + Log(undef, "Container will run with $dockeruserarg"); + } + last; + } + } + + if (!defined $dockeruserarg) { + croak("Could not find a user in container that is not UID 0 (tried default user, @tryusers) or there was a problem running 'id' in the container."); + } + if ($Job->{arvados_sdk_version}) { # The job also specifies an Arvados SDK version. Add the SDKs to the # tar file for the build script to install. @@ -784,6 +820,9 @@ update_progress_stats(); THISROUND: for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++) { + # Don't create new tasks if we already know the job's final result. + last if defined($main::success); + my $id = $jobstep_todo[$todo_ptr]; my $Jobstep = $jobstep[$id]; if ($Jobstep->{level} != $level) @@ -844,6 +883,9 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++) qw(-n1 -c1 -N1 -D), $ENV{'TMPDIR'}, "--job-name=$job_id.$id.$$", ); + + my $stdbuf = " stdbuf --output=0 --error=0 "; + my $command = "if [ -e $ENV{TASK_WORK} ]; then rm -rf $ENV{TASK_WORK}; fi; " ."mkdir -p $ENV{CRUNCH_TMP} $ENV{JOB_WORK} $ENV{TASK_WORK} $ENV{TASK_KEEPMOUNT} " @@ -854,12 +896,13 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++) .q{&& SWAP=$(awk '($1 == "SwapTotal:"){print $2}' {"script"}; + + if ($Job->{arvados_sdk_version}) { + $command .= $stdbuf; + $command .= "perl - \Q$ENV{CRUNCH_SRC}/crunch_scripts/$Job->{script}\E"; + } else { + $command .= "/bin/sh -c \'mkdir -p \"$ENV{JOB_WORK}\" \"$ENV{TASK_WORK}\" && " . + "if which stdbuf >/dev/null ; then " . + " exec $stdbuf \Q$ENV{CRUNCH_SRC}/crunch_scripts/$Job->{script}\E ;" . + " else " . + " exec \Q$ENV{CRUNCH_SRC}/crunch_scripts/$Job->{script}\E ;" . + " fi\'"; + } } else { # Non-docker run $command .= "crunchstat -cgroup-root=/sys/fs/cgroup -poll=10000 "; - $command .= "stdbuf --output=0 --error=0 "; + $command .= $stdbuf; $command .= "perl - $ENV{CRUNCH_SRC}/crunch_scripts/" . $Job->{"script"}; } @@ -968,7 +1021,7 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++) || ($round_num_freeslots > @freeslot && $todo_ptr+1 > $#jobstep_todo)) { - last THISROUND if $main::please_freeze || defined($main::success); + last THISROUND if $main::please_freeze; if ($main::please_info) { $main::please_info = 0; @@ -1129,6 +1182,9 @@ sub reapchildren if (!defined $task_success) { # task did not indicate one way or the other --> fail + Log($jobstepid, sprintf( + "ERROR: Task process exited %d, but never updated its task record to indicate success and record its output.", + exit_status_s($childstatus))); $Jobstep->{'arvados_task'}->{success} = 0; $Jobstep->{'arvados_task'}->save; $task_success = 0; @@ -1645,20 +1701,24 @@ sub log_writer_finish() close($log_pipe_in); + my $logger_failed = 0; my $read_result = log_writer_read_output(120); if ($read_result == -1) { + $logger_failed = -1; Log (undef, "timed out reading from 'arv-put'"); } elsif ($read_result != 0) { + $logger_failed = -2; Log(undef, "failed to read arv-put log manifest to EOF"); } waitpid($log_pipe_pid, 0); if ($?) { + $logger_failed ||= $?; Log(undef, "log_writer_finish: arv-put exited " . exit_status_s($?)) } close($log_pipe_out); - my $arv_put_output = $log_pipe_out_buf; + my $arv_put_output = $logger_failed ? undef : $log_pipe_out_buf; $log_pipe_pid = $log_pipe_in = $log_pipe_out = $log_pipe_out_buf = $log_pipe_out_select = undef; @@ -1724,13 +1784,13 @@ sub save_meta my $justcheckpoint = shift; # false if this will be the last meta saved return if $justcheckpoint; # checkpointing is not relevant post-Warehouse.pm return unless log_writer_is_active(); + my $log_manifest = log_writer_finish(); + return unless defined($log_manifest); - my $log_manifest = ""; if ($Job->{log}) { my $prev_log_coll = api_call("collections/get", uuid => $Job->{log}); - $log_manifest .= $prev_log_coll->{manifest_text}; + $log_manifest = $prev_log_coll->{manifest_text} . $log_manifest; } - $log_manifest .= log_writer_finish(); my $log_coll = api_call( "collections/create", ensure_unique_name => 1, collection => {