X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/71e1b1d62a71ad052487f5e8ecb8f36ae17ca8e1..38d8d205385181811235502dbad7c6e27fcea2a7:/sdk/cli/bin/crunch-job?ds=sidebyside diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job index 33e775ff14..8167eda9f0 100755 --- a/sdk/cli/bin/crunch-job +++ b/sdk/cli/bin/crunch-job @@ -390,12 +390,12 @@ if (!defined $no_clear_tmp) { my $cleanpid = fork(); if ($cleanpid == 0) { - # Find FUSE mounts that look like Keep mounts (the mount path has the - # word "keep") and unmount them. Then clean up work directories. - # TODO: When #5036 is done and widely deployed, we can get rid of the - # regular expression and just unmount everything with type fuse.keep. + # Find FUSE mounts under $CRUNCH_TMP and unmount them. + # Then clean up work directories. + # TODO: When #5036 is done and widely deployed, we can limit mount's + # -t option to simply fuse.keep. srun (["srun", "--nodelist=$nodelist", "-D", $ENV{'TMPDIR'}], - ['bash', '-ec', '-o', 'pipefail', 'mount -t fuse,fuse.keep | awk \'($3 ~ /\ykeep\y/){print $3}\' | xargs -r -n 1 fusermount -u -z; sleep 1; rm -rf $JOB_WORK $CRUNCH_INSTALL $CRUNCH_TMP/task $CRUNCH_TMP/src* $CRUNCH_TMP/*.cid']); + ['bash', '-ec', '-o', 'pipefail', 'mount -t fuse,fuse.keep | awk "(index(\$3, \"$CRUNCH_TMP\") == 1){print \$3}" | xargs -r -n 1 fusermount -u -z; sleep 1; rm -rf $JOB_WORK $CRUNCH_INSTALL $CRUNCH_TMP/task $CRUNCH_TMP/src* $CRUNCH_TMP/*.cid']); exit (1); } while (1) @@ -411,7 +411,7 @@ if (!defined $no_clear_tmp) { } # If this job requires a Docker image, install that. -my ($docker_locator, $docker_stream, $docker_hash, $docker_limitmem); +my ($docker_locator, $docker_stream, $docker_hash, $docker_limitmem, $dockeruserarg); if ($docker_locator = $Job->{docker_image_locator}) { ($docker_stream, $docker_hash) = find_docker_image($docker_locator); if (!$docker_hash) @@ -449,6 +449,42 @@ fi {fork => 1}); $docker_limitmem = ($? == 0); + # Find a non-root Docker user to use. + # Tries the default user for the container, then 'crunch', then 'nobody', + # testing for whether the actual user id is non-zero. This defends against + # mistakes but not malice, but we intend to harden the security in the future + # so we don't want anyone getting used to their jobs running as root in their + # Docker containers. + my @tryusers = ("", "crunch", "nobody"); + foreach my $try_user (@tryusers) { + my $try_user_arg; + if ($try_user eq "") { + Log(undef, "Checking if container default user is not UID 0"); + $try_user_arg = ""; + } else { + Log(undef, "Checking if user '$try_user' is not UID 0"); + $try_user_arg = "--user=$try_user"; + } + srun(["srun", "--nodelist=" . $node[0]], + ["/bin/sh", "-ec", + "a=`$docker_bin run --rm --user=$try_user $docker_hash id --user` && " . + " test \$a -ne 0"], + {fork => 1}); + if ($? == 0) { + $dockeruserarg = $try_user_arg; + if ($try_user eq "") { + Log(undef, "Container will run with default user"); + } else { + Log(undef, "Container will run with $dockeruserarg"); + } + last; + } + } + + if (!defined $dockeruserarg) { + croak("Could not find a user in container that is not UID 0 (tried default user, @tryusers) or there was a problem running 'id' in the container."); + } + if ($Job->{arvados_sdk_version}) { # The job also specifies an Arvados SDK version. Add the SDKs to the # tar file for the build script to install. @@ -859,7 +895,7 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++) { my $cidfile = "$ENV{CRUNCH_TMP}/$Jobstep->{arvados_task}->{uuid}-$Jobstep->{failures}.cid"; $command .= "crunchstat -cgroup-root=/sys/fs/cgroup -cgroup-parent=docker -cgroup-cid=$cidfile -poll=10000 "; - $command .= "$docker_bin run --rm=true --attach=stdout --attach=stderr --attach=stdin -i --user=crunch --cidfile=$cidfile --sig-proxy "; + $command .= "$docker_bin run --rm=true --attach=stdout --attach=stderr --attach=stdin -i \Q$dockeruserarg\E --cidfile=$cidfile --sig-proxy "; # We only set memory limits if Docker lets us limit both memory and swap. # Memory limits alone have been supported longer, but subprocesses tend # to get SIGKILL if they exceed that without any swap limit set. @@ -917,8 +953,13 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++) } $command .= "--env=\QHOME=$ENV{HOME}\E "; $command .= "\Q$docker_hash\E "; - $command .= "stdbuf --output=0 --error=0 "; - $command .= "perl - $ENV{CRUNCH_SRC}/crunch_scripts/" . $Job->{"script"}; + + if ($Job->{arvados_sdk_version}) { + $command .= "stdbuf --output=0 --error=0 "; + $command .= "perl - \Q$ENV{CRUNCH_SRC}/crunch_scripts/$Job->{script}\E"; + } else { + $command .= "/bin/sh -c \'mkdir -p \"$ENV{JOB_WORK}\" \"$ENV{TASK_WORK}\" && exec \Q$ENV{CRUNCH_SRC}/crunch_scripts/$Job->{script}\E\'"; + } } else { # Non-docker run $command .= "crunchstat -cgroup-root=/sys/fs/cgroup -poll=10000 "; @@ -980,14 +1021,14 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++) my $gotsome = readfrompipes () + reapchildren (); - if (!$gotsome) + if (!$gotsome || ($latest_refresh + 2 < scalar time)) { check_refresh_wanted(); check_squeue(); update_progress_stats(); select (undef, undef, undef, 0.1); } - elsif (time - $progress_stats_updated >= 30) + elsif (time - $progress_stats_updated >= 30 || $progress_is_dirty) { update_progress_stats(); } @@ -1094,8 +1135,8 @@ sub update_progress_stats $progress_stats_updated = time; return if !$progress_is_dirty; my ($todo, $done, $running) = (scalar @jobstep_todo, - scalar @jobstep_done, - scalar @slot - scalar @freeslot - scalar @holdslot); + scalar @jobstep_done, + scalar keys(%proc)); $Job->{'tasks_summary'} ||= {}; $Job->{'tasks_summary'}->{'todo'} = $todo; $Job->{'tasks_summary'}->{'done'} = $done; @@ -2205,11 +2246,12 @@ if (-d $sdk_root) { my $python_dir = "$install_dir/python"; if ((-d $python_dir) and can_run("python2.7")) { open(my $egg_info_pipe, "-|", - "python2.7 \Q$python_dir/setup.py\E --quiet egg_info 2>&1 >/dev/null"); + "python2.7 \Q$python_dir/setup.py\E egg_info 2>&1 >/dev/null"); my @egg_info_errors = <$egg_info_pipe>; close($egg_info_pipe); + if ($?) { - if (@egg_info_errors and ($egg_info_errors[-1] =~ /\bgit\b/)) { + if (@egg_info_errors and (($egg_info_errors[-1] =~ /\bgit\b/) or ($egg_info_errors[-1] =~ /\[Errno 2\]/))) { # egg_info apparently failed because it couldn't ask git for a build tag. # Specify no build tag. open(my $pysdk_cfg, ">>", "$python_dir/setup.cfg"); @@ -2218,7 +2260,7 @@ if ((-d $python_dir) and can_run("python2.7")) { } else { my $egg_info_exit = $? >> 8; foreach my $errline (@egg_info_errors) { - print STDERR_ORIG $errline; + warn $errline; } warn "python setup.py egg_info failed: exit $egg_info_exit"; exit ($egg_info_exit || 1);