X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/b269c28f1d54e8609f36c8aeb77a2b6025172066..3cbdb14acde0092ae4f33e41a4bf10c6a1fce052:/sdk/cli/bin/crunch-job diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job index b38efdc53e..5539012c49 100755 --- a/sdk/cli/bin/crunch-job +++ b/sdk/cli/bin/crunch-job @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # -*- mode: perl; perl-indent-level: 2; indent-tabs-mode: nil; -*- =head1 NAME @@ -126,7 +126,7 @@ my $jobspec; my $job_api_token; my $no_clear_tmp; my $resume_stash; -my $docker_bin = "/usr/bin/docker.io"; +my $docker_bin = "docker.io"; GetOptions('force-unlock' => \$force_unlock, 'git-dir=s' => \$git_dir, 'job=s' => \$jobspec, @@ -169,8 +169,7 @@ if ($jobspec =~ /^[-a-z\d]+$/) } else { - $Job = JSON::decode_json($jobspec); - $local_job = 1; + $local_job = JSON::decode_json($jobspec); } @@ -178,7 +177,7 @@ else # at least able to run basic commands: they aren't down or severely # misconfigured. my $cmd = ['true']; -if ($Job->{docker_image_locator}) { +if (($Job || $local_job)->{docker_image_locator}) { $cmd = [$docker_bin, 'ps', '-q']; } Log(undef, "Sanity check is `@$cmd`"); @@ -208,15 +207,15 @@ else { if (!$resume_stash) { - map { croak ("No $_ specified") unless $Job->{$_} } + map { croak ("No $_ specified") unless $local_job->{$_} } qw(script script_version script_parameters); } - $Job->{'is_locked_by_uuid'} = $User->{'uuid'}; - $Job->{'started_at'} = gmtime; - $Job->{'state'} = 'Running'; + $local_job->{'is_locked_by_uuid'} = $User->{'uuid'}; + $local_job->{'started_at'} = gmtime; + $local_job->{'state'} = 'Running'; - $Job = api_call("jobs/create", job => $Job); + $Job = api_call("jobs/create", job => $local_job); } $job_id = $Job->{'uuid'}; @@ -391,12 +390,12 @@ if (!defined $no_clear_tmp) { my $cleanpid = fork(); if ($cleanpid == 0) { - # Find FUSE mounts that look like Keep mounts (the mount path has the - # word "keep") and unmount them. Then clean up work directories. - # TODO: When #5036 is done and widely deployed, we can get rid of the - # regular expression and just unmount everything with type fuse.keep. + # Find FUSE mounts under $CRUNCH_TMP and unmount them. + # Then clean up work directories. + # TODO: When #5036 is done and widely deployed, we can limit mount's + # -t option to simply fuse.keep. srun (["srun", "--nodelist=$nodelist", "-D", $ENV{'TMPDIR'}], - ['bash', '-ec', 'mount -t fuse,fuse.keep | awk \'($3 ~ /\ykeep\y/){print $3}\' | xargs -r -n 1 fusermount -u -z; sleep 1; rm -rf $JOB_WORK $CRUNCH_INSTALL $CRUNCH_TMP/task $CRUNCH_TMP/src* $CRUNCH_TMP/*.cid']); + ['bash', '-ec', '-o', 'pipefail', 'mount -t fuse,fuse.keep | awk "(index(\$3, \"$CRUNCH_TMP\") == 1){print \$3}" | xargs -r -n 1 fusermount -u -z; sleep 1; rm -rf $JOB_WORK $CRUNCH_INSTALL $CRUNCH_TMP/task $CRUNCH_TMP/src* $CRUNCH_TMP/*.cid']); exit (1); } while (1) @@ -405,7 +404,10 @@ if (!defined $no_clear_tmp) { freeze_if_want_freeze ($cleanpid); select (undef, undef, undef, 0.1); } - Log (undef, "Cleanup command exited ".exit_status_s($?)); + if ($?) { + Log(undef, "Clean work dirs: exit ".exit_status_s($?)); + exit(EX_RETRY_UNLOCKED); + } } # If this job requires a Docker image, install that. @@ -596,7 +598,7 @@ else { unless ($? == 0 && $sha1 =~ /^([0-9a-f]{40})$/) { croak("`$gitcmd rev-list` exited " .exit_status_s($?) - .", '$treeish' not found. Giving up."); + .", '$treeish' not found, giving up"); } $commit = $1; Log(undef, "Version $treeish is commit $commit"); @@ -866,13 +868,6 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++) $command .= "--memory=\${MEMLIMIT}k --memory-swap=\${SWAPLIMIT}k "; } - # Dynamically configure the container to use the host system as its - # DNS server. Get the host's global addresses from the ip command, - # and turn them into docker --dns options using gawk. - $command .= - q{$(ip -o address show scope global | - gawk 'match($4, /^([0-9\.:]+)\//, x){print "--dns", x[1]}') }; - # The source tree and $destdir directory (which we have # installed on the worker host) are available in the container, # under the same path. @@ -985,14 +980,14 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++) my $gotsome = readfrompipes () + reapchildren (); - if (!$gotsome) + if (!$gotsome || ($latest_refresh + 2 < scalar time)) { check_refresh_wanted(); check_squeue(); update_progress_stats(); select (undef, undef, undef, 0.1); } - elsif (time - $progress_stats_updated >= 30) + elsif (time - $progress_stats_updated >= 30 || $progress_is_dirty) { update_progress_stats(); } @@ -1099,8 +1094,8 @@ sub update_progress_stats $progress_stats_updated = time; return if !$progress_is_dirty; my ($todo, $done, $running) = (scalar @jobstep_todo, - scalar @jobstep_done, - scalar @slot - scalar @freeslot - scalar @holdslot); + scalar @jobstep_done, + scalar keys(%proc)); $Job->{'tasks_summary'} ||= {}; $Job->{'tasks_summary'}->{'todo'} = $todo; $Job->{'tasks_summary'}->{'done'} = $done; @@ -2042,7 +2037,7 @@ sub set_nonblocking { } __DATA__ -#!/usr/bin/perl +#!/usr/bin/env perl # # This is crunch-job's internal dispatch script. crunch-job running on the API # server invokes this script on individual compute nodes, or localhost if we're @@ -2210,11 +2205,12 @@ if (-d $sdk_root) { my $python_dir = "$install_dir/python"; if ((-d $python_dir) and can_run("python2.7")) { open(my $egg_info_pipe, "-|", - "python2.7 \Q$python_dir/setup.py\E --quiet egg_info 2>&1 >/dev/null"); + "python2.7 \Q$python_dir/setup.py\E egg_info 2>&1 >/dev/null"); my @egg_info_errors = <$egg_info_pipe>; close($egg_info_pipe); + if ($?) { - if (@egg_info_errors and ($egg_info_errors[-1] =~ /\bgit\b/)) { + if (@egg_info_errors and (($egg_info_errors[-1] =~ /\bgit\b/) or ($egg_info_errors[-1] =~ /\[Errno 2\]/))) { # egg_info apparently failed because it couldn't ask git for a build tag. # Specify no build tag. open(my $pysdk_cfg, ">>", "$python_dir/setup.cfg"); @@ -2223,7 +2219,7 @@ if ((-d $python_dir) and can_run("python2.7")) { } else { my $egg_info_exit = $? >> 8; foreach my $errline (@egg_info_errors) { - print STDERR_ORIG $errline; + warn $errline; } warn "python setup.py egg_info failed: exit $egg_info_exit"; exit ($egg_info_exit || 1);