-#!/usr/bin/perl
+#!/usr/bin/env perl
# -*- mode: perl; perl-indent-level: 2; indent-tabs-mode: nil; -*-
=head1 NAME
use constant TASK_TEMPFAIL => 111;
use constant EX_TEMPFAIL => 75;
+use constant EX_RETRY_UNLOCKED => 93;
$ENV{"TMPDIR"} ||= "/tmp";
unless (defined $ENV{"CRUNCH_TMP"}) {
}
}
+$ENV{"HOST_CRUNCHRUNNER_BIN"} ||= `which crunchrunner`;
+unless (defined($ENV{"HOST_CERTS"})) {
+ if (-f "/etc/ssl/certs/ca-certificates.crt") {
+ $ENV{"HOST_CERTS"} = "/etc/ssl/certs/ca-certificates.crt";
+ } elsif (-f "/etc/pki/tls/certs/ca-bundle.crt") {
+ $ENV{"HOST_CERTS"} = "/etc/pki/tls/certs/ca-bundle.crt";
+ }
+}
+
# Create the tmp directory if it does not exist
if ( ! -d $ENV{"CRUNCH_TMP"} ) {
make_path $ENV{"CRUNCH_TMP"} or die "Failed to create temporary working directory: " . $ENV{"CRUNCH_TMP"};
$ENV{"CRUNCH_WORK"} = $ENV{"JOB_WORK"}; # deprecated
mkdir ($ENV{"JOB_WORK"});
+my %proc;
my $force_unlock;
my $git_dir;
my $jobspec;
my $job_api_token;
my $no_clear_tmp;
my $resume_stash;
+my $cgroup_root = "/sys/fs/cgroup";
+my $docker_bin = "docker.io";
+my $docker_run_args = "";
GetOptions('force-unlock' => \$force_unlock,
'git-dir=s' => \$git_dir,
'job=s' => \$jobspec,
'job-api-token=s' => \$job_api_token,
'no-clear-tmp' => \$no_clear_tmp,
'resume-stash=s' => \$resume_stash,
+ 'cgroup-root=s' => \$cgroup_root,
+ 'docker-bin=s' => \$docker_bin,
+ 'docker-run-args=s' => \$docker_run_args,
);
if (defined $job_api_token) {
$ENV{ARVADOS_API_TOKEN} = $job_api_token;
}
-my $have_slurm = exists $ENV{SLURM_JOBID} && exists $ENV{SLURM_NODELIST};
-my $local_job = 0;
+my $have_slurm = exists $ENV{SLURM_JOB_ID} && exists $ENV{SLURM_NODELIST};
$SIG{'USR1'} = sub
$main::ENV{CRUNCH_DEBUG} = 0;
};
-
-
my $arv = Arvados->new('apiVersion' => 'v1');
my $Job;
my $sth;
my @jobstep;
-my $User = api_call("users/current");
-
+my $local_job;
if ($jobspec =~ /^[-a-z\d]+$/)
{
# $jobspec is an Arvados UUID, not a JSON job specification
$Job = api_call("jobs/get", uuid => $jobspec);
+ $local_job = 0;
+}
+else
+{
+ $local_job = JSON::decode_json($jobspec);
+}
+
+
+# Make sure our workers (our slurm nodes, localhost, or whatever) are
+# at least able to run basic commands: they aren't down or severely
+# misconfigured.
+my $cmd = ['true'];
+if (($Job || $local_job)->{docker_image_locator}) {
+ $cmd = [$docker_bin, 'ps', '-q'];
+}
+Log(undef, "Sanity check is `@$cmd`");
+my ($exited, $stdout, $stderr) = srun_sync(
+ ["srun", "--nodes=\Q$ENV{SLURM_NNODES}\E", "--ntasks-per-node=1"],
+ $cmd,
+ {label => "sanity check"});
+if ($exited != 0) {
+ Log(undef, "Sanity check failed: ".exit_status_s($exited));
+ exit EX_TEMPFAIL;
+}
+Log(undef, "Sanity check OK");
+
+
+my $User = api_call("users/current");
+
+if (!$local_job) {
if (!$force_unlock) {
# Claim this job, and make sure nobody else does
eval { api_call("jobs/lock", uuid => $Job->{uuid}); };
}
else
{
- $Job = JSON::decode_json($jobspec);
-
if (!$resume_stash)
{
- map { croak ("No $_ specified") unless $Job->{$_} }
+ map { croak ("No $_ specified") unless $local_job->{$_} }
qw(script script_version script_parameters);
}
- $Job->{'is_locked_by_uuid'} = $User->{'uuid'};
- $Job->{'started_at'} = gmtime;
- $Job->{'state'} = 'Running';
+ $local_job->{'is_locked_by_uuid'} = $User->{'uuid'};
+ $local_job->{'started_at'} = gmtime;
+ $local_job->{'state'} = 'Running';
- $Job = api_call("jobs/create", job => $Job);
+ $Job = api_call("jobs/create", job => $local_job);
}
$job_id = $Job->{'uuid'};
{
Log (undef, "node $nodename - $ncpus slots");
my $node = { name => $nodename,
- ncpus => $ncpus,
- losing_streak => 0,
- hold_until => 0 };
+ ncpus => $ncpus,
+ # The number of consecutive times a task has been dispatched
+ # to this node and failed.
+ losing_streak => 0,
+ # The number of consecutive times that SLURM has reported
+ # a node failure since the last successful task.
+ fail_count => 0,
+ # Don't dispatch work to this node until this time
+ # (in seconds since the epoch) has passed.
+ hold_until => 0 };
foreach my $cpu (1..$ncpus)
{
push @slot, { node => $node,
my @jobstep_done = ();
my @jobstep_tomerge = ();
my $jobstep_tomerge_level = 0;
-my $squeue_checked;
-my $squeue_kill_checked;
+my $squeue_checked = 0;
my $latest_refresh = scalar time;
my $git_tar_count = 0;
if (!defined $no_clear_tmp) {
- # Clean out crunch_tmp/work, crunch_tmp/opt, crunch_tmp/src*
- Log (undef, "Clean work dirs");
-
- my $cleanpid = fork();
- if ($cleanpid == 0)
- {
- # Find FUSE mounts that look like Keep mounts (the mount path has the
- # word "keep") and unmount them. Then clean up work directories.
- # TODO: When #5036 is done and widely deployed, we can get rid of the
- # regular expression and just unmount everything with type fuse.keep.
- srun (["srun", "--nodelist=$nodelist", "-D", $ENV{'TMPDIR'}],
- ['bash', '-ec', 'mount -t fuse,fuse.keep | awk \'($3 ~ /\ykeep\y/){print $3}\' | xargs -r -n 1 fusermount -u -z; sleep 1; rm -rf $JOB_WORK $CRUNCH_INSTALL $CRUNCH_TMP/task $CRUNCH_TMP/src* $CRUNCH_TMP/*.cid']);
- exit (1);
- }
- while (1)
- {
- last if $cleanpid == waitpid (-1, WNOHANG);
- freeze_if_want_freeze ($cleanpid);
- select (undef, undef, undef, 0.1);
+ # Find FUSE mounts under $CRUNCH_TMP and unmount them. Then clean
+ # up work directories crunch_tmp/work, crunch_tmp/opt,
+ # crunch_tmp/src*.
+ #
+ # TODO: When #5036 is done and widely deployed, we can limit mount's
+ # -t option to simply fuse.keep.
+ my ($exited, $stdout, $stderr) = srun_sync(
+ ["srun", "--nodelist=$nodelist", "-D", $ENV{'TMPDIR'}],
+ ['bash', '-ec', '-o', 'pipefail', 'mount -t fuse,fuse.keep | awk "(index(\$3, \"$CRUNCH_TMP\") == 1){print \$3}" | xargs -r -n 1 fusermount -u -z; sleep 1; rm -rf $JOB_WORK $CRUNCH_INSTALL $CRUNCH_TMP/task $CRUNCH_TMP/src* $CRUNCH_TMP/*.cid'],
+ {label => "clean work dirs"});
+ if ($exited != 0) {
+ exit(EX_RETRY_UNLOCKED);
}
- Log (undef, "Cleanup command exited ".exit_status_s($?));
}
# If this job requires a Docker image, install that.
-my $docker_bin = "/usr/bin/docker.io";
-my ($docker_locator, $docker_stream, $docker_hash);
+my ($docker_locator, $docker_stream, $docker_hash, $docker_limitmem, $dockeruserarg);
if ($docker_locator = $Job->{docker_image_locator}) {
+ Log (undef, "Install docker image $docker_locator");
($docker_stream, $docker_hash) = find_docker_image($docker_locator);
if (!$docker_hash)
{
croak("No Docker image hash found from locator $docker_locator");
}
+ Log (undef, "docker image hash is $docker_hash");
$docker_stream =~ s/^\.//;
my $docker_install_script = qq{
if ! $docker_bin images -q --no-trunc --all | grep -qxF \Q$docker_hash\E; then
arv-get \Q$docker_locator$docker_stream/$docker_hash.tar\E | $docker_bin load
fi
};
- my $docker_pid = fork();
- if ($docker_pid == 0)
- {
- srun (["srun", "--nodelist=" . join(',', @node)],
- ["/bin/sh", "-ec", $docker_install_script]);
- exit ($?);
- }
- while (1)
+
+ my ($exited, $stdout, $stderr) = srun_sync(
+ ["srun", "--nodelist=" . join(',', @node)],
+ ["/bin/bash", "-o", "pipefail", "-ec", $docker_install_script],
+ {label => "load docker image"});
+ if ($exited != 0)
{
- last if $docker_pid == waitpid (-1, WNOHANG);
- freeze_if_want_freeze ($docker_pid);
- select (undef, undef, undef, 0.1);
+ exit(EX_RETRY_UNLOCKED);
+ }
+
+ # Determine whether this version of Docker supports memory+swap limits.
+ ($exited, $stdout, $stderr) = srun_sync(
+ ["srun", "--nodes=1"],
+ [$docker_bin, 'run', '--help'],
+ {label => "check --memory-swap feature"});
+ $docker_limitmem = ($stdout =~ /--memory-swap/);
+
+ # Find a non-root Docker user to use.
+ # Tries the default user for the container, then 'crunch', then 'nobody',
+ # testing for whether the actual user id is non-zero. This defends against
+ # mistakes but not malice, but we intend to harden the security in the future
+ # so we don't want anyone getting used to their jobs running as root in their
+ # Docker containers.
+ my @tryusers = ("", "crunch", "nobody");
+ foreach my $try_user (@tryusers) {
+ my $label;
+ my $try_user_arg;
+ if ($try_user eq "") {
+ $label = "check whether default user is UID 0";
+ $try_user_arg = "";
+ } else {
+ $label = "check whether user '$try_user' is UID 0";
+ $try_user_arg = "--user=$try_user";
+ }
+ my ($exited, $stdout, $stderr) = srun_sync(
+ ["srun", "--nodes=1"],
+ ["/bin/sh", "-ec",
+ "$docker_bin run $docker_run_args $try_user_arg $docker_hash id --user"],
+ {label => $label});
+ chomp($stdout);
+ if ($exited == 0 && $stdout =~ /^\d+$/ && $stdout > 0) {
+ $dockeruserarg = $try_user_arg;
+ if ($try_user eq "") {
+ Log(undef, "Container will run with default user");
+ } else {
+ Log(undef, "Container will run with $dockeruserarg");
+ }
+ last;
+ }
}
- if ($? != 0)
- {
- croak("Installing Docker image from $docker_locator exited "
- .exit_status_s($?));
+
+ if (!defined $dockeruserarg) {
+ croak("Could not find a user in container that is not UID 0 (tried default user, @tryusers) or there was a problem running 'id' in the container.");
}
if ($Job->{arvados_sdk_version}) {
unless ($? == 0 && $sha1 =~ /^([0-9a-f]{40})$/) {
croak("`$gitcmd rev-list` exited "
.exit_status_s($?)
- .", '$treeish' not found. Giving up.");
+ .", '$treeish' not found, giving up");
}
$commit = $1;
Log(undef, "Version $treeish is commit $commit");
}
}
else {
- Log(undef, "Run install script on all workers");
+ my $exited;
+ my $install_script_tries_left = 3;
+ for (my $attempts = 0; $attempts < 3; $attempts++) {
+ my @srunargs = ("srun",
+ "--nodelist=$nodelist",
+ "-D", $ENV{'TMPDIR'}, "--job-name=$job_id");
+ my @execargs = ("sh", "-c",
+ "mkdir -p $ENV{CRUNCH_INSTALL} && cd $ENV{CRUNCH_TMP} && perl -");
+
+ $ENV{"CRUNCH_GIT_ARCHIVE_HASH"} = md5_hex($git_archive);
+ my ($stdout, $stderr);
+ ($exited, $stdout, $stderr) = srun_sync(
+ \@srunargs, \@execargs,
+ {label => "run install script on all workers"},
+ $build_script . $git_archive);
+
+ my $stderr_anything_from_script = 0;
+ for my $line (split(/\n/, $stderr)) {
+ if ($line !~ /^(srun: error: |starting: \[)/) {
+ $stderr_anything_from_script = 1;
+ }
+ }
- my @srunargs = ("srun",
- "--nodelist=$nodelist",
- "-D", $ENV{'TMPDIR'}, "--job-name=$job_id");
- my @execargs = ("sh", "-c",
- "mkdir -p $ENV{CRUNCH_INSTALL} && cd $ENV{CRUNCH_TMP} && perl -");
+ last if $exited == 0 || $main::please_freeze;
- my $installpid = fork();
- if ($installpid == 0)
- {
- srun (\@srunargs, \@execargs, {}, $build_script . $git_archive);
- exit (1);
+ # If the install script fails but doesn't print an error message,
+ # the next thing anyone is likely to do is just run it again in
+ # case it was a transient problem like "slurm communication fails
+ # because the network isn't reliable enough". So we'll just do
+ # that ourselves (up to 3 attempts in total). OTOH, if there is an
+ # error message, the problem is more likely to have a real fix and
+ # we should fail the job so the fixing process can start, instead
+ # of doing 2 more attempts.
+ last if $stderr_anything_from_script;
}
- while (1)
- {
- last if $installpid == waitpid (-1, WNOHANG);
- freeze_if_want_freeze ($installpid);
- select (undef, undef, undef, 0.1);
- }
- my $install_exited = $?;
- Log (undef, "Install script exited ".exit_status_s($install_exited));
+
foreach my $tar_filename (map { tar_filename_n($_); } (1..$git_tar_count)) {
unlink($tar_filename);
}
- exit (1) if $install_exited != 0;
+
+ if ($exited != 0) {
+ croak("Giving up");
+ }
}
foreach (qw (script script_version script_parameters runtime_constraints))
my $thisround_succeeded = 0;
my $thisround_failed = 0;
my $thisround_failed_multiple = 0;
+my $working_slot_count = scalar(@slot);
@jobstep_todo = sort { $jobstep[$a]->{level} <=> $jobstep[$b]->{level}
or $a <=> $b } @jobstep_todo;
my $level = $jobstep[$jobstep_todo[0]]->{level};
-Log (undef, "start level $level");
+my $initial_tasks_this_level = 0;
+foreach my $id (@jobstep_todo) {
+ $initial_tasks_this_level++ if ($jobstep[$id]->{level} == $level);
+}
+
+# If the number of tasks scheduled at this level #T is smaller than the number
+# of slots available #S, only use the first #T slots, or the first slot on
+# each node, whichever number is greater.
+#
+# When we dispatch tasks later, we'll allocate whole-node resources like RAM
+# based on these numbers. Using fewer slots makes more resources available
+# to each individual task, which should normally be a better strategy when
+# there are fewer of them running with less parallelism.
+#
+# Note that this calculation is not redone if the initial tasks at
+# this level queue more tasks at the same level. This may harm
+# overall task throughput for that level.
+my @freeslot;
+if ($initial_tasks_this_level < @node) {
+ @freeslot = (0..$#node);
+} elsif ($initial_tasks_this_level < @slot) {
+ @freeslot = (0..$initial_tasks_this_level - 1);
+} else {
+ @freeslot = (0..$#slot);
+}
+my $round_num_freeslots = scalar(@freeslot);
+print STDERR "crunch-job have ${round_num_freeslots} free slots for ${initial_tasks_this_level} initial tasks at this level, ".scalar(@node)." nodes, and ".scalar(@slot)." slots\n";
+my %round_max_slots = ();
+for (my $ii = $#freeslot; $ii >= 0; $ii--) {
+ my $this_slot = $slot[$freeslot[$ii]];
+ my $node_name = $this_slot->{node}->{name};
+ $round_max_slots{$node_name} ||= $this_slot->{cpu};
+ last if (scalar(keys(%round_max_slots)) >= @node);
+}
-my %proc;
-my @freeslot = (0..$#slot);
+Log(undef, "start level $level with $round_num_freeslots slots");
my @holdslot;
my %reader;
my $progress_is_dirty = 1;
update_progress_stats();
-
THISROUND:
for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
{
+ # Don't create new tasks if we already know the job's final result.
+ last if defined($main::success);
+
my $id = $jobstep_todo[$todo_ptr];
my $Jobstep = $jobstep[$id];
if ($Jobstep->{level} != $level)
next;
}
- pipe $reader{$id}, "writer" or croak ($!);
- my $flags = fcntl ($reader{$id}, F_GETFL, 0) or croak ($!);
- fcntl ($reader{$id}, F_SETFL, $flags | O_NONBLOCK) or croak ($!);
+ pipe $reader{$id}, "writer" or croak("pipe() failed: $!");
+ set_nonblocking($reader{$id});
my $childslot = $freeslot[0];
my $childnode = $slot[$childslot]->{node};
$ENV{"TASK_SLOT_NUMBER"} = $slot[$childslot]->{cpu};
$ENV{"TASK_WORK"} = $ENV{"CRUNCH_TMP"}."/task/$childslotname";
$ENV{"HOME"} = $ENV{"TASK_WORK"};
- $ENV{"TASK_KEEPMOUNT"} = $ENV{"TASK_WORK"}.".keep";
$ENV{"TASK_TMPDIR"} = $ENV{"TASK_WORK"}; # deprecated
- $ENV{"CRUNCH_NODE_SLOTS"} = $slot[$childslot]->{node}->{ncpus};
+ $ENV{"CRUNCH_NODE_SLOTS"} = $round_max_slots{$ENV{TASK_SLOT_NODE}};
$ENV{"PATH"} = $ENV{"CRUNCH_INSTALL"} . "/bin:" . $ENV{"PATH"};
+ my $keep_mnt = $ENV{"TASK_WORK"}.".keep";
+
$ENV{"GZIP"} = "-n";
my @srunargs = (
qw(-n1 -c1 -N1 -D), $ENV{'TMPDIR'},
"--job-name=$job_id.$id.$$",
);
+
+ my $stdbuf = " stdbuf --output=0 --error=0 ";
+
+ my $arv_file_cache = "";
+ if (defined($Job->{'runtime_constraints'}->{'keep_cache_mb_per_task'})) {
+ $arv_file_cache = "--file-cache=" . ($Job->{'runtime_constraints'}->{'keep_cache_mb_per_task'} * 1024 * 1024);
+ }
+
my $command =
- "if [ -e $ENV{TASK_WORK} ]; then rm -rf $ENV{TASK_WORK}; fi; "
- ."mkdir -p $ENV{CRUNCH_TMP} $ENV{JOB_WORK} $ENV{TASK_WORK} $ENV{TASK_KEEPMOUNT} "
- ."&& cd $ENV{CRUNCH_TMP} ";
- $command .= "&& exec arv-mount --by-id --allow-other $ENV{TASK_KEEPMOUNT} --exec ";
+ "if [ -e \Q$ENV{TASK_WORK}\E ]; then rm -rf \Q$ENV{TASK_WORK}\E; fi; "
+ ."mkdir -p \Q$ENV{CRUNCH_TMP}\E \Q$ENV{JOB_WORK}\E \Q$ENV{TASK_WORK}\E \Q$keep_mnt\E "
+ ."&& cd \Q$ENV{CRUNCH_TMP}\E "
+ # These environment variables get used explicitly later in
+ # $command. No tool is expected to read these values directly.
+ .q{&& MEM=$(awk '($1 == "MemTotal:"){print $2}' </proc/meminfo) }
+ .q{&& SWAP=$(awk '($1 == "SwapTotal:"){print $2}' </proc/meminfo) }
+ ."&& MEMLIMIT=\$(( (\$MEM * 95) / ($ENV{CRUNCH_NODE_SLOTS} * 100) )) "
+ ."&& let SWAPLIMIT=\$MEMLIMIT+\$SWAP ";
+
+ $command .= "&& exec arv-mount --read-write --mount-by-pdh=by_pdh --mount-tmp=tmp --crunchstat-interval=10 --allow-other $arv_file_cache \Q$keep_mnt\E --exec ";
+ $ENV{TASK_KEEPMOUNT} = "$keep_mnt/by_pdh";
+ $ENV{TASK_KEEPMOUNT_TMP} = "$keep_mnt/tmp";
+
if ($docker_hash)
{
- my $cidfile = "$ENV{CRUNCH_TMP}/$Jobstep->{arvados_task}->{uuid}-$Jobstep->{failures}.cid";
- $command .= "crunchstat -cgroup-root=/sys/fs/cgroup -cgroup-parent=docker -cgroup-cid=$cidfile -poll=10000 ";
- $command .= "$docker_bin run --rm=true --attach=stdout --attach=stderr --attach=stdin -i --user=crunch --cidfile=$cidfile --sig-proxy ";
-
- # Dynamically configure the container to use the host system as its
- # DNS server. Get the host's global addresses from the ip command,
- # and turn them into docker --dns options using gawk.
- $command .=
- q{$(ip -o address show scope global |
- gawk 'match($4, /^([0-9\.:]+)\//, x){print "--dns", x[1]}') };
+ my $containername = "$Jobstep->{arvados_task}->{uuid}-$Jobstep->{failures}";
+ my $cidfile = "$ENV{CRUNCH_TMP}/$containername.cid";
+ $command .= "crunchstat -cgroup-root=\Q$cgroup_root\E -cgroup-parent=docker -cgroup-cid=$cidfile -poll=10000 ";
+ $command .= "$docker_bin run $docker_run_args --name=$containername --attach=stdout --attach=stderr --attach=stdin -i \Q$dockeruserarg\E --cidfile=$cidfile --sig-proxy ";
+ # We only set memory limits if Docker lets us limit both memory and swap.
+ # Memory limits alone have been supported longer, but subprocesses tend
+ # to get SIGKILL if they exceed that without any swap limit set.
+ # See #5642 for additional background.
+ if ($docker_limitmem) {
+ $command .= "--memory=\${MEMLIMIT}k --memory-swap=\${SWAPLIMIT}k ";
+ }
# The source tree and $destdir directory (which we have
# installed on the worker host) are available in the container,
$command .= "--volume=\Q$ENV{CRUNCH_SRC}:$ENV{CRUNCH_SRC}:ro\E ";
$command .= "--volume=\Q$ENV{CRUNCH_INSTALL}:$ENV{CRUNCH_INSTALL}:ro\E ";
- # Currently, we make arv-mount's mount point appear at /keep
- # inside the container (instead of using the same path as the
- # host like we do with CRUNCH_SRC and CRUNCH_INSTALL). However,
- # crunch scripts and utilities must not rely on this. They must
- # use $TASK_KEEPMOUNT.
+ # Currently, we make the "by_pdh" directory in arv-mount's mount
+ # point appear at /keep inside the container (instead of using
+ # the same path as the host like we do with CRUNCH_SRC and
+ # CRUNCH_INSTALL). However, crunch scripts and utilities must
+ # not rely on this. They must use $TASK_KEEPMOUNT.
$command .= "--volume=\Q$ENV{TASK_KEEPMOUNT}:/keep:ro\E ";
$ENV{TASK_KEEPMOUNT} = "/keep";
+ # Ditto TASK_KEEPMOUNT_TMP, as /keep_tmp.
+ $command .= "--volume=\Q$ENV{TASK_KEEPMOUNT_TMP}:/keep_tmp\E ";
+ $ENV{TASK_KEEPMOUNT_TMP} = "/keep_tmp";
+
# TASK_WORK is almost exactly like a docker data volume: it
# starts out empty, is writable, and persists until no
# containers use it any more. We don't use --volumes-from to
# For now, use the same approach as TASK_WORK above.
$ENV{"JOB_WORK"} = "/tmp/crunch-job-work";
+ # Bind mount the crunchrunner binary and host TLS certificates file into
+ # the container.
+ $command .= "--volume=\Q$ENV{HOST_CRUNCHRUNNER_BIN}:/usr/local/bin/crunchrunner\E ";
+ $command .= "--volume=\Q$ENV{HOST_CERTS}:/etc/arvados/ca-certificates.crt\E ";
+
while (my ($env_key, $env_val) = each %ENV)
{
if ($env_key =~ /^(ARVADOS|CRUNCH|JOB|TASK)_/) {
}
$command .= "--env=\QHOME=$ENV{HOME}\E ";
$command .= "\Q$docker_hash\E ";
- $command .= "stdbuf --output=0 --error=0 ";
- $command .= "perl - $ENV{CRUNCH_SRC}/crunch_scripts/" . $Job->{"script"};
+
+ if ($Job->{arvados_sdk_version}) {
+ $command .= $stdbuf;
+ $command .= "perl - \Q$ENV{CRUNCH_SRC}/crunch_scripts/$Job->{script}\E";
+ } else {
+ $command .= "/bin/sh -c \'python -c " .
+ '"from pkg_resources import get_distribution as get; print \"Using Arvados SDK version\", get(\"arvados-python-client\").version"' .
+ ">&2 2>/dev/null; " .
+ "mkdir -p \"$ENV{JOB_WORK}\" \"$ENV{TASK_WORK}\" && " .
+ "if which stdbuf >/dev/null ; then " .
+ " exec $stdbuf \Q$ENV{CRUNCH_SRC}/crunch_scripts/$Job->{script}\E ;" .
+ " else " .
+ " exec \Q$ENV{CRUNCH_SRC}/crunch_scripts/$Job->{script}\E ;" .
+ " fi\'";
+ }
} else {
# Non-docker run
- $command .= "crunchstat -cgroup-root=/sys/fs/cgroup -poll=10000 ";
- $command .= "stdbuf --output=0 --error=0 ";
+ $command .= "crunchstat -cgroup-root=\Q$cgroup_root\E -poll=10000 ";
+ $command .= $stdbuf;
$command .= "perl - $ENV{CRUNCH_SRC}/crunch_scripts/" . $Job->{"script"};
}
next;
}
shift @freeslot;
- $proc{$childpid} = { jobstep => $id,
- time => time,
- slot => $childslot,
- jobstepname => "$job_id.$id.$childpid",
- };
+ $proc{$childpid} = {
+ jobstepidx => $id,
+ time => time,
+ slot => $childslot,
+ jobstepname => "$job_id.$id.$childpid",
+ };
croak ("assert failed: \$slot[$childslot]->{'pid'} exists") if exists $slot[$childslot]->{pid};
$slot[$childslot]->{pid} = $childpid;
$Jobstep->{slotindex} = $childslot;
delete $Jobstep->{stderr};
delete $Jobstep->{finishtime};
+ delete $Jobstep->{tempfail};
$Jobstep->{'arvados_task'}->{started_at} = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($Jobstep->{starttime});
$Jobstep->{'arvados_task'}->save;
while (!@freeslot
||
- (@slot > @freeslot && $todo_ptr+1 > $#jobstep_todo))
+ ($round_num_freeslots > @freeslot && $todo_ptr+1 > $#jobstep_todo))
{
- last THISROUND if $main::please_freeze || defined($main::success);
+ last THISROUND if $main::please_freeze;
if ($main::please_info)
{
$main::please_info = 0;
my $gotsome
= readfrompipes ()
+ reapchildren ();
- if (!$gotsome)
+ if (!$gotsome || ($latest_refresh + 2 < scalar time))
{
check_refresh_wanted();
check_squeue();
update_progress_stats();
- select (undef, undef, undef, 0.1);
}
- elsif (time - $progress_stats_updated >= 30)
+ elsif (time - $progress_stats_updated >= 30 || $progress_is_dirty)
{
update_progress_stats();
}
+ if (!$gotsome) {
+ select (undef, undef, undef, 0.1);
+ }
+ $working_slot_count = scalar(grep { $_->{node}->{fail_count} == 0 &&
+ $_->{node}->{hold_count} < 4 } @slot);
if (($thisround_failed_multiple >= 8 && $thisround_succeeded == 0) ||
($thisround_failed_multiple >= 16 && $thisround_failed_multiple > $thisround_succeeded))
{
}
# give up if no nodes are succeeding
- if (!grep { $_->{node}->{losing_streak} == 0 &&
- $_->{node}->{hold_count} < 4 } @slot) {
- my $message = "Every node has failed -- giving up on this round";
- Log (undef, $message);
+ if ($working_slot_count < 1) {
+ Log(undef, "Every node has failed -- giving up");
last THISROUND;
}
}
if (!defined $main::success)
{
- if (@jobstep_todo &&
- $thisround_succeeded == 0 &&
- ($thisround_failed == 0 || $thisround_failed > 4))
- {
+ if (!@jobstep_todo) {
+ $main::success = 1;
+ } elsif ($working_slot_count < 1) {
+ save_output_collection();
+ save_meta();
+ exit(EX_RETRY_UNLOCKED);
+ } elsif ($thisround_succeeded == 0 &&
+ ($thisround_failed == 0 || $thisround_failed > 4)) {
my $message = "stop because $thisround_failed tasks failed and none succeeded";
Log (undef, $message);
$main::success = 0;
}
- if (!@jobstep_todo)
- {
- $main::success = 1;
- }
}
goto ONELEVEL if !defined $main::success;
release_allocation();
freeze();
-my $collated_output = &create_output_collection();
-
-if (!$collated_output) {
- Log (undef, "Failed to write output collection");
-}
-else {
- Log(undef, "job output $collated_output");
- $Job->update_attributes('output' => $collated_output);
-}
-
+my $collated_output = save_output_collection();
Log (undef, "finish");
save_meta();
$progress_stats_updated = time;
return if !$progress_is_dirty;
my ($todo, $done, $running) = (scalar @jobstep_todo,
- scalar @jobstep_done,
- scalar @slot - scalar @freeslot - scalar @holdslot);
+ scalar @jobstep_done,
+ scalar keys(%proc));
$Job->{'tasks_summary'} ||= {};
$Job->{'tasks_summary'}->{'todo'} = $todo;
$Job->{'tasks_summary'}->{'done'} = $done;
sub reapchildren
{
- my $pid = waitpid (-1, WNOHANG);
- return 0 if $pid <= 0;
-
- my $whatslot = ($slot[$proc{$pid}->{slot}]->{node}->{name}
- . "."
- . $slot[$proc{$pid}->{slot}]->{cpu});
- my $jobstepid = $proc{$pid}->{jobstep};
- my $elapsed = time - $proc{$pid}->{time};
- my $Jobstep = $jobstep[$jobstepid];
-
- my $childstatus = $?;
- my $exitvalue = $childstatus >> 8;
- my $exitinfo = "exit ".exit_status_s($childstatus);
- $Jobstep->{'arvados_task'}->reload;
- my $task_success = $Jobstep->{'arvados_task'}->{success};
-
- Log ($jobstepid, "child $pid on $whatslot $exitinfo success=$task_success");
-
- if (!defined $task_success) {
- # task did not indicate one way or the other --> fail
- $Jobstep->{'arvados_task'}->{success} = 0;
- $Jobstep->{'arvados_task'}->save;
- $task_success = 0;
- }
+ my $children_reaped = 0;
+ my @successful_task_uuids = ();
- if (!$task_success)
+ while((my $pid = waitpid (-1, WNOHANG)) > 0)
{
- my $temporary_fail;
- $temporary_fail ||= $Jobstep->{node_fail};
- $temporary_fail ||= ($exitvalue == TASK_TEMPFAIL);
-
- ++$thisround_failed;
- ++$thisround_failed_multiple if $Jobstep->{'failures'} >= 1;
-
- # Check for signs of a failed or misconfigured node
- if (++$slot[$proc{$pid}->{slot}]->{node}->{losing_streak} >=
- 2+$slot[$proc{$pid}->{slot}]->{node}->{ncpus}) {
- # Don't count this against jobstep failure thresholds if this
- # node is already suspected faulty and srun exited quickly
- if ($slot[$proc{$pid}->{slot}]->{node}->{hold_until} &&
- $elapsed < 5) {
- Log ($jobstepid, "blaming failure on suspect node " .
- $slot[$proc{$pid}->{slot}]->{node}->{name});
- $temporary_fail ||= 1;
- }
- ban_node_by_slot($proc{$pid}->{slot});
+ my $childstatus = $?;
+
+ my $whatslot = ($slot[$proc{$pid}->{slot}]->{node}->{name}
+ . "."
+ . $slot[$proc{$pid}->{slot}]->{cpu});
+ my $jobstepidx = $proc{$pid}->{jobstepidx};
+
+ if (!WIFEXITED($childstatus))
+ {
+ # child did not exit (may be temporarily stopped)
+ Log ($jobstepidx, "child $pid did not actually exit in reapchildren, ignoring for now.");
+ next;
+ }
+
+ $children_reaped++;
+ my $elapsed = time - $proc{$pid}->{time};
+ my $Jobstep = $jobstep[$jobstepidx];
+
+ my $exitvalue = $childstatus >> 8;
+ my $exitinfo = "exit ".exit_status_s($childstatus);
+ $Jobstep->{'arvados_task'}->reload;
+ my $task_success = $Jobstep->{'arvados_task'}->{success};
+
+ Log ($jobstepidx, "child $pid on $whatslot $exitinfo success=$task_success");
+
+ if (!defined $task_success) {
+ # task did not indicate one way or the other --> fail
+ Log($jobstepidx, sprintf(
+ "ERROR: Task process exited %s, but never updated its task record to indicate success and record its output.",
+ exit_status_s($childstatus)));
+ $Jobstep->{'arvados_task'}->{success} = 0;
+ $Jobstep->{'arvados_task'}->save;
+ $task_success = 0;
}
- Log ($jobstepid, sprintf('failure (#%d, %s) after %d seconds',
- ++$Jobstep->{'failures'},
- $temporary_fail ? 'temporary ' : 'permanent',
- $elapsed));
+ if (!$task_success)
+ {
+ my $temporary_fail;
+ $temporary_fail ||= $Jobstep->{tempfail};
+ $temporary_fail ||= ($exitvalue == TASK_TEMPFAIL);
+
+ ++$thisround_failed;
+ ++$thisround_failed_multiple if $Jobstep->{'failures'} >= 1;
+
+ # Check for signs of a failed or misconfigured node
+ if (++$slot[$proc{$pid}->{slot}]->{node}->{losing_streak} >=
+ 2+$slot[$proc{$pid}->{slot}]->{node}->{ncpus}) {
+ # Don't count this against jobstep failure thresholds if this
+ # node is already suspected faulty and srun exited quickly
+ if ($slot[$proc{$pid}->{slot}]->{node}->{hold_until} &&
+ $elapsed < 5) {
+ Log ($jobstepidx, "blaming failure on suspect node " .
+ $slot[$proc{$pid}->{slot}]->{node}->{name});
+ $temporary_fail ||= 1;
+ }
+ ban_node_by_slot($proc{$pid}->{slot});
+ }
- if (!$temporary_fail || $Jobstep->{'failures'} >= 3) {
- # Give up on this task, and the whole job
- $main::success = 0;
+ Log ($jobstepidx, sprintf('failure (#%d, %s) after %d seconds',
+ ++$Jobstep->{'failures'},
+ $temporary_fail ? 'temporary' : 'permanent',
+ $elapsed));
+
+ if (!$temporary_fail || $Jobstep->{'failures'} >= 3) {
+ # Give up on this task, and the whole job
+ $main::success = 0;
+ }
+ # Put this task back on the todo queue
+ push @jobstep_todo, $jobstepidx;
+ $Job->{'tasks_summary'}->{'failed'}++;
}
- # Put this task back on the todo queue
- push @jobstep_todo, $jobstepid;
- $Job->{'tasks_summary'}->{'failed'}++;
+ else # task_success
+ {
+ push @successful_task_uuids, $Jobstep->{'arvados_task'}->{uuid};
+ ++$thisround_succeeded;
+ $slot[$proc{$pid}->{slot}]->{node}->{losing_streak} = 0;
+ $slot[$proc{$pid}->{slot}]->{node}->{hold_until} = 0;
+ $slot[$proc{$pid}->{slot}]->{node}->{fail_count} = 0;
+ push @jobstep_done, $jobstepidx;
+ Log ($jobstepidx, "success in $elapsed seconds");
+ }
+ $Jobstep->{exitcode} = $childstatus;
+ $Jobstep->{finishtime} = time;
+ $Jobstep->{'arvados_task'}->{finished_at} = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($Jobstep->{finishtime});
+ $Jobstep->{'arvados_task'}->save;
+ process_stderr_final ($jobstepidx);
+ Log ($jobstepidx, sprintf("task output (%d bytes): %s",
+ length($Jobstep->{'arvados_task'}->{output}),
+ $Jobstep->{'arvados_task'}->{output}));
+
+ close $reader{$jobstepidx};
+ delete $reader{$jobstepidx};
+ delete $slot[$proc{$pid}->{slot}]->{pid};
+ push @freeslot, $proc{$pid}->{slot};
+ delete $proc{$pid};
+
+ $progress_is_dirty = 1;
}
- else
+
+ if (scalar(@successful_task_uuids) > 0)
{
- ++$thisround_succeeded;
- $slot[$proc{$pid}->{slot}]->{node}->{losing_streak} = 0;
- $slot[$proc{$pid}->{slot}]->{node}->{hold_until} = 0;
- push @jobstep_done, $jobstepid;
- Log ($jobstepid, "success in $elapsed seconds");
- }
- $Jobstep->{exitcode} = $childstatus;
- $Jobstep->{finishtime} = time;
- $Jobstep->{'arvados_task'}->{finished_at} = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($Jobstep->{finishtime});
- $Jobstep->{'arvados_task'}->save;
- process_stderr ($jobstepid, $task_success);
- Log ($jobstepid, sprintf("task output (%d bytes): %s",
- length($Jobstep->{'arvados_task'}->{output}),
- $Jobstep->{'arvados_task'}->{output}));
-
- close $reader{$jobstepid};
- delete $reader{$jobstepid};
- delete $slot[$proc{$pid}->{slot}]->{pid};
- push @freeslot, $proc{$pid}->{slot};
- delete $proc{$pid};
-
- if ($task_success) {
+ Log (undef, sprintf("%d tasks exited (%d succeeded), checking for new tasks from API server.", $children_reaped, scalar(@successful_task_uuids)));
# Load new tasks
my $newtask_list = [];
my $newtask_results;
do {
$newtask_results = api_call(
"job_tasks/list",
- 'where' => {
- 'created_by_job_task_uuid' => $Jobstep->{'arvados_task'}->{uuid}
- },
+ 'filters' => [["created_by_job_task_uuid","in",\@successful_task_uuids]],
'order' => 'qsequence',
'offset' => scalar(@$newtask_list),
- );
+ );
push(@$newtask_list, @{$newtask_results->{items}});
} while (@{$newtask_results->{items}});
+ Log (undef, sprintf("Got %d new tasks from API server.", scalar(@$newtask_list)));
foreach my $arvados_task (@$newtask_list) {
my $jobstep = {
'level' => $arvados_task->{'sequence'},
}
}
- $progress_is_dirty = 1;
- 1;
+ return $children_reaped;
}
sub check_refresh_wanted
{
my @stat = stat $ENV{"CRUNCH_REFRESH_TRIGGER"};
- if (@stat && $stat[9] > $latest_refresh) {
+ if (@stat &&
+ $stat[9] > $latest_refresh &&
+ # ...and we have actually locked the job record...
+ $job_id eq $Job->{'uuid'}) {
$latest_refresh = scalar time;
my $Job2 = api_call("jobs/get", uuid => $jobspec);
for my $attr ('cancelled_at',
sub check_squeue
{
- # return if the kill list was checked <4 seconds ago
- if (defined $squeue_kill_checked && $squeue_kill_checked > time - 4)
- {
- return;
- }
- $squeue_kill_checked = time;
+ my $last_squeue_check = $squeue_checked;
- # use killem() on procs whose killtime is reached
- for (keys %proc)
+ # Do not call `squeue` or check the kill list more than once every
+ # 15 seconds.
+ return if $last_squeue_check > time - 15;
+ $squeue_checked = time;
+
+ # Look for children from which we haven't received stderr data since
+ # the last squeue check. If no such children exist, all procs are
+ # alive and there's no need to even look at squeue.
+ #
+ # As long as the crunchstat poll interval (10s) is shorter than the
+ # squeue check interval (15s) this should make the squeue check an
+ # infrequent event.
+ my $silent_procs = 0;
+ for my $js (map {$jobstep[$_->{jobstepidx}]} values %proc)
{
- if (exists $proc{$_}->{killtime}
- && $proc{$_}->{killtime} <= time)
+ if (!exists($js->{stderr_at}))
+ {
+ $js->{stderr_at} = 0;
+ }
+ if ($js->{stderr_at} < $last_squeue_check)
{
- killem ($_);
+ $silent_procs++;
}
}
+ return if $silent_procs == 0;
- # return if the squeue was checked <60 seconds ago
- if (defined $squeue_checked && $squeue_checked > time - 60)
+ # use killem() on procs whose killtime is reached
+ while (my ($pid, $procinfo) = each %proc)
{
- return;
+ my $js = $jobstep[$procinfo->{jobstepidx}];
+ if (exists $procinfo->{killtime}
+ && $procinfo->{killtime} <= time
+ && $js->{stderr_at} < $last_squeue_check)
+ {
+ my $sincewhen = "";
+ if ($js->{stderr_at}) {
+ $sincewhen = " in last " . (time - $js->{stderr_at}) . "s";
+ }
+ Log($procinfo->{jobstepidx}, "killing orphaned srun process $pid (task not in slurm queue, no stderr received$sincewhen)");
+ killem ($pid);
+ }
}
- $squeue_checked = time;
if (!$have_slurm)
{
return;
}
- # get a list of steps still running
- my @squeue = `squeue -s -h -o '%i %j' && echo ok`;
- chop @squeue;
- if ($squeue[-1] ne "ok")
+ # Get a list of steps still running. Note: squeue(1) says --steps
+ # selects a format (which we override anyway) and allows us to
+ # specify which steps we're interested in (which we don't).
+ # Importantly, it also changes the meaning of %j from "job name" to
+ # "step name" and (although this isn't mentioned explicitly in the
+ # docs) switches from "one line per job" mode to "one line per step"
+ # mode. Without it, we'd just get a list of one job, instead of a
+ # list of N steps.
+ my @squeue = `squeue --jobs=\Q$ENV{SLURM_JOB_ID}\E --steps --format='%j' --noheader`;
+ if ($? != 0)
{
+ Log(undef, "warning: squeue exit status $? ($!)");
return;
}
- pop @squeue;
+ chop @squeue;
# which of my jobsteps are running, according to squeue?
my %ok;
- foreach (@squeue)
+ for my $jobstepname (@squeue)
{
- if (/^(\d+)\.(\d+) (\S+)/)
- {
- if ($1 eq $ENV{SLURM_JOBID})
- {
- $ok{$3} = 1;
- }
- }
+ $ok{$jobstepname} = 1;
}
- # which of my active child procs (>60s old) were not mentioned by squeue?
- foreach (keys %proc)
+ # Check for child procs >60s old and not mentioned by squeue.
+ while (my ($pid, $procinfo) = each %proc)
{
- if ($proc{$_}->{time} < time - 60
- && !exists $ok{$proc{$_}->{jobstepname}}
- && !exists $proc{$_}->{killtime})
+ if ($procinfo->{time} < time - 60
+ && $procinfo->{jobstepname}
+ && !exists $ok{$procinfo->{jobstepname}}
+ && !exists $procinfo->{killtime})
{
- # kill this proc if it hasn't exited in 30 seconds
- $proc{$_}->{killtime} = time + 30;
+ # According to slurm, this task has ended (successfully or not)
+ # -- but our srun child hasn't exited. First we must wait (30
+ # seconds) in case this is just a race between communication
+ # channels. Then, if our srun child process still hasn't
+ # terminated, we'll conclude some slurm communication
+ # error/delay has caused the task to die without notifying srun,
+ # and we'll kill srun ourselves.
+ $procinfo->{killtime} = time + 30;
+ Log($procinfo->{jobstepidx}, "notice: task is not in slurm queue but srun process $pid has not exited");
}
}
}
if ($have_slurm)
{
Log (undef, "release job allocation");
- system "scancel $ENV{SLURM_JOBID}";
+ system "scancel $ENV{SLURM_JOB_ID}";
}
}
sub readfrompipes
{
my $gotsome = 0;
- foreach my $job (keys %reader)
+ my %fd_job;
+ my $sel = IO::Select->new();
+ foreach my $jobstepidx (keys %reader)
+ {
+ my $fd = $reader{$jobstepidx};
+ $sel->add($fd);
+ $fd_job{$fd} = $jobstepidx;
+
+ if (my $stdout_fd = $jobstep[$jobstepidx]->{stdout_r}) {
+ $sel->add($stdout_fd);
+ $fd_job{$stdout_fd} = $jobstepidx;
+ }
+ }
+ # select on all reader fds with 0.1s timeout
+ my @ready_fds = $sel->can_read(0.1);
+ foreach my $fd (@ready_fds)
{
my $buf;
- while (0 < sysread ($reader{$job}, $buf, 8192))
+ if (0 < sysread ($fd, $buf, 65536))
{
+ $gotsome = 1;
print STDERR $buf if $ENV{CRUNCH_DEBUG};
- $jobstep[$job]->{stderr} .= $buf;
- preprocess_stderr ($job);
- if (length ($jobstep[$job]->{stderr}) > 16384)
+
+ my $jobstepidx = $fd_job{$fd};
+ if ($jobstep[$jobstepidx]->{stdout_r} == $fd) {
+ $jobstep[$jobstepidx]->{stdout_captured} .= $buf;
+ next;
+ }
+
+ $jobstep[$jobstepidx]->{stderr_at} = time;
+ $jobstep[$jobstepidx]->{stderr} .= $buf;
+
+ # Consume everything up to the last \n
+ preprocess_stderr ($jobstepidx);
+
+ if (length ($jobstep[$jobstepidx]->{stderr}) > 16384)
{
- substr ($jobstep[$job]->{stderr}, 0, 8192) = "";
+ # If we get a lot of stderr without a newline, chop off the
+ # front to avoid letting our buffer grow indefinitely.
+ substr ($jobstep[$jobstepidx]->{stderr},
+ 0, length($jobstep[$jobstepidx]->{stderr}) - 8192) = "";
}
- $gotsome = 1;
}
}
return $gotsome;
}
+# Consume all full lines of stderr for a jobstep. Everything after the
+# last newline will remain in $jobstep[$jobstepidx]->{stderr} after
+# returning.
sub preprocess_stderr
{
- my $job = shift;
+ my $jobstepidx = shift;
- while ($jobstep[$job]->{stderr} =~ /^(.*?)\n/) {
+ while ($jobstep[$jobstepidx]->{stderr} =~ /^(.*?)\n/) {
my $line = $1;
- substr $jobstep[$job]->{stderr}, 0, 1+length($line), "";
- Log ($job, "stderr $line");
+ substr $jobstep[$jobstepidx]->{stderr}, 0, 1+length($line), "";
+ Log ($jobstepidx, "stderr $line");
if ($line =~ /srun: error: (SLURM job $ENV{SLURM_JOB_ID} has expired|Unable to confirm allocation for job $ENV{SLURM_JOB_ID})/) {
# whoa.
$main::please_freeze = 1;
}
- elsif ($line =~ /srun: error: (Node failure on|Unable to create job step|.*: Communication connection failure)/) {
- $jobstep[$job]->{node_fail} = 1;
- ban_node_by_slot($jobstep[$job]->{slotindex});
+ elsif (!exists $jobstep[$jobstepidx]->{slotindex}) {
+ # Skip the following tempfail checks if this srun proc isn't
+ # attached to a particular worker slot.
+ }
+ elsif ($line =~ /srun: error: (Node failure on|Aborting, .*\bio error\b)/) {
+ my $job_slot_index = $jobstep[$jobstepidx]->{slotindex};
+ $slot[$job_slot_index]->{node}->{fail_count}++;
+ $jobstep[$jobstepidx]->{tempfail} = 1;
+ ban_node_by_slot($job_slot_index);
+ }
+ elsif ($line =~ /srun: error: (Unable to create job step|.*: Communication connection failure)/) {
+ $jobstep[$jobstepidx]->{tempfail} = 1;
+ ban_node_by_slot($jobstep[$jobstepidx]->{slotindex});
+ }
+ elsif ($line =~ /\bKeep(Read|Write|Request)Error:/) {
+ $jobstep[$jobstepidx]->{tempfail} = 1;
}
}
}
-sub process_stderr
+sub process_stderr_final
{
- my $job = shift;
- my $task_success = shift;
- preprocess_stderr ($job);
+ my $jobstepidx = shift;
+ preprocess_stderr ($jobstepidx);
map {
- Log ($job, "stderr $_");
- } split ("\n", $jobstep[$job]->{stderr});
+ Log ($jobstepidx, "stderr $_");
+ } split ("\n", $jobstep[$jobstepidx]->{stderr});
+ $jobstep[$jobstepidx]->{stderr} = '';
}
sub fetch_block
return $joboutput;
}
+# Calls create_output_collection, logs the result, and returns it.
+# If that was successful, save that as the output in the job record.
+sub save_output_collection {
+ my $collated_output = create_output_collection();
+
+ if (!$collated_output) {
+ Log(undef, "Failed to write output collection");
+ }
+ else {
+ Log(undef, "job output $collated_output");
+ $Job->update_attributes('output' => $collated_output);
+ }
+ return $collated_output;
+}
sub killem
{
}
if (!exists $proc{$_}->{"sent_$sig"})
{
- Log ($proc{$_}->{jobstep}, "sending 2x signal $sig to pid $_");
+ Log ($proc{$_}->{jobstepidx}, "sending 2x signal $sig to pid $_");
kill $sig, $_;
select (undef, undef, undef, 0.1);
if ($sig == 2)
# Send log output to Keep via arv-put.
#
# $log_pipe_in and $log_pipe_out are the input and output filehandles to the arv-put pipe.
+# $log_pipe_out_buf is a string containing all output read from arv-put so far.
+# $log_pipe_out_select is an IO::Select object around $log_pipe_out.
# $log_pipe_pid is the pid of the arv-put subprocess.
#
# The only functions that should access these variables directly are:
# Starts an arv-put pipe, reading data on stdin and writing it to
# a $logfilename file in an output collection.
#
+# log_writer_read_output([$timeout])
+# Read output from $log_pipe_out and append it to $log_pipe_out_buf.
+# Passes $timeout to the select() call, with a default of 0.01.
+# Returns the result of the last read() call on $log_pipe_out, or
+# -1 if read() wasn't called because select() timed out.
+# Only other log_writer_* functions should need to call this.
+#
# log_writer_send($txt)
# Writes $txt to the output log collection.
#
# Returns a true value if there is currently a live arv-put
# process, false otherwise.
#
-my ($log_pipe_in, $log_pipe_out, $log_pipe_pid);
+my ($log_pipe_in, $log_pipe_out, $log_pipe_out_buf, $log_pipe_out_select,
+ $log_pipe_pid);
sub log_writer_start($)
{
my $logfilename = shift;
$log_pipe_pid = open2($log_pipe_out, $log_pipe_in,
'arv-put',
- '--portable-data-hash',
- '--project-uuid', $Job->{owner_uuid},
+ '--stream',
'--retries', '3',
- '--name', $logfilename,
'--filename', $logfilename,
'-');
+ $log_pipe_out_buf = "";
+ $log_pipe_out_select = IO::Select->new($log_pipe_out);
+}
+
+sub log_writer_read_output {
+ my $timeout = shift || 0.01;
+ my $read = -1;
+ while ($read && $log_pipe_out_select->can_read($timeout)) {
+ $read = read($log_pipe_out, $log_pipe_out_buf, 65536,
+ length($log_pipe_out_buf));
+ }
+ if (!defined($read)) {
+ Log(undef, "error reading log manifest from arv-put: $!");
+ }
+ return $read;
}
sub log_writer_send($)
{
my $txt = shift;
print $log_pipe_in $txt;
+ log_writer_read_output();
}
sub log_writer_finish()
return unless $log_pipe_pid;
close($log_pipe_in);
- my $arv_put_output;
- my $s = IO::Select->new($log_pipe_out);
- if ($s->can_read(120)) {
- sysread($log_pipe_out, $arv_put_output, 1024);
- chomp($arv_put_output);
- } else {
+ my $logger_failed = 0;
+ my $read_result = log_writer_read_output(120);
+ if ($read_result == -1) {
+ $logger_failed = -1;
Log (undef, "timed out reading from 'arv-put'");
+ } elsif ($read_result != 0) {
+ $logger_failed = -2;
+ Log(undef, "failed to read arv-put log manifest to EOF");
}
waitpid($log_pipe_pid, 0);
- $log_pipe_pid = $log_pipe_in = $log_pipe_out = undef;
if ($?) {
- Log("log_writer_finish: arv-put exited ".exit_status_s($?))
+ $logger_failed ||= $?;
+ Log(undef, "log_writer_finish: arv-put exited " . exit_status_s($?))
}
+ close($log_pipe_out);
+ my $arv_put_output = $logger_failed ? undef : $log_pipe_out_buf;
+ $log_pipe_pid = $log_pipe_in = $log_pipe_out = $log_pipe_out_buf =
+ $log_pipe_out_select = undef;
+
return $arv_put_output;
}
return $log_pipe_pid;
}
-sub Log # ($jobstep_id, $logmessage)
+sub Log # ($jobstepidx, $logmessage)
{
- if ($_[1] =~ /\n/) {
+ my ($jobstepidx, $logmessage) = @_;
+ if ($logmessage =~ /\n/) {
for my $line (split (/\n/, $_[1])) {
- Log ($_[0], $line);
+ Log ($jobstepidx, $line);
}
return;
}
my $fh = select STDERR; $|=1; select $fh;
- my $message = sprintf ("%s %d %s %s", $job_id, $$, @_);
+ my $task_qseq = '';
+ if (defined($jobstepidx) && exists($jobstep[$jobstepidx]->{arvados_task})) {
+ $task_qseq = $jobstepidx;
+ }
+ my $message = sprintf ("%s %d %s %s", $job_id, $$, $task_qseq, $logmessage);
$message =~ s{([^ -\176])}{"\\" . sprintf ("%03o", ord($1))}ge;
$message .= "\n";
my $datetime;
my $justcheckpoint = shift; # false if this will be the last meta saved
return if $justcheckpoint; # checkpointing is not relevant post-Warehouse.pm
return unless log_writer_is_active();
+ my $log_manifest = log_writer_finish();
+ return unless defined($log_manifest);
+
+ if ($Job->{log}) {
+ my $prev_log_coll = api_call("collections/get", uuid => $Job->{log});
+ $log_manifest = $prev_log_coll->{manifest_text} . $log_manifest;
+ }
- my $loglocator = log_writer_finish();
- Log (undef, "log manifest is $loglocator");
- $Job->{'log'} = $loglocator;
- $Job->update_attributes('log', $loglocator);
+ my $log_coll = api_call(
+ "collections/create", ensure_unique_name => 1, collection => {
+ manifest_text => $log_manifest,
+ owner_uuid => $Job->{owner_uuid},
+ name => sprintf("Log from %s job %s", $Job->{script}, $Job->{uuid}),
+ });
+ Log(undef, "log collection is " . $log_coll->{portable_data_hash});
+ $Job->update_attributes('log' => $log_coll->{portable_data_hash});
}
}
+sub srun_sync
+{
+ my $srunargs = shift;
+ my $execargs = shift;
+ my $opts = shift || {};
+ my $stdin = shift;
+
+ my $label = exists $opts->{label} ? $opts->{label} : "@$execargs";
+ Log (undef, "$label: start");
+
+ my ($stderr_r, $stderr_w);
+ pipe $stderr_r, $stderr_w or croak("pipe() failed: $!");
+
+ my ($stdout_r, $stdout_w);
+ pipe $stdout_r, $stdout_w or croak("pipe() failed: $!");
+
+ my $srunpid = fork();
+ if ($srunpid == 0)
+ {
+ close($stderr_r);
+ close($stdout_r);
+ fcntl($stderr_w, F_SETFL, 0) or croak($!); # no close-on-exec
+ fcntl($stdout_w, F_SETFL, 0) or croak($!);
+ open(STDERR, ">&", $stderr_w);
+ open(STDOUT, ">&", $stdout_w);
+ srun ($srunargs, $execargs, $opts, $stdin);
+ exit (1);
+ }
+ close($stderr_w);
+ close($stdout_w);
+
+ set_nonblocking($stderr_r);
+ set_nonblocking($stdout_r);
+
+ # Add entries to @jobstep and %proc so check_squeue() and
+ # freeze_if_want_freeze() can treat it like a job task process.
+ push @jobstep, {
+ stderr => '',
+ stderr_at => 0,
+ stderr_captured => '',
+ stdout_r => $stdout_r,
+ stdout_captured => '',
+ };
+ my $jobstepidx = $#jobstep;
+ $proc{$srunpid} = {
+ jobstepidx => $jobstepidx,
+ };
+ $reader{$jobstepidx} = $stderr_r;
+
+ while ($srunpid != waitpid ($srunpid, WNOHANG)) {
+ my $busy = readfrompipes();
+ if (!$busy || ($latest_refresh + 2 < scalar time)) {
+ check_refresh_wanted();
+ check_squeue();
+ }
+ if (!$busy) {
+ select(undef, undef, undef, 0.1);
+ }
+ killem(keys %proc) if $main::please_freeze;
+ }
+ my $exited = $?;
+
+ 1 while readfrompipes();
+ process_stderr_final ($jobstepidx);
+
+ Log (undef, "$label: exit ".exit_status_s($exited));
+
+ close($stdout_r);
+ close($stderr_r);
+ delete $proc{$srunpid};
+ delete $reader{$jobstepidx};
+
+ my $j = pop @jobstep;
+ return ($exited, $j->{stdout_captured}, $j->{stderr_captured});
+}
+
+
sub srun
{
my $srunargs = shift;
my $show_cmd = Dumper($args);
$show_cmd =~ s/(TOKEN\\*=)[^\s\']+/${1}[...]/g;
$show_cmd =~ s/\n/ /g;
- warn "starting: $show_cmd\n";
+ if ($opts->{fork}) {
+ Log(undef, "starting: $show_cmd");
+ } else {
+ # This is a child process: parent is in charge of reading our
+ # stderr and copying it to Log() if needed.
+ warn "starting: $show_cmd\n";
+ }
if (defined $stdin) {
my $child = open STDIN, "-|";
return $tar_contents;
}
+sub set_nonblocking {
+ my $fh = shift;
+ my $flags = fcntl ($fh, F_GETFL, 0) or croak ($!);
+ fcntl ($fh, F_SETFL, $flags | O_NONBLOCK) or croak ($!);
+}
+
__DATA__
-#!/usr/bin/perl
+#!/usr/bin/env perl
#
# This is crunch-job's internal dispatch script. crunch-job running on the API
# server invokes this script on individual compute nodes, or localhost if we're
my %SDK_ENVVARS = ("perl/lib" => "PERLLIB", "ruby/lib" => "RUBYLIB");
my $destdir = $ENV{"CRUNCH_SRC"};
-my $commit = $ENV{"CRUNCH_SRC_COMMIT"};
+my $archive_hash = $ENV{"CRUNCH_GIT_ARCHIVE_HASH"};
my $repo = $ENV{"CRUNCH_SRC_URL"};
my $install_dir = $ENV{"CRUNCH_INSTALL"} || (getcwd() . "/opt");
my $job_work = $ENV{"JOB_WORK"};
my $task_work = $ENV{"TASK_WORK"};
+open(STDOUT_ORIG, ">&", STDOUT);
+open(STDERR_ORIG, ">&", STDERR);
+
for my $dir ($destdir, $job_work, $task_work) {
if ($dir) {
make_path $dir;
remove_tree($task_work, {keep_root => 1});
}
-open(STDOUT_ORIG, ">&", STDOUT);
-open(STDERR_ORIG, ">&", STDERR);
-open(STDOUT, ">>", "$destdir.log");
-open(STDERR, ">&", STDOUT);
-
### Crunch script run mode
if (@ARGV) {
# We want to do routine logging during task 0 only. This gives the user
$Log->("Built Python SDK virtualenv");
}
- my $pip_bin = "pip";
+ my @pysdk_version_cmd = ("python", "-c",
+ "from pkg_resources import get_distribution as get; print get('arvados-python-client').version");
if ($venv_built) {
$Log->("Running in Python SDK virtualenv");
- $pip_bin = "$venv_dir/bin/pip";
+ @pysdk_version_cmd = ();
my $orig_argv = join(" ", map { quotemeta($_); } @ARGV);
@ARGV = ("/bin/sh", "-ec",
". \Q$venv_dir/bin/activate\E; exec $orig_argv");
"\$PATH. Can't install Python SDK.");
}
- my $pkgs = `(\Q$pip_bin\E freeze 2>/dev/null | grep arvados) || dpkg-query --show '*arvados*'`;
- if ($pkgs) {
- $Log->("Using Arvados SDK:");
- foreach my $line (split /\n/, $pkgs) {
- $Log->($line);
+ if (@pysdk_version_cmd) {
+ open(my $pysdk_version_pipe, "-|", @pysdk_version_cmd);
+ my $pysdk_version = <$pysdk_version_pipe>;
+ close($pysdk_version_pipe);
+ if ($? == 0) {
+ chomp($pysdk_version);
+ $Log->("Using Arvados SDK version $pysdk_version");
+ } else {
+ # A lot could've gone wrong here, but pretty much all of it means that
+ # Python won't be able to load the Arvados SDK.
+ $Log->("Warning: Arvados SDK not found");
}
- } else {
- $Log->("Arvados SDK packages not found");
}
while (my ($sdk_dir, $sdk_envkey) = each(%SDK_ENVVARS)) {
}
}
- close(STDOUT);
- close(STDERR);
- open(STDOUT, ">&", STDOUT_ORIG);
- open(STDERR, ">&", STDERR_ORIG);
exec(@ARGV);
die "Cannot exec `@ARGV`: $!";
}
### Installation mode
open L, ">", "$destdir.lock" or die "$destdir.lock: $!";
flock L, LOCK_EX;
-if (readlink ("$destdir.commit") eq $commit && -d $destdir) {
- # This version already installed -> nothing to do.
+if (readlink ("$destdir.archive_hash") eq $archive_hash && -d $destdir) {
+ # This exact git archive (source + arvados sdk) is already installed
+ # here, so there's no need to reinstall it.
+
+ # We must consume our DATA section, though: otherwise the process
+ # feeding it to us will get SIGPIPE.
+ my $buf;
+ while (read(DATA, $buf, 65536)) { }
+
exit(0);
}
-unlink "$destdir.commit";
+unlink "$destdir.archive_hash";
mkdir $destdir;
-if (!open(TARX, "|-", "tar", "-xC", $destdir)) {
- die "Error launching 'tar -xC $destdir': $!";
-}
-# If we send too much data to tar in one write (> 4-5 MiB), it stops, and we
-# get SIGPIPE. We must feed it data incrementally.
-my $tar_input;
-while (read(DATA, $tar_input, 65536)) {
- print TARX $tar_input;
-}
-if(!close(TARX)) {
- die "'tar -xC $destdir' exited $?: $!";
-}
+do {
+ # Ignore SIGPIPE: we check retval of close() instead. See perlipc(1).
+ local $SIG{PIPE} = "IGNORE";
+ warn "Extracting archive: $archive_hash\n";
+ # --ignore-zeros is necessary sometimes: depending on how much NUL
+ # padding tar -A put on our combined archive (which in turn depends
+ # on the length of the component archives) tar without
+ # --ignore-zeros will exit before consuming stdin and cause close()
+ # to fail on the resulting SIGPIPE.
+ if (!open(TARX, "|-", "tar", "--ignore-zeros", "-xC", $destdir)) {
+ die "Error launching 'tar -xC $destdir': $!";
+ }
+ # If we send too much data to tar in one write (> 4-5 MiB), it stops, and we
+ # get SIGPIPE. We must feed it data incrementally.
+ my $tar_input;
+ while (read(DATA, $tar_input, 65536)) {
+ print TARX $tar_input;
+ }
+ if(!close(TARX)) {
+ die "'tar -xC $destdir' exited $?: $!";
+ }
+};
mkdir $install_dir;
}
my $python_dir = "$install_dir/python";
-if ((-d $python_dir) and can_run("python2.7") and
- (system("python2.7", "$python_dir/setup.py", "--quiet", "egg_info") != 0)) {
- # egg_info failed, probably when it asked git for a build tag.
- # Specify no build tag.
- open(my $pysdk_cfg, ">>", "$python_dir/setup.cfg");
- print $pysdk_cfg "\n[egg_info]\ntag_build =\n";
- close($pysdk_cfg);
+if ((-d $python_dir) and can_run("python2.7")) {
+ open(my $egg_info_pipe, "-|",
+ "python2.7 \Q$python_dir/setup.py\E egg_info 2>&1 >/dev/null");
+ my @egg_info_errors = <$egg_info_pipe>;
+ close($egg_info_pipe);
+
+ if ($?) {
+ if (@egg_info_errors and (($egg_info_errors[-1] =~ /\bgit\b/) or ($egg_info_errors[-1] =~ /\[Errno 2\]/))) {
+ # egg_info apparently failed because it couldn't ask git for a build tag.
+ # Specify no build tag.
+ open(my $pysdk_cfg, ">>", "$python_dir/setup.cfg");
+ print $pysdk_cfg "\n[egg_info]\ntag_build =\n";
+ close($pysdk_cfg);
+ } else {
+ my $egg_info_exit = $? >> 8;
+ foreach my $errline (@egg_info_errors) {
+ warn $errline;
+ }
+ warn "python setup.py egg_info failed: exit $egg_info_exit";
+ exit ($egg_info_exit || 1);
+ }
+ }
}
+# Hide messages from the install script (unless it fails: shell_or_die
+# will show $destdir.log in that case).
+open(STDOUT, ">>", "$destdir.log");
+open(STDERR, ">&", STDOUT);
+
if (-e "$destdir/crunch_scripts/install") {
shell_or_die (undef, "$destdir/crunch_scripts/install", $install_dir);
} elsif (!-e "./install.sh" && -e "./tests/autotests.sh") {
shell_or_die (undef, "./install.sh", $install_dir);
}
-if ($commit) {
- unlink "$destdir.commit.new";
- symlink ($commit, "$destdir.commit.new") or die "$destdir.commit.new: $!";
- rename ("$destdir.commit.new", "$destdir.commit") or die "$destdir.commit: $!";
+if ($archive_hash) {
+ unlink "$destdir.archive_hash.new";
+ symlink ($archive_hash, "$destdir.archive_hash.new") or die "$destdir.archive_hash.new: $!";
+ rename ("$destdir.archive_hash.new", "$destdir.archive_hash") or die "$destdir.archive_hash: $!";
}
close L;