use Fcntl ':flock';
use File::Path qw( make_path remove_tree );
+use constant TASK_TEMPFAIL => 111;
use constant EX_TEMPFAIL => 75;
$ENV{"TMPDIR"} ||= "/tmp";
# TODO: When #5036 is done and widely deployed, we can get rid of the
# regular expression and just unmount everything with type fuse.keep.
srun (["srun", "--nodelist=$nodelist", "-D", $ENV{'TMPDIR'}],
- ['bash', '-ec', 'mount -t fuse,fuse.keep | awk \'($3 ~ /\ykeep\y/){print $3}\' | xargs -r -n 1 fusermount -u -z; sleep 1; rm -rf $JOB_WORK $CRUNCH_INSTALL $CRUNCH_TMP/task $CRUNCH_TMP/src*']);
+ ['bash', '-ec', 'mount -t fuse,fuse.keep | awk \'($3 ~ /\ykeep\y/){print $3}\' | xargs -r -n 1 fusermount -u -z; sleep 1; rm -rf $JOB_WORK $CRUNCH_INSTALL $CRUNCH_TMP/task $CRUNCH_TMP/src* $CRUNCH_TMP/*.cid']);
exit (1);
}
while (1)
# If this job requires a Docker image, install that.
my $docker_bin = "/usr/bin/docker.io";
-my ($docker_locator, $docker_stream, $docker_hash);
+my ($docker_locator, $docker_stream, $docker_hash, $docker_limitmem);
if ($docker_locator = $Job->{docker_image_locator}) {
($docker_stream, $docker_hash) = find_docker_image($docker_locator);
if (!$docker_hash)
}
$docker_stream =~ s/^\.//;
my $docker_install_script = qq{
-if ! $docker_bin images -q --no-trunc | grep -qxF \Q$docker_hash\E; then
+if ! $docker_bin images -q --no-trunc --all | grep -qxF \Q$docker_hash\E; then
arv-get \Q$docker_locator$docker_stream/$docker_hash.tar\E | $docker_bin load
fi
};
.exit_status_s($?));
}
+ # Determine whether this version of Docker supports memory+swap limits.
+ srun(["srun", "--nodelist=" . $node[0]],
+ ["/bin/sh", "-ec", "$docker_bin run --help | grep -qe --memory-swap="],
+ {fork => 1});
+ $docker_limitmem = ($? == 0);
+
if ($Job->{arvados_sdk_version}) {
# The job also specifies an Arvados SDK version. Add the SDKs to the
# tar file for the build script to install.
THISROUND:
+my $tasks_this_level = 0;
+foreach my $id (@jobstep_todo) {
+ $tasks_this_level++ if ($jobstep[$id]->{level} == $level);
+}
for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
{
my $id = $jobstep_todo[$todo_ptr];
my $childslotname = join (".",
$slot[$childslot]->{node}->{name},
$slot[$childslot]->{cpu});
+
my $childpid = fork();
if ($childpid == 0)
{
$ENV{"GZIP"} = "-n";
+ my $max_node_concurrent_tasks = $ENV{CRUNCH_NODE_SLOTS};
+ if ($tasks_this_level < $max_node_concurrent_tasks) {
+ $max_node_concurrent_tasks = $tasks_this_level;
+ }
+
my @srunargs = (
"srun",
"--nodelist=".$childnode->{name},
my $command =
"if [ -e $ENV{TASK_WORK} ]; then rm -rf $ENV{TASK_WORK}; fi; "
."mkdir -p $ENV{CRUNCH_TMP} $ENV{JOB_WORK} $ENV{TASK_WORK} $ENV{TASK_KEEPMOUNT} "
- ."&& cd $ENV{CRUNCH_TMP} ";
+ ."&& cd $ENV{CRUNCH_TMP} "
+ # These environment variables get used explicitly later in
+ # $command. No tool is expected to read these values directly.
+ .q{&& MEM=$(awk '($1 == "MemTotal:"){print $2}' </proc/meminfo) }
+ .q{&& SWAP=$(awk '($1 == "SwapTotal:"){print $2}' </proc/meminfo) }
+ ."&& MEMLIMIT=\$(( (\$MEM * 95) / ($max_node_concurrent_tasks * 100) )) "
+ ."&& let SWAPLIMIT=\$MEMLIMIT+\$SWAP ";
$command .= "&& exec arv-mount --by-id --allow-other $ENV{TASK_KEEPMOUNT} --exec ";
if ($docker_hash)
{
- my $cidfile = "$ENV{CRUNCH_TMP}/$ENV{TASK_UUID}.cid";
+ my $cidfile = "$ENV{CRUNCH_TMP}/$Jobstep->{arvados_task}->{uuid}-$Jobstep->{failures}.cid";
$command .= "crunchstat -cgroup-root=/sys/fs/cgroup -cgroup-parent=docker -cgroup-cid=$cidfile -poll=10000 ";
$command .= "$docker_bin run --rm=true --attach=stdout --attach=stderr --attach=stdin -i --user=crunch --cidfile=$cidfile --sig-proxy ";
+ # We only set memory limits if Docker lets us limit both memory and swap.
+ # Memory limits alone have been supported longer, but subprocesses tend
+ # to get SIGKILL if they exceed that without any swap limit set.
+ # See #5642 for additional background.
+ if ($docker_limitmem) {
+ $command .= "--memory=\${MEMLIMIT}k --memory-swap=\${SWAPLIMIT}k ";
+ }
# Dynamically configure the container to use the host system as its
# DNS server. Get the host's global addresses from the ip command,
||
(@slot > @freeslot && $todo_ptr+1 > $#jobstep_todo))
{
- last THISROUND if $main::please_freeze;
+ last THISROUND if $main::please_freeze || defined($main::success);
if ($main::please_info)
{
$main::please_info = 0;
{
my $temporary_fail;
$temporary_fail ||= $Jobstep->{node_fail};
- $temporary_fail ||= ($exitvalue == 111);
+ $temporary_fail ||= ($exitvalue == TASK_TEMPFAIL);
++$thisround_failed;
++$thisround_failed_multiple if $Jobstep->{'failures'} >= 1;
# whoa.
$main::please_freeze = 1;
}
- elsif ($line =~ /srun: error: (Node failure on|Unable to create job step) /) {
+ elsif ($line =~ /(srun: error: (Node failure on|Unable to create job step|.*: Communication connection failure))|arvados.errors.Keep/) {
$jobstep[$job]->{node_fail} = 1;
ban_node_by_slot($jobstep[$job]->{slotindex});
}
use File::Path qw( make_path remove_tree );
use POSIX qw(getcwd);
+use constant TASK_TEMPFAIL => 111;
+
# Map SDK subdirectories to the path environments they belong to.
my %SDK_ENVVARS = ("perl/lib" => "PERLLIB", "ruby/lib" => "RUBYLIB");
my $venv_dir = "$job_work/.arvados.venv";
my $venv_built = -e "$venv_dir/bin/activate";
if ((!$venv_built) and (-d $python_src) and can_run("virtualenv")) {
- shell_or_die("virtualenv", "--quiet", "--system-site-packages",
+ shell_or_die(undef, "virtualenv", "--quiet", "--system-site-packages",
"--python=python2.7", $venv_dir);
- shell_or_die("$venv_dir/bin/pip", "--quiet", "install", "-I", $python_src);
+ shell_or_die(TASK_TEMPFAIL, "$venv_dir/bin/pip", "--quiet", "install", "-I", $python_src);
$venv_built = 1;
$Log->("Built Python SDK virtualenv");
}
}
if (-e "$destdir/crunch_scripts/install") {
- shell_or_die ("$destdir/crunch_scripts/install", $install_dir);
+ shell_or_die (undef, "$destdir/crunch_scripts/install", $install_dir);
} elsif (!-e "./install.sh" && -e "./tests/autotests.sh") {
# Old version
- shell_or_die ("./tests/autotests.sh", $install_dir);
+ shell_or_die (undef, "./tests/autotests.sh", $install_dir);
} elsif (-e "./install.sh") {
- shell_or_die ("./install.sh", $install_dir);
+ shell_or_die (undef, "./install.sh", $install_dir);
}
if ($commit) {
sub shell_or_die
{
+ my $exitcode = shift;
+
if ($ENV{"DEBUG"}) {
print STDERR "@_\n";
}
if (system (@_) != 0) {
my $err = $!;
- my $exitstatus = sprintf("exit %d signal %d", $? >> 8, $? & 0x7f);
+ my $code = $?;
+ my $exitstatus = sprintf("exit %d signal %d", $code >> 8, $code & 0x7f);
open STDERR, ">&STDERR_ORIG";
system ("cat $destdir.log >&2");
- die "@_ failed ($err): $exitstatus";
+ warn "@_ failed ($err): $exitstatus";
+ if (defined($exitcode)) {
+ exit $exitcode;
+ }
+ else {
+ exit (($code >> 8) || 1);
+ }
}
}