my $job_api_token;
my $no_clear_tmp;
my $resume_stash;
+my $cgroup_root = "/sys/fs/cgroup";
my $docker_bin = "docker.io";
my $docker_run_args = "";
GetOptions('force-unlock' => \$force_unlock,
'job-api-token=s' => \$job_api_token,
'no-clear-tmp' => \$no_clear_tmp,
'resume-stash=s' => \$resume_stash,
+ 'cgroup-root=s' => \$cgroup_root,
'docker-bin=s' => \$docker_bin,
'docker-run-args=s' => \$docker_run_args,
);
# If this job requires a Docker image, install that.
my ($docker_locator, $docker_stream, $docker_hash, $docker_limitmem, $dockeruserarg);
if ($docker_locator = $Job->{docker_image_locator}) {
+ Log (undef, "Install docker image $docker_locator");
($docker_stream, $docker_hash) = find_docker_image($docker_locator);
if (!$docker_hash)
{
croak("No Docker image hash found from locator $docker_locator");
}
+ Log (undef, "docker image hash is $docker_hash");
$docker_stream =~ s/^\.//;
my $docker_install_script = qq{
if ! $docker_bin images -q --no-trunc --all | grep -qxF \Q$docker_hash\E; then
if ($docker_pid == 0)
{
srun (["srun", "--nodelist=" . join(',', @node)],
- ["/bin/sh", "-ec", $docker_install_script]);
+ ["/bin/bash", "-o", "pipefail", "-ec", $docker_install_script]);
exit ($?);
}
while (1)
}
if ($? != 0)
{
- croak("Installing Docker image from $docker_locator exited "
- .exit_status_s($?));
+ Log(undef, "Installing Docker image from $docker_locator exited " . exit_status_s($?));
+ exit(EX_RETRY_UNLOCKED);
}
# Determine whether this version of Docker supports memory+swap limits.
{
my $containername = "$Jobstep->{arvados_task}->{uuid}-$Jobstep->{failures}";
my $cidfile = "$ENV{CRUNCH_TMP}/$containername.cid";
- $command .= "crunchstat -cgroup-root=/sys/fs/cgroup -cgroup-parent=docker -cgroup-cid=$cidfile -poll=10000 ";
+ $command .= "crunchstat -cgroup-root=\Q$cgroup_root\E -cgroup-parent=docker -cgroup-cid=$cidfile -poll=10000 ";
$command .= "$docker_bin run $docker_run_args --name=$containername --attach=stdout --attach=stderr --attach=stdin -i \Q$dockeruserarg\E --cidfile=$cidfile --sig-proxy ";
# We only set memory limits if Docker lets us limit both memory and swap.
# Memory limits alone have been supported longer, but subprocesses tend
}
} else {
# Non-docker run
- $command .= "crunchstat -cgroup-root=/sys/fs/cgroup -poll=10000 ";
+ $command .= "crunchstat -cgroup-root=\Q$cgroup_root\E -poll=10000 ";
$command .= $stdbuf;
$command .= "perl - $ENV{CRUNCH_SRC}/crunch_scripts/" . $Job->{"script"};
}
check_refresh_wanted();
check_squeue();
update_progress_stats();
- select (undef, undef, undef, 0.1);
}
elsif (time - $progress_stats_updated >= 30 || $progress_is_dirty)
{
update_progress_stats();
}
+ if (!$gotsome) {
+ select (undef, undef, undef, 0.1);
+ }
$working_slot_count = scalar(grep { $_->{node}->{fail_count} == 0 &&
$_->{node}->{hold_count} < 4 } @slot);
if (($thisround_failed_multiple >= 8 && $thisround_succeeded == 0) ||
foreach my $job (keys %reader)
{
my $buf;
- while (0 < sysread ($reader{$job}, $buf, 8192))
+ if (0 < sysread ($reader{$job}, $buf, 65536))
{
print STDERR $buf if $ENV{CRUNCH_DEBUG};
$jobstep[$job]->{stderr_at} = time;
$jobstep[$job]->{stderr} .= $buf;
+
+ # Consume everything up to the last \n
preprocess_stderr ($job);
+
if (length ($jobstep[$job]->{stderr}) > 16384)
{
- substr ($jobstep[$job]->{stderr}, 0, 8192) = "";
+ # If we get a lot of stderr without a newline, chop off the
+ # front to avoid letting our buffer grow indefinitely.
+ substr ($jobstep[$job]->{stderr},
+ 0, length($jobstep[$job]->{stderr}) - 8192) = "";
}
$gotsome = 1;
}
# whoa.
$main::please_freeze = 1;
}
- elsif ($line =~ /srun: error: Node failure on/) {
+ elsif ($line =~ /srun: error: (Node failure on|Aborting, .*\bio error\b)/) {
my $job_slot_index = $jobstep[$job]->{slotindex};
$slot[$job_slot_index]->{node}->{fail_count}++;
$jobstep[$job]->{tempfail} = 1;
$jobstep[$job]->{tempfail} = 1;
ban_node_by_slot($jobstep[$job]->{slotindex});
}
- elsif ($line =~ /arvados\.errors\.Keep/) {
+ elsif ($line =~ /\bKeep(Read|Write|Request)Error:/) {
$jobstep[$job]->{tempfail} = 1;
}
}