my $job_api_token;
my $no_clear_tmp;
my $resume_stash;
+my $cgroup_root = "/sys/fs/cgroup";
my $docker_bin = "docker.io";
my $docker_run_args = "";
GetOptions('force-unlock' => \$force_unlock,
'job-api-token=s' => \$job_api_token,
'no-clear-tmp' => \$no_clear_tmp,
'resume-stash=s' => \$resume_stash,
+ 'cgroup-root=s' => \$cgroup_root,
'docker-bin=s' => \$docker_bin,
'docker-run-args=s' => \$docker_run_args,
);
# Determine whether this version of Docker supports memory+swap limits.
($exited, $stdout, $stderr) = srun_sync(
- ["srun", "--nodelist=" . $node[0]],
+ ["srun", "--nodes=1"],
[$docker_bin, 'run', '--help'],
{label => "check --memory-swap feature"});
$docker_limitmem = ($stdout =~ /--memory-swap/);
$try_user_arg = "--user=$try_user";
}
my ($exited, $stdout, $stderr) = srun_sync(
- ["srun", "--nodelist=" . $node[0]],
+ ["srun", "--nodes=1"],
["/bin/sh", "-ec",
"$docker_bin run $docker_run_args $try_user_arg $docker_hash id --user"],
{label => $label});
{
my $containername = "$Jobstep->{arvados_task}->{uuid}-$Jobstep->{failures}";
my $cidfile = "$ENV{CRUNCH_TMP}/$containername.cid";
- $command .= "crunchstat -cgroup-root=/sys/fs/cgroup -cgroup-parent=docker -cgroup-cid=$cidfile -poll=10000 ";
+ $command .= "crunchstat -cgroup-root=\Q$cgroup_root\E -cgroup-parent=docker -cgroup-cid=$cidfile -poll=10000 ";
$command .= "$docker_bin run $docker_run_args --name=$containername --attach=stdout --attach=stderr --attach=stdin -i \Q$dockeruserarg\E --cidfile=$cidfile --sig-proxy ";
# We only set memory limits if Docker lets us limit both memory and swap.
# Memory limits alone have been supported longer, but subprocesses tend
}
} else {
# Non-docker run
- $command .= "crunchstat -cgroup-root=/sys/fs/cgroup -poll=10000 ";
+ $command .= "crunchstat -cgroup-root=\Q$cgroup_root\E -poll=10000 ";
$command .= $stdbuf;
$command .= "perl - $ENV{CRUNCH_SRC}/crunch_scripts/" . $Job->{"script"};
}
sub reapchildren
{
my $children_reaped = 0;
- while ((my $pid = waitpid (-1, WNOHANG)) > 0)
+ my @successful_task_uuids = ();
+
+ while((my $pid = waitpid (-1, WNOHANG)) > 0)
{
my $childstatus = $?;
push @jobstep_todo, $jobstepidx;
$Job->{'tasks_summary'}->{'failed'}++;
}
- else
+ else # task_success
{
+ push @successful_task_uuids, $Jobstep->{'arvados_task'}->{uuid};
++$thisround_succeeded;
$slot[$proc{$pid}->{slot}]->{node}->{losing_streak} = 0;
$slot[$proc{$pid}->{slot}]->{node}->{hold_until} = 0;
push @freeslot, $proc{$pid}->{slot};
delete $proc{$pid};
- if ($task_success) {
- # Load new tasks
- my $newtask_list = [];
- my $newtask_results;
- do {
- $newtask_results = api_call(
- "job_tasks/list",
- 'where' => {
- 'created_by_job_task_uuid' => $Jobstep->{'arvados_task'}->{uuid}
- },
- 'order' => 'qsequence',
- 'offset' => scalar(@$newtask_list),
- );
- push(@$newtask_list, @{$newtask_results->{items}});
- } while (@{$newtask_results->{items}});
- foreach my $arvados_task (@$newtask_list) {
- my $jobstep = {
- 'level' => $arvados_task->{'sequence'},
- 'failures' => 0,
- 'arvados_task' => $arvados_task
- };
- push @jobstep, $jobstep;
- push @jobstep_todo, $#jobstep;
- }
- }
$progress_is_dirty = 1;
}
+ if (scalar(@successful_task_uuids) > 0)
+ {
+ Log (undef, sprintf("%d tasks exited (%d succeeded), checking for new tasks from API server.", $children_reaped, scalar(@successful_task_uuids)));
+ # Load new tasks
+ my $newtask_list = [];
+ my $newtask_results;
+ do {
+ $newtask_results = api_call(
+ "job_tasks/list",
+ 'filters' => [["created_by_job_task_uuid","in",\@successful_task_uuids]],
+ 'order' => 'qsequence',
+ 'offset' => scalar(@$newtask_list),
+ );
+ push(@$newtask_list, @{$newtask_results->{items}});
+ } while (@{$newtask_results->{items}});
+ Log (undef, sprintf("Got %d new tasks from API server.", scalar(@$newtask_list)));
+ foreach my $arvados_task (@$newtask_list) {
+ my $jobstep = {
+ 'level' => $arvados_task->{'sequence'},
+ 'failures' => 0,
+ 'arvados_task' => $arvados_task
+ };
+ push @jobstep, $jobstep;
+ push @jobstep_todo, $#jobstep;
+ }
+ }
+
return $children_reaped;
}
$jobstep[$jobstepidx]->{tempfail} = 1;
ban_node_by_slot($jobstep[$jobstepidx]->{slotindex});
}
- elsif ($line =~ /arvados\.errors\.Keep/) {
+ elsif ($line =~ /\bKeep(Read|Write|Request)Error:/) {
$jobstep[$jobstepidx]->{tempfail} = 1;
}
}