projects
/
arvados.git
/ blobdiff
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
Merge branch '8400-additional-gitignore' of https://github.com/wtsi-hgi/arvados close...
[arvados.git]
/
sdk
/
cli
/
bin
/
crunch-job
diff --git
a/sdk/cli/bin/crunch-job
b/sdk/cli/bin/crunch-job
index baaf795ee927fe48db7000d7e1941af5291133d4..ae210a6f447e42d69ecd9302f414866bb4da6e23 100755
(executable)
--- a/
sdk/cli/bin/crunch-job
+++ b/
sdk/cli/bin/crunch-job
@@
-415,11
+415,13
@@
if (!defined $no_clear_tmp) {
# If this job requires a Docker image, install that.
my ($docker_locator, $docker_stream, $docker_hash, $docker_limitmem, $dockeruserarg);
if ($docker_locator = $Job->{docker_image_locator}) {
# If this job requires a Docker image, install that.
my ($docker_locator, $docker_stream, $docker_hash, $docker_limitmem, $dockeruserarg);
if ($docker_locator = $Job->{docker_image_locator}) {
+ Log (undef, "Install docker image $docker_locator");
($docker_stream, $docker_hash) = find_docker_image($docker_locator);
if (!$docker_hash)
{
croak("No Docker image hash found from locator $docker_locator");
}
($docker_stream, $docker_hash) = find_docker_image($docker_locator);
if (!$docker_hash)
{
croak("No Docker image hash found from locator $docker_locator");
}
+ Log (undef, "docker image hash is $docker_hash");
$docker_stream =~ s/^\.//;
my $docker_install_script = qq{
if ! $docker_bin images -q --no-trunc --all | grep -qxF \Q$docker_hash\E; then
$docker_stream =~ s/^\.//;
my $docker_install_script = qq{
if ! $docker_bin images -q --no-trunc --all | grep -qxF \Q$docker_hash\E; then
@@
-430,7
+432,7
@@
fi
if ($docker_pid == 0)
{
srun (["srun", "--nodelist=" . join(',', @node)],
if ($docker_pid == 0)
{
srun (["srun", "--nodelist=" . join(',', @node)],
- ["/bin/
sh
", "-ec", $docker_install_script]);
+ ["/bin/
bash", "-o", "pipefail
", "-ec", $docker_install_script]);
exit ($?);
}
while (1)
exit ($?);
}
while (1)
@@
-441,8
+443,8
@@
fi
}
if ($? != 0)
{
}
if ($? != 0)
{
- croak("Installing Docker image from $docker_locator exited "
-
.exit_status_s($?)
);
+ Log(undef, "Installing Docker image from $docker_locator exited " . exit_status_s($?));
+
exit(EX_RETRY_UNLOCKED
);
}
# Determine whether this version of Docker supports memory+swap limits.
}
# Determine whether this version of Docker supports memory+swap limits.
@@
-1057,12
+1059,14
@@
for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
check_refresh_wanted();
check_squeue();
update_progress_stats();
check_refresh_wanted();
check_squeue();
update_progress_stats();
- select (undef, undef, undef, 0.1);
}
elsif (time - $progress_stats_updated >= 30 || $progress_is_dirty)
{
update_progress_stats();
}
}
elsif (time - $progress_stats_updated >= 30 || $progress_is_dirty)
{
update_progress_stats();
}
+ if (!$gotsome) {
+ select (undef, undef, undef, 0.1);
+ }
$working_slot_count = scalar(grep { $_->{node}->{fail_count} == 0 &&
$_->{node}->{hold_count} < 4 } @slot);
if (($thisround_failed_multiple >= 8 && $thisround_succeeded == 0) ||
$working_slot_count = scalar(grep { $_->{node}->{fail_count} == 0 &&
$_->{node}->{hold_count} < 4 } @slot);
if (($thisround_failed_multiple >= 8 && $thisround_succeeded == 0) ||
@@
-1340,8
+1344,9
@@
sub check_squeue
# squeue check interval (15s) this should make the squeue check an
# infrequent event.
my $silent_procs = 0;
# squeue check interval (15s) this should make the squeue check an
# infrequent event.
my $silent_procs = 0;
- for my $
jobstep
(values %proc)
+ for my $
procinfo
(values %proc)
{
{
+ my $jobstep = $jobstep[$procinfo->{jobstep}];
if ($jobstep->{stderr_at} < $last_squeue_check)
{
$silent_procs++;
if ($jobstep->{stderr_at} < $last_squeue_check)
{
$silent_procs++;
@@
-1350,17
+1355,18
@@
sub check_squeue
return if $silent_procs == 0;
# use killem() on procs whose killtime is reached
return if $silent_procs == 0;
# use killem() on procs whose killtime is reached
- while (my ($pid, $
jobstep
) = each %proc)
+ while (my ($pid, $
procinfo
) = each %proc)
{
{
- if (exists $jobstep->{killtime}
- && $jobstep->{killtime} <= time
+ my $jobstep = $jobstep[$procinfo->{jobstep}];
+ if (exists $procinfo->{killtime}
+ && $procinfo->{killtime} <= time
&& $jobstep->{stderr_at} < $last_squeue_check)
{
my $sincewhen = "";
if ($jobstep->{stderr_at}) {
$sincewhen = " in last " . (time - $jobstep->{stderr_at}) . "s";
}
&& $jobstep->{stderr_at} < $last_squeue_check)
{
my $sincewhen = "";
if ($jobstep->{stderr_at}) {
$sincewhen = " in last " . (time - $jobstep->{stderr_at}) . "s";
}
- Log($
jobstep
->{jobstep}, "killing orphaned srun process $pid (task not in slurm queue, no stderr received$sincewhen)");
+ Log($
procinfo
->{jobstep}, "killing orphaned srun process $pid (task not in slurm queue, no stderr received$sincewhen)");
killem ($pid);
}
}
killem ($pid);
}
}
@@
-1395,12
+1401,12
@@
sub check_squeue
}
# Check for child procs >60s old and not mentioned by squeue.
}
# Check for child procs >60s old and not mentioned by squeue.
- while (my ($pid, $
jobstep
) = each %proc)
+ while (my ($pid, $
procinfo
) = each %proc)
{
{
- if ($
jobstep
->{time} < time - 60
- && $
jobstep
->{jobstepname}
- && !exists $ok{$
jobstep
->{jobstepname}}
- && !exists $
jobstep
->{killtime})
+ if ($
procinfo
->{time} < time - 60
+ && $
procinfo
->{jobstepname}
+ && !exists $ok{$
procinfo
->{jobstepname}}
+ && !exists $
procinfo
->{killtime})
{
# According to slurm, this task has ended (successfully or not)
# -- but our srun child hasn't exited. First we must wait (30
{
# According to slurm, this task has ended (successfully or not)
# -- but our srun child hasn't exited. First we must wait (30
@@
-1409,8
+1415,8
@@
sub check_squeue
# terminated, we'll conclude some slurm communication
# error/delay has caused the task to die without notifying srun,
# and we'll kill srun ourselves.
# terminated, we'll conclude some slurm communication
# error/delay has caused the task to die without notifying srun,
# and we'll kill srun ourselves.
- $
jobstep
->{killtime} = time + 30;
- Log($
jobstep
->{jobstep}, "notice: task is not in slurm queue but srun process $pid has not exited");
+ $
procinfo
->{killtime} = time + 30;
+ Log($
procinfo
->{jobstep}, "notice: task is not in slurm queue but srun process $pid has not exited");
}
}
}
}
}
}
@@
-1432,15
+1438,21
@@
sub readfrompipes
foreach my $job (keys %reader)
{
my $buf;
foreach my $job (keys %reader)
{
my $buf;
-
while (0 < sysread ($reader{$job}, $buf, 8192
))
+
if (0 < sysread ($reader{$job}, $buf, 65536
))
{
print STDERR $buf if $ENV{CRUNCH_DEBUG};
$jobstep[$job]->{stderr_at} = time;
$jobstep[$job]->{stderr} .= $buf;
{
print STDERR $buf if $ENV{CRUNCH_DEBUG};
$jobstep[$job]->{stderr_at} = time;
$jobstep[$job]->{stderr} .= $buf;
+
+ # Consume everything up to the last \n
preprocess_stderr ($job);
preprocess_stderr ($job);
+
if (length ($jobstep[$job]->{stderr}) > 16384)
{
if (length ($jobstep[$job]->{stderr}) > 16384)
{
- substr ($jobstep[$job]->{stderr}, 0, 8192) = "";
+ # If we get a lot of stderr without a newline, chop off the
+ # front to avoid letting our buffer grow indefinitely.
+ substr ($jobstep[$job]->{stderr},
+ 0, length($jobstep[$job]->{stderr}) - 8192) = "";
}
$gotsome = 1;
}
}
$gotsome = 1;
}