my $job_api_token;
my $no_clear_tmp;
my $resume_stash;
+my $docker_bin = "/usr/bin/docker.io";
GetOptions('force-unlock' => \$force_unlock,
'git-dir=s' => \$git_dir,
'job=s' => \$jobspec,
'job-api-token=s' => \$job_api_token,
'no-clear-tmp' => \$no_clear_tmp,
'resume-stash=s' => \$resume_stash,
+ 'docker-bin=s' => \$docker_bin,
);
if (defined $job_api_token) {
}
my $have_slurm = exists $ENV{SLURM_JOBID} && exists $ENV{SLURM_NODELIST};
-my $local_job = 0;
$SIG{'USR1'} = sub
$main::ENV{CRUNCH_DEBUG} = 0;
};
-
-
my $arv = Arvados->new('apiVersion' => 'v1');
my $Job;
my $sth;
my @jobstep;
-my $User = api_call("users/current");
-
+my $local_job;
if ($jobspec =~ /^[-a-z\d]+$/)
{
# $jobspec is an Arvados UUID, not a JSON job specification
$Job = api_call("jobs/get", uuid => $jobspec);
+ $local_job = 0;
+}
+else
+{
+ $Job = JSON::decode_json($jobspec);
+ $local_job = 1;
+}
+
+
+# Make sure our workers (our slurm nodes, localhost, or whatever) are
+# at least able to run basic commands: they aren't down or severely
+# misconfigured.
+my $cmd = ['true'];
+if ($Job->{docker_image_locator}) {
+ $cmd = [$docker_bin, 'ps', '-q'];
+}
+Log(undef, "Sanity check is `@$cmd`");
+srun(["srun", "--nodes=\Q$ENV{SLURM_NNODES}\E", "--ntasks-per-node=1"],
+ $cmd,
+ {fork => 1});
+if ($? != 0) {
+ Log(undef, "Sanity check failed: ".exit_status_s($?));
+ exit EX_TEMPFAIL;
+}
+Log(undef, "Sanity check OK");
+
+
+my $User = api_call("users/current");
+
+if (!$local_job) {
if (!$force_unlock) {
# Claim this job, and make sure nobody else does
eval { api_call("jobs/lock", uuid => $Job->{uuid}); };
}
else
{
- $Job = JSON::decode_json($jobspec);
-
if (!$resume_stash)
{
map { croak ("No $_ specified") unless $Job->{$_} }
}
# If this job requires a Docker image, install that.
-my $docker_bin = "/usr/bin/docker.io";
my ($docker_locator, $docker_stream, $docker_hash, $docker_limitmem);
if ($docker_locator = $Job->{docker_image_locator}) {
($docker_stream, $docker_hash) = find_docker_image($docker_locator);
Log ($jobstepid, sprintf('failure (#%d, %s) after %d seconds',
++$Jobstep->{'failures'},
- $temporary_fail ? 'temporary ' : 'permanent',
+ $temporary_fail ? 'temporary' : 'permanent',
$elapsed));
if (!$temporary_fail || $Jobstep->{'failures'} >= 3) {
my $show_cmd = Dumper($args);
$show_cmd =~ s/(TOKEN\\*=)[^\s\']+/${1}[...]/g;
$show_cmd =~ s/\n/ /g;
- warn "starting: $show_cmd\n";
+ if ($opts->{fork}) {
+ Log(undef, "starting: $show_cmd");
+ } else {
+ # This is a child process: parent is in charge of reading our
+ # stderr and copying it to Log() if needed.
+ warn "starting: $show_cmd\n";
+ }
if (defined $stdin) {
my $child = open STDIN, "-|";