2 # -*- mode: perl; perl-indent-level: 2; indent-tabs-mode: nil; -*-
6 crunch-job: Execute job steps, save snapshots as requested, collate output.
10 Obtain job details from Arvados, run tasks on compute nodes (typically
11 invoked by scheduler on controller):
13 crunch-job --job x-y-z
15 Obtain job details from command line, run tasks on local machine
16 (typically invoked by application or developer on VM):
18 crunch-job --job '{"script_version":"/path/to/tree","script":"scriptname",...}'
26 If the job is already locked, steal the lock and run it anyway.
30 Path to .git directory where the specified commit is found.
34 Arvados API authorization token to use during the course of the job.
38 Do not clear per-job/task temporary directories during initial job
39 setup. This can speed up development and debugging when running jobs
44 =head1 RUNNING JOBS LOCALLY
46 crunch-job's log messages appear on stderr along with the job tasks'
47 stderr streams. The log is saved in Keep at each checkpoint and when
50 If the job succeeds, the job's output locator is printed on stdout.
52 While the job is running, the following signals are accepted:
56 =item control-C, SIGINT, SIGQUIT
58 Save a checkpoint, terminate any job tasks that are running, and stop.
62 Save a checkpoint and continue.
66 Refresh node allocation (i.e., check whether any nodes have been added
67 or unallocated) and attributes of the Job record that should affect
68 behavior (e.g., cancel job if cancelled_at becomes non-nil).
76 use POSIX ':sys_wait_h';
77 use POSIX qw(strftime);
78 use Fcntl qw(F_GETFL F_SETFL O_NONBLOCK);
80 use Digest::MD5 qw(md5_hex);
86 use File::Path qw( make_path remove_tree );
88 use constant EX_TEMPFAIL => 75;
90 $ENV{"TMPDIR"} ||= "/tmp";
91 unless (defined $ENV{"CRUNCH_TMP"}) {
92 $ENV{"CRUNCH_TMP"} = $ENV{"TMPDIR"} . "/crunch-job";
93 if ($ENV{"USER"} ne "crunch" && $< != 0) {
94 # use a tmp dir unique for my uid
95 $ENV{"CRUNCH_TMP"} .= "-$<";
99 # Create the tmp directory if it does not exist
100 if ( ! -d $ENV{"CRUNCH_TMP"} ) {
101 make_path $ENV{"CRUNCH_TMP"} or die "Failed to create temporary working directory: " . $ENV{"CRUNCH_TMP"};
104 $ENV{"JOB_WORK"} = $ENV{"CRUNCH_TMP"} . "/work";
105 $ENV{"CRUNCH_INSTALL"} = "$ENV{CRUNCH_TMP}/opt";
106 $ENV{"CRUNCH_WORK"} = $ENV{"JOB_WORK"}; # deprecated
107 mkdir ($ENV{"JOB_WORK"});
115 GetOptions('force-unlock' => \$force_unlock,
116 'git-dir=s' => \$git_dir,
117 'job=s' => \$jobspec,
118 'job-api-token=s' => \$job_api_token,
119 'no-clear-tmp' => \$no_clear_tmp,
120 'resume-stash=s' => \$resume_stash,
123 if (defined $job_api_token) {
124 $ENV{ARVADOS_API_TOKEN} = $job_api_token;
127 my $have_slurm = exists $ENV{SLURM_JOBID} && exists $ENV{SLURM_NODELIST};
133 $main::ENV{CRUNCH_DEBUG} = 1;
137 $main::ENV{CRUNCH_DEBUG} = 0;
142 my $arv = Arvados->new('apiVersion' => 'v1');
150 my $User = retry_op(sub { $arv->{'users'}->{'current'}->execute; });
152 if ($jobspec =~ /^[-a-z\d]+$/)
154 # $jobspec is an Arvados UUID, not a JSON job specification
155 $Job = retry_op(sub {
156 $arv->{'jobs'}->{'get'}->execute('uuid' => $jobspec);
158 if (!$force_unlock) {
159 # Claim this job, and make sure nobody else does
160 eval { retry_op(sub {
161 # lock() sets is_locked_by_uuid and changes state to Running.
162 $arv->{'jobs'}->{'lock'}->execute('uuid' => $Job->{'uuid'})
165 Log(undef, "Error while locking job, exiting ".EX_TEMPFAIL);
172 $Job = JSON::decode_json($jobspec);
176 map { croak ("No $_ specified") unless $Job->{$_} }
177 qw(script script_version script_parameters);
180 $Job->{'is_locked_by_uuid'} = $User->{'uuid'};
181 $Job->{'started_at'} = gmtime;
182 $Job->{'state'} = 'Running';
184 $Job = retry_op(sub { $arv->{'jobs'}->{'create'}->execute('job' => $Job); });
186 $job_id = $Job->{'uuid'};
188 my $keep_logfile = $job_id . '.log.txt';
189 log_writer_start($keep_logfile);
191 $Job->{'runtime_constraints'} ||= {};
192 $Job->{'runtime_constraints'}->{'max_tasks_per_node'} ||= 0;
193 my $max_ncpus = $Job->{'runtime_constraints'}->{'max_tasks_per_node'};
196 Log (undef, "check slurm allocation");
199 # Should use $ENV{SLURM_TASKS_PER_NODE} instead of sinfo? (eg. "4(x3),2,4(x2)")
203 my $localcpus = 0 + `grep -cw ^processor /proc/cpuinfo` || 1;
204 push @sinfo, "$localcpus localhost";
206 if (exists $ENV{SLURM_NODELIST})
208 push @sinfo, `sinfo -h --format='%c %N' --nodes=\Q$ENV{SLURM_NODELIST}\E`;
212 my ($ncpus, $slurm_nodelist) = split;
213 $ncpus = $max_ncpus if $max_ncpus && $ncpus > $max_ncpus;
216 while ($slurm_nodelist =~ s/^([^\[,]+?(\[.*?\])?)(,|$)//)
219 if ($nodelist =~ /\[((\d+)(-(\d+))?(,(\d+)(-(\d+))?)*)\]/)
222 foreach (split (",", $ranges))
235 push @nodelist, map {
237 $n =~ s/\[[-,\d]+\]/$_/;
244 push @nodelist, $nodelist;
247 foreach my $nodename (@nodelist)
249 Log (undef, "node $nodename - $ncpus slots");
250 my $node = { name => $nodename,
254 foreach my $cpu (1..$ncpus)
256 push @slot, { node => $node,
260 push @node, @nodelist;
265 # Ensure that we get one jobstep running on each allocated node before
266 # we start overloading nodes with concurrent steps
268 @slot = sort { $a->{cpu} <=> $b->{cpu} } @slot;
271 $Job->update_attributes(
272 'tasks_summary' => { 'failed' => 0,
277 Log (undef, "start");
278 $SIG{'INT'} = sub { $main::please_freeze = 1; };
279 $SIG{'QUIT'} = sub { $main::please_freeze = 1; };
280 $SIG{'TERM'} = \&croak;
281 $SIG{'TSTP'} = sub { $main::please_freeze = 1; };
282 $SIG{'ALRM'} = sub { $main::please_info = 1; };
283 $SIG{'CONT'} = sub { $main::please_continue = 1; };
284 $SIG{'HUP'} = sub { $main::please_refresh = 1; };
286 $main::please_freeze = 0;
287 $main::please_info = 0;
288 $main::please_continue = 0;
289 $main::please_refresh = 0;
290 my $jobsteps_must_output_keys = 0; # becomes 1 when any task outputs a key
292 grep { $ENV{$1} = $2 if /^(NOCACHE.*?)=(.*)/ } split ("\n", $$Job{knobs});
293 $ENV{"CRUNCH_JOB_UUID"} = $job_id;
294 $ENV{"JOB_UUID"} = $job_id;
297 my @jobstep_todo = ();
298 my @jobstep_done = ();
299 my @jobstep_tomerge = ();
300 my $jobstep_tomerge_level = 0;
302 my $squeue_kill_checked;
303 my $output_in_keep = 0;
304 my $latest_refresh = scalar time;
308 if (defined $Job->{thawedfromkey})
310 thaw ($Job->{thawedfromkey});
314 my $first_task = retry_op(sub {
315 $arv->{'job_tasks'}->{'create'}->execute('job_task' => {
316 'job_uuid' => $Job->{'uuid'},
322 push @jobstep, { 'level' => 0,
324 'arvados_task' => $first_task,
326 push @jobstep_todo, 0;
332 must_lock_now("$ENV{CRUNCH_TMP}/.lock", "a job is already running here.");
339 $build_script = <DATA>;
341 my $nodelist = join(",", @node);
343 if (!defined $no_clear_tmp) {
344 # Clean out crunch_tmp/work, crunch_tmp/opt, crunch_tmp/src*
345 Log (undef, "Clean work dirs");
347 my $cleanpid = fork();
350 srun (["srun", "--nodelist=$nodelist", "-D", $ENV{'TMPDIR'}],
351 ['bash', '-c', 'if mount | grep -q $JOB_WORK/; then for i in $JOB_WORK/*keep; do /bin/fusermount -z -u $i; done; fi; sleep 1; rm -rf $JOB_WORK $CRUNCH_TMP/opt $CRUNCH_TMP/src*']);
356 last if $cleanpid == waitpid (-1, WNOHANG);
357 freeze_if_want_freeze ($cleanpid);
358 select (undef, undef, undef, 0.1);
360 Log (undef, "Cleanup command exited ".exit_status_s($?));
365 if (!defined $git_dir && $Job->{'script_version'} =~ m{^/}) {
366 # If we're in user-land (i.e., not called from crunch-dispatch)
367 # script_version can be an absolute directory path, signifying we
368 # should work straight out of that directory instead of using a git
370 $ENV{"CRUNCH_SRC_COMMIT"} = $Job->{'script_version'};
371 $ENV{"CRUNCH_SRC"} = $Job->{'script_version'};
374 $ENV{"CRUNCH_SRC"} = "$ENV{CRUNCH_TMP}/src";
376 # Install requested code version
377 Log (undef, "Looking for version ".$Job->{script_version}." from repository ".$Job->{repository});
379 $ENV{"CRUNCH_SRC_COMMIT"} = $Job->{script_version};
381 # If we're running under crunch-dispatch, it will have already
382 # pulled the appropriate source tree into its own repository, and
383 # given us that repo's path as $git_dir.
385 # If we're running a "local" job, we might have to fetch content
386 # from a remote repository.
388 # (Currently crunch-dispatch gives a local path with --git-dir, but
389 # we might as well accept URLs there too in case it changes its
391 my $repo = $git_dir || $Job->{'repository'};
393 # Repository can be remote or local. If remote, we'll need to fetch it
394 # to a local dir before doing `git log` et al.
397 if ($repo =~ m{://|^[^/]*:}) {
398 # $repo is a git url we can clone, like git:// or https:// or
399 # file:/// or [user@]host:repo.git. Note "user/name@host:foo" is
400 # not recognized here because distinguishing that from a local
401 # path is too fragile. If you really need something strange here,
402 # use the ssh:// form.
403 $repo_location = 'remote';
404 } elsif ($repo =~ m{^\.*/}) {
405 # $repo is a local path to a git index. We'll also resolve ../foo
406 # to ../foo/.git if the latter is a directory. To help
407 # disambiguate local paths from named hosted repositories, this
408 # form must be given as ./ or ../ if it's a relative path.
409 if (-d "$repo/.git") {
410 $repo = "$repo/.git";
412 $repo_location = 'local';
414 # $repo is none of the above. It must be the name of a hosted
416 my $arv_repo_list = retry_op(sub {
417 $arv->{'repositories'}->{'list'}->execute(
418 'filters' => [['name','=',$repo]])->{'items'};
420 my $n_found = scalar @{$arv_repo_list};
422 Log(undef, "Repository '$repo' -> "
423 . join(", ", map { $_->{'uuid'} } @{$arv_repo_list}));
426 croak("Error: Found $n_found repositories with name '$repo'.");
428 $repo = $arv_repo_list->[0]->{'fetch_url'};
429 $repo_location = 'remote';
431 Log(undef, "Using $repo_location repository '$repo'");
432 $ENV{"CRUNCH_SRC_URL"} = $repo;
434 # Resolve given script_version (we'll call that $treeish here) to a
435 # commit sha1 ($commit).
436 my $treeish = $Job->{'script_version'};
438 if ($repo_location eq 'remote') {
439 # We minimize excess object-fetching by re-using the same bare
440 # repository in CRUNCH_TMP/.git for multiple crunch-jobs -- we
441 # just keep adding remotes to it as needed.
442 my $local_repo = $ENV{'CRUNCH_TMP'}."/.git";
443 my $gitcmd = "git --git-dir=\Q$local_repo\E";
445 # Set up our local repo for caching remote objects, making
447 if (!-d $local_repo) {
448 make_path($local_repo) or croak("Error: could not create $local_repo");
450 # This works (exits 0 and doesn't delete fetched objects) even
451 # if $local_repo is already initialized:
452 `$gitcmd init --bare`;
454 croak("Error: $gitcmd init --bare exited ".exit_status_s($?));
457 # If $treeish looks like a hash (or abbrev hash) we look it up in
458 # our local cache first, since that's cheaper. (We don't want to
459 # do that with tags/branches though -- those change over time, so
460 # they should always be resolved by the remote repo.)
461 if ($treeish =~ /^[0-9a-f]{3,40}$/s) {
462 # Hide stderr because it's normal for this to fail:
463 my $sha1 = `$gitcmd rev-list -n1 ''\Q$treeish\E 2>/dev/null`;
465 $sha1 =~ /^$treeish/ && # Don't use commit 123 @ branch abc!
466 $sha1 =~ /^([0-9a-f]{40})$/s) {
468 Log(undef, "Commit $commit already present in $local_repo");
472 if (!defined $commit) {
473 # If $treeish isn't just a hash or abbrev hash, or isn't here
474 # yet, we need to fetch the remote to resolve it correctly.
476 # First, remove all local heads. This prevents a name that does
477 # not exist on the remote from resolving to (or colliding with)
478 # a previously fetched branch or tag (possibly from a different
480 remove_tree("$local_repo/refs/heads", {keep_root => 1});
482 Log(undef, "Fetching objects from $repo to $local_repo");
483 `$gitcmd fetch --no-progress --tags ''\Q$repo\E \Q+refs/heads/*:refs/heads/*\E`;
485 croak("Error: `$gitcmd fetch` exited ".exit_status_s($?));
489 # Now that the data is all here, we will use our local repo for
490 # the rest of our git activities.
494 my $gitcmd = "git --git-dir=\Q$repo\E";
495 my $sha1 = `$gitcmd rev-list -n1 ''\Q$treeish\E`;
496 unless ($? == 0 && $sha1 =~ /^([0-9a-f]{40})$/) {
497 croak("`$gitcmd rev-list` exited "
499 .", '$treeish' not found. Giving up.");
502 Log(undef, "Version $treeish is commit $commit");
504 if ($commit ne $Job->{'script_version'}) {
505 # Record the real commit id in the database, frozentokey, logs,
506 # etc. -- instead of an abbreviation or a branch name which can
507 # become ambiguous or point to a different commit in the future.
508 if (!$Job->update_attributes('script_version' => $commit)) {
509 croak("Error: failed to update job's script_version attribute");
513 $ENV{"CRUNCH_SRC_COMMIT"} = $commit;
514 $git_archive = `$gitcmd archive ''\Q$commit\E`;
516 croak("Error: $gitcmd archive exited ".exit_status_s($?));
520 if (!defined $git_archive) {
521 Log(undef, "Skip install phase (no git archive)");
523 Log(undef, "Warning: This probably means workers have no source tree!");
527 Log(undef, "Run install script on all workers");
529 my @srunargs = ("srun",
530 "--nodelist=$nodelist",
531 "-D", $ENV{'TMPDIR'}, "--job-name=$job_id");
532 my @execargs = ("sh", "-c",
533 "mkdir -p $ENV{CRUNCH_INSTALL} && cd $ENV{CRUNCH_TMP} && perl -");
535 # Note: this section is almost certainly unnecessary if we're
536 # running tasks in docker containers.
537 my $installpid = fork();
538 if ($installpid == 0)
540 srun (\@srunargs, \@execargs, {}, $build_script . $git_archive);
545 last if $installpid == waitpid (-1, WNOHANG);
546 freeze_if_want_freeze ($installpid);
547 select (undef, undef, undef, 0.1);
549 Log (undef, "Install script exited ".exit_status_s($?));
554 # Grab our lock again (we might have deleted and re-created CRUNCH_TMP above)
555 must_lock_now("$ENV{CRUNCH_TMP}/.lock", "a job is already running here.");
558 # If this job requires a Docker image, install that.
559 my $docker_bin = "/usr/bin/docker.io";
560 my ($docker_locator, $docker_stream, $docker_hash);
561 if ($docker_locator = $Job->{docker_image_locator}) {
562 ($docker_stream, $docker_hash) = find_docker_image($docker_locator);
565 croak("No Docker image hash found from locator $docker_locator");
567 $docker_stream =~ s/^\.//;
568 my $docker_install_script = qq{
569 if ! $docker_bin images -q --no-trunc | grep -qxF \Q$docker_hash\E; then
570 arv-get \Q$docker_locator$docker_stream/$docker_hash.tar\E | $docker_bin load
573 my $docker_pid = fork();
574 if ($docker_pid == 0)
576 srun (["srun", "--nodelist=" . join(',', @node)],
577 ["/bin/sh", "-ec", $docker_install_script]);
582 last if $docker_pid == waitpid (-1, WNOHANG);
583 freeze_if_want_freeze ($docker_pid);
584 select (undef, undef, undef, 0.1);
588 croak("Installing Docker image from $docker_locator exited "
593 foreach (qw (script script_version script_parameters runtime_constraints))
597 (ref($Job->{$_}) ? JSON::encode_json($Job->{$_}) : $Job->{$_}));
599 foreach (split (/\n/, $Job->{knobs}))
601 Log (undef, "knob " . $_);
606 $main::success = undef;
612 my $thisround_succeeded = 0;
613 my $thisround_failed = 0;
614 my $thisround_failed_multiple = 0;
616 @jobstep_todo = sort { $jobstep[$a]->{level} <=> $jobstep[$b]->{level}
617 or $a <=> $b } @jobstep_todo;
618 my $level = $jobstep[$jobstep_todo[0]]->{level};
619 Log (undef, "start level $level");
624 my @freeslot = (0..$#slot);
627 my $progress_is_dirty = 1;
628 my $progress_stats_updated = 0;
630 update_progress_stats();
635 for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
637 my $id = $jobstep_todo[$todo_ptr];
638 my $Jobstep = $jobstep[$id];
639 if ($Jobstep->{level} != $level)
644 pipe $reader{$id}, "writer" or croak ($!);
645 my $flags = fcntl ($reader{$id}, F_GETFL, 0) or croak ($!);
646 fcntl ($reader{$id}, F_SETFL, $flags | O_NONBLOCK) or croak ($!);
648 my $childslot = $freeslot[0];
649 my $childnode = $slot[$childslot]->{node};
650 my $childslotname = join (".",
651 $slot[$childslot]->{node}->{name},
652 $slot[$childslot]->{cpu});
653 my $childpid = fork();
656 $SIG{'INT'} = 'DEFAULT';
657 $SIG{'QUIT'} = 'DEFAULT';
658 $SIG{'TERM'} = 'DEFAULT';
660 foreach (values (%reader))
664 fcntl ("writer", F_SETFL, 0) or croak ($!); # no close-on-exec
665 open(STDOUT,">&writer");
666 open(STDERR,">&writer");
671 delete $ENV{"GNUPGHOME"};
672 $ENV{"TASK_UUID"} = $Jobstep->{'arvados_task'}->{'uuid'};
673 $ENV{"TASK_QSEQUENCE"} = $id;
674 $ENV{"TASK_SEQUENCE"} = $level;
675 $ENV{"JOB_SCRIPT"} = $Job->{script};
676 while (my ($param, $value) = each %{$Job->{script_parameters}}) {
677 $param =~ tr/a-z/A-Z/;
678 $ENV{"JOB_PARAMETER_$param"} = $value;
680 $ENV{"TASK_SLOT_NODE"} = $slot[$childslot]->{node}->{name};
681 $ENV{"TASK_SLOT_NUMBER"} = $slot[$childslot]->{cpu};
682 $ENV{"TASK_WORK"} = $ENV{"JOB_WORK"}."/$id.$$";
683 $ENV{"HOME"} = $ENV{"TASK_WORK"};
684 $ENV{"TASK_KEEPMOUNT"} = $ENV{"TASK_WORK"}.".keep";
685 $ENV{"TASK_TMPDIR"} = $ENV{"TASK_WORK"}; # deprecated
686 $ENV{"CRUNCH_NODE_SLOTS"} = $slot[$childslot]->{node}->{ncpus};
687 $ENV{"PATH"} = $ENV{"CRUNCH_INSTALL"} . "/bin:" . $ENV{"PATH"};
693 "--nodelist=".$childnode->{name},
694 qw(-n1 -c1 -N1 -D), $ENV{'TMPDIR'},
695 "--job-name=$job_id.$id.$$",
697 my $build_script_to_send = "";
699 "if [ -e $ENV{TASK_WORK} ]; then rm -rf $ENV{TASK_WORK}; fi; "
700 ."mkdir -p $ENV{CRUNCH_TMP} $ENV{JOB_WORK} $ENV{TASK_WORK} $ENV{TASK_KEEPMOUNT} "
701 ."&& cd $ENV{CRUNCH_TMP} ";
704 $build_script_to_send = $build_script;
708 $command .= "&& exec arv-mount --by-id --allow-other $ENV{TASK_KEEPMOUNT} --exec ";
711 $command .= "crunchstat -cgroup-root=/sys/fs/cgroup -cgroup-parent=docker -cgroup-cid=$ENV{TASK_WORK}/docker.cid -poll=10000 ";
712 $command .= "$docker_bin run --rm=true --attach=stdout --attach=stderr --user=crunch --cidfile=$ENV{TASK_WORK}/docker.cid ";
713 # Dynamically configure the container to use the host system as its
714 # DNS server. Get the host's global addresses from the ip command,
715 # and turn them into docker --dns options using gawk.
717 q{$(ip -o address show scope global |
718 gawk 'match($4, /^([0-9\.:]+)\//, x){print "--dns", x[1]}') };
719 $command .= "--volume=\Q$ENV{CRUNCH_SRC}:/tmp/crunch-src:ro\E ";
720 $command .= "--volume=\Q$ENV{TASK_KEEPMOUNT}:/keep:ro\E ";
721 $command .= "--env=\QHOME=/home/crunch\E ";
722 while (my ($env_key, $env_val) = each %ENV)
724 if ($env_key =~ /^(ARVADOS|JOB|TASK)_/) {
725 if ($env_key eq "TASK_WORK") {
726 $command .= "--env=\QTASK_WORK=/tmp/crunch-job\E ";
728 elsif ($env_key eq "TASK_KEEPMOUNT") {
729 $command .= "--env=\QTASK_KEEPMOUNT=/keep\E ";
732 $command .= "--env=\Q$env_key=$env_val\E ";
736 $command .= "--env=\QCRUNCH_NODE_SLOTS=$ENV{CRUNCH_NODE_SLOTS}\E ";
737 $command .= "--env=\QCRUNCH_SRC=/tmp/crunch-src\E ";
738 $command .= "\Q$docker_hash\E ";
739 $command .= "stdbuf --output=0 --error=0 ";
740 $command .= "/tmp/crunch-src/crunch_scripts/" . $Job->{"script"};
743 $command .= "crunchstat -cgroup-root=/sys/fs/cgroup -poll=10000 ";
744 $command .= "stdbuf --output=0 --error=0 ";
745 $command .= "$ENV{CRUNCH_SRC}/crunch_scripts/" . $Job->{"script"};
748 my @execargs = ('bash', '-c', $command);
749 srun (\@srunargs, \@execargs, undef, $build_script_to_send);
750 # exec() failed, we assume nothing happened.
751 Log(undef, "srun() failed on build script");
755 if (!defined $childpid)
762 $proc{$childpid} = { jobstep => $id,
765 jobstepname => "$job_id.$id.$childpid",
767 croak ("assert failed: \$slot[$childslot]->{'pid'} exists") if exists $slot[$childslot]->{pid};
768 $slot[$childslot]->{pid} = $childpid;
770 Log ($id, "job_task ".$Jobstep->{'arvados_task'}->{'uuid'});
771 Log ($id, "child $childpid started on $childslotname");
772 $Jobstep->{starttime} = time;
773 $Jobstep->{node} = $childnode->{name};
774 $Jobstep->{slotindex} = $childslot;
775 delete $Jobstep->{stderr};
776 delete $Jobstep->{finishtime};
778 $Jobstep->{'arvados_task'}->{started_at} = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($Jobstep->{starttime});
779 $Jobstep->{'arvados_task'}->save;
781 splice @jobstep_todo, $todo_ptr, 1;
784 $progress_is_dirty = 1;
788 (@slot > @freeslot && $todo_ptr+1 > $#jobstep_todo))
790 last THISROUND if $main::please_freeze;
791 if ($main::please_info)
793 $main::please_info = 0;
797 update_progress_stats();
804 check_refresh_wanted();
806 update_progress_stats();
807 select (undef, undef, undef, 0.1);
809 elsif (time - $progress_stats_updated >= 30)
811 update_progress_stats();
813 if (($thisround_failed_multiple >= 8 && $thisround_succeeded == 0) ||
814 ($thisround_failed_multiple >= 16 && $thisround_failed_multiple > $thisround_succeeded))
816 my $message = "Repeated failure rate too high ($thisround_failed_multiple/"
817 .($thisround_failed+$thisround_succeeded)
818 .") -- giving up on this round";
819 Log (undef, $message);
823 # move slots from freeslot to holdslot (or back to freeslot) if necessary
824 for (my $i=$#freeslot; $i>=0; $i--) {
825 if ($slot[$freeslot[$i]]->{node}->{hold_until} > scalar time) {
826 push @holdslot, (splice @freeslot, $i, 1);
829 for (my $i=$#holdslot; $i>=0; $i--) {
830 if ($slot[$holdslot[$i]]->{node}->{hold_until} <= scalar time) {
831 push @freeslot, (splice @holdslot, $i, 1);
835 # give up if no nodes are succeeding
836 if (!grep { $_->{node}->{losing_streak} == 0 &&
837 $_->{node}->{hold_count} < 4 } @slot) {
838 my $message = "Every node has failed -- giving up on this round";
839 Log (undef, $message);
846 push @freeslot, splice @holdslot;
847 map { $slot[$freeslot[$_]]->{node}->{losing_streak} = 0 } (0..$#freeslot);
850 Log (undef, "wait for last ".(scalar keys %proc)." children to finish");
853 if ($main::please_continue) {
854 $main::please_continue = 0;
857 $main::please_info = 0, freeze(), collate_output(), save_meta(1) if $main::please_info;
861 check_refresh_wanted();
863 update_progress_stats();
864 select (undef, undef, undef, 0.1);
865 killem (keys %proc) if $main::please_freeze;
869 update_progress_stats();
870 freeze_if_want_freeze();
873 if (!defined $main::success)
876 $thisround_succeeded == 0 &&
877 ($thisround_failed == 0 || $thisround_failed > 4))
879 my $message = "stop because $thisround_failed tasks failed and none succeeded";
880 Log (undef, $message);
889 goto ONELEVEL if !defined $main::success;
892 release_allocation();
894 my $collated_output = &collate_output();
896 if (!$collated_output) {
897 Log(undef, "output undef");
901 open(my $orig_manifest, '-|', 'arv-get', $collated_output)
902 or die "failed to get collated manifest: $!";
903 my $orig_manifest_text = '';
904 while (my $manifest_line = <$orig_manifest>) {
905 $orig_manifest_text .= $manifest_line;
907 my $output = retry_op(sub {
908 $arv->{'collections'}->{'create'}->execute(
909 'collection' => {'manifest_text' => $orig_manifest_text});
911 Log(undef, "output uuid " . $output->{uuid});
912 Log(undef, "output hash " . $output->{portable_data_hash});
913 $Job->update_attributes('output' => $output->{portable_data_hash});
916 Log (undef, "Failed to register output manifest: $@");
920 Log (undef, "finish");
925 if ($collated_output && $main::success) {
926 $final_state = 'Complete';
928 $final_state = 'Failed';
930 $Job->update_attributes('state' => $final_state);
932 exit (($final_state eq 'Complete') ? 0 : 1);
936 sub update_progress_stats
938 $progress_stats_updated = time;
939 return if !$progress_is_dirty;
940 my ($todo, $done, $running) = (scalar @jobstep_todo,
941 scalar @jobstep_done,
942 scalar @slot - scalar @freeslot - scalar @holdslot);
943 $Job->{'tasks_summary'} ||= {};
944 $Job->{'tasks_summary'}->{'todo'} = $todo;
945 $Job->{'tasks_summary'}->{'done'} = $done;
946 $Job->{'tasks_summary'}->{'running'} = $running;
947 $Job->update_attributes('tasks_summary' => $Job->{'tasks_summary'});
948 Log (undef, "status: $done done, $running running, $todo todo");
949 $progress_is_dirty = 0;
956 my $pid = waitpid (-1, WNOHANG);
957 return 0 if $pid <= 0;
959 my $whatslot = ($slot[$proc{$pid}->{slot}]->{node}->{name}
961 . $slot[$proc{$pid}->{slot}]->{cpu});
962 my $jobstepid = $proc{$pid}->{jobstep};
963 my $elapsed = time - $proc{$pid}->{time};
964 my $Jobstep = $jobstep[$jobstepid];
966 my $childstatus = $?;
967 my $exitvalue = $childstatus >> 8;
968 my $exitinfo = "exit ".exit_status_s($childstatus);
969 $Jobstep->{'arvados_task'}->reload;
970 my $task_success = $Jobstep->{'arvados_task'}->{success};
972 Log ($jobstepid, "child $pid on $whatslot $exitinfo success=$task_success");
974 if (!defined $task_success) {
975 # task did not indicate one way or the other --> fail
976 $Jobstep->{'arvados_task'}->{success} = 0;
977 $Jobstep->{'arvados_task'}->save;
984 $temporary_fail ||= $Jobstep->{node_fail};
985 $temporary_fail ||= ($exitvalue == 111);
988 ++$thisround_failed_multiple if $Jobstep->{'failures'} >= 1;
990 # Check for signs of a failed or misconfigured node
991 if (++$slot[$proc{$pid}->{slot}]->{node}->{losing_streak} >=
992 2+$slot[$proc{$pid}->{slot}]->{node}->{ncpus}) {
993 # Don't count this against jobstep failure thresholds if this
994 # node is already suspected faulty and srun exited quickly
995 if ($slot[$proc{$pid}->{slot}]->{node}->{hold_until} &&
997 Log ($jobstepid, "blaming failure on suspect node " .
998 $slot[$proc{$pid}->{slot}]->{node}->{name});
999 $temporary_fail ||= 1;
1001 ban_node_by_slot($proc{$pid}->{slot});
1004 Log ($jobstepid, sprintf('failure (#%d, %s) after %d seconds',
1005 ++$Jobstep->{'failures'},
1006 $temporary_fail ? 'temporary ' : 'permanent',
1009 if (!$temporary_fail || $Jobstep->{'failures'} >= 3) {
1010 # Give up on this task, and the whole job
1012 $main::please_freeze = 1;
1014 # Put this task back on the todo queue
1015 push @jobstep_todo, $jobstepid;
1016 $Job->{'tasks_summary'}->{'failed'}++;
1020 ++$thisround_succeeded;
1021 $slot[$proc{$pid}->{slot}]->{node}->{losing_streak} = 0;
1022 $slot[$proc{$pid}->{slot}]->{node}->{hold_until} = 0;
1023 push @jobstep_done, $jobstepid;
1024 Log ($jobstepid, "success in $elapsed seconds");
1026 $Jobstep->{exitcode} = $childstatus;
1027 $Jobstep->{finishtime} = time;
1028 $Jobstep->{'arvados_task'}->{finished_at} = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($Jobstep->{finishtime});
1029 $Jobstep->{'arvados_task'}->save;
1030 process_stderr ($jobstepid, $task_success);
1031 Log ($jobstepid, "output " . $Jobstep->{'arvados_task'}->{output});
1033 close $reader{$jobstepid};
1034 delete $reader{$jobstepid};
1035 delete $slot[$proc{$pid}->{slot}]->{pid};
1036 push @freeslot, $proc{$pid}->{slot};
1039 if ($task_success) {
1041 my $newtask_list = [];
1042 my $newtask_results;
1044 $newtask_results = retry_op(sub {
1045 $arv->{'job_tasks'}->{'list'}->execute(
1047 'created_by_job_task_uuid' => $Jobstep->{'arvados_task'}->{uuid}
1049 'order' => 'qsequence',
1050 'offset' => scalar(@$newtask_list),
1053 push(@$newtask_list, @{$newtask_results->{items}});
1054 } while (@{$newtask_results->{items}});
1055 foreach my $arvados_task (@$newtask_list) {
1057 'level' => $arvados_task->{'sequence'},
1059 'arvados_task' => $arvados_task
1061 push @jobstep, $jobstep;
1062 push @jobstep_todo, $#jobstep;
1066 $progress_is_dirty = 1;
1070 sub check_refresh_wanted
1072 my @stat = stat $ENV{"CRUNCH_REFRESH_TRIGGER"};
1073 if (@stat && $stat[9] > $latest_refresh) {
1074 $latest_refresh = scalar time;
1075 my $Job2 = retry_op(sub {
1076 $arv->{'jobs'}->{'get'}->execute('uuid' => $jobspec);
1078 for my $attr ('cancelled_at',
1079 'cancelled_by_user_uuid',
1080 'cancelled_by_client_uuid',
1082 $Job->{$attr} = $Job2->{$attr};
1084 if ($Job->{'state'} ne "Running") {
1085 if ($Job->{'state'} eq "Cancelled") {
1086 Log (undef, "Job cancelled at " . $Job->{'cancelled_at'} . " by user " . $Job->{'cancelled_by_user_uuid'});
1088 Log (undef, "Job state unexpectedly changed to " . $Job->{'state'});
1091 $main::please_freeze = 1;
1098 # return if the kill list was checked <4 seconds ago
1099 if (defined $squeue_kill_checked && $squeue_kill_checked > time - 4)
1103 $squeue_kill_checked = time;
1105 # use killem() on procs whose killtime is reached
1108 if (exists $proc{$_}->{killtime}
1109 && $proc{$_}->{killtime} <= time)
1115 # return if the squeue was checked <60 seconds ago
1116 if (defined $squeue_checked && $squeue_checked > time - 60)
1120 $squeue_checked = time;
1124 # here is an opportunity to check for mysterious problems with local procs
1128 # get a list of steps still running
1129 my @squeue = `squeue -s -h -o '%i %j' && echo ok`;
1131 if ($squeue[-1] ne "ok")
1137 # which of my jobsteps are running, according to squeue?
1141 if (/^(\d+)\.(\d+) (\S+)/)
1143 if ($1 eq $ENV{SLURM_JOBID})
1150 # which of my active child procs (>60s old) were not mentioned by squeue?
1151 foreach (keys %proc)
1153 if ($proc{$_}->{time} < time - 60
1154 && !exists $ok{$proc{$_}->{jobstepname}}
1155 && !exists $proc{$_}->{killtime})
1157 # kill this proc if it hasn't exited in 30 seconds
1158 $proc{$_}->{killtime} = time + 30;
1164 sub release_allocation
1168 Log (undef, "release job allocation");
1169 system "scancel $ENV{SLURM_JOBID}";
1177 foreach my $job (keys %reader)
1180 while (0 < sysread ($reader{$job}, $buf, 8192))
1182 print STDERR $buf if $ENV{CRUNCH_DEBUG};
1183 $jobstep[$job]->{stderr} .= $buf;
1184 preprocess_stderr ($job);
1185 if (length ($jobstep[$job]->{stderr}) > 16384)
1187 substr ($jobstep[$job]->{stderr}, 0, 8192) = "";
1196 sub preprocess_stderr
1200 while ($jobstep[$job]->{stderr} =~ /^(.*?)\n/) {
1202 substr $jobstep[$job]->{stderr}, 0, 1+length($line), "";
1203 Log ($job, "stderr $line");
1204 if ($line =~ /srun: error: (SLURM job $ENV{SLURM_JOB_ID} has expired|Unable to confirm allocation for job $ENV{SLURM_JOB_ID})/) {
1206 $main::please_freeze = 1;
1208 elsif ($line =~ /srun: error: (Node failure on|Unable to create job step) /) {
1209 $jobstep[$job]->{node_fail} = 1;
1210 ban_node_by_slot($jobstep[$job]->{slotindex});
1219 my $task_success = shift;
1220 preprocess_stderr ($job);
1223 Log ($job, "stderr $_");
1224 } split ("\n", $jobstep[$job]->{stderr});
1230 my ($keep, $child_out, $output_block);
1232 my $cmd = "arv-get \Q$hash\E";
1233 open($keep, '-|', $cmd) or die "fetch_block: $cmd: $!";
1237 my $bytes = sysread($keep, $buf, 1024 * 1024);
1238 if (!defined $bytes) {
1239 die "reading from arv-get: $!";
1240 } elsif ($bytes == 0) {
1241 # sysread returns 0 at the end of the pipe.
1244 # some bytes were read into buf.
1245 $output_block .= $buf;
1249 return $output_block;
1254 Log (undef, "collate");
1256 my ($child_out, $child_in);
1257 my $pid = open2($child_out, $child_in, 'arv-put', '--raw',
1258 '--retries', retry_count());
1262 next if (!exists $_->{'arvados_task'}->{'output'} ||
1263 !$_->{'arvados_task'}->{'success'});
1264 my $output = $_->{'arvados_task'}->{output};
1265 if ($output !~ /^[0-9a-f]{32}(\+\S+)*$/)
1267 $output_in_keep ||= $output =~ / [0-9a-f]{32}\S*\+K/;
1268 print $child_in $output;
1270 elsif (@jobstep == 1)
1272 $joboutput = $output;
1275 elsif (defined (my $outblock = fetch_block ($output)))
1277 $output_in_keep ||= $outblock =~ / [0-9a-f]{32}\S*\+K/;
1278 print $child_in $outblock;
1282 Log (undef, "XXX fetch_block($output) failed XXX");
1288 if (!defined $joboutput) {
1289 my $s = IO::Select->new($child_out);
1290 if ($s->can_read(120)) {
1291 sysread($child_out, $joboutput, 64 * 1024 * 1024);
1293 # TODO: Ensure exit status == 0.
1295 Log (undef, "timed out reading from 'arv-put'");
1298 # TODO: kill $pid instead of waiting, now that we've decided to
1299 # ignore further output.
1310 my $sig = 2; # SIGINT first
1311 if (exists $proc{$_}->{"sent_$sig"} &&
1312 time - $proc{$_}->{"sent_$sig"} > 4)
1314 $sig = 15; # SIGTERM if SIGINT doesn't work
1316 if (exists $proc{$_}->{"sent_$sig"} &&
1317 time - $proc{$_}->{"sent_$sig"} > 4)
1319 $sig = 9; # SIGKILL if SIGTERM doesn't work
1321 if (!exists $proc{$_}->{"sent_$sig"})
1323 Log ($proc{$_}->{jobstep}, "sending 2x signal $sig to pid $_");
1325 select (undef, undef, undef, 0.1);
1328 kill $sig, $_; # srun wants two SIGINT to really interrupt
1330 $proc{$_}->{"sent_$sig"} = time;
1331 $proc{$_}->{"killedafter"} = time - $proc{$_}->{"time"};
1341 vec($bits,fileno($_),1) = 1;
1347 # Send log output to Keep via arv-put.
1349 # $log_pipe_in and $log_pipe_out are the input and output filehandles to the arv-put pipe.
1350 # $log_pipe_pid is the pid of the arv-put subprocess.
1352 # The only functions that should access these variables directly are:
1354 # log_writer_start($logfilename)
1355 # Starts an arv-put pipe, reading data on stdin and writing it to
1356 # a $logfilename file in an output collection.
1358 # log_writer_send($txt)
1359 # Writes $txt to the output log collection.
1361 # log_writer_finish()
1362 # Closes the arv-put pipe and returns the output that it produces.
1364 # log_writer_is_active()
1365 # Returns a true value if there is currently a live arv-put
1366 # process, false otherwise.
1368 my ($log_pipe_in, $log_pipe_out, $log_pipe_pid);
1370 sub log_writer_start($)
1372 my $logfilename = shift;
1373 $log_pipe_pid = open2($log_pipe_out, $log_pipe_in,
1374 'arv-put', '--portable-data-hash',
1376 '--filename', $logfilename,
1380 sub log_writer_send($)
1383 print $log_pipe_in $txt;
1386 sub log_writer_finish()
1388 return unless $log_pipe_pid;
1390 close($log_pipe_in);
1393 my $s = IO::Select->new($log_pipe_out);
1394 if ($s->can_read(120)) {
1395 sysread($log_pipe_out, $arv_put_output, 1024);
1396 chomp($arv_put_output);
1398 Log (undef, "timed out reading from 'arv-put'");
1401 waitpid($log_pipe_pid, 0);
1402 $log_pipe_pid = $log_pipe_in = $log_pipe_out = undef;
1404 Log("log_writer_finish: arv-put exited ".exit_status_s($?))
1407 return $arv_put_output;
1410 sub log_writer_is_active() {
1411 return $log_pipe_pid;
1414 sub Log # ($jobstep_id, $logmessage)
1416 if ($_[1] =~ /\n/) {
1417 for my $line (split (/\n/, $_[1])) {
1422 my $fh = select STDERR; $|=1; select $fh;
1423 my $message = sprintf ("%s %d %s %s", $job_id, $$, @_);
1424 $message =~ s{([^ -\176])}{"\\" . sprintf ("%03o", ord($1))}ge;
1427 if (log_writer_is_active() || -t STDERR) {
1428 my @gmtime = gmtime;
1429 $datetime = sprintf ("%04d-%02d-%02d_%02d:%02d:%02d",
1430 $gmtime[5]+1900, $gmtime[4]+1, @gmtime[3,2,1,0]);
1432 print STDERR ((-t STDERR) ? ($datetime." ".$message) : $message);
1434 if (log_writer_is_active()) {
1435 log_writer_send($datetime . " " . $message);
1442 my ($package, $file, $line) = caller;
1443 my $message = "@_ at $file line $line\n";
1444 Log (undef, $message);
1445 freeze() if @jobstep_todo;
1446 collate_output() if @jobstep_todo;
1448 save_meta() if log_writer_is_active();
1455 if ($Job->{'state'} eq 'Cancelled') {
1456 $Job->update_attributes('finished_at' => scalar gmtime);
1458 $Job->update_attributes('state' => 'Failed');
1465 my $justcheckpoint = shift; # false if this will be the last meta saved
1466 return if $justcheckpoint; # checkpointing is not relevant post-Warehouse.pm
1468 my $loglocator = log_writer_finish();
1469 Log (undef, "log manifest is $loglocator");
1470 $Job->{'log'} = $loglocator;
1471 $Job->update_attributes('log', $loglocator);
1475 sub freeze_if_want_freeze
1477 if ($main::please_freeze)
1479 release_allocation();
1482 # kill some srun procs before freeze+stop
1483 map { $proc{$_} = {} } @_;
1486 killem (keys %proc);
1487 select (undef, undef, undef, 0.1);
1489 while (($died = waitpid (-1, WNOHANG)) > 0)
1491 delete $proc{$died};
1506 Log (undef, "Freeze not implemented");
1513 croak ("Thaw not implemented");
1529 $s =~ s{\\(.)}{$1 eq "n" ? "\n" : $1}ge;
1536 my $srunargs = shift;
1537 my $execargs = shift;
1538 my $opts = shift || {};
1540 my $args = $have_slurm ? [@$srunargs, @$execargs] : $execargs;
1541 print STDERR (join (" ",
1542 map { / / ? "'$_'" : $_ }
1545 if $ENV{CRUNCH_DEBUG};
1547 if (defined $stdin) {
1548 my $child = open STDIN, "-|";
1549 defined $child or die "no fork: $!";
1551 print $stdin or die $!;
1552 close STDOUT or die $!;
1557 return system (@$args) if $opts->{fork};
1560 warn "ENV size is ".length(join(" ",%ENV));
1561 die "exec failed: $!: @$args";
1565 sub ban_node_by_slot {
1566 # Don't start any new jobsteps on this node for 60 seconds
1568 $slot[$slotid]->{node}->{hold_until} = 60 + scalar time;
1569 $slot[$slotid]->{node}->{hold_count}++;
1570 Log (undef, "backing off node " . $slot[$slotid]->{node}->{name} . " for 60 seconds");
1575 my ($lockfile, $error_message) = @_;
1576 open L, ">", $lockfile or croak("$lockfile: $!");
1577 if (!flock L, LOCK_EX|LOCK_NB) {
1578 croak("Can't lock $lockfile: $error_message\n");
1582 sub find_docker_image {
1583 # Given a Keep locator, check to see if it contains a Docker image.
1584 # If so, return its stream name and Docker hash.
1585 # If not, return undef for both values.
1586 my $locator = shift;
1587 my ($streamname, $filename);
1588 my $image = retry_op(sub {
1589 $arv->{collections}->{get}->execute(uuid => $locator);
1592 foreach my $line (split(/\n/, $image->{manifest_text})) {
1593 my @tokens = split(/\s+/, $line);
1595 $streamname = shift(@tokens);
1596 foreach my $filedata (grep(/^\d+:\d+:/, @tokens)) {
1597 if (defined($filename)) {
1598 return (undef, undef); # More than one file in the Collection.
1600 $filename = (split(/:/, $filedata, 3))[2];
1605 if (defined($filename) and ($filename =~ /^([0-9A-Fa-f]{64})\.tar$/)) {
1606 return ($streamname, $1);
1608 return (undef, undef);
1613 # Calculate the number of times an operation should be retried,
1614 # assuming exponential backoff, and that we're willing to retry as
1615 # long as tasks have been running. Enforce a minimum of 3 retries.
1616 my ($starttime, $endtime, $timediff, $retries);
1618 $starttime = $jobstep[0]->{starttime};
1619 $endtime = $jobstep[-1]->{finishtime};
1621 if (!defined($starttime)) {
1623 } elsif (!defined($endtime)) {
1624 $timediff = time - $starttime;
1626 $timediff = ($endtime - $starttime) - (time - $endtime);
1628 if ($timediff > 0) {
1629 $retries = int(log($timediff) / log(2));
1631 $retries = 1; # Use the minimum.
1633 return ($retries > 3) ? $retries : 3;
1637 # Given a function reference, call it with the remaining arguments. If
1638 # it dies, retry it with exponential backoff until it succeeds, or until
1639 # the current retry_count is exhausted.
1640 my $operation = shift;
1641 my $retries = retry_count();
1642 foreach my $try_count (0..$retries) {
1643 my $next_try = time + (2 ** $try_count);
1644 my $result = eval { $operation->(@_); };
1647 } elsif ($try_count < $retries) {
1648 my $sleep_time = $next_try - time;
1649 sleep($sleep_time) if ($sleep_time > 0);
1652 # Ensure the error message ends in a newline, so Perl doesn't add
1653 # retry_op's line number to it.
1659 # Given a $?, return a human-readable exit code string like "0" or
1660 # "1" or "0 with signal 1" or "1 with signal 11".
1661 my $exitcode = shift;
1662 my $s = $exitcode >> 8;
1663 if ($exitcode & 0x7f) {
1664 $s .= " with signal " . ($exitcode & 0x7f);
1666 if ($exitcode & 0x80) {
1667 $s .= " with core dump";
1675 # checkout-and-build
1678 use File::Path qw( make_path );
1680 my $destdir = $ENV{"CRUNCH_SRC"};
1681 my $commit = $ENV{"CRUNCH_SRC_COMMIT"};
1682 my $repo = $ENV{"CRUNCH_SRC_URL"};
1683 my $task_work = $ENV{"TASK_WORK"};
1685 for my $dir ($destdir, $task_work) {
1688 -e $dir or die "Failed to create temporary directory ($dir): $!";
1692 open L, ">", "$destdir.lock" or die "$destdir.lock: $!";
1694 if (readlink ("$destdir.commit") eq $commit && -d $destdir) {
1697 die "Cannot exec `@ARGV`: $!";
1703 unlink "$destdir.commit";
1704 open STDOUT, ">", "$destdir.log";
1705 open STDERR, ">&STDOUT";
1708 my @git_archive_data = <DATA>;
1709 if (@git_archive_data) {
1710 open TARX, "|-", "tar", "-C", $destdir, "-xf", "-";
1711 print TARX @git_archive_data;
1713 die "'tar -C $destdir -xf -' exited $?: $!";
1718 chomp ($pwd = `pwd`);
1719 my $install_dir = $ENV{"CRUNCH_INSTALL"} || "$pwd/opt";
1722 for my $src_path ("$destdir/arvados/sdk/python") {
1724 shell_or_die ("virtualenv", $install_dir);
1725 shell_or_die ("cd $src_path && ./build.sh && $install_dir/bin/python setup.py install");
1729 if (-e "$destdir/crunch_scripts/install") {
1730 shell_or_die ("$destdir/crunch_scripts/install", $install_dir);
1731 } elsif (!-e "./install.sh" && -e "./tests/autotests.sh") {
1733 shell_or_die ("./tests/autotests.sh", $install_dir);
1734 } elsif (-e "./install.sh") {
1735 shell_or_die ("./install.sh", $install_dir);
1739 unlink "$destdir.commit.new";
1740 symlink ($commit, "$destdir.commit.new") or die "$destdir.commit.new: $!";
1741 rename ("$destdir.commit.new", "$destdir.commit") or die "$destdir.commit: $!";
1748 die "Cannot exec `@ARGV`: $!";
1755 if ($ENV{"DEBUG"}) {
1756 print STDERR "@_\n";
1759 or die "@_ failed: $! exit 0x".sprintf("%x",$?);