sdk/cli/bin/crunch-job

   1 #!/usr/bin/perl
   2 # -*- mode: perl; perl-indent-level: 2; indent-tabs-mode: nil; -*-
   3
   4 =head1 NAME
   5
   6 crunch-job: Execute job steps, save snapshots as requested, collate output.
   7
   8 =head1 SYNOPSIS
   9
  10 Obtain job details from Arvados, run tasks on compute nodes (typically
  11 invoked by scheduler on controller):
  12
  13  crunch-job --job x-y-z --git-dir /path/to/repo/.git
  14
  15 Obtain job details from command line, run tasks on local machine
  16 (typically invoked by application or developer on VM):
  17
  18  crunch-job --job '{"script_version":"/path/to/working/tree","script":"scriptname",...}'
  19
  20  crunch-job --job '{"repository":"https://github.com/curoverse/arvados.git","script_version":"master","script":"scriptname",...}'
  21
  22 =head1 OPTIONS
  23
  24 =over
  25
  26 =item --force-unlock
  27
  28 If the job is already locked, steal the lock and run it anyway.
  29
  30 =item --git-dir
  31
  32 Path to a .git directory (or a git URL) where the commit given in the
  33 job's C<script_version> attribute is to be found. If this is I<not>
  34 given, the job's C<repository> attribute will be used.
  35
  36 =item --job-api-token
  37
  38 Arvados API authorization token to use during the course of the job.
  39
  40 =item --no-clear-tmp
  41
  42 Do not clear per-job/task temporary directories during initial job
  43 setup. This can speed up development and debugging when running jobs
  44 locally.
  45
  46 =item --job
  47
  48 UUID of the job to run, or a JSON-encoded job resource without a
  49 UUID. If the latter is given, a new job object will be created.
  50
  51 =back
  52
  53 =head1 RUNNING JOBS LOCALLY
  54
  55 crunch-job's log messages appear on stderr along with the job tasks'
  56 stderr streams. The log is saved in Keep at each checkpoint and when
  57 the job finishes.
  58
  59 If the job succeeds, the job's output locator is printed on stdout.
  60
  61 While the job is running, the following signals are accepted:
  62
  63 =over
  64
  65 =item control-C, SIGINT, SIGQUIT
  66
  67 Save a checkpoint, terminate any job tasks that are running, and stop.
  68
  69 =item SIGALRM
  70
  71 Save a checkpoint and continue.
  72
  73 =item SIGHUP
  74
  75 Refresh node allocation (i.e., check whether any nodes have been added
  76 or unallocated) and attributes of the Job record that should affect
  77 behavior (e.g., cancel job if cancelled_at becomes non-nil).
  78
  79 =back
  80
  81 =cut
  82
  83
  84 use strict;
  85 use POSIX ':sys_wait_h';
  86 use POSIX qw(strftime);
  87 use Fcntl qw(F_GETFL F_SETFL O_NONBLOCK);
  88 use Arvados;
  89 use Cwd qw(realpath);
  90 use Data::Dumper;
  91 use Digest::MD5 qw(md5_hex);
  92 use Getopt::Long;
  93 use IPC::Open2;
  94 use IO::Select;
  95 use File::Temp;
  96 use Fcntl ':flock';
  97 use File::Path qw( make_path remove_tree );
  98
  99 use constant EX_TEMPFAIL => 75;
 100
 101 $ENV{"TMPDIR"} ||= "/tmp";
 102 unless (defined $ENV{"CRUNCH_TMP"}) {
 103   $ENV{"CRUNCH_TMP"} = $ENV{"TMPDIR"} . "/crunch-job";
 104   if ($ENV{"USER"} ne "crunch" && $< != 0) {
 105     # use a tmp dir unique for my uid
 106     $ENV{"CRUNCH_TMP"} .= "-$<";
 107   }
 108 }
 109
 110 # Create the tmp directory if it does not exist
 111 if ( ! -d $ENV{"CRUNCH_TMP"} ) {
 112   make_path $ENV{"CRUNCH_TMP"} or die "Failed to create temporary working directory: " . $ENV{"CRUNCH_TMP"};
 113 }
 114
 115 $ENV{"JOB_WORK"} = $ENV{"CRUNCH_TMP"} . "/work";
 116 $ENV{"CRUNCH_INSTALL"} = "$ENV{CRUNCH_TMP}/opt";
 117 $ENV{"CRUNCH_WORK"} = $ENV{"JOB_WORK"}; # deprecated
 118 mkdir ($ENV{"JOB_WORK"});
 119
 120 my $force_unlock;
 121 my $git_dir;
 122 my $jobspec;
 123 my $job_api_token;
 124 my $no_clear_tmp;
 125 my $resume_stash;
 126 GetOptions('force-unlock' => \$force_unlock,
 127            'git-dir=s' => \$git_dir,
 128            'job=s' => \$jobspec,
 129            'job-api-token=s' => \$job_api_token,
 130            'no-clear-tmp' => \$no_clear_tmp,
 131            'resume-stash=s' => \$resume_stash,
 132     );
 133
 134 if (defined $job_api_token) {
 135   $ENV{ARVADOS_API_TOKEN} = $job_api_token;
 136 }
 137
 138 my $have_slurm = exists $ENV{SLURM_JOBID} && exists $ENV{SLURM_NODELIST};
 139 my $local_job = 0;
 140
 141
 142 $SIG{'USR1'} = sub
 143 {
 144   $main::ENV{CRUNCH_DEBUG} = 1;
 145 };
 146 $SIG{'USR2'} = sub
 147 {
 148   $main::ENV{CRUNCH_DEBUG} = 0;
 149 };
 150
 151
 152
 153 my $arv = Arvados->new('apiVersion' => 'v1');
 154
 155 my $Job;
 156 my $job_id;
 157 my $dbh;
 158 my $sth;
 159 my @jobstep;
 160
 161 my $User = api_call("users/current");
 162
 163 if ($jobspec =~ /^[-a-z\d]+$/)
 164 {
 165   # $jobspec is an Arvados UUID, not a JSON job specification
 166   $Job = api_call("jobs/get", uuid => $jobspec);
 167   if (!$force_unlock) {
 168     # Claim this job, and make sure nobody else does
 169     eval { api_call("jobs/lock", uuid => $Job->{uuid}); };
 170     if ($@) {
 171       Log(undef, "Error while locking job, exiting ".EX_TEMPFAIL);
 172       exit EX_TEMPFAIL;
 173     };
 174   }
 175 }
 176 else
 177 {
 178   $Job = JSON::decode_json($jobspec);
 179
 180   if (!$resume_stash)
 181   {
 182     map { croak ("No $_ specified") unless $Job->{$_} }
 183     qw(script script_version script_parameters);
 184   }
 185
 186   $Job->{'is_locked_by_uuid'} = $User->{'uuid'};
 187   $Job->{'started_at'} = gmtime;
 188   $Job->{'state'} = 'Running';
 189
 190   $Job = api_call("jobs/create", job => $Job);
 191 }
 192 $job_id = $Job->{'uuid'};
 193
 194 my $keep_logfile = $job_id . '.log.txt';
 195 log_writer_start($keep_logfile);
 196
 197 $Job->{'runtime_constraints'} ||= {};
 198 $Job->{'runtime_constraints'}->{'max_tasks_per_node'} ||= 0;
 199 my $max_ncpus = $Job->{'runtime_constraints'}->{'max_tasks_per_node'};
 200
 201 my $gem_versions = `gem list --quiet arvados-cli 2>/dev/null`;
 202 if ($? == 0) {
 203   $gem_versions =~ s/^arvados-cli \(/ with arvados-cli Gem version(s) /;
 204   chomp($gem_versions);
 205   chop($gem_versions);  # Closing parentheses
 206 } else {
 207   $gem_versions = "";
 208 }
 209 Log(undef,
 210     "running from " . ((-e $0) ? realpath($0) : "stdin") . $gem_versions);
 211
 212 Log (undef, "check slurm allocation");
 213 my @slot;
 214 my @node;
 215 # Should use $ENV{SLURM_TASKS_PER_NODE} instead of sinfo? (eg. "4(x3),2,4(x2)")
 216 my @sinfo;
 217 if (!$have_slurm)
 218 {
 219   my $localcpus = 0 + `grep -cw ^processor /proc/cpuinfo` || 1;
 220   push @sinfo, "$localcpus localhost";
 221 }
 222 if (exists $ENV{SLURM_NODELIST})
 223 {
 224   push @sinfo, `sinfo -h --format='%c %N' --nodes=\Q$ENV{SLURM_NODELIST}\E`;
 225 }
 226 foreach (@sinfo)
 227 {
 228   my ($ncpus, $slurm_nodelist) = split;
 229   $ncpus = $max_ncpus if $max_ncpus && $ncpus > $max_ncpus;
 230
 231   my @nodelist;
 232   while ($slurm_nodelist =~ s/^([^\[,]+?(\[.*?\])?)(,|$)//)
 233   {
 234     my $nodelist = $1;
 235     if ($nodelist =~ /\[((\d+)(-(\d+))?(,(\d+)(-(\d+))?)*)\]/)
 236     {
 237       my $ranges = $1;
 238       foreach (split (",", $ranges))
 239       {
 240         my ($a, $b);
 241         if (/(\d+)-(\d+)/)
 242         {
 243           $a = $1;
 244           $b = $2;
 245         }
 246         else
 247         {
 248           $a = $_;
 249           $b = $_;
 250         }
 251         push @nodelist, map {
 252           my $n = $nodelist;
 253           $n =~ s/\[[-,\d]+\]/$_/;
 254           $n;
 255         } ($a..$b);
 256       }
 257     }
 258     else
 259     {
 260       push @nodelist, $nodelist;
 261     }
 262   }
 263   foreach my $nodename (@nodelist)
 264   {
 265     Log (undef, "node $nodename - $ncpus slots");
 266     my $node = { name => $nodename,
 267                  ncpus => $ncpus,
 268                  losing_streak => 0,
 269                  hold_until => 0 };
 270     foreach my $cpu (1..$ncpus)
 271     {
 272       push @slot, { node => $node,
 273                     cpu => $cpu };
 274     }
 275   }
 276   push @node, @nodelist;
 277 }
 278
 279
 280
 281 # Ensure that we get one jobstep running on each allocated node before
 282 # we start overloading nodes with concurrent steps
 283
 284 @slot = sort { $a->{cpu} <=> $b->{cpu} } @slot;
 285
 286
 287 $Job->update_attributes(
 288   'tasks_summary' => { 'failed' => 0,
 289                        'todo' => 1,
 290                        'running' => 0,
 291                        'done' => 0 });
 292
 293 Log (undef, "start");
 294 $SIG{'INT'} = sub { $main::please_freeze = 1; };
 295 $SIG{'QUIT'} = sub { $main::please_freeze = 1; };
 296 $SIG{'TERM'} = \&croak;
 297 $SIG{'TSTP'} = sub { $main::please_freeze = 1; };
 298 $SIG{'ALRM'} = sub { $main::please_info = 1; };
 299 $SIG{'CONT'} = sub { $main::please_continue = 1; };
 300 $SIG{'HUP'} = sub { $main::please_refresh = 1; };
 301
 302 $main::please_freeze = 0;
 303 $main::please_info = 0;
 304 $main::please_continue = 0;
 305 $main::please_refresh = 0;
 306 my $jobsteps_must_output_keys = 0;      # becomes 1 when any task outputs a key
 307
 308 grep { $ENV{$1} = $2 if /^(NOCACHE.*?)=(.*)/ } split ("\n", $$Job{knobs});
 309 $ENV{"CRUNCH_JOB_UUID"} = $job_id;
 310 $ENV{"JOB_UUID"} = $job_id;
 311
 312
 313 my @jobstep_todo = ();
 314 my @jobstep_done = ();
 315 my @jobstep_tomerge = ();
 316 my $jobstep_tomerge_level = 0;
 317 my $squeue_checked;
 318 my $squeue_kill_checked;
 319 my $latest_refresh = scalar time;
 320
 321
 322
 323 if (defined $Job->{thawedfromkey})
 324 {
 325   thaw ($Job->{thawedfromkey});
 326 }
 327 else
 328 {
 329   my $first_task = api_call("job_tasks/create", job_task => {
 330     'job_uuid' => $Job->{'uuid'},
 331     'sequence' => 0,
 332     'qsequence' => 0,
 333     'parameters' => {},
 334   });
 335   push @jobstep, { 'level' => 0,
 336                    'failures' => 0,
 337                    'arvados_task' => $first_task,
 338                  };
 339   push @jobstep_todo, 0;
 340 }
 341
 342
 343 if (!$have_slurm)
 344 {
 345   must_lock_now("$ENV{CRUNCH_TMP}/.lock", "a job is already running here.");
 346 }
 347
 348 my $build_script = handle_readall(\*DATA);
 349 my $nodelist = join(",", @node);
 350 my $git_tar_count = 0;
 351
 352 if (!defined $no_clear_tmp) {
 353   # Clean out crunch_tmp/work, crunch_tmp/opt, crunch_tmp/src*
 354   Log (undef, "Clean work dirs");
 355
 356   my $cleanpid = fork();
 357   if ($cleanpid == 0)
 358   {
 359     # Find FUSE mounts that look like Keep mounts (the mount path has the
 360     # word "keep") and unmount them.  Then clean up work directories.
 361     # TODO: When #5036 is done and widely deployed, we can get rid of the
 362     # regular expression and just unmount everything with type fuse.keep.
 363     srun (["srun", "--nodelist=$nodelist", "-D", $ENV{'TMPDIR'}],
 364           ['bash', '-ec', 'mount -t fuse,fuse.keep | awk \'($3 ~ /\ykeep\y/){print $3}\' | xargs -r -n 1 fusermount -u -z; sleep 1; rm -rf $JOB_WORK $CRUNCH_INSTALL $CRUNCH_TMP/task $CRUNCH_TMP/src* $CRUNCH_TMP/*.cid']);
 365     exit (1);
 366   }
 367   while (1)
 368   {
 369     last if $cleanpid == waitpid (-1, WNOHANG);
 370     freeze_if_want_freeze ($cleanpid);
 371     select (undef, undef, undef, 0.1);
 372   }
 373   Log (undef, "Cleanup command exited ".exit_status_s($?));
 374 }
 375
 376 # If this job requires a Docker image, install that.
 377 my $docker_bin = "/usr/bin/docker.io";
 378 my ($docker_locator, $docker_stream, $docker_hash);
 379 if ($docker_locator = $Job->{docker_image_locator}) {
 380   ($docker_stream, $docker_hash) = find_docker_image($docker_locator);
 381   if (!$docker_hash)
 382   {
 383     croak("No Docker image hash found from locator $docker_locator");
 384   }
 385   $docker_stream =~ s/^\.//;
 386   my $docker_install_script = qq{
 387 if ! $docker_bin images -q --no-trunc --all | grep -qxF \Q$docker_hash\E; then
 388     arv-get \Q$docker_locator$docker_stream/$docker_hash.tar\E | $docker_bin load
 389 fi
 390 };
 391   my $docker_pid = fork();
 392   if ($docker_pid == 0)
 393   {
 394     srun (["srun", "--nodelist=" . join(',', @node)],
 395           ["/bin/sh", "-ec", $docker_install_script]);
 396     exit ($?);
 397   }
 398   while (1)
 399   {
 400     last if $docker_pid == waitpid (-1, WNOHANG);
 401     freeze_if_want_freeze ($docker_pid);
 402     select (undef, undef, undef, 0.1);
 403   }
 404   if ($? != 0)
 405   {
 406     croak("Installing Docker image from $docker_locator exited "
 407           .exit_status_s($?));
 408   }
 409
 410   if ($Job->{arvados_sdk_version}) {
 411     # The job also specifies an Arvados SDK version.  Add the SDKs to the
 412     # tar file for the build script to install.
 413     Log(undef, sprintf("Packing Arvados SDK version %s for installation",
 414                        $Job->{arvados_sdk_version}));
 415     add_git_archive("git", "--git-dir=$git_dir", "archive",
 416                     "--prefix=.arvados.sdk/",
 417                     $Job->{arvados_sdk_version}, "sdk");
 418   }
 419 }
 420
 421 if (!defined $git_dir && $Job->{'script_version'} =~ m{^/}) {
 422   # If script_version looks like an absolute path, *and* the --git-dir
 423   # argument was not given -- which implies we were not invoked by
 424   # crunch-dispatch -- we will use the given path as a working
 425   # directory instead of resolving script_version to a git commit (or
 426   # doing anything else with git).
 427   $ENV{"CRUNCH_SRC_COMMIT"} = $Job->{'script_version'};
 428   $ENV{"CRUNCH_SRC"} = $Job->{'script_version'};
 429 }
 430 else {
 431   # Resolve the given script_version to a git commit sha1. Also, if
 432   # the repository is remote, clone it into our local filesystem: this
 433   # ensures "git archive" will work, and is necessary to reliably
 434   # resolve a symbolic script_version like "master^".
 435   $ENV{"CRUNCH_SRC"} = "$ENV{CRUNCH_TMP}/src";
 436
 437   Log (undef, "Looking for version ".$Job->{script_version}." from repository ".$Job->{repository});
 438
 439   $ENV{"CRUNCH_SRC_COMMIT"} = $Job->{script_version};
 440
 441   # If we're running under crunch-dispatch, it will have already
 442   # pulled the appropriate source tree into its own repository, and
 443   # given us that repo's path as $git_dir.
 444   #
 445   # If we're running a "local" job, we might have to fetch content
 446   # from a remote repository.
 447   #
 448   # (Currently crunch-dispatch gives a local path with --git-dir, but
 449   # we might as well accept URLs there too in case it changes its
 450   # mind.)
 451   my $repo = $git_dir || $Job->{'repository'};
 452
 453   # Repository can be remote or local. If remote, we'll need to fetch it
 454   # to a local dir before doing `git log` et al.
 455   my $repo_location;
 456
 457   if ($repo =~ m{://|^[^/]*:}) {
 458     # $repo is a git url we can clone, like git:// or https:// or
 459     # file:/// or [user@]host:repo.git. Note "user/name@host:foo" is
 460     # not recognized here because distinguishing that from a local
 461     # path is too fragile. If you really need something strange here,
 462     # use the ssh:// form.
 463     $repo_location = 'remote';
 464   } elsif ($repo =~ m{^\.*/}) {
 465     # $repo is a local path to a git index. We'll also resolve ../foo
 466     # to ../foo/.git if the latter is a directory. To help
 467     # disambiguate local paths from named hosted repositories, this
 468     # form must be given as ./ or ../ if it's a relative path.
 469     if (-d "$repo/.git") {
 470       $repo = "$repo/.git";
 471     }
 472     $repo_location = 'local';
 473   } else {
 474     # $repo is none of the above. It must be the name of a hosted
 475     # repository.
 476     my $arv_repo_list = api_call("repositories/list",
 477                                  'filters' => [['name','=',$repo]]);
 478     my @repos_found = @{$arv_repo_list->{'items'}};
 479     my $n_found = $arv_repo_list->{'serverResponse'}->{'items_available'};
 480     if ($n_found > 0) {
 481       Log(undef, "Repository '$repo' -> "
 482           . join(", ", map { $_->{'uuid'} } @repos_found));
 483     }
 484     if ($n_found != 1) {
 485       croak("Error: Found $n_found repositories with name '$repo'.");
 486     }
 487     $repo = $repos_found[0]->{'fetch_url'};
 488     $repo_location = 'remote';
 489   }
 490   Log(undef, "Using $repo_location repository '$repo'");
 491   $ENV{"CRUNCH_SRC_URL"} = $repo;
 492
 493   # Resolve given script_version (we'll call that $treeish here) to a
 494   # commit sha1 ($commit).
 495   my $treeish = $Job->{'script_version'};
 496   my $commit;
 497   if ($repo_location eq 'remote') {
 498     # We minimize excess object-fetching by re-using the same bare
 499     # repository in CRUNCH_TMP/.git for multiple crunch-jobs -- we
 500     # just keep adding remotes to it as needed.
 501     my $local_repo = $ENV{'CRUNCH_TMP'}."/.git";
 502     my $gitcmd = "git --git-dir=\Q$local_repo\E";
 503
 504     # Set up our local repo for caching remote objects, making
 505     # archives, etc.
 506     if (!-d $local_repo) {
 507       make_path($local_repo) or croak("Error: could not create $local_repo");
 508     }
 509     # This works (exits 0 and doesn't delete fetched objects) even
 510     # if $local_repo is already initialized:
 511     `$gitcmd init --bare`;
 512     if ($?) {
 513       croak("Error: $gitcmd init --bare exited ".exit_status_s($?));
 514     }
 515
 516     # If $treeish looks like a hash (or abbrev hash) we look it up in
 517     # our local cache first, since that's cheaper. (We don't want to
 518     # do that with tags/branches though -- those change over time, so
 519     # they should always be resolved by the remote repo.)
 520     if ($treeish =~ /^[0-9a-f]{7,40}$/s) {
 521       # Hide stderr because it's normal for this to fail:
 522       my $sha1 = `$gitcmd rev-list -n1 ''\Q$treeish\E 2>/dev/null`;
 523       if ($? == 0 &&
 524           # Careful not to resolve a branch named abcdeff to commit 1234567:
 525           $sha1 =~ /^$treeish/ &&
 526           $sha1 =~ /^([0-9a-f]{40})$/s) {
 527         $commit = $1;
 528         Log(undef, "Commit $commit already present in $local_repo");
 529       }
 530     }
 531
 532     if (!defined $commit) {
 533       # If $treeish isn't just a hash or abbrev hash, or isn't here
 534       # yet, we need to fetch the remote to resolve it correctly.
 535
 536       # First, remove all local heads. This prevents a name that does
 537       # not exist on the remote from resolving to (or colliding with)
 538       # a previously fetched branch or tag (possibly from a different
 539       # remote).
 540       remove_tree("$local_repo/refs/heads", {keep_root => 1});
 541
 542       Log(undef, "Fetching objects from $repo to $local_repo");
 543       `$gitcmd fetch --no-progress --tags ''\Q$repo\E \Q+refs/heads/*:refs/heads/*\E`;
 544       if ($?) {
 545         croak("Error: `$gitcmd fetch` exited ".exit_status_s($?));
 546       }
 547     }
 548
 549     # Now that the data is all here, we will use our local repo for
 550     # the rest of our git activities.
 551     $repo = $local_repo;
 552   }
 553
 554   my $gitcmd = "git --git-dir=\Q$repo\E";
 555   my $sha1 = `$gitcmd rev-list -n1 ''\Q$treeish\E`;
 556   unless ($? == 0 && $sha1 =~ /^([0-9a-f]{40})$/) {
 557     croak("`$gitcmd rev-list` exited "
 558           .exit_status_s($?)
 559           .", '$treeish' not found. Giving up.");
 560   }
 561   $commit = $1;
 562   Log(undef, "Version $treeish is commit $commit");
 563
 564   if ($commit ne $Job->{'script_version'}) {
 565     # Record the real commit id in the database, frozentokey, logs,
 566     # etc. -- instead of an abbreviation or a branch name which can
 567     # become ambiguous or point to a different commit in the future.
 568     if (!$Job->update_attributes('script_version' => $commit)) {
 569       croak("Error: failed to update job's script_version attribute");
 570     }
 571   }
 572
 573   $ENV{"CRUNCH_SRC_COMMIT"} = $commit;
 574   add_git_archive("$gitcmd archive ''\Q$commit\E");
 575 }
 576
 577 my $git_archive = combined_git_archive();
 578 if (!defined $git_archive) {
 579   Log(undef, "Skip install phase (no git archive)");
 580   if ($have_slurm) {
 581     Log(undef, "Warning: This probably means workers have no source tree!");
 582   }
 583 }
 584 else {
 585   Log(undef, "Run install script on all workers");
 586
 587   my @srunargs = ("srun",
 588                   "--nodelist=$nodelist",
 589                   "-D", $ENV{'TMPDIR'}, "--job-name=$job_id");
 590   my @execargs = ("sh", "-c",
 591                   "mkdir -p $ENV{CRUNCH_INSTALL} && cd $ENV{CRUNCH_TMP} && perl -");
 592
 593   my $installpid = fork();
 594   if ($installpid == 0)
 595   {
 596     srun (\@srunargs, \@execargs, {}, $build_script . $git_archive);
 597     exit (1);
 598   }
 599   while (1)
 600   {
 601     last if $installpid == waitpid (-1, WNOHANG);
 602     freeze_if_want_freeze ($installpid);
 603     select (undef, undef, undef, 0.1);
 604   }
 605   my $install_exited = $?;
 606   Log (undef, "Install script exited ".exit_status_s($install_exited));
 607   foreach my $tar_filename (map { tar_filename_n($_); } (1..$git_tar_count)) {
 608     unlink($tar_filename);
 609   }
 610   exit (1) if $install_exited != 0;
 611 }
 612
 613 foreach (qw (script script_version script_parameters runtime_constraints))
 614 {
 615   Log (undef,
 616        "$_ " .
 617        (ref($Job->{$_}) ? JSON::encode_json($Job->{$_}) : $Job->{$_}));
 618 }
 619 foreach (split (/\n/, $Job->{knobs}))
 620 {
 621   Log (undef, "knob " . $_);
 622 }
 623
 624
 625
 626 $main::success = undef;
 627
 628
 629
 630 ONELEVEL:
 631
 632 my $thisround_succeeded = 0;
 633 my $thisround_failed = 0;
 634 my $thisround_failed_multiple = 0;
 635
 636 @jobstep_todo = sort { $jobstep[$a]->{level} <=> $jobstep[$b]->{level}
 637                        or $a <=> $b } @jobstep_todo;
 638 my $level = $jobstep[$jobstep_todo[0]]->{level};
 639 Log (undef, "start level $level");
 640
 641
 642
 643 my %proc;
 644 my @freeslot = (0..$#slot);
 645 my @holdslot;
 646 my %reader;
 647 my $progress_is_dirty = 1;
 648 my $progress_stats_updated = 0;
 649
 650 update_progress_stats();
 651
 652
 653
 654 THISROUND:
 655 for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
 656 {
 657   my $id = $jobstep_todo[$todo_ptr];
 658   my $Jobstep = $jobstep[$id];
 659   if ($Jobstep->{level} != $level)
 660   {
 661     next;
 662   }
 663
 664   pipe $reader{$id}, "writer" or croak ($!);
 665   my $flags = fcntl ($reader{$id}, F_GETFL, 0) or croak ($!);
 666   fcntl ($reader{$id}, F_SETFL, $flags | O_NONBLOCK) or croak ($!);
 667
 668   my $childslot = $freeslot[0];
 669   my $childnode = $slot[$childslot]->{node};
 670   my $childslotname = join (".",
 671                             $slot[$childslot]->{node}->{name},
 672                             $slot[$childslot]->{cpu});
 673
 674   my $childpid = fork();
 675   if ($childpid == 0)
 676   {
 677     $SIG{'INT'} = 'DEFAULT';
 678     $SIG{'QUIT'} = 'DEFAULT';
 679     $SIG{'TERM'} = 'DEFAULT';
 680
 681     foreach (values (%reader))
 682     {
 683       close($_);
 684     }
 685     fcntl ("writer", F_SETFL, 0) or croak ($!); # no close-on-exec
 686     open(STDOUT,">&writer");
 687     open(STDERR,">&writer");
 688
 689     undef $dbh;
 690     undef $sth;
 691
 692     delete $ENV{"GNUPGHOME"};
 693     $ENV{"TASK_UUID"} = $Jobstep->{'arvados_task'}->{'uuid'};
 694     $ENV{"TASK_QSEQUENCE"} = $id;
 695     $ENV{"TASK_SEQUENCE"} = $level;
 696     $ENV{"JOB_SCRIPT"} = $Job->{script};
 697     while (my ($param, $value) = each %{$Job->{script_parameters}}) {
 698       $param =~ tr/a-z/A-Z/;
 699       $ENV{"JOB_PARAMETER_$param"} = $value;
 700     }
 701     $ENV{"TASK_SLOT_NODE"} = $slot[$childslot]->{node}->{name};
 702     $ENV{"TASK_SLOT_NUMBER"} = $slot[$childslot]->{cpu};
 703     $ENV{"TASK_WORK"} = $ENV{"CRUNCH_TMP"}."/task/$childslotname";
 704     $ENV{"HOME"} = $ENV{"TASK_WORK"};
 705     $ENV{"TASK_KEEPMOUNT"} = $ENV{"TASK_WORK"}.".keep";
 706     $ENV{"TASK_TMPDIR"} = $ENV{"TASK_WORK"}; # deprecated
 707     $ENV{"CRUNCH_NODE_SLOTS"} = $slot[$childslot]->{node}->{ncpus};
 708     $ENV{"PATH"} = $ENV{"CRUNCH_INSTALL"} . "/bin:" . $ENV{"PATH"};
 709
 710     $ENV{"GZIP"} = "-n";
 711
 712     my @srunargs = (
 713       "srun",
 714       "--nodelist=".$childnode->{name},
 715       qw(-n1 -c1 -N1 -D), $ENV{'TMPDIR'},
 716       "--job-name=$job_id.$id.$$",
 717         );
 718     my $command =
 719         "if [ -e $ENV{TASK_WORK} ]; then rm -rf $ENV{TASK_WORK}; fi; "
 720         ."mkdir -p $ENV{CRUNCH_TMP} $ENV{JOB_WORK} $ENV{TASK_WORK} $ENV{TASK_KEEPMOUNT} "
 721         ."&& cd $ENV{CRUNCH_TMP} ";
 722     $command .= "&& exec arv-mount --by-id --allow-other $ENV{TASK_KEEPMOUNT} --exec ";
 723     if ($docker_hash)
 724     {
 725       my $cidfile = "$ENV{CRUNCH_TMP}/$Jobstep->{arvados_task}->{uuid}-$Jobstep->{failures}.cid";
 726       $command .= "crunchstat -cgroup-root=/sys/fs/cgroup -cgroup-parent=docker -cgroup-cid=$cidfile -poll=10000 ";
 727       $command .= "$docker_bin run --rm=true --attach=stdout --attach=stderr --attach=stdin -i --user=crunch --cidfile=$cidfile --sig-proxy ";
 728
 729       # Dynamically configure the container to use the host system as its
 730       # DNS server.  Get the host's global addresses from the ip command,
 731       # and turn them into docker --dns options using gawk.
 732       $command .=
 733           q{$(ip -o address show scope global |
 734               gawk 'match($4, /^([0-9\.:]+)\//, x){print "--dns", x[1]}') };
 735
 736       # The source tree and $destdir directory (which we have
 737       # installed on the worker host) are available in the container,
 738       # under the same path.
 739       $command .= "--volume=\Q$ENV{CRUNCH_SRC}:$ENV{CRUNCH_SRC}:ro\E ";
 740       $command .= "--volume=\Q$ENV{CRUNCH_INSTALL}:$ENV{CRUNCH_INSTALL}:ro\E ";
 741
 742       # Currently, we make arv-mount's mount point appear at /keep
 743       # inside the container (instead of using the same path as the
 744       # host like we do with CRUNCH_SRC and CRUNCH_INSTALL). However,
 745       # crunch scripts and utilities must not rely on this. They must
 746       # use $TASK_KEEPMOUNT.
 747       $command .= "--volume=\Q$ENV{TASK_KEEPMOUNT}:/keep:ro\E ";
 748       $ENV{TASK_KEEPMOUNT} = "/keep";
 749
 750       # TASK_WORK is almost exactly like a docker data volume: it
 751       # starts out empty, is writable, and persists until no
 752       # containers use it any more. We don't use --volumes-from to
 753       # share it with other containers: it is only accessible to this
 754       # task, and it goes away when this task stops.
 755       #
 756       # However, a docker data volume is writable only by root unless
 757       # the mount point already happens to exist in the container with
 758       # different permissions. Therefore, we [1] assume /tmp already
 759       # exists in the image and is writable by the crunch user; [2]
 760       # avoid putting TASK_WORK inside CRUNCH_TMP (which won't be
 761       # writable if they are created by docker while setting up the
 762       # other --volumes); and [3] create $TASK_WORK inside the
 763       # container using $build_script.
 764       $command .= "--volume=/tmp ";
 765       $ENV{"TASK_WORK"} = "/tmp/crunch-job-task-work/$childslotname";
 766       $ENV{"HOME"} = $ENV{"TASK_WORK"};
 767       $ENV{"TASK_TMPDIR"} = $ENV{"TASK_WORK"}; # deprecated
 768
 769       # TODO: Share a single JOB_WORK volume across all task
 770       # containers on a given worker node, and delete it when the job
 771       # ends (and, in case that doesn't work, when the next job
 772       # starts).
 773       #
 774       # For now, use the same approach as TASK_WORK above.
 775       $ENV{"JOB_WORK"} = "/tmp/crunch-job-work";
 776
 777       while (my ($env_key, $env_val) = each %ENV)
 778       {
 779         if ($env_key =~ /^(ARVADOS|CRUNCH|JOB|TASK)_/) {
 780           $command .= "--env=\Q$env_key=$env_val\E ";
 781         }
 782       }
 783       $command .= "--env=\QHOME=$ENV{HOME}\E ";
 784       $command .= "\Q$docker_hash\E ";
 785       $command .= "stdbuf --output=0 --error=0 ";
 786       $command .= "perl - $ENV{CRUNCH_SRC}/crunch_scripts/" . $Job->{"script"};
 787     } else {
 788       # Non-docker run
 789       $command .= "crunchstat -cgroup-root=/sys/fs/cgroup -poll=10000 ";
 790       $command .= "stdbuf --output=0 --error=0 ";
 791       $command .= "perl - $ENV{CRUNCH_SRC}/crunch_scripts/" . $Job->{"script"};
 792     }
 793
 794     my @execargs = ('bash', '-c', $command);
 795     srun (\@srunargs, \@execargs, undef, $build_script);
 796     # exec() failed, we assume nothing happened.
 797     die "srun() failed on build script\n";
 798   }
 799   close("writer");
 800   if (!defined $childpid)
 801   {
 802     close $reader{$id};
 803     delete $reader{$id};
 804     next;
 805   }
 806   shift @freeslot;
 807   $proc{$childpid} = { jobstep => $id,
 808                        time => time,
 809                        slot => $childslot,
 810                        jobstepname => "$job_id.$id.$childpid",
 811                      };
 812   croak ("assert failed: \$slot[$childslot]->{'pid'} exists") if exists $slot[$childslot]->{pid};
 813   $slot[$childslot]->{pid} = $childpid;
 814
 815   Log ($id, "job_task ".$Jobstep->{'arvados_task'}->{'uuid'});
 816   Log ($id, "child $childpid started on $childslotname");
 817   $Jobstep->{starttime} = time;
 818   $Jobstep->{node} = $childnode->{name};
 819   $Jobstep->{slotindex} = $childslot;
 820   delete $Jobstep->{stderr};
 821   delete $Jobstep->{finishtime};
 822
 823   $Jobstep->{'arvados_task'}->{started_at} = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($Jobstep->{starttime});
 824   $Jobstep->{'arvados_task'}->save;
 825
 826   splice @jobstep_todo, $todo_ptr, 1;
 827   --$todo_ptr;
 828
 829   $progress_is_dirty = 1;
 830
 831   while (!@freeslot
 832          ||
 833          (@slot > @freeslot && $todo_ptr+1 > $#jobstep_todo))
 834   {
 835     last THISROUND if $main::please_freeze || defined($main::success);
 836     if ($main::please_info)
 837     {
 838       $main::please_info = 0;
 839       freeze();
 840       create_output_collection();
 841       save_meta(1);
 842       update_progress_stats();
 843     }
 844     my $gotsome
 845         = readfrompipes ()
 846         + reapchildren ();
 847     if (!$gotsome)
 848     {
 849       check_refresh_wanted();
 850       check_squeue();
 851       update_progress_stats();
 852       select (undef, undef, undef, 0.1);
 853     }
 854     elsif (time - $progress_stats_updated >= 30)
 855     {
 856       update_progress_stats();
 857     }
 858     if (($thisround_failed_multiple >= 8 && $thisround_succeeded == 0) ||
 859         ($thisround_failed_multiple >= 16 && $thisround_failed_multiple > $thisround_succeeded))
 860     {
 861       my $message = "Repeated failure rate too high ($thisround_failed_multiple/"
 862           .($thisround_failed+$thisround_succeeded)
 863           .") -- giving up on this round";
 864       Log (undef, $message);
 865       last THISROUND;
 866     }
 867
 868     # move slots from freeslot to holdslot (or back to freeslot) if necessary
 869     for (my $i=$#freeslot; $i>=0; $i--) {
 870       if ($slot[$freeslot[$i]]->{node}->{hold_until} > scalar time) {
 871         push @holdslot, (splice @freeslot, $i, 1);
 872       }
 873     }
 874     for (my $i=$#holdslot; $i>=0; $i--) {
 875       if ($slot[$holdslot[$i]]->{node}->{hold_until} <= scalar time) {
 876         push @freeslot, (splice @holdslot, $i, 1);
 877       }
 878     }
 879
 880     # give up if no nodes are succeeding
 881     if (!grep { $_->{node}->{losing_streak} == 0 &&
 882                     $_->{node}->{hold_count} < 4 } @slot) {
 883       my $message = "Every node has failed -- giving up on this round";
 884       Log (undef, $message);
 885       last THISROUND;
 886     }
 887   }
 888 }
 889
 890
 891 push @freeslot, splice @holdslot;
 892 map { $slot[$freeslot[$_]]->{node}->{losing_streak} = 0 } (0..$#freeslot);
 893
 894
 895 Log (undef, "wait for last ".(scalar keys %proc)." children to finish");
 896 while (%proc)
 897 {
 898   if ($main::please_continue) {
 899     $main::please_continue = 0;
 900     goto THISROUND;
 901   }
 902   $main::please_info = 0, freeze(), create_output_collection(), save_meta(1) if $main::please_info;
 903   readfrompipes ();
 904   if (!reapchildren())
 905   {
 906     check_refresh_wanted();
 907     check_squeue();
 908     update_progress_stats();
 909     select (undef, undef, undef, 0.1);
 910     killem (keys %proc) if $main::please_freeze;
 911   }
 912 }
 913
 914 update_progress_stats();
 915 freeze_if_want_freeze();
 916
 917
 918 if (!defined $main::success)
 919 {
 920   if (@jobstep_todo &&
 921       $thisround_succeeded == 0 &&
 922       ($thisround_failed == 0 || $thisround_failed > 4))
 923   {
 924     my $message = "stop because $thisround_failed tasks failed and none succeeded";
 925     Log (undef, $message);
 926     $main::success = 0;
 927   }
 928   if (!@jobstep_todo)
 929   {
 930     $main::success = 1;
 931   }
 932 }
 933
 934 goto ONELEVEL if !defined $main::success;
 935
 936
 937 release_allocation();
 938 freeze();
 939 my $collated_output = &create_output_collection();
 940
 941 if (!$collated_output) {
 942   Log (undef, "Failed to write output collection");
 943 }
 944 else {
 945   Log(undef, "job output $collated_output");
 946   $Job->update_attributes('output' => $collated_output);
 947 }
 948
 949 Log (undef, "finish");
 950
 951 save_meta();
 952
 953 my $final_state;
 954 if ($collated_output && $main::success) {
 955   $final_state = 'Complete';
 956 } else {
 957   $final_state = 'Failed';
 958 }
 959 $Job->update_attributes('state' => $final_state);
 960
 961 exit (($final_state eq 'Complete') ? 0 : 1);
 962
 963
 964
 965 sub update_progress_stats
 966 {
 967   $progress_stats_updated = time;
 968   return if !$progress_is_dirty;
 969   my ($todo, $done, $running) = (scalar @jobstep_todo,
 970                                  scalar @jobstep_done,
 971                                  scalar @slot - scalar @freeslot - scalar @holdslot);
 972   $Job->{'tasks_summary'} ||= {};
 973   $Job->{'tasks_summary'}->{'todo'} = $todo;
 974   $Job->{'tasks_summary'}->{'done'} = $done;
 975   $Job->{'tasks_summary'}->{'running'} = $running;
 976   $Job->update_attributes('tasks_summary' => $Job->{'tasks_summary'});
 977   Log (undef, "status: $done done, $running running, $todo todo");
 978   $progress_is_dirty = 0;
 979 }
 980
 981
 982
 983 sub reapchildren
 984 {
 985   my $pid = waitpid (-1, WNOHANG);
 986   return 0 if $pid <= 0;
 987
 988   my $whatslot = ($slot[$proc{$pid}->{slot}]->{node}->{name}
 989                   . "."
 990                   . $slot[$proc{$pid}->{slot}]->{cpu});
 991   my $jobstepid = $proc{$pid}->{jobstep};
 992   my $elapsed = time - $proc{$pid}->{time};
 993   my $Jobstep = $jobstep[$jobstepid];
 994
 995   my $childstatus = $?;
 996   my $exitvalue = $childstatus >> 8;
 997   my $exitinfo = "exit ".exit_status_s($childstatus);
 998   $Jobstep->{'arvados_task'}->reload;
 999   my $task_success = $Jobstep->{'arvados_task'}->{success};
1000
1001   Log ($jobstepid, "child $pid on $whatslot $exitinfo success=$task_success");
1002
1003   if (!defined $task_success) {
1004     # task did not indicate one way or the other --> fail
1005     $Jobstep->{'arvados_task'}->{success} = 0;
1006     $Jobstep->{'arvados_task'}->save;
1007     $task_success = 0;
1008   }
1009
1010   if (!$task_success)
1011   {
1012     my $temporary_fail;
1013     $temporary_fail ||= $Jobstep->{node_fail};
1014     $temporary_fail ||= ($exitvalue == 111);
1015
1016     ++$thisround_failed;
1017     ++$thisround_failed_multiple if $Jobstep->{'failures'} >= 1;
1018
1019     # Check for signs of a failed or misconfigured node
1020     if (++$slot[$proc{$pid}->{slot}]->{node}->{losing_streak} >=
1021         2+$slot[$proc{$pid}->{slot}]->{node}->{ncpus}) {
1022       # Don't count this against jobstep failure thresholds if this
1023       # node is already suspected faulty and srun exited quickly
1024       if ($slot[$proc{$pid}->{slot}]->{node}->{hold_until} &&
1025           $elapsed < 5) {
1026         Log ($jobstepid, "blaming failure on suspect node " .
1027              $slot[$proc{$pid}->{slot}]->{node}->{name});
1028         $temporary_fail ||= 1;
1029       }
1030       ban_node_by_slot($proc{$pid}->{slot});
1031     }
1032
1033     Log ($jobstepid, sprintf('failure (#%d, %s) after %d seconds',
1034                              ++$Jobstep->{'failures'},
1035                              $temporary_fail ? 'temporary ' : 'permanent',
1036                              $elapsed));
1037
1038     if (!$temporary_fail || $Jobstep->{'failures'} >= 3) {
1039       # Give up on this task, and the whole job
1040       $main::success = 0;
1041     }
1042     # Put this task back on the todo queue
1043     push @jobstep_todo, $jobstepid;
1044     $Job->{'tasks_summary'}->{'failed'}++;
1045   }
1046   else
1047   {
1048     ++$thisround_succeeded;
1049     $slot[$proc{$pid}->{slot}]->{node}->{losing_streak} = 0;
1050     $slot[$proc{$pid}->{slot}]->{node}->{hold_until} = 0;
1051     push @jobstep_done, $jobstepid;
1052     Log ($jobstepid, "success in $elapsed seconds");
1053   }
1054   $Jobstep->{exitcode} = $childstatus;
1055   $Jobstep->{finishtime} = time;
1056   $Jobstep->{'arvados_task'}->{finished_at} = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($Jobstep->{finishtime});
1057   $Jobstep->{'arvados_task'}->save;
1058   process_stderr ($jobstepid, $task_success);
1059   Log ($jobstepid, sprintf("task output (%d bytes): %s",
1060                            length($Jobstep->{'arvados_task'}->{output}),
1061                            $Jobstep->{'arvados_task'}->{output}));
1062
1063   close $reader{$jobstepid};
1064   delete $reader{$jobstepid};
1065   delete $slot[$proc{$pid}->{slot}]->{pid};
1066   push @freeslot, $proc{$pid}->{slot};
1067   delete $proc{$pid};
1068
1069   if ($task_success) {
1070     # Load new tasks
1071     my $newtask_list = [];
1072     my $newtask_results;
1073     do {
1074       $newtask_results = api_call(
1075         "job_tasks/list",
1076         'where' => {
1077           'created_by_job_task_uuid' => $Jobstep->{'arvados_task'}->{uuid}
1078         },
1079         'order' => 'qsequence',
1080         'offset' => scalar(@$newtask_list),
1081       );
1082       push(@$newtask_list, @{$newtask_results->{items}});
1083     } while (@{$newtask_results->{items}});
1084     foreach my $arvados_task (@$newtask_list) {
1085       my $jobstep = {
1086         'level' => $arvados_task->{'sequence'},
1087         'failures' => 0,
1088         'arvados_task' => $arvados_task
1089       };
1090       push @jobstep, $jobstep;
1091       push @jobstep_todo, $#jobstep;
1092     }
1093   }
1094
1095   $progress_is_dirty = 1;
1096   1;
1097 }
1098
1099 sub check_refresh_wanted
1100 {
1101   my @stat = stat $ENV{"CRUNCH_REFRESH_TRIGGER"};
1102   if (@stat && $stat[9] > $latest_refresh) {
1103     $latest_refresh = scalar time;
1104     my $Job2 = api_call("jobs/get", uuid => $jobspec);
1105     for my $attr ('cancelled_at',
1106                   'cancelled_by_user_uuid',
1107                   'cancelled_by_client_uuid',
1108                   'state') {
1109       $Job->{$attr} = $Job2->{$attr};
1110     }
1111     if ($Job->{'state'} ne "Running") {
1112       if ($Job->{'state'} eq "Cancelled") {
1113         Log (undef, "Job cancelled at " . $Job->{'cancelled_at'} . " by user " . $Job->{'cancelled_by_user_uuid'});
1114       } else {
1115         Log (undef, "Job state unexpectedly changed to " . $Job->{'state'});
1116       }
1117       $main::success = 0;
1118       $main::please_freeze = 1;
1119     }
1120   }
1121 }
1122
1123 sub check_squeue
1124 {
1125   # return if the kill list was checked <4 seconds ago
1126   if (defined $squeue_kill_checked && $squeue_kill_checked > time - 4)
1127   {
1128     return;
1129   }
1130   $squeue_kill_checked = time;
1131
1132   # use killem() on procs whose killtime is reached
1133   for (keys %proc)
1134   {
1135     if (exists $proc{$_}->{killtime}
1136         && $proc{$_}->{killtime} <= time)
1137     {
1138       killem ($_);
1139     }
1140   }
1141
1142   # return if the squeue was checked <60 seconds ago
1143   if (defined $squeue_checked && $squeue_checked > time - 60)
1144   {
1145     return;
1146   }
1147   $squeue_checked = time;
1148
1149   if (!$have_slurm)
1150   {
1151     # here is an opportunity to check for mysterious problems with local procs
1152     return;
1153   }
1154
1155   # get a list of steps still running
1156   my @squeue = `squeue -s -h -o '%i %j' && echo ok`;
1157   chop @squeue;
1158   if ($squeue[-1] ne "ok")
1159   {
1160     return;
1161   }
1162   pop @squeue;
1163
1164   # which of my jobsteps are running, according to squeue?
1165   my %ok;
1166   foreach (@squeue)
1167   {
1168     if (/^(\d+)\.(\d+) (\S+)/)
1169     {
1170       if ($1 eq $ENV{SLURM_JOBID})
1171       {
1172         $ok{$3} = 1;
1173       }
1174     }
1175   }
1176
1177   # which of my active child procs (>60s old) were not mentioned by squeue?
1178   foreach (keys %proc)
1179   {
1180     if ($proc{$_}->{time} < time - 60
1181         && !exists $ok{$proc{$_}->{jobstepname}}
1182         && !exists $proc{$_}->{killtime})
1183     {
1184       # kill this proc if it hasn't exited in 30 seconds
1185       $proc{$_}->{killtime} = time + 30;
1186     }
1187   }
1188 }
1189
1190
1191 sub release_allocation
1192 {
1193   if ($have_slurm)
1194   {
1195     Log (undef, "release job allocation");
1196     system "scancel $ENV{SLURM_JOBID}";
1197   }
1198 }
1199
1200
1201 sub readfrompipes
1202 {
1203   my $gotsome = 0;
1204   foreach my $job (keys %reader)
1205   {
1206     my $buf;
1207     while (0 < sysread ($reader{$job}, $buf, 8192))
1208     {
1209       print STDERR $buf if $ENV{CRUNCH_DEBUG};
1210       $jobstep[$job]->{stderr} .= $buf;
1211       preprocess_stderr ($job);
1212       if (length ($jobstep[$job]->{stderr}) > 16384)
1213       {
1214         substr ($jobstep[$job]->{stderr}, 0, 8192) = "";
1215       }
1216       $gotsome = 1;
1217     }
1218   }
1219   return $gotsome;
1220 }
1221
1222
1223 sub preprocess_stderr
1224 {
1225   my $job = shift;
1226
1227   while ($jobstep[$job]->{stderr} =~ /^(.*?)\n/) {
1228     my $line = $1;
1229     substr $jobstep[$job]->{stderr}, 0, 1+length($line), "";
1230     Log ($job, "stderr $line");
1231     if ($line =~ /srun: error: (SLURM job $ENV{SLURM_JOB_ID} has expired|Unable to confirm allocation for job $ENV{SLURM_JOB_ID})/) {
1232       # whoa.
1233       $main::please_freeze = 1;
1234     }
1235     elsif ($line =~ /srun: error: (Node failure on|Unable to create job step) /) {
1236       $jobstep[$job]->{node_fail} = 1;
1237       ban_node_by_slot($jobstep[$job]->{slotindex});
1238     }
1239   }
1240 }
1241
1242
1243 sub process_stderr
1244 {
1245   my $job = shift;
1246   my $task_success = shift;
1247   preprocess_stderr ($job);
1248
1249   map {
1250     Log ($job, "stderr $_");
1251   } split ("\n", $jobstep[$job]->{stderr});
1252 }
1253
1254 sub fetch_block
1255 {
1256   my $hash = shift;
1257   my $keep;
1258   if (!open($keep, "-|", "arv-get", "--retries", retry_count(), $hash)) {
1259     Log(undef, "fetch_block run error from arv-get $hash: $!");
1260     return undef;
1261   }
1262   my $output_block = "";
1263   while (1) {
1264     my $buf;
1265     my $bytes = sysread($keep, $buf, 1024 * 1024);
1266     if (!defined $bytes) {
1267       Log(undef, "fetch_block read error from arv-get: $!");
1268       $output_block = undef;
1269       last;
1270     } elsif ($bytes == 0) {
1271       # sysread returns 0 at the end of the pipe.
1272       last;
1273     } else {
1274       # some bytes were read into buf.
1275       $output_block .= $buf;
1276     }
1277   }
1278   close $keep;
1279   if ($?) {
1280     Log(undef, "fetch_block arv-get exited " . exit_status_s($?));
1281     $output_block = undef;
1282   }
1283   return $output_block;
1284 }
1285
1286 # Create a collection by concatenating the output of all tasks (each
1287 # task's output is either a manifest fragment, a locator for a
1288 # manifest fragment stored in Keep, or nothing at all). Return the
1289 # portable_data_hash of the new collection.
1290 sub create_output_collection
1291 {
1292   Log (undef, "collate");
1293
1294   my ($child_out, $child_in);
1295   my $pid = open2($child_out, $child_in, 'python', '-c', q{
1296 import arvados
1297 import sys
1298 print (arvados.api("v1").collections().
1299        create(body={"manifest_text": sys.stdin.read()}).
1300        execute(num_retries=int(sys.argv[1]))["portable_data_hash"])
1301 }, retry_count());
1302
1303   my $task_idx = -1;
1304   my $manifest_size = 0;
1305   for (@jobstep)
1306   {
1307     ++$task_idx;
1308     my $output = $_->{'arvados_task'}->{output};
1309     next if (!defined($output));
1310     my $next_write;
1311     if ($output =~ /^[0-9a-f]{32}(\+\S+)*$/) {
1312       $next_write = fetch_block($output);
1313     } else {
1314       $next_write = $output;
1315     }
1316     if (defined($next_write)) {
1317       if (!defined(syswrite($child_in, $next_write))) {
1318         # There's been an error writing.  Stop the loop.
1319         # We'll log details about the exit code later.
1320         last;
1321       } else {
1322         $manifest_size += length($next_write);
1323       }
1324     } else {
1325       my $uuid = $_->{'arvados_task'}->{'uuid'};
1326       Log (undef, "Error retrieving '$output' output by task $task_idx ($uuid)");
1327       $main::success = 0;
1328     }
1329   }
1330   close($child_in);
1331   Log(undef, "collated output manifest text to send to API server is $manifest_size bytes with access tokens");
1332
1333   my $joboutput;
1334   my $s = IO::Select->new($child_out);
1335   if ($s->can_read(120)) {
1336     sysread($child_out, $joboutput, 1024 * 1024);
1337     waitpid($pid, 0);
1338     if ($?) {
1339       Log(undef, "output collection creation exited " . exit_status_s($?));
1340       $joboutput = undef;
1341     } else {
1342       chomp($joboutput);
1343     }
1344   } else {
1345     Log (undef, "timed out while creating output collection");
1346     foreach my $signal (2, 2, 2, 15, 15, 9) {
1347       kill($signal, $pid);
1348       last if waitpid($pid, WNOHANG) == -1;
1349       sleep(1);
1350     }
1351   }
1352   close($child_out);
1353
1354   return $joboutput;
1355 }
1356
1357
1358 sub killem
1359 {
1360   foreach (@_)
1361   {
1362     my $sig = 2;                # SIGINT first
1363     if (exists $proc{$_}->{"sent_$sig"} &&
1364         time - $proc{$_}->{"sent_$sig"} > 4)
1365     {
1366       $sig = 15;                # SIGTERM if SIGINT doesn't work
1367     }
1368     if (exists $proc{$_}->{"sent_$sig"} &&
1369         time - $proc{$_}->{"sent_$sig"} > 4)
1370     {
1371       $sig = 9;                 # SIGKILL if SIGTERM doesn't work
1372     }
1373     if (!exists $proc{$_}->{"sent_$sig"})
1374     {
1375       Log ($proc{$_}->{jobstep}, "sending 2x signal $sig to pid $_");
1376       kill $sig, $_;
1377       select (undef, undef, undef, 0.1);
1378       if ($sig == 2)
1379       {
1380         kill $sig, $_;     # srun wants two SIGINT to really interrupt
1381       }
1382       $proc{$_}->{"sent_$sig"} = time;
1383       $proc{$_}->{"killedafter"} = time - $proc{$_}->{"time"};
1384     }
1385   }
1386 }
1387
1388
1389 sub fhbits
1390 {
1391   my($bits);
1392   for (@_) {
1393     vec($bits,fileno($_),1) = 1;
1394   }
1395   $bits;
1396 }
1397
1398
1399 # Send log output to Keep via arv-put.
1400 #
1401 # $log_pipe_in and $log_pipe_out are the input and output filehandles to the arv-put pipe.
1402 # $log_pipe_pid is the pid of the arv-put subprocess.
1403 #
1404 # The only functions that should access these variables directly are:
1405 #
1406 # log_writer_start($logfilename)
1407 #     Starts an arv-put pipe, reading data on stdin and writing it to
1408 #     a $logfilename file in an output collection.
1409 #
1410 # log_writer_send($txt)
1411 #     Writes $txt to the output log collection.
1412 #
1413 # log_writer_finish()
1414 #     Closes the arv-put pipe and returns the output that it produces.
1415 #
1416 # log_writer_is_active()
1417 #     Returns a true value if there is currently a live arv-put
1418 #     process, false otherwise.
1419 #
1420 my ($log_pipe_in, $log_pipe_out, $log_pipe_pid);
1421
1422 sub log_writer_start($)
1423 {
1424   my $logfilename = shift;
1425   $log_pipe_pid = open2($log_pipe_out, $log_pipe_in,
1426                         'arv-put',
1427                         '--portable-data-hash',
1428                         '--project-uuid', $Job->{owner_uuid},
1429                         '--retries', '3',
1430                         '--name', $logfilename,
1431                         '--filename', $logfilename,
1432                         '-');
1433 }
1434
1435 sub log_writer_send($)
1436 {
1437   my $txt = shift;
1438   print $log_pipe_in $txt;
1439 }
1440
1441 sub log_writer_finish()
1442 {
1443   return unless $log_pipe_pid;
1444
1445   close($log_pipe_in);
1446   my $arv_put_output;
1447
1448   my $s = IO::Select->new($log_pipe_out);
1449   if ($s->can_read(120)) {
1450     sysread($log_pipe_out, $arv_put_output, 1024);
1451     chomp($arv_put_output);
1452   } else {
1453     Log (undef, "timed out reading from 'arv-put'");
1454   }
1455
1456   waitpid($log_pipe_pid, 0);
1457   $log_pipe_pid = $log_pipe_in = $log_pipe_out = undef;
1458   if ($?) {
1459     Log("log_writer_finish: arv-put exited ".exit_status_s($?))
1460   }
1461
1462   return $arv_put_output;
1463 }
1464
1465 sub log_writer_is_active() {
1466   return $log_pipe_pid;
1467 }
1468
1469 sub Log                         # ($jobstep_id, $logmessage)
1470 {
1471   if ($_[1] =~ /\n/) {
1472     for my $line (split (/\n/, $_[1])) {
1473       Log ($_[0], $line);
1474     }
1475     return;
1476   }
1477   my $fh = select STDERR; $|=1; select $fh;
1478   my $message = sprintf ("%s %d %s %s", $job_id, $$, @_);
1479   $message =~ s{([^ -\176])}{"\\" . sprintf ("%03o", ord($1))}ge;
1480   $message .= "\n";
1481   my $datetime;
1482   if (log_writer_is_active() || -t STDERR) {
1483     my @gmtime = gmtime;
1484     $datetime = sprintf ("%04d-%02d-%02d_%02d:%02d:%02d",
1485                          $gmtime[5]+1900, $gmtime[4]+1, @gmtime[3,2,1,0]);
1486   }
1487   print STDERR ((-t STDERR) ? ($datetime." ".$message) : $message);
1488
1489   if (log_writer_is_active()) {
1490     log_writer_send($datetime . " " . $message);
1491   }
1492 }
1493
1494
1495 sub croak
1496 {
1497   my ($package, $file, $line) = caller;
1498   my $message = "@_ at $file line $line\n";
1499   Log (undef, $message);
1500   freeze() if @jobstep_todo;
1501   create_output_collection() if @jobstep_todo;
1502   cleanup();
1503   save_meta();
1504   die;
1505 }
1506
1507
1508 sub cleanup
1509 {
1510   return unless $Job;
1511   if ($Job->{'state'} eq 'Cancelled') {
1512     $Job->update_attributes('finished_at' => scalar gmtime);
1513   } else {
1514     $Job->update_attributes('state' => 'Failed');
1515   }
1516 }
1517
1518
1519 sub save_meta
1520 {
1521   my $justcheckpoint = shift; # false if this will be the last meta saved
1522   return if $justcheckpoint;  # checkpointing is not relevant post-Warehouse.pm
1523   return unless log_writer_is_active();
1524
1525   my $loglocator = log_writer_finish();
1526   Log (undef, "log manifest is $loglocator");
1527   $Job->{'log'} = $loglocator;
1528   $Job->update_attributes('log', $loglocator);
1529 }
1530
1531
1532 sub freeze_if_want_freeze
1533 {
1534   if ($main::please_freeze)
1535   {
1536     release_allocation();
1537     if (@_)
1538     {
1539       # kill some srun procs before freeze+stop
1540       map { $proc{$_} = {} } @_;
1541       while (%proc)
1542       {
1543         killem (keys %proc);
1544         select (undef, undef, undef, 0.1);
1545         my $died;
1546         while (($died = waitpid (-1, WNOHANG)) > 0)
1547         {
1548           delete $proc{$died};
1549         }
1550       }
1551     }
1552     freeze();
1553     create_output_collection();
1554     cleanup();
1555     save_meta();
1556     exit 1;
1557   }
1558 }
1559
1560
1561 sub freeze
1562 {
1563   Log (undef, "Freeze not implemented");
1564   return;
1565 }
1566
1567
1568 sub thaw
1569 {
1570   croak ("Thaw not implemented");
1571 }
1572
1573
1574 sub freezequote
1575 {
1576   my $s = shift;
1577   $s =~ s/\\/\\\\/g;
1578   $s =~ s/\n/\\n/g;
1579   return $s;
1580 }
1581
1582
1583 sub freezeunquote
1584 {
1585   my $s = shift;
1586   $s =~ s{\\(.)}{$1 eq "n" ? "\n" : $1}ge;
1587   return $s;
1588 }
1589
1590
1591 sub srun
1592 {
1593   my $srunargs = shift;
1594   my $execargs = shift;
1595   my $opts = shift || {};
1596   my $stdin = shift;
1597   my $args = $have_slurm ? [@$srunargs, @$execargs] : $execargs;
1598
1599   $Data::Dumper::Terse = 1;
1600   $Data::Dumper::Indent = 0;
1601   my $show_cmd = Dumper($args);
1602   $show_cmd =~ s/(TOKEN\\*=)[^\s\']+/${1}[...]/g;
1603   $show_cmd =~ s/\n/ /g;
1604   warn "starting: $show_cmd\n";
1605
1606   if (defined $stdin) {
1607     my $child = open STDIN, "-|";
1608     defined $child or die "no fork: $!";
1609     if ($child == 0) {
1610       print $stdin or die $!;
1611       close STDOUT or die $!;
1612       exit 0;
1613     }
1614   }
1615
1616   return system (@$args) if $opts->{fork};
1617
1618   exec @$args;
1619   warn "ENV size is ".length(join(" ",%ENV));
1620   die "exec failed: $!: @$args";
1621 }
1622
1623
1624 sub ban_node_by_slot {
1625   # Don't start any new jobsteps on this node for 60 seconds
1626   my $slotid = shift;
1627   $slot[$slotid]->{node}->{hold_until} = 60 + scalar time;
1628   $slot[$slotid]->{node}->{hold_count}++;
1629   Log (undef, "backing off node " . $slot[$slotid]->{node}->{name} . " for 60 seconds");
1630 }
1631
1632 sub must_lock_now
1633 {
1634   my ($lockfile, $error_message) = @_;
1635   open L, ">", $lockfile or croak("$lockfile: $!");
1636   if (!flock L, LOCK_EX|LOCK_NB) {
1637     croak("Can't lock $lockfile: $error_message\n");
1638   }
1639 }
1640
1641 sub find_docker_image {
1642   # Given a Keep locator, check to see if it contains a Docker image.
1643   # If so, return its stream name and Docker hash.
1644   # If not, return undef for both values.
1645   my $locator = shift;
1646   my ($streamname, $filename);
1647   my $image = api_call("collections/get", uuid => $locator);
1648   if ($image) {
1649     foreach my $line (split(/\n/, $image->{manifest_text})) {
1650       my @tokens = split(/\s+/, $line);
1651       next if (!@tokens);
1652       $streamname = shift(@tokens);
1653       foreach my $filedata (grep(/^\d+:\d+:/, @tokens)) {
1654         if (defined($filename)) {
1655           return (undef, undef);  # More than one file in the Collection.
1656         } else {
1657           $filename = (split(/:/, $filedata, 3))[2];
1658         }
1659       }
1660     }
1661   }
1662   if (defined($filename) and ($filename =~ /^([0-9A-Fa-f]{64})\.tar$/)) {
1663     return ($streamname, $1);
1664   } else {
1665     return (undef, undef);
1666   }
1667 }
1668
1669 sub retry_count {
1670   # Calculate the number of times an operation should be retried,
1671   # assuming exponential backoff, and that we're willing to retry as
1672   # long as tasks have been running.  Enforce a minimum of 3 retries.
1673   my ($starttime, $endtime, $timediff, $retries);
1674   if (@jobstep) {
1675     $starttime = $jobstep[0]->{starttime};
1676     $endtime = $jobstep[-1]->{finishtime};
1677   }
1678   if (!defined($starttime)) {
1679     $timediff = 0;
1680   } elsif (!defined($endtime)) {
1681     $timediff = time - $starttime;
1682   } else {
1683     $timediff = ($endtime - $starttime) - (time - $endtime);
1684   }
1685   if ($timediff > 0) {
1686     $retries = int(log($timediff) / log(2));
1687   } else {
1688     $retries = 1;  # Use the minimum.
1689   }
1690   return ($retries > 3) ? $retries : 3;
1691 }
1692
1693 sub retry_op {
1694   # Pass in two function references.
1695   # This method will be called with the remaining arguments.
1696   # If it dies, retry it with exponential backoff until it succeeds,
1697   # or until the current retry_count is exhausted.  After each failure
1698   # that can be retried, the second function will be called with
1699   # the current try count (0-based), next try time, and error message.
1700   my $operation = shift;
1701   my $retry_callback = shift;
1702   my $retries = retry_count();
1703   foreach my $try_count (0..$retries) {
1704     my $next_try = time + (2 ** $try_count);
1705     my $result = eval { $operation->(@_); };
1706     if (!$@) {
1707       return $result;
1708     } elsif ($try_count < $retries) {
1709       $retry_callback->($try_count, $next_try, $@);
1710       my $sleep_time = $next_try - time;
1711       sleep($sleep_time) if ($sleep_time > 0);
1712     }
1713   }
1714   # Ensure the error message ends in a newline, so Perl doesn't add
1715   # retry_op's line number to it.
1716   chomp($@);
1717   die($@ . "\n");
1718 }
1719
1720 sub api_call {
1721   # Pass in a /-separated API method name, and arguments for it.
1722   # This function will call that method, retrying as needed until
1723   # the current retry_count is exhausted, with a log on the first failure.
1724   my $method_name = shift;
1725   my $log_api_retry = sub {
1726     my ($try_count, $next_try_at, $errmsg) = @_;
1727     $errmsg =~ s/\s*\bat \Q$0\E line \d+\.?\s*//;
1728     $errmsg =~ s/\s/ /g;
1729     $errmsg =~ s/\s+$//;
1730     my $retry_msg;
1731     if ($next_try_at < time) {
1732       $retry_msg = "Retrying.";
1733     } else {
1734       my $next_try_fmt = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($next_try_at);
1735       $retry_msg = "Retrying at $next_try_fmt.";
1736     }
1737     Log(undef, "API method $method_name failed: $errmsg. $retry_msg");
1738   };
1739   my $method = $arv;
1740   foreach my $key (split(/\//, $method_name)) {
1741     $method = $method->{$key};
1742   }
1743   return retry_op(sub { $method->execute(@_); }, $log_api_retry, @_);
1744 }
1745
1746 sub exit_status_s {
1747   # Given a $?, return a human-readable exit code string like "0" or
1748   # "1" or "0 with signal 1" or "1 with signal 11".
1749   my $exitcode = shift;
1750   my $s = $exitcode >> 8;
1751   if ($exitcode & 0x7f) {
1752     $s .= " with signal " . ($exitcode & 0x7f);
1753   }
1754   if ($exitcode & 0x80) {
1755     $s .= " with core dump";
1756   }
1757   return $s;
1758 }
1759
1760 sub handle_readall {
1761   # Pass in a glob reference to a file handle.
1762   # Read all its contents and return them as a string.
1763   my $fh_glob_ref = shift;
1764   local $/ = undef;
1765   return <$fh_glob_ref>;
1766 }
1767
1768 sub tar_filename_n {
1769   my $n = shift;
1770   return sprintf("%s/git.%s.%d.tar", $ENV{CRUNCH_TMP}, $job_id, $n);
1771 }
1772
1773 sub add_git_archive {
1774   # Pass in a git archive command as a string or list, a la system().
1775   # This method will save its output to be included in the archive sent to the
1776   # build script.
1777   my $git_input;
1778   $git_tar_count++;
1779   if (!open(GIT_ARCHIVE, ">", tar_filename_n($git_tar_count))) {
1780     croak("Failed to save git archive: $!");
1781   }
1782   my $git_pid = open2(">&GIT_ARCHIVE", $git_input, @_);
1783   close($git_input);
1784   waitpid($git_pid, 0);
1785   close(GIT_ARCHIVE);
1786   if ($?) {
1787     croak("Failed to save git archive: git exited " . exit_status_s($?));
1788   }
1789 }
1790
1791 sub combined_git_archive {
1792   # Combine all saved tar archives into a single archive, then return its
1793   # contents in a string.  Return undef if no archives have been saved.
1794   if ($git_tar_count < 1) {
1795     return undef;
1796   }
1797   my $base_tar_name = tar_filename_n(1);
1798   foreach my $tar_to_append (map { tar_filename_n($_); } (2..$git_tar_count)) {
1799     my $tar_exit = system("tar", "-Af", $base_tar_name, $tar_to_append);
1800     if ($tar_exit != 0) {
1801       croak("Error preparing build archive: tar -A exited " .
1802             exit_status_s($tar_exit));
1803     }
1804   }
1805   if (!open(GIT_TAR, "<", $base_tar_name)) {
1806     croak("Could not open build archive: $!");
1807   }
1808   my $tar_contents = handle_readall(\*GIT_TAR);
1809   close(GIT_TAR);
1810   return $tar_contents;
1811 }
1812
1813 __DATA__
1814 #!/usr/bin/perl
1815 #
1816 # This is crunch-job's internal dispatch script.  crunch-job running on the API
1817 # server invokes this script on individual compute nodes, or localhost if we're
1818 # running a job locally.  It gets called in two modes:
1819 #
1820 # * No arguments: Installation mode.  Read a tar archive from the DATA
1821 #   file handle; it includes the Crunch script's source code, and
1822 #   maybe SDKs as well.  Those should be installed in the proper
1823 #   locations.  This runs outside of any Docker container, so don't try to
1824 #   introspect Crunch's runtime environment.
1825 #
1826 # * With arguments: Crunch script run mode.  This script should set up the
1827 #   environment, then run the command specified in the arguments.  This runs
1828 #   inside any Docker container.
1829
1830 use Fcntl ':flock';
1831 use File::Path qw( make_path remove_tree );
1832 use POSIX qw(getcwd);
1833
1834 # Map SDK subdirectories to the path environments they belong to.
1835 my %SDK_ENVVARS = ("perl/lib" => "PERLLIB", "ruby/lib" => "RUBYLIB");
1836
1837 my $destdir = $ENV{"CRUNCH_SRC"};
1838 my $commit = $ENV{"CRUNCH_SRC_COMMIT"};
1839 my $repo = $ENV{"CRUNCH_SRC_URL"};
1840 my $install_dir = $ENV{"CRUNCH_INSTALL"} || (getcwd() . "/opt");
1841 my $job_work = $ENV{"JOB_WORK"};
1842 my $task_work = $ENV{"TASK_WORK"};
1843
1844 for my $dir ($destdir, $job_work, $task_work) {
1845   if ($dir) {
1846     make_path $dir;
1847     -e $dir or die "Failed to create temporary directory ($dir): $!";
1848   }
1849 }
1850
1851 if ($task_work) {
1852   remove_tree($task_work, {keep_root => 1});
1853 }
1854
1855 open(STDOUT_ORIG, ">&", STDOUT);
1856 open(STDERR_ORIG, ">&", STDERR);
1857 open(STDOUT, ">>", "$destdir.log");
1858 open(STDERR, ">&", STDOUT);
1859
1860 ### Crunch script run mode
1861 if (@ARGV) {
1862   # We want to do routine logging during task 0 only.  This gives the user
1863   # the information they need, but avoids repeating the information for every
1864   # task.
1865   my $Log;
1866   if ($ENV{TASK_SEQUENCE} eq "0") {
1867     $Log = sub {
1868       my $msg = shift;
1869       printf STDERR_ORIG "[Crunch] $msg\n", @_;
1870     };
1871   } else {
1872     $Log = sub { };
1873   }
1874
1875   my $python_src = "$install_dir/python";
1876   my $venv_dir = "$job_work/.arvados.venv";
1877   my $venv_built = -e "$venv_dir/bin/activate";
1878   if ((!$venv_built) and (-d $python_src) and can_run("virtualenv")) {
1879     shell_or_die("virtualenv", "--quiet", "--system-site-packages",
1880                  "--python=python2.7", $venv_dir);
1881     shell_or_die("$venv_dir/bin/pip", "--quiet", "install", "-I", $python_src);
1882     $venv_built = 1;
1883     $Log->("Built Python SDK virtualenv");
1884   }
1885
1886   my $pip_bin = "pip";
1887   if ($venv_built) {
1888     $Log->("Running in Python SDK virtualenv");
1889     $pip_bin = "$venv_dir/bin/pip";
1890     my $orig_argv = join(" ", map { quotemeta($_); } @ARGV);
1891     @ARGV = ("/bin/sh", "-ec",
1892              ". \Q$venv_dir/bin/activate\E; exec $orig_argv");
1893   } elsif (-d $python_src) {
1894     $Log->("Warning: virtualenv not found inside Docker container default " .
1895            "\$PATH. Can't install Python SDK.");
1896   }
1897
1898   my $pkgs = `(\Q$pip_bin\E freeze 2>/dev/null | grep arvados) || dpkg-query --show '*arvados*'`;
1899   if ($pkgs) {
1900     $Log->("Using Arvados SDK:");
1901     foreach my $line (split /\n/, $pkgs) {
1902       $Log->($line);
1903     }
1904   } else {
1905     $Log->("Arvados SDK packages not found");
1906   }
1907
1908   while (my ($sdk_dir, $sdk_envkey) = each(%SDK_ENVVARS)) {
1909     my $sdk_path = "$install_dir/$sdk_dir";
1910     if (-d $sdk_path) {
1911       if ($ENV{$sdk_envkey}) {
1912         $ENV{$sdk_envkey} = "$sdk_path:" . $ENV{$sdk_envkey};
1913       } else {
1914         $ENV{$sdk_envkey} = $sdk_path;
1915       }
1916       $Log->("Arvados SDK added to %s", $sdk_envkey);
1917     }
1918   }
1919
1920   close(STDOUT);
1921   close(STDERR);
1922   open(STDOUT, ">&", STDOUT_ORIG);
1923   open(STDERR, ">&", STDERR_ORIG);
1924   exec(@ARGV);
1925   die "Cannot exec `@ARGV`: $!";
1926 }
1927
1928 ### Installation mode
1929 open L, ">", "$destdir.lock" or die "$destdir.lock: $!";
1930 flock L, LOCK_EX;
1931 if (readlink ("$destdir.commit") eq $commit && -d $destdir) {
1932   # This version already installed -> nothing to do.
1933   exit(0);
1934 }
1935
1936 unlink "$destdir.commit";
1937 mkdir $destdir;
1938
1939 if (!open(TARX, "|-", "tar", "-xC", $destdir)) {
1940   die "Error launching 'tar -xC $destdir': $!";
1941 }
1942 # If we send too much data to tar in one write (> 4-5 MiB), it stops, and we
1943 # get SIGPIPE.  We must feed it data incrementally.
1944 my $tar_input;
1945 while (read(DATA, $tar_input, 65536)) {
1946   print TARX $tar_input;
1947 }
1948 if(!close(TARX)) {
1949   die "'tar -xC $destdir' exited $?: $!";
1950 }
1951
1952 mkdir $install_dir;
1953
1954 my $sdk_root = "$destdir/.arvados.sdk/sdk";
1955 if (-d $sdk_root) {
1956   foreach my $sdk_lang (("python",
1957                          map { (split /\//, $_, 2)[0]; } keys(%SDK_ENVVARS))) {
1958     if (-d "$sdk_root/$sdk_lang") {
1959       if (!rename("$sdk_root/$sdk_lang", "$install_dir/$sdk_lang")) {
1960         die "Failed to install $sdk_lang SDK: $!";
1961       }
1962     }
1963   }
1964 }
1965
1966 my $python_dir = "$install_dir/python";
1967 if ((-d $python_dir) and can_run("python2.7") and
1968     (system("python2.7", "$python_dir/setup.py", "--quiet", "egg_info") != 0)) {
1969   # egg_info failed, probably when it asked git for a build tag.
1970   # Specify no build tag.
1971   open(my $pysdk_cfg, ">>", "$python_dir/setup.cfg");
1972   print $pysdk_cfg "\n[egg_info]\ntag_build =\n";
1973   close($pysdk_cfg);
1974 }
1975
1976 if (-e "$destdir/crunch_scripts/install") {
1977     shell_or_die ("$destdir/crunch_scripts/install", $install_dir);
1978 } elsif (!-e "./install.sh" && -e "./tests/autotests.sh") {
1979     # Old version
1980     shell_or_die ("./tests/autotests.sh", $install_dir);
1981 } elsif (-e "./install.sh") {
1982     shell_or_die ("./install.sh", $install_dir);
1983 }
1984
1985 if ($commit) {
1986     unlink "$destdir.commit.new";
1987     symlink ($commit, "$destdir.commit.new") or die "$destdir.commit.new: $!";
1988     rename ("$destdir.commit.new", "$destdir.commit") or die "$destdir.commit: $!";
1989 }
1990
1991 close L;
1992
1993 sub can_run {
1994   my $command_name = shift;
1995   open(my $which, "-|", "which", $command_name);
1996   while (<$which>) { }
1997   close($which);
1998   return ($? == 0);
1999 }
2000
2001 sub shell_or_die
2002 {
2003   if ($ENV{"DEBUG"}) {
2004     print STDERR "@_\n";
2005   }
2006   if (system (@_) != 0) {
2007     my $err = $!;
2008     my $exitstatus = sprintf("exit %d signal %d", $? >> 8, $? & 0x7f);
2009     open STDERR, ">&STDERR_ORIG";
2010     system ("cat $destdir.log >&2");
2011     die "@_ failed ($err): $exitstatus";
2012   }
2013 }
2014
2015 __DATA__