sdk/cli/bin/crunch-job

   1 #!/usr/bin/perl
   2 # -*- mode: perl; perl-indent-level: 2; indent-tabs-mode: nil; -*-
   3
   4 =head1 NAME
   5
   6 crunch-job: Execute job steps, save snapshots as requested, collate output.
   7
   8 =head1 SYNOPSIS
   9
  10 Obtain job details from Arvados, run tasks on compute nodes (typically
  11 invoked by scheduler on controller):
  12
  13  crunch-job --job x-y-z --git-dir /path/to/repo/.git
  14
  15 Obtain job details from command line, run tasks on local machine
  16 (typically invoked by application or developer on VM):
  17
  18  crunch-job --job '{"script_version":"/path/to/working/tree","script":"scriptname",...}'
  19
  20  crunch-job --job '{"repository":"https://github.com/curoverse/arvados.git","script_version":"master","script":"scriptname",...}'
  21
  22 =head1 OPTIONS
  23
  24 =over
  25
  26 =item --force-unlock
  27
  28 If the job is already locked, steal the lock and run it anyway.
  29
  30 =item --git-dir
  31
  32 Path to a .git directory (or a git URL) where the commit given in the
  33 job's C<script_version> attribute is to be found. If this is I<not>
  34 given, the job's C<repository> attribute will be used.
  35
  36 =item --job-api-token
  37
  38 Arvados API authorization token to use during the course of the job.
  39
  40 =item --no-clear-tmp
  41
  42 Do not clear per-job/task temporary directories during initial job
  43 setup. This can speed up development and debugging when running jobs
  44 locally.
  45
  46 =item --job
  47
  48 UUID of the job to run, or a JSON-encoded job resource without a
  49 UUID. If the latter is given, a new job object will be created.
  50
  51 =back
  52
  53 =head1 RUNNING JOBS LOCALLY
  54
  55 crunch-job's log messages appear on stderr along with the job tasks'
  56 stderr streams. The log is saved in Keep at each checkpoint and when
  57 the job finishes.
  58
  59 If the job succeeds, the job's output locator is printed on stdout.
  60
  61 While the job is running, the following signals are accepted:
  62
  63 =over
  64
  65 =item control-C, SIGINT, SIGQUIT
  66
  67 Save a checkpoint, terminate any job tasks that are running, and stop.
  68
  69 =item SIGALRM
  70
  71 Save a checkpoint and continue.
  72
  73 =item SIGHUP
  74
  75 Refresh node allocation (i.e., check whether any nodes have been added
  76 or unallocated) and attributes of the Job record that should affect
  77 behavior (e.g., cancel job if cancelled_at becomes non-nil).
  78
  79 =back
  80
  81 =cut
  82
  83
  84 use strict;
  85 use POSIX ':sys_wait_h';
  86 use POSIX qw(strftime);
  87 use Fcntl qw(F_GETFL F_SETFL O_NONBLOCK);
  88 use Arvados;
  89 use Cwd qw(realpath);
  90 use Data::Dumper;
  91 use Digest::MD5 qw(md5_hex);
  92 use Getopt::Long;
  93 use IPC::Open2;
  94 use IO::Select;
  95 use File::Temp;
  96 use Fcntl ':flock';
  97 use File::Path qw( make_path remove_tree );
  98
  99 use constant EX_TEMPFAIL => 75;
 100
 101 $ENV{"TMPDIR"} ||= "/tmp";
 102 unless (defined $ENV{"CRUNCH_TMP"}) {
 103   $ENV{"CRUNCH_TMP"} = $ENV{"TMPDIR"} . "/crunch-job";
 104   if ($ENV{"USER"} ne "crunch" && $< != 0) {
 105     # use a tmp dir unique for my uid
 106     $ENV{"CRUNCH_TMP"} .= "-$<";
 107   }
 108 }
 109
 110 # Create the tmp directory if it does not exist
 111 if ( ! -d $ENV{"CRUNCH_TMP"} ) {
 112   make_path $ENV{"CRUNCH_TMP"} or die "Failed to create temporary working directory: " . $ENV{"CRUNCH_TMP"};
 113 }
 114
 115 $ENV{"JOB_WORK"} = $ENV{"CRUNCH_TMP"} . "/work";
 116 $ENV{"CRUNCH_INSTALL"} = "$ENV{CRUNCH_TMP}/opt";
 117 $ENV{"CRUNCH_WORK"} = $ENV{"JOB_WORK"}; # deprecated
 118 mkdir ($ENV{"JOB_WORK"});
 119
 120 my $force_unlock;
 121 my $git_dir;
 122 my $jobspec;
 123 my $job_api_token;
 124 my $no_clear_tmp;
 125 my $resume_stash;
 126 GetOptions('force-unlock' => \$force_unlock,
 127            'git-dir=s' => \$git_dir,
 128            'job=s' => \$jobspec,
 129            'job-api-token=s' => \$job_api_token,
 130            'no-clear-tmp' => \$no_clear_tmp,
 131            'resume-stash=s' => \$resume_stash,
 132     );
 133
 134 if (defined $job_api_token) {
 135   $ENV{ARVADOS_API_TOKEN} = $job_api_token;
 136 }
 137
 138 my $have_slurm = exists $ENV{SLURM_JOBID} && exists $ENV{SLURM_NODELIST};
 139 my $local_job = 0;
 140
 141
 142 $SIG{'USR1'} = sub
 143 {
 144   $main::ENV{CRUNCH_DEBUG} = 1;
 145 };
 146 $SIG{'USR2'} = sub
 147 {
 148   $main::ENV{CRUNCH_DEBUG} = 0;
 149 };
 150
 151
 152
 153 my $arv = Arvados->new('apiVersion' => 'v1');
 154
 155 my $Job;
 156 my $job_id;
 157 my $dbh;
 158 my $sth;
 159 my @jobstep;
 160
 161 my $User = api_call("users/current");
 162
 163 if ($jobspec =~ /^[-a-z\d]+$/)
 164 {
 165   # $jobspec is an Arvados UUID, not a JSON job specification
 166   $Job = api_call("jobs/get", uuid => $jobspec);
 167   if (!$force_unlock) {
 168     # Claim this job, and make sure nobody else does
 169     eval { api_call("jobs/lock", uuid => $Job->{uuid}); };
 170     if ($@) {
 171       Log(undef, "Error while locking job, exiting ".EX_TEMPFAIL);
 172       exit EX_TEMPFAIL;
 173     };
 174   }
 175 }
 176 else
 177 {
 178   $Job = JSON::decode_json($jobspec);
 179
 180   if (!$resume_stash)
 181   {
 182     map { croak ("No $_ specified") unless $Job->{$_} }
 183     qw(script script_version script_parameters);
 184   }
 185
 186   $Job->{'is_locked_by_uuid'} = $User->{'uuid'};
 187   $Job->{'started_at'} = gmtime;
 188   $Job->{'state'} = 'Running';
 189
 190   $Job = api_call("jobs/create", job => $Job);
 191 }
 192 $job_id = $Job->{'uuid'};
 193
 194 my $keep_logfile = $job_id . '.log.txt';
 195 log_writer_start($keep_logfile);
 196
 197 $Job->{'runtime_constraints'} ||= {};
 198 $Job->{'runtime_constraints'}->{'max_tasks_per_node'} ||= 0;
 199 my $max_ncpus = $Job->{'runtime_constraints'}->{'max_tasks_per_node'};
 200
 201 my $gem_versions = `gem list --quiet arvados-cli 2>/dev/null`;
 202 if ($? == 0) {
 203   $gem_versions =~ s/^arvados-cli \(/ with arvados-cli Gem version(s) /;
 204   chomp($gem_versions);
 205   chop($gem_versions);  # Closing parentheses
 206 } else {
 207   $gem_versions = "";
 208 }
 209 Log(undef,
 210     "running from " . ((-e $0) ? realpath($0) : "stdin") . $gem_versions);
 211
 212 Log (undef, "check slurm allocation");
 213 my @slot;
 214 my @node;
 215 # Should use $ENV{SLURM_TASKS_PER_NODE} instead of sinfo? (eg. "4(x3),2,4(x2)")
 216 my @sinfo;
 217 if (!$have_slurm)
 218 {
 219   my $localcpus = 0 + `grep -cw ^processor /proc/cpuinfo` || 1;
 220   push @sinfo, "$localcpus localhost";
 221 }
 222 if (exists $ENV{SLURM_NODELIST})
 223 {
 224   push @sinfo, `sinfo -h --format='%c %N' --nodes=\Q$ENV{SLURM_NODELIST}\E`;
 225 }
 226 foreach (@sinfo)
 227 {
 228   my ($ncpus, $slurm_nodelist) = split;
 229   $ncpus = $max_ncpus if $max_ncpus && $ncpus > $max_ncpus;
 230
 231   my @nodelist;
 232   while ($slurm_nodelist =~ s/^([^\[,]+?(\[.*?\])?)(,|$)//)
 233   {
 234     my $nodelist = $1;
 235     if ($nodelist =~ /\[((\d+)(-(\d+))?(,(\d+)(-(\d+))?)*)\]/)
 236     {
 237       my $ranges = $1;
 238       foreach (split (",", $ranges))
 239       {
 240         my ($a, $b);
 241         if (/(\d+)-(\d+)/)
 242         {
 243           $a = $1;
 244           $b = $2;
 245         }
 246         else
 247         {
 248           $a = $_;
 249           $b = $_;
 250         }
 251         push @nodelist, map {
 252           my $n = $nodelist;
 253           $n =~ s/\[[-,\d]+\]/$_/;
 254           $n;
 255         } ($a..$b);
 256       }
 257     }
 258     else
 259     {
 260       push @nodelist, $nodelist;
 261     }
 262   }
 263   foreach my $nodename (@nodelist)
 264   {
 265     Log (undef, "node $nodename - $ncpus slots");
 266     my $node = { name => $nodename,
 267                  ncpus => $ncpus,
 268                  losing_streak => 0,
 269                  hold_until => 0 };
 270     foreach my $cpu (1..$ncpus)
 271     {
 272       push @slot, { node => $node,
 273                     cpu => $cpu };
 274     }
 275   }
 276   push @node, @nodelist;
 277 }
 278
 279
 280
 281 # Ensure that we get one jobstep running on each allocated node before
 282 # we start overloading nodes with concurrent steps
 283
 284 @slot = sort { $a->{cpu} <=> $b->{cpu} } @slot;
 285
 286
 287 $Job->update_attributes(
 288   'tasks_summary' => { 'failed' => 0,
 289                        'todo' => 1,
 290                        'running' => 0,
 291                        'done' => 0 });
 292
 293 Log (undef, "start");
 294 $SIG{'INT'} = sub { $main::please_freeze = 1; };
 295 $SIG{'QUIT'} = sub { $main::please_freeze = 1; };
 296 $SIG{'TERM'} = \&croak;
 297 $SIG{'TSTP'} = sub { $main::please_freeze = 1; };
 298 $SIG{'ALRM'} = sub { $main::please_info = 1; };
 299 $SIG{'CONT'} = sub { $main::please_continue = 1; };
 300 $SIG{'HUP'} = sub { $main::please_refresh = 1; };
 301
 302 $main::please_freeze = 0;
 303 $main::please_info = 0;
 304 $main::please_continue = 0;
 305 $main::please_refresh = 0;
 306 my $jobsteps_must_output_keys = 0;      # becomes 1 when any task outputs a key
 307
 308 grep { $ENV{$1} = $2 if /^(NOCACHE.*?)=(.*)/ } split ("\n", $$Job{knobs});
 309 $ENV{"CRUNCH_JOB_UUID"} = $job_id;
 310 $ENV{"JOB_UUID"} = $job_id;
 311
 312
 313 my @jobstep_todo = ();
 314 my @jobstep_done = ();
 315 my @jobstep_tomerge = ();
 316 my $jobstep_tomerge_level = 0;
 317 my $squeue_checked;
 318 my $squeue_kill_checked;
 319 my $latest_refresh = scalar time;
 320
 321
 322
 323 if (defined $Job->{thawedfromkey})
 324 {
 325   thaw ($Job->{thawedfromkey});
 326 }
 327 else
 328 {
 329   my $first_task = api_call("job_tasks/create", job_task => {
 330     'job_uuid' => $Job->{'uuid'},
 331     'sequence' => 0,
 332     'qsequence' => 0,
 333     'parameters' => {},
 334   });
 335   push @jobstep, { 'level' => 0,
 336                    'failures' => 0,
 337                    'arvados_task' => $first_task,
 338                  };
 339   push @jobstep_todo, 0;
 340 }
 341
 342
 343 if (!$have_slurm)
 344 {
 345   must_lock_now("$ENV{CRUNCH_TMP}/.lock", "a job is already running here.");
 346 }
 347
 348 my $build_script = handle_readall(\*DATA);
 349 my $nodelist = join(",", @node);
 350 my $git_tar_count = 0;
 351
 352 if (!defined $no_clear_tmp) {
 353   # Clean out crunch_tmp/work, crunch_tmp/opt, crunch_tmp/src*
 354   Log (undef, "Clean work dirs");
 355
 356   my $cleanpid = fork();
 357   if ($cleanpid == 0)
 358   {
 359     # Find FUSE mounts that look like Keep mounts (the mount path has the
 360     # word "keep") and unmount them.  Then clean up work directories.
 361     # TODO: When #5036 is done and widely deployed, we can get rid of the
 362     # regular expression and just unmount everything with type fuse.keep.
 363     srun (["srun", "--nodelist=$nodelist", "-D", $ENV{'TMPDIR'}],
 364           ['bash', '-ec', 'mount -t fuse,fuse.keep | awk \'($3 ~ /\ykeep\y/){print $3}\' | xargs -r -n 1 fusermount -u -z; sleep 1; rm -rf $JOB_WORK $CRUNCH_INSTALL $CRUNCH_TMP/task $CRUNCH_TMP/src*']);
 365     exit (1);
 366   }
 367   while (1)
 368   {
 369     last if $cleanpid == waitpid (-1, WNOHANG);
 370     freeze_if_want_freeze ($cleanpid);
 371     select (undef, undef, undef, 0.1);
 372   }
 373   Log (undef, "Cleanup command exited ".exit_status_s($?));
 374 }
 375
 376 # If this job requires a Docker image, install that.
 377 my $docker_bin = "/usr/bin/docker.io";
 378 my ($docker_locator, $docker_stream, $docker_hash);
 379 if ($docker_locator = $Job->{docker_image_locator}) {
 380   ($docker_stream, $docker_hash) = find_docker_image($docker_locator);
 381   if (!$docker_hash)
 382   {
 383     croak("No Docker image hash found from locator $docker_locator");
 384   }
 385   $docker_stream =~ s/^\.//;
 386   my $docker_install_script = qq{
 387 if ! $docker_bin images -q --no-trunc | grep -qxF \Q$docker_hash\E; then
 388     arv-get \Q$docker_locator$docker_stream/$docker_hash.tar\E | $docker_bin load
 389 fi
 390 };
 391   my $docker_pid = fork();
 392   if ($docker_pid == 0)
 393   {
 394     srun (["srun", "--nodelist=" . join(',', @node)],
 395           ["/bin/sh", "-ec", $docker_install_script]);
 396     exit ($?);
 397   }
 398   while (1)
 399   {
 400     last if $docker_pid == waitpid (-1, WNOHANG);
 401     freeze_if_want_freeze ($docker_pid);
 402     select (undef, undef, undef, 0.1);
 403   }
 404   if ($? != 0)
 405   {
 406     croak("Installing Docker image from $docker_locator exited "
 407           .exit_status_s($?));
 408   }
 409
 410   if ($Job->{arvados_sdk_version}) {
 411     # The job also specifies an Arvados SDK version.  Add the SDKs to the
 412     # tar file for the build script to install.
 413     Log(undef, sprintf("Packing Arvados SDK version %s for installation",
 414                        $Job->{arvados_sdk_version}));
 415     add_git_archive("git", "--git-dir=$git_dir", "archive",
 416                     "--prefix=.arvados.sdk/",
 417                     $Job->{arvados_sdk_version}, "sdk");
 418   }
 419 }
 420
 421 if (!defined $git_dir && $Job->{'script_version'} =~ m{^/}) {
 422   # If script_version looks like an absolute path, *and* the --git-dir
 423   # argument was not given -- which implies we were not invoked by
 424   # crunch-dispatch -- we will use the given path as a working
 425   # directory instead of resolving script_version to a git commit (or
 426   # doing anything else with git).
 427   $ENV{"CRUNCH_SRC_COMMIT"} = $Job->{'script_version'};
 428   $ENV{"CRUNCH_SRC"} = $Job->{'script_version'};
 429 }
 430 else {
 431   # Resolve the given script_version to a git commit sha1. Also, if
 432   # the repository is remote, clone it into our local filesystem: this
 433   # ensures "git archive" will work, and is necessary to reliably
 434   # resolve a symbolic script_version like "master^".
 435   $ENV{"CRUNCH_SRC"} = "$ENV{CRUNCH_TMP}/src";
 436
 437   Log (undef, "Looking for version ".$Job->{script_version}." from repository ".$Job->{repository});
 438
 439   $ENV{"CRUNCH_SRC_COMMIT"} = $Job->{script_version};
 440
 441   # If we're running under crunch-dispatch, it will have already
 442   # pulled the appropriate source tree into its own repository, and
 443   # given us that repo's path as $git_dir.
 444   #
 445   # If we're running a "local" job, we might have to fetch content
 446   # from a remote repository.
 447   #
 448   # (Currently crunch-dispatch gives a local path with --git-dir, but
 449   # we might as well accept URLs there too in case it changes its
 450   # mind.)
 451   my $repo = $git_dir || $Job->{'repository'};
 452
 453   # Repository can be remote or local. If remote, we'll need to fetch it
 454   # to a local dir before doing `git log` et al.
 455   my $repo_location;
 456
 457   if ($repo =~ m{://|^[^/]*:}) {
 458     # $repo is a git url we can clone, like git:// or https:// or
 459     # file:/// or [user@]host:repo.git. Note "user/name@host:foo" is
 460     # not recognized here because distinguishing that from a local
 461     # path is too fragile. If you really need something strange here,
 462     # use the ssh:// form.
 463     $repo_location = 'remote';
 464   } elsif ($repo =~ m{^\.*/}) {
 465     # $repo is a local path to a git index. We'll also resolve ../foo
 466     # to ../foo/.git if the latter is a directory. To help
 467     # disambiguate local paths from named hosted repositories, this
 468     # form must be given as ./ or ../ if it's a relative path.
 469     if (-d "$repo/.git") {
 470       $repo = "$repo/.git";
 471     }
 472     $repo_location = 'local';
 473   } else {
 474     # $repo is none of the above. It must be the name of a hosted
 475     # repository.
 476     my $arv_repo_list = api_call("repositories/list",
 477                                  'filters' => [['name','=',$repo]]);
 478     my @repos_found = @{$arv_repo_list->{'items'}};
 479     my $n_found = $arv_repo_list->{'serverResponse'}->{'items_available'};
 480     if ($n_found > 0) {
 481       Log(undef, "Repository '$repo' -> "
 482           . join(", ", map { $_->{'uuid'} } @repos_found));
 483     }
 484     if ($n_found != 1) {
 485       croak("Error: Found $n_found repositories with name '$repo'.");
 486     }
 487     $repo = $repos_found[0]->{'fetch_url'};
 488     $repo_location = 'remote';
 489   }
 490   Log(undef, "Using $repo_location repository '$repo'");
 491   $ENV{"CRUNCH_SRC_URL"} = $repo;
 492
 493   # Resolve given script_version (we'll call that $treeish here) to a
 494   # commit sha1 ($commit).
 495   my $treeish = $Job->{'script_version'};
 496   my $commit;
 497   if ($repo_location eq 'remote') {
 498     # We minimize excess object-fetching by re-using the same bare
 499     # repository in CRUNCH_TMP/.git for multiple crunch-jobs -- we
 500     # just keep adding remotes to it as needed.
 501     my $local_repo = $ENV{'CRUNCH_TMP'}."/.git";
 502     my $gitcmd = "git --git-dir=\Q$local_repo\E";
 503
 504     # Set up our local repo for caching remote objects, making
 505     # archives, etc.
 506     if (!-d $local_repo) {
 507       make_path($local_repo) or croak("Error: could not create $local_repo");
 508     }
 509     # This works (exits 0 and doesn't delete fetched objects) even
 510     # if $local_repo is already initialized:
 511     `$gitcmd init --bare`;
 512     if ($?) {
 513       croak("Error: $gitcmd init --bare exited ".exit_status_s($?));
 514     }
 515
 516     # If $treeish looks like a hash (or abbrev hash) we look it up in
 517     # our local cache first, since that's cheaper. (We don't want to
 518     # do that with tags/branches though -- those change over time, so
 519     # they should always be resolved by the remote repo.)
 520     if ($treeish =~ /^[0-9a-f]{7,40}$/s) {
 521       # Hide stderr because it's normal for this to fail:
 522       my $sha1 = `$gitcmd rev-list -n1 ''\Q$treeish\E 2>/dev/null`;
 523       if ($? == 0 &&
 524           # Careful not to resolve a branch named abcdeff to commit 1234567:
 525           $sha1 =~ /^$treeish/ &&
 526           $sha1 =~ /^([0-9a-f]{40})$/s) {
 527         $commit = $1;
 528         Log(undef, "Commit $commit already present in $local_repo");
 529       }
 530     }
 531
 532     if (!defined $commit) {
 533       # If $treeish isn't just a hash or abbrev hash, or isn't here
 534       # yet, we need to fetch the remote to resolve it correctly.
 535
 536       # First, remove all local heads. This prevents a name that does
 537       # not exist on the remote from resolving to (or colliding with)
 538       # a previously fetched branch or tag (possibly from a different
 539       # remote).
 540       remove_tree("$local_repo/refs/heads", {keep_root => 1});
 541
 542       Log(undef, "Fetching objects from $repo to $local_repo");
 543       `$gitcmd fetch --no-progress --tags ''\Q$repo\E \Q+refs/heads/*:refs/heads/*\E`;
 544       if ($?) {
 545         croak("Error: `$gitcmd fetch` exited ".exit_status_s($?));
 546       }
 547     }
 548
 549     # Now that the data is all here, we will use our local repo for
 550     # the rest of our git activities.
 551     $repo = $local_repo;
 552   }
 553
 554   my $gitcmd = "git --git-dir=\Q$repo\E";
 555   my $sha1 = `$gitcmd rev-list -n1 ''\Q$treeish\E`;
 556   unless ($? == 0 && $sha1 =~ /^([0-9a-f]{40})$/) {
 557     croak("`$gitcmd rev-list` exited "
 558           .exit_status_s($?)
 559           .", '$treeish' not found. Giving up.");
 560   }
 561   $commit = $1;
 562   Log(undef, "Version $treeish is commit $commit");
 563
 564   if ($commit ne $Job->{'script_version'}) {
 565     # Record the real commit id in the database, frozentokey, logs,
 566     # etc. -- instead of an abbreviation or a branch name which can
 567     # become ambiguous or point to a different commit in the future.
 568     if (!$Job->update_attributes('script_version' => $commit)) {
 569       croak("Error: failed to update job's script_version attribute");
 570     }
 571   }
 572
 573   $ENV{"CRUNCH_SRC_COMMIT"} = $commit;
 574   add_git_archive("$gitcmd archive ''\Q$commit\E");
 575 }
 576
 577 my $git_archive = combined_git_archive();
 578 if (!defined $git_archive) {
 579   Log(undef, "Skip install phase (no git archive)");
 580   if ($have_slurm) {
 581     Log(undef, "Warning: This probably means workers have no source tree!");
 582   }
 583 }
 584 else {
 585   Log(undef, "Run install script on all workers");
 586
 587   my @srunargs = ("srun",
 588                   "--nodelist=$nodelist",
 589                   "-D", $ENV{'TMPDIR'}, "--job-name=$job_id");
 590   my @execargs = ("sh", "-c",
 591                   "mkdir -p $ENV{CRUNCH_INSTALL} && cd $ENV{CRUNCH_TMP} && perl -");
 592
 593   my $installpid = fork();
 594   if ($installpid == 0)
 595   {
 596     srun (\@srunargs, \@execargs, {}, $build_script . $git_archive);
 597     exit (1);
 598   }
 599   while (1)
 600   {
 601     last if $installpid == waitpid (-1, WNOHANG);
 602     freeze_if_want_freeze ($installpid);
 603     select (undef, undef, undef, 0.1);
 604   }
 605   my $install_exited = $?;
 606   Log (undef, "Install script exited ".exit_status_s($install_exited));
 607   foreach my $tar_filename (map { tar_filename_n($_); } (1..$git_tar_count)) {
 608     unlink($tar_filename);
 609   }
 610   exit (1) if $install_exited != 0;
 611 }
 612
 613 foreach (qw (script script_version script_parameters runtime_constraints))
 614 {
 615   Log (undef,
 616        "$_ " .
 617        (ref($Job->{$_}) ? JSON::encode_json($Job->{$_}) : $Job->{$_}));
 618 }
 619 foreach (split (/\n/, $Job->{knobs}))
 620 {
 621   Log (undef, "knob " . $_);
 622 }
 623
 624
 625
 626 $main::success = undef;
 627
 628
 629
 630 ONELEVEL:
 631
 632 my $thisround_succeeded = 0;
 633 my $thisround_failed = 0;
 634 my $thisround_failed_multiple = 0;
 635
 636 @jobstep_todo = sort { $jobstep[$a]->{level} <=> $jobstep[$b]->{level}
 637                        or $a <=> $b } @jobstep_todo;
 638 my $level = $jobstep[$jobstep_todo[0]]->{level};
 639 Log (undef, "start level $level");
 640
 641
 642
 643 my %proc;
 644 my @freeslot = (0..$#slot);
 645 my @holdslot;
 646 my %reader;
 647 my $progress_is_dirty = 1;
 648 my $progress_stats_updated = 0;
 649
 650 update_progress_stats();
 651
 652
 653
 654 THISROUND:
 655 for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
 656 {
 657   my $id = $jobstep_todo[$todo_ptr];
 658   my $Jobstep = $jobstep[$id];
 659   if ($Jobstep->{level} != $level)
 660   {
 661     next;
 662   }
 663
 664   pipe $reader{$id}, "writer" or croak ($!);
 665   my $flags = fcntl ($reader{$id}, F_GETFL, 0) or croak ($!);
 666   fcntl ($reader{$id}, F_SETFL, $flags | O_NONBLOCK) or croak ($!);
 667
 668   my $childslot = $freeslot[0];
 669   my $childnode = $slot[$childslot]->{node};
 670   my $childslotname = join (".",
 671                             $slot[$childslot]->{node}->{name},
 672                             $slot[$childslot]->{cpu});
 673   my $childpid = fork();
 674   if ($childpid == 0)
 675   {
 676     $SIG{'INT'} = 'DEFAULT';
 677     $SIG{'QUIT'} = 'DEFAULT';
 678     $SIG{'TERM'} = 'DEFAULT';
 679
 680     foreach (values (%reader))
 681     {
 682       close($_);
 683     }
 684     fcntl ("writer", F_SETFL, 0) or croak ($!); # no close-on-exec
 685     open(STDOUT,">&writer");
 686     open(STDERR,">&writer");
 687
 688     undef $dbh;
 689     undef $sth;
 690
 691     delete $ENV{"GNUPGHOME"};
 692     $ENV{"TASK_UUID"} = $Jobstep->{'arvados_task'}->{'uuid'};
 693     $ENV{"TASK_QSEQUENCE"} = $id;
 694     $ENV{"TASK_SEQUENCE"} = $level;
 695     $ENV{"JOB_SCRIPT"} = $Job->{script};
 696     while (my ($param, $value) = each %{$Job->{script_parameters}}) {
 697       $param =~ tr/a-z/A-Z/;
 698       $ENV{"JOB_PARAMETER_$param"} = $value;
 699     }
 700     $ENV{"TASK_SLOT_NODE"} = $slot[$childslot]->{node}->{name};
 701     $ENV{"TASK_SLOT_NUMBER"} = $slot[$childslot]->{cpu};
 702     $ENV{"TASK_WORK"} = $ENV{"CRUNCH_TMP"}."/task/$childslotname";
 703     $ENV{"HOME"} = $ENV{"TASK_WORK"};
 704     $ENV{"TASK_KEEPMOUNT"} = $ENV{"TASK_WORK"}.".keep";
 705     $ENV{"TASK_TMPDIR"} = $ENV{"TASK_WORK"}; # deprecated
 706     $ENV{"CRUNCH_NODE_SLOTS"} = $slot[$childslot]->{node}->{ncpus};
 707     $ENV{"PATH"} = $ENV{"CRUNCH_INSTALL"} . "/bin:" . $ENV{"PATH"};
 708
 709     $ENV{"GZIP"} = "-n";
 710
 711     my @srunargs = (
 712       "srun",
 713       "--nodelist=".$childnode->{name},
 714       qw(-n1 -c1 -N1 -D), $ENV{'TMPDIR'},
 715       "--job-name=$job_id.$id.$$",
 716         );
 717     my $command =
 718         "if [ -e $ENV{TASK_WORK} ]; then rm -rf $ENV{TASK_WORK}; fi; "
 719         ."mkdir -p $ENV{CRUNCH_TMP} $ENV{JOB_WORK} $ENV{TASK_WORK} $ENV{TASK_KEEPMOUNT} "
 720         ."&& cd $ENV{CRUNCH_TMP} ";
 721     $command .= "&& exec arv-mount --by-id --allow-other $ENV{TASK_KEEPMOUNT} --exec ";
 722     if ($docker_hash)
 723     {
 724       my $cidfile = "$ENV{CRUNCH_TMP}/$ENV{TASK_UUID}.cid";
 725       $command .= "crunchstat -cgroup-root=/sys/fs/cgroup -cgroup-parent=docker -cgroup-cid=$cidfile -poll=10000 ";
 726       $command .= "$docker_bin run --rm=true --attach=stdout --attach=stderr --attach=stdin -i --user=crunch --cidfile=$cidfile --sig-proxy ";
 727
 728       # Dynamically configure the container to use the host system as its
 729       # DNS server.  Get the host's global addresses from the ip command,
 730       # and turn them into docker --dns options using gawk.
 731       $command .=
 732           q{$(ip -o address show scope global |
 733               gawk 'match($4, /^([0-9\.:]+)\//, x){print "--dns", x[1]}') };
 734
 735       # The source tree and $destdir directory (which we have
 736       # installed on the worker host) are available in the container,
 737       # under the same path.
 738       $command .= "--volume=\Q$ENV{CRUNCH_SRC}:$ENV{CRUNCH_SRC}:ro\E ";
 739       $command .= "--volume=\Q$ENV{CRUNCH_INSTALL}:$ENV{CRUNCH_INSTALL}:ro\E ";
 740
 741       # Currently, we make arv-mount's mount point appear at /keep
 742       # inside the container (instead of using the same path as the
 743       # host like we do with CRUNCH_SRC and CRUNCH_INSTALL). However,
 744       # crunch scripts and utilities must not rely on this. They must
 745       # use $TASK_KEEPMOUNT.
 746       $command .= "--volume=\Q$ENV{TASK_KEEPMOUNT}:/keep:ro\E ";
 747       $ENV{TASK_KEEPMOUNT} = "/keep";
 748
 749       # TASK_WORK is almost exactly like a docker data volume: it
 750       # starts out empty, is writable, and persists until no
 751       # containers use it any more. We don't use --volumes-from to
 752       # share it with other containers: it is only accessible to this
 753       # task, and it goes away when this task stops.
 754       #
 755       # However, a docker data volume is writable only by root unless
 756       # the mount point already happens to exist in the container with
 757       # different permissions. Therefore, we [1] assume /tmp already
 758       # exists in the image and is writable by the crunch user; [2]
 759       # avoid putting TASK_WORK inside CRUNCH_TMP (which won't be
 760       # writable if they are created by docker while setting up the
 761       # other --volumes); and [3] create $TASK_WORK inside the
 762       # container using $build_script.
 763       $command .= "--volume=/tmp ";
 764       $ENV{"TASK_WORK"} = "/tmp/crunch-job-task-work/$childslotname";
 765       $ENV{"HOME"} = $ENV{"TASK_WORK"};
 766       $ENV{"TASK_TMPDIR"} = $ENV{"TASK_WORK"}; # deprecated
 767
 768       # TODO: Share a single JOB_WORK volume across all task
 769       # containers on a given worker node, and delete it when the job
 770       # ends (and, in case that doesn't work, when the next job
 771       # starts).
 772       #
 773       # For now, use the same approach as TASK_WORK above.
 774       $ENV{"JOB_WORK"} = "/tmp/crunch-job-work";
 775
 776       while (my ($env_key, $env_val) = each %ENV)
 777       {
 778         if ($env_key =~ /^(ARVADOS|CRUNCH|JOB|TASK)_/) {
 779           $command .= "--env=\Q$env_key=$env_val\E ";
 780         }
 781       }
 782       $command .= "--env=\QHOME=$ENV{HOME}\E ";
 783       $command .= "\Q$docker_hash\E ";
 784       $command .= "stdbuf --output=0 --error=0 ";
 785       $command .= "perl - $ENV{CRUNCH_SRC}/crunch_scripts/" . $Job->{"script"};
 786     } else {
 787       # Non-docker run
 788       $command .= "crunchstat -cgroup-root=/sys/fs/cgroup -poll=10000 ";
 789       $command .= "stdbuf --output=0 --error=0 ";
 790       $command .= "perl - $ENV{CRUNCH_SRC}/crunch_scripts/" . $Job->{"script"};
 791     }
 792
 793     my @execargs = ('bash', '-c', $command);
 794     srun (\@srunargs, \@execargs, undef, $build_script);
 795     # exec() failed, we assume nothing happened.
 796     die "srun() failed on build script\n";
 797   }
 798   close("writer");
 799   if (!defined $childpid)
 800   {
 801     close $reader{$id};
 802     delete $reader{$id};
 803     next;
 804   }
 805   shift @freeslot;
 806   $proc{$childpid} = { jobstep => $id,
 807                        time => time,
 808                        slot => $childslot,
 809                        jobstepname => "$job_id.$id.$childpid",
 810                      };
 811   croak ("assert failed: \$slot[$childslot]->{'pid'} exists") if exists $slot[$childslot]->{pid};
 812   $slot[$childslot]->{pid} = $childpid;
 813
 814   Log ($id, "job_task ".$Jobstep->{'arvados_task'}->{'uuid'});
 815   Log ($id, "child $childpid started on $childslotname");
 816   $Jobstep->{starttime} = time;
 817   $Jobstep->{node} = $childnode->{name};
 818   $Jobstep->{slotindex} = $childslot;
 819   delete $Jobstep->{stderr};
 820   delete $Jobstep->{finishtime};
 821
 822   $Jobstep->{'arvados_task'}->{started_at} = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($Jobstep->{starttime});
 823   $Jobstep->{'arvados_task'}->save;
 824
 825   splice @jobstep_todo, $todo_ptr, 1;
 826   --$todo_ptr;
 827
 828   $progress_is_dirty = 1;
 829
 830   while (!@freeslot
 831          ||
 832          (@slot > @freeslot && $todo_ptr+1 > $#jobstep_todo))
 833   {
 834     last THISROUND if $main::please_freeze;
 835     if ($main::please_info)
 836     {
 837       $main::please_info = 0;
 838       freeze();
 839       create_output_collection();
 840       save_meta(1);
 841       update_progress_stats();
 842     }
 843     my $gotsome
 844         = readfrompipes ()
 845         + reapchildren ();
 846     if (!$gotsome)
 847     {
 848       check_refresh_wanted();
 849       check_squeue();
 850       update_progress_stats();
 851       select (undef, undef, undef, 0.1);
 852     }
 853     elsif (time - $progress_stats_updated >= 30)
 854     {
 855       update_progress_stats();
 856     }
 857     if (($thisround_failed_multiple >= 8 && $thisround_succeeded == 0) ||
 858         ($thisround_failed_multiple >= 16 && $thisround_failed_multiple > $thisround_succeeded))
 859     {
 860       my $message = "Repeated failure rate too high ($thisround_failed_multiple/"
 861           .($thisround_failed+$thisround_succeeded)
 862           .") -- giving up on this round";
 863       Log (undef, $message);
 864       last THISROUND;
 865     }
 866
 867     # move slots from freeslot to holdslot (or back to freeslot) if necessary
 868     for (my $i=$#freeslot; $i>=0; $i--) {
 869       if ($slot[$freeslot[$i]]->{node}->{hold_until} > scalar time) {
 870         push @holdslot, (splice @freeslot, $i, 1);
 871       }
 872     }
 873     for (my $i=$#holdslot; $i>=0; $i--) {
 874       if ($slot[$holdslot[$i]]->{node}->{hold_until} <= scalar time) {
 875         push @freeslot, (splice @holdslot, $i, 1);
 876       }
 877     }
 878
 879     # give up if no nodes are succeeding
 880     if (!grep { $_->{node}->{losing_streak} == 0 &&
 881                     $_->{node}->{hold_count} < 4 } @slot) {
 882       my $message = "Every node has failed -- giving up on this round";
 883       Log (undef, $message);
 884       last THISROUND;
 885     }
 886   }
 887 }
 888
 889
 890 push @freeslot, splice @holdslot;
 891 map { $slot[$freeslot[$_]]->{node}->{losing_streak} = 0 } (0..$#freeslot);
 892
 893
 894 Log (undef, "wait for last ".(scalar keys %proc)." children to finish");
 895 while (%proc)
 896 {
 897   if ($main::please_continue) {
 898     $main::please_continue = 0;
 899     goto THISROUND;
 900   }
 901   $main::please_info = 0, freeze(), create_output_collection(), save_meta(1) if $main::please_info;
 902   readfrompipes ();
 903   if (!reapchildren())
 904   {
 905     check_refresh_wanted();
 906     check_squeue();
 907     update_progress_stats();
 908     select (undef, undef, undef, 0.1);
 909     killem (keys %proc) if $main::please_freeze;
 910   }
 911 }
 912
 913 update_progress_stats();
 914 freeze_if_want_freeze();
 915
 916
 917 if (!defined $main::success)
 918 {
 919   if (@jobstep_todo &&
 920       $thisround_succeeded == 0 &&
 921       ($thisround_failed == 0 || $thisround_failed > 4))
 922   {
 923     my $message = "stop because $thisround_failed tasks failed and none succeeded";
 924     Log (undef, $message);
 925     $main::success = 0;
 926   }
 927   if (!@jobstep_todo)
 928   {
 929     $main::success = 1;
 930   }
 931 }
 932
 933 goto ONELEVEL if !defined $main::success;
 934
 935
 936 release_allocation();
 937 freeze();
 938 my $collated_output = &create_output_collection();
 939
 940 if (!$collated_output) {
 941   Log (undef, "Failed to write output collection");
 942 }
 943 else {
 944   Log(undef, "output hash " . $collated_output);
 945   $Job->update_attributes('output' => $collated_output);
 946 }
 947
 948 Log (undef, "finish");
 949
 950 save_meta();
 951
 952 my $final_state;
 953 if ($collated_output && $main::success) {
 954   $final_state = 'Complete';
 955 } else {
 956   $final_state = 'Failed';
 957 }
 958 $Job->update_attributes('state' => $final_state);
 959
 960 exit (($final_state eq 'Complete') ? 0 : 1);
 961
 962
 963
 964 sub update_progress_stats
 965 {
 966   $progress_stats_updated = time;
 967   return if !$progress_is_dirty;
 968   my ($todo, $done, $running) = (scalar @jobstep_todo,
 969                                  scalar @jobstep_done,
 970                                  scalar @slot - scalar @freeslot - scalar @holdslot);
 971   $Job->{'tasks_summary'} ||= {};
 972   $Job->{'tasks_summary'}->{'todo'} = $todo;
 973   $Job->{'tasks_summary'}->{'done'} = $done;
 974   $Job->{'tasks_summary'}->{'running'} = $running;
 975   $Job->update_attributes('tasks_summary' => $Job->{'tasks_summary'});
 976   Log (undef, "status: $done done, $running running, $todo todo");
 977   $progress_is_dirty = 0;
 978 }
 979
 980
 981
 982 sub reapchildren
 983 {
 984   my $pid = waitpid (-1, WNOHANG);
 985   return 0 if $pid <= 0;
 986
 987   my $whatslot = ($slot[$proc{$pid}->{slot}]->{node}->{name}
 988                   . "."
 989                   . $slot[$proc{$pid}->{slot}]->{cpu});
 990   my $jobstepid = $proc{$pid}->{jobstep};
 991   my $elapsed = time - $proc{$pid}->{time};
 992   my $Jobstep = $jobstep[$jobstepid];
 993
 994   my $childstatus = $?;
 995   my $exitvalue = $childstatus >> 8;
 996   my $exitinfo = "exit ".exit_status_s($childstatus);
 997   $Jobstep->{'arvados_task'}->reload;
 998   my $task_success = $Jobstep->{'arvados_task'}->{success};
 999
1000   Log ($jobstepid, "child $pid on $whatslot $exitinfo success=$task_success");
1001
1002   if (!defined $task_success) {
1003     # task did not indicate one way or the other --> fail
1004     $Jobstep->{'arvados_task'}->{success} = 0;
1005     $Jobstep->{'arvados_task'}->save;
1006     $task_success = 0;
1007   }
1008
1009   if (!$task_success)
1010   {
1011     my $temporary_fail;
1012     $temporary_fail ||= $Jobstep->{node_fail};
1013     $temporary_fail ||= ($exitvalue == 111);
1014
1015     ++$thisround_failed;
1016     ++$thisround_failed_multiple if $Jobstep->{'failures'} >= 1;
1017
1018     # Check for signs of a failed or misconfigured node
1019     if (++$slot[$proc{$pid}->{slot}]->{node}->{losing_streak} >=
1020         2+$slot[$proc{$pid}->{slot}]->{node}->{ncpus}) {
1021       # Don't count this against jobstep failure thresholds if this
1022       # node is already suspected faulty and srun exited quickly
1023       if ($slot[$proc{$pid}->{slot}]->{node}->{hold_until} &&
1024           $elapsed < 5) {
1025         Log ($jobstepid, "blaming failure on suspect node " .
1026              $slot[$proc{$pid}->{slot}]->{node}->{name});
1027         $temporary_fail ||= 1;
1028       }
1029       ban_node_by_slot($proc{$pid}->{slot});
1030     }
1031
1032     Log ($jobstepid, sprintf('failure (#%d, %s) after %d seconds',
1033                              ++$Jobstep->{'failures'},
1034                              $temporary_fail ? 'temporary ' : 'permanent',
1035                              $elapsed));
1036
1037     if (!$temporary_fail || $Jobstep->{'failures'} >= 3) {
1038       # Give up on this task, and the whole job
1039       $main::success = 0;
1040       $main::please_freeze = 1;
1041     }
1042     # Put this task back on the todo queue
1043     push @jobstep_todo, $jobstepid;
1044     $Job->{'tasks_summary'}->{'failed'}++;
1045   }
1046   else
1047   {
1048     ++$thisround_succeeded;
1049     $slot[$proc{$pid}->{slot}]->{node}->{losing_streak} = 0;
1050     $slot[$proc{$pid}->{slot}]->{node}->{hold_until} = 0;
1051     push @jobstep_done, $jobstepid;
1052     Log ($jobstepid, "success in $elapsed seconds");
1053   }
1054   $Jobstep->{exitcode} = $childstatus;
1055   $Jobstep->{finishtime} = time;
1056   $Jobstep->{'arvados_task'}->{finished_at} = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($Jobstep->{finishtime});
1057   $Jobstep->{'arvados_task'}->save;
1058   process_stderr ($jobstepid, $task_success);
1059   Log ($jobstepid, "output " . $Jobstep->{'arvados_task'}->{output});
1060
1061   close $reader{$jobstepid};
1062   delete $reader{$jobstepid};
1063   delete $slot[$proc{$pid}->{slot}]->{pid};
1064   push @freeslot, $proc{$pid}->{slot};
1065   delete $proc{$pid};
1066
1067   if ($task_success) {
1068     # Load new tasks
1069     my $newtask_list = [];
1070     my $newtask_results;
1071     do {
1072       $newtask_results = api_call(
1073         "job_tasks/list",
1074         'where' => {
1075           'created_by_job_task_uuid' => $Jobstep->{'arvados_task'}->{uuid}
1076         },
1077         'order' => 'qsequence',
1078         'offset' => scalar(@$newtask_list),
1079       );
1080       push(@$newtask_list, @{$newtask_results->{items}});
1081     } while (@{$newtask_results->{items}});
1082     foreach my $arvados_task (@$newtask_list) {
1083       my $jobstep = {
1084         'level' => $arvados_task->{'sequence'},
1085         'failures' => 0,
1086         'arvados_task' => $arvados_task
1087       };
1088       push @jobstep, $jobstep;
1089       push @jobstep_todo, $#jobstep;
1090     }
1091   }
1092
1093   $progress_is_dirty = 1;
1094   1;
1095 }
1096
1097 sub check_refresh_wanted
1098 {
1099   my @stat = stat $ENV{"CRUNCH_REFRESH_TRIGGER"};
1100   if (@stat && $stat[9] > $latest_refresh) {
1101     $latest_refresh = scalar time;
1102     my $Job2 = api_call("jobs/get", uuid => $jobspec);
1103     for my $attr ('cancelled_at',
1104                   'cancelled_by_user_uuid',
1105                   'cancelled_by_client_uuid',
1106                   'state') {
1107       $Job->{$attr} = $Job2->{$attr};
1108     }
1109     if ($Job->{'state'} ne "Running") {
1110       if ($Job->{'state'} eq "Cancelled") {
1111         Log (undef, "Job cancelled at " . $Job->{'cancelled_at'} . " by user " . $Job->{'cancelled_by_user_uuid'});
1112       } else {
1113         Log (undef, "Job state unexpectedly changed to " . $Job->{'state'});
1114       }
1115       $main::success = 0;
1116       $main::please_freeze = 1;
1117     }
1118   }
1119 }
1120
1121 sub check_squeue
1122 {
1123   # return if the kill list was checked <4 seconds ago
1124   if (defined $squeue_kill_checked && $squeue_kill_checked > time - 4)
1125   {
1126     return;
1127   }
1128   $squeue_kill_checked = time;
1129
1130   # use killem() on procs whose killtime is reached
1131   for (keys %proc)
1132   {
1133     if (exists $proc{$_}->{killtime}
1134         && $proc{$_}->{killtime} <= time)
1135     {
1136       killem ($_);
1137     }
1138   }
1139
1140   # return if the squeue was checked <60 seconds ago
1141   if (defined $squeue_checked && $squeue_checked > time - 60)
1142   {
1143     return;
1144   }
1145   $squeue_checked = time;
1146
1147   if (!$have_slurm)
1148   {
1149     # here is an opportunity to check for mysterious problems with local procs
1150     return;
1151   }
1152
1153   # get a list of steps still running
1154   my @squeue = `squeue -s -h -o '%i %j' && echo ok`;
1155   chop @squeue;
1156   if ($squeue[-1] ne "ok")
1157   {
1158     return;
1159   }
1160   pop @squeue;
1161
1162   # which of my jobsteps are running, according to squeue?
1163   my %ok;
1164   foreach (@squeue)
1165   {
1166     if (/^(\d+)\.(\d+) (\S+)/)
1167     {
1168       if ($1 eq $ENV{SLURM_JOBID})
1169       {
1170         $ok{$3} = 1;
1171       }
1172     }
1173   }
1174
1175   # which of my active child procs (>60s old) were not mentioned by squeue?
1176   foreach (keys %proc)
1177   {
1178     if ($proc{$_}->{time} < time - 60
1179         && !exists $ok{$proc{$_}->{jobstepname}}
1180         && !exists $proc{$_}->{killtime})
1181     {
1182       # kill this proc if it hasn't exited in 30 seconds
1183       $proc{$_}->{killtime} = time + 30;
1184     }
1185   }
1186 }
1187
1188
1189 sub release_allocation
1190 {
1191   if ($have_slurm)
1192   {
1193     Log (undef, "release job allocation");
1194     system "scancel $ENV{SLURM_JOBID}";
1195   }
1196 }
1197
1198
1199 sub readfrompipes
1200 {
1201   my $gotsome = 0;
1202   foreach my $job (keys %reader)
1203   {
1204     my $buf;
1205     while (0 < sysread ($reader{$job}, $buf, 8192))
1206     {
1207       print STDERR $buf if $ENV{CRUNCH_DEBUG};
1208       $jobstep[$job]->{stderr} .= $buf;
1209       preprocess_stderr ($job);
1210       if (length ($jobstep[$job]->{stderr}) > 16384)
1211       {
1212         substr ($jobstep[$job]->{stderr}, 0, 8192) = "";
1213       }
1214       $gotsome = 1;
1215     }
1216   }
1217   return $gotsome;
1218 }
1219
1220
1221 sub preprocess_stderr
1222 {
1223   my $job = shift;
1224
1225   while ($jobstep[$job]->{stderr} =~ /^(.*?)\n/) {
1226     my $line = $1;
1227     substr $jobstep[$job]->{stderr}, 0, 1+length($line), "";
1228     Log ($job, "stderr $line");
1229     if ($line =~ /srun: error: (SLURM job $ENV{SLURM_JOB_ID} has expired|Unable to confirm allocation for job $ENV{SLURM_JOB_ID})/) {
1230       # whoa.
1231       $main::please_freeze = 1;
1232     }
1233     elsif ($line =~ /srun: error: (Node failure on|Unable to create job step) /) {
1234       $jobstep[$job]->{node_fail} = 1;
1235       ban_node_by_slot($jobstep[$job]->{slotindex});
1236     }
1237   }
1238 }
1239
1240
1241 sub process_stderr
1242 {
1243   my $job = shift;
1244   my $task_success = shift;
1245   preprocess_stderr ($job);
1246
1247   map {
1248     Log ($job, "stderr $_");
1249   } split ("\n", $jobstep[$job]->{stderr});
1250 }
1251
1252 sub fetch_block
1253 {
1254   my $hash = shift;
1255   my ($keep, $child_out, $output_block);
1256
1257   my $cmd = "arv-get \Q$hash\E";
1258   open($keep, '-|', $cmd) or die "fetch_block: $cmd: $!";
1259   $output_block = '';
1260   while (1) {
1261     my $buf;
1262     my $bytes = sysread($keep, $buf, 1024 * 1024);
1263     if (!defined $bytes) {
1264       die "reading from arv-get: $!";
1265     } elsif ($bytes == 0) {
1266       # sysread returns 0 at the end of the pipe.
1267       last;
1268     } else {
1269       # some bytes were read into buf.
1270       $output_block .= $buf;
1271     }
1272   }
1273   close $keep;
1274   return $output_block;
1275 }
1276
1277 # Create a collection by concatenating the output of all tasks (each
1278 # task's output is either a manifest fragment, a locator for a
1279 # manifest fragment stored in Keep, or nothing at all). Return the
1280 # portable_data_hash of the new collection.
1281 sub create_output_collection
1282 {
1283   Log (undef, "collate");
1284
1285   my ($child_out, $child_in);
1286   my $pid = open2($child_out, $child_in, 'python', '-c',
1287                   'import arvados; ' .
1288                   'import sys; ' .
1289                   'print arvados.api()' .
1290                   '.collections()' .
1291                   '.create(body={"manifest_text":sys.stdin.read()})' .
1292                   '.execute()["portable_data_hash"]'
1293       );
1294
1295   my $task_idx = -1;
1296   for (@jobstep)
1297   {
1298     ++$task_idx;
1299     next unless exists $_->{'arvados_task'}->{'output'};
1300     my $output = $_->{'arvados_task'}->{output};
1301     if ($output !~ /^[0-9a-f]{32}(\+\S+)*$/)
1302     {
1303       print $child_in $output;
1304     }
1305     elsif (defined (my $outblock = fetch_block ($output)))
1306     {
1307       print $child_in $outblock;
1308     }
1309     else
1310     {
1311       my $uuid = $_->{'arvados_task'}->{'uuid'};
1312       Log (undef, "Error retrieving '$output' output by task $task_idx ($uuid)");
1313       $main::success = 0;
1314     }
1315   }
1316   $child_in->close;
1317
1318   my $joboutput;
1319   my $s = IO::Select->new($child_out);
1320   if ($s->can_read(120)) {
1321     sysread($child_out, $joboutput, 64 * 1024 * 1024);
1322     chomp($joboutput);
1323     # TODO: Ensure exit status == 0.
1324   } else {
1325     Log (undef, "timed out while creating output collection");
1326   }
1327   # TODO: kill $pid instead of waiting, now that we've decided to
1328   # ignore further output.
1329   waitpid($pid, 0);
1330
1331   return $joboutput;
1332 }
1333
1334
1335 sub killem
1336 {
1337   foreach (@_)
1338   {
1339     my $sig = 2;                # SIGINT first
1340     if (exists $proc{$_}->{"sent_$sig"} &&
1341         time - $proc{$_}->{"sent_$sig"} > 4)
1342     {
1343       $sig = 15;                # SIGTERM if SIGINT doesn't work
1344     }
1345     if (exists $proc{$_}->{"sent_$sig"} &&
1346         time - $proc{$_}->{"sent_$sig"} > 4)
1347     {
1348       $sig = 9;                 # SIGKILL if SIGTERM doesn't work
1349     }
1350     if (!exists $proc{$_}->{"sent_$sig"})
1351     {
1352       Log ($proc{$_}->{jobstep}, "sending 2x signal $sig to pid $_");
1353       kill $sig, $_;
1354       select (undef, undef, undef, 0.1);
1355       if ($sig == 2)
1356       {
1357         kill $sig, $_;     # srun wants two SIGINT to really interrupt
1358       }
1359       $proc{$_}->{"sent_$sig"} = time;
1360       $proc{$_}->{"killedafter"} = time - $proc{$_}->{"time"};
1361     }
1362   }
1363 }
1364
1365
1366 sub fhbits
1367 {
1368   my($bits);
1369   for (@_) {
1370     vec($bits,fileno($_),1) = 1;
1371   }
1372   $bits;
1373 }
1374
1375
1376 # Send log output to Keep via arv-put.
1377 #
1378 # $log_pipe_in and $log_pipe_out are the input and output filehandles to the arv-put pipe.
1379 # $log_pipe_pid is the pid of the arv-put subprocess.
1380 #
1381 # The only functions that should access these variables directly are:
1382 #
1383 # log_writer_start($logfilename)
1384 #     Starts an arv-put pipe, reading data on stdin and writing it to
1385 #     a $logfilename file in an output collection.
1386 #
1387 # log_writer_send($txt)
1388 #     Writes $txt to the output log collection.
1389 #
1390 # log_writer_finish()
1391 #     Closes the arv-put pipe and returns the output that it produces.
1392 #
1393 # log_writer_is_active()
1394 #     Returns a true value if there is currently a live arv-put
1395 #     process, false otherwise.
1396 #
1397 my ($log_pipe_in, $log_pipe_out, $log_pipe_pid);
1398
1399 sub log_writer_start($)
1400 {
1401   my $logfilename = shift;
1402   $log_pipe_pid = open2($log_pipe_out, $log_pipe_in,
1403                         'arv-put', '--portable-data-hash',
1404                         '--retries', '3',
1405                         '--filename', $logfilename,
1406                         '-');
1407 }
1408
1409 sub log_writer_send($)
1410 {
1411   my $txt = shift;
1412   print $log_pipe_in $txt;
1413 }
1414
1415 sub log_writer_finish()
1416 {
1417   return unless $log_pipe_pid;
1418
1419   close($log_pipe_in);
1420   my $arv_put_output;
1421
1422   my $s = IO::Select->new($log_pipe_out);
1423   if ($s->can_read(120)) {
1424     sysread($log_pipe_out, $arv_put_output, 1024);
1425     chomp($arv_put_output);
1426   } else {
1427     Log (undef, "timed out reading from 'arv-put'");
1428   }
1429
1430   waitpid($log_pipe_pid, 0);
1431   $log_pipe_pid = $log_pipe_in = $log_pipe_out = undef;
1432   if ($?) {
1433     Log("log_writer_finish: arv-put exited ".exit_status_s($?))
1434   }
1435
1436   return $arv_put_output;
1437 }
1438
1439 sub log_writer_is_active() {
1440   return $log_pipe_pid;
1441 }
1442
1443 sub Log                         # ($jobstep_id, $logmessage)
1444 {
1445   if ($_[1] =~ /\n/) {
1446     for my $line (split (/\n/, $_[1])) {
1447       Log ($_[0], $line);
1448     }
1449     return;
1450   }
1451   my $fh = select STDERR; $|=1; select $fh;
1452   my $message = sprintf ("%s %d %s %s", $job_id, $$, @_);
1453   $message =~ s{([^ -\176])}{"\\" . sprintf ("%03o", ord($1))}ge;
1454   $message .= "\n";
1455   my $datetime;
1456   if (log_writer_is_active() || -t STDERR) {
1457     my @gmtime = gmtime;
1458     $datetime = sprintf ("%04d-%02d-%02d_%02d:%02d:%02d",
1459                          $gmtime[5]+1900, $gmtime[4]+1, @gmtime[3,2,1,0]);
1460   }
1461   print STDERR ((-t STDERR) ? ($datetime." ".$message) : $message);
1462
1463   if (log_writer_is_active()) {
1464     log_writer_send($datetime . " " . $message);
1465   }
1466 }
1467
1468
1469 sub croak
1470 {
1471   my ($package, $file, $line) = caller;
1472   my $message = "@_ at $file line $line\n";
1473   Log (undef, $message);
1474   freeze() if @jobstep_todo;
1475   create_output_collection() if @jobstep_todo;
1476   cleanup();
1477   save_meta();
1478   die;
1479 }
1480
1481
1482 sub cleanup
1483 {
1484   return unless $Job;
1485   if ($Job->{'state'} eq 'Cancelled') {
1486     $Job->update_attributes('finished_at' => scalar gmtime);
1487   } else {
1488     $Job->update_attributes('state' => 'Failed');
1489   }
1490 }
1491
1492
1493 sub save_meta
1494 {
1495   my $justcheckpoint = shift; # false if this will be the last meta saved
1496   return if $justcheckpoint;  # checkpointing is not relevant post-Warehouse.pm
1497   return unless log_writer_is_active();
1498
1499   my $loglocator = log_writer_finish();
1500   Log (undef, "log manifest is $loglocator");
1501   $Job->{'log'} = $loglocator;
1502   $Job->update_attributes('log', $loglocator);
1503 }
1504
1505
1506 sub freeze_if_want_freeze
1507 {
1508   if ($main::please_freeze)
1509   {
1510     release_allocation();
1511     if (@_)
1512     {
1513       # kill some srun procs before freeze+stop
1514       map { $proc{$_} = {} } @_;
1515       while (%proc)
1516       {
1517         killem (keys %proc);
1518         select (undef, undef, undef, 0.1);
1519         my $died;
1520         while (($died = waitpid (-1, WNOHANG)) > 0)
1521         {
1522           delete $proc{$died};
1523         }
1524       }
1525     }
1526     freeze();
1527     create_output_collection();
1528     cleanup();
1529     save_meta();
1530     exit 1;
1531   }
1532 }
1533
1534
1535 sub freeze
1536 {
1537   Log (undef, "Freeze not implemented");
1538   return;
1539 }
1540
1541
1542 sub thaw
1543 {
1544   croak ("Thaw not implemented");
1545 }
1546
1547
1548 sub freezequote
1549 {
1550   my $s = shift;
1551   $s =~ s/\\/\\\\/g;
1552   $s =~ s/\n/\\n/g;
1553   return $s;
1554 }
1555
1556
1557 sub freezeunquote
1558 {
1559   my $s = shift;
1560   $s =~ s{\\(.)}{$1 eq "n" ? "\n" : $1}ge;
1561   return $s;
1562 }
1563
1564
1565 sub srun
1566 {
1567   my $srunargs = shift;
1568   my $execargs = shift;
1569   my $opts = shift || {};
1570   my $stdin = shift;
1571   my $args = $have_slurm ? [@$srunargs, @$execargs] : $execargs;
1572
1573   $Data::Dumper::Terse = 1;
1574   $Data::Dumper::Indent = 0;
1575   my $show_cmd = Dumper($args);
1576   $show_cmd =~ s/(TOKEN\\*=)[^\s\']+/${1}[...]/g;
1577   $show_cmd =~ s/\n/ /g;
1578   warn "starting: $show_cmd\n";
1579
1580   if (defined $stdin) {
1581     my $child = open STDIN, "-|";
1582     defined $child or die "no fork: $!";
1583     if ($child == 0) {
1584       print $stdin or die $!;
1585       close STDOUT or die $!;
1586       exit 0;
1587     }
1588   }
1589
1590   return system (@$args) if $opts->{fork};
1591
1592   exec @$args;
1593   warn "ENV size is ".length(join(" ",%ENV));
1594   die "exec failed: $!: @$args";
1595 }
1596
1597
1598 sub ban_node_by_slot {
1599   # Don't start any new jobsteps on this node for 60 seconds
1600   my $slotid = shift;
1601   $slot[$slotid]->{node}->{hold_until} = 60 + scalar time;
1602   $slot[$slotid]->{node}->{hold_count}++;
1603   Log (undef, "backing off node " . $slot[$slotid]->{node}->{name} . " for 60 seconds");
1604 }
1605
1606 sub must_lock_now
1607 {
1608   my ($lockfile, $error_message) = @_;
1609   open L, ">", $lockfile or croak("$lockfile: $!");
1610   if (!flock L, LOCK_EX|LOCK_NB) {
1611     croak("Can't lock $lockfile: $error_message\n");
1612   }
1613 }
1614
1615 sub find_docker_image {
1616   # Given a Keep locator, check to see if it contains a Docker image.
1617   # If so, return its stream name and Docker hash.
1618   # If not, return undef for both values.
1619   my $locator = shift;
1620   my ($streamname, $filename);
1621   my $image = api_call("collections/get", uuid => $locator);
1622   if ($image) {
1623     foreach my $line (split(/\n/, $image->{manifest_text})) {
1624       my @tokens = split(/\s+/, $line);
1625       next if (!@tokens);
1626       $streamname = shift(@tokens);
1627       foreach my $filedata (grep(/^\d+:\d+:/, @tokens)) {
1628         if (defined($filename)) {
1629           return (undef, undef);  # More than one file in the Collection.
1630         } else {
1631           $filename = (split(/:/, $filedata, 3))[2];
1632         }
1633       }
1634     }
1635   }
1636   if (defined($filename) and ($filename =~ /^([0-9A-Fa-f]{64})\.tar$/)) {
1637     return ($streamname, $1);
1638   } else {
1639     return (undef, undef);
1640   }
1641 }
1642
1643 sub retry_count {
1644   # Calculate the number of times an operation should be retried,
1645   # assuming exponential backoff, and that we're willing to retry as
1646   # long as tasks have been running.  Enforce a minimum of 3 retries.
1647   my ($starttime, $endtime, $timediff, $retries);
1648   if (@jobstep) {
1649     $starttime = $jobstep[0]->{starttime};
1650     $endtime = $jobstep[-1]->{finishtime};
1651   }
1652   if (!defined($starttime)) {
1653     $timediff = 0;
1654   } elsif (!defined($endtime)) {
1655     $timediff = time - $starttime;
1656   } else {
1657     $timediff = ($endtime - $starttime) - (time - $endtime);
1658   }
1659   if ($timediff > 0) {
1660     $retries = int(log($timediff) / log(2));
1661   } else {
1662     $retries = 1;  # Use the minimum.
1663   }
1664   return ($retries > 3) ? $retries : 3;
1665 }
1666
1667 sub retry_op {
1668   # Pass in two function references.
1669   # This method will be called with the remaining arguments.
1670   # If it dies, retry it with exponential backoff until it succeeds,
1671   # or until the current retry_count is exhausted.  After each failure
1672   # that can be retried, the second function will be called with
1673   # the current try count (0-based), next try time, and error message.
1674   my $operation = shift;
1675   my $retry_callback = shift;
1676   my $retries = retry_count();
1677   foreach my $try_count (0..$retries) {
1678     my $next_try = time + (2 ** $try_count);
1679     my $result = eval { $operation->(@_); };
1680     if (!$@) {
1681       return $result;
1682     } elsif ($try_count < $retries) {
1683       $retry_callback->($try_count, $next_try, $@);
1684       my $sleep_time = $next_try - time;
1685       sleep($sleep_time) if ($sleep_time > 0);
1686     }
1687   }
1688   # Ensure the error message ends in a newline, so Perl doesn't add
1689   # retry_op's line number to it.
1690   chomp($@);
1691   die($@ . "\n");
1692 }
1693
1694 sub api_call {
1695   # Pass in a /-separated API method name, and arguments for it.
1696   # This function will call that method, retrying as needed until
1697   # the current retry_count is exhausted, with a log on the first failure.
1698   my $method_name = shift;
1699   my $log_api_retry = sub {
1700     my ($try_count, $next_try_at, $errmsg) = @_;
1701     $errmsg =~ s/\s*\bat \Q$0\E line \d+\.?\s*//;
1702     $errmsg =~ s/\s/ /g;
1703     $errmsg =~ s/\s+$//;
1704     my $retry_msg;
1705     if ($next_try_at < time) {
1706       $retry_msg = "Retrying.";
1707     } else {
1708       my $next_try_fmt = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($next_try_at);
1709       $retry_msg = "Retrying at $next_try_fmt.";
1710     }
1711     Log(undef, "API method $method_name failed: $errmsg. $retry_msg");
1712   };
1713   my $method = $arv;
1714   foreach my $key (split(/\//, $method_name)) {
1715     $method = $method->{$key};
1716   }
1717   return retry_op(sub { $method->execute(@_); }, $log_api_retry, @_);
1718 }
1719
1720 sub exit_status_s {
1721   # Given a $?, return a human-readable exit code string like "0" or
1722   # "1" or "0 with signal 1" or "1 with signal 11".
1723   my $exitcode = shift;
1724   my $s = $exitcode >> 8;
1725   if ($exitcode & 0x7f) {
1726     $s .= " with signal " . ($exitcode & 0x7f);
1727   }
1728   if ($exitcode & 0x80) {
1729     $s .= " with core dump";
1730   }
1731   return $s;
1732 }
1733
1734 sub handle_readall {
1735   # Pass in a glob reference to a file handle.
1736   # Read all its contents and return them as a string.
1737   my $fh_glob_ref = shift;
1738   local $/ = undef;
1739   return <$fh_glob_ref>;
1740 }
1741
1742 sub tar_filename_n {
1743   my $n = shift;
1744   return sprintf("%s/git.%s.%d.tar", $ENV{CRUNCH_TMP}, $job_id, $n);
1745 }
1746
1747 sub add_git_archive {
1748   # Pass in a git archive command as a string or list, a la system().
1749   # This method will save its output to be included in the archive sent to the
1750   # build script.
1751   my $git_input;
1752   $git_tar_count++;
1753   if (!open(GIT_ARCHIVE, ">", tar_filename_n($git_tar_count))) {
1754     croak("Failed to save git archive: $!");
1755   }
1756   my $git_pid = open2(">&GIT_ARCHIVE", $git_input, @_);
1757   close($git_input);
1758   waitpid($git_pid, 0);
1759   close(GIT_ARCHIVE);
1760   if ($?) {
1761     croak("Failed to save git archive: git exited " . exit_status_s($?));
1762   }
1763 }
1764
1765 sub combined_git_archive {
1766   # Combine all saved tar archives into a single archive, then return its
1767   # contents in a string.  Return undef if no archives have been saved.
1768   if ($git_tar_count < 1) {
1769     return undef;
1770   }
1771   my $base_tar_name = tar_filename_n(1);
1772   foreach my $tar_to_append (map { tar_filename_n($_); } (2..$git_tar_count)) {
1773     my $tar_exit = system("tar", "-Af", $base_tar_name, $tar_to_append);
1774     if ($tar_exit != 0) {
1775       croak("Error preparing build archive: tar -A exited " .
1776             exit_status_s($tar_exit));
1777     }
1778   }
1779   if (!open(GIT_TAR, "<", $base_tar_name)) {
1780     croak("Could not open build archive: $!");
1781   }
1782   my $tar_contents = handle_readall(\*GIT_TAR);
1783   close(GIT_TAR);
1784   return $tar_contents;
1785 }
1786
1787 __DATA__
1788 #!/usr/bin/perl
1789 #
1790 # This is crunch-job's internal dispatch script.  crunch-job running on the API
1791 # server invokes this script on individual compute nodes, or localhost if we're
1792 # running a job locally.  It gets called in two modes:
1793 #
1794 # * No arguments: Installation mode.  Read a tar archive from the DATA
1795 #   file handle; it includes the Crunch script's source code, and
1796 #   maybe SDKs as well.  Those should be installed in the proper
1797 #   locations.  This runs outside of any Docker container, so don't try to
1798 #   introspect Crunch's runtime environment.
1799 #
1800 # * With arguments: Crunch script run mode.  This script should set up the
1801 #   environment, then run the command specified in the arguments.  This runs
1802 #   inside any Docker container.
1803
1804 use Fcntl ':flock';
1805 use File::Path qw( make_path remove_tree );
1806 use POSIX qw(getcwd);
1807
1808 # Map SDK subdirectories to the path environments they belong to.
1809 my %SDK_ENVVARS = ("perl/lib" => "PERLLIB", "ruby/lib" => "RUBYLIB");
1810
1811 my $destdir = $ENV{"CRUNCH_SRC"};
1812 my $commit = $ENV{"CRUNCH_SRC_COMMIT"};
1813 my $repo = $ENV{"CRUNCH_SRC_URL"};
1814 my $install_dir = $ENV{"CRUNCH_INSTALL"} || (getcwd() . "/opt");
1815 my $job_work = $ENV{"JOB_WORK"};
1816 my $task_work = $ENV{"TASK_WORK"};
1817
1818 for my $dir ($destdir, $job_work, $task_work) {
1819   if ($dir) {
1820     make_path $dir;
1821     -e $dir or die "Failed to create temporary directory ($dir): $!";
1822   }
1823 }
1824
1825 if ($task_work) {
1826   remove_tree($task_work, {keep_root => 1});
1827 }
1828
1829 open(STDOUT_ORIG, ">&", STDOUT);
1830 open(STDERR_ORIG, ">&", STDERR);
1831 open(STDOUT, ">>", "$destdir.log");
1832 open(STDERR, ">&", STDOUT);
1833
1834 ### Crunch script run mode
1835 if (@ARGV) {
1836   # We want to do routine logging during task 0 only.  This gives the user
1837   # the information they need, but avoids repeating the information for every
1838   # task.
1839   my $Log;
1840   if ($ENV{TASK_SEQUENCE} eq "0") {
1841     $Log = sub {
1842       my $msg = shift;
1843       printf STDERR_ORIG "[Crunch] $msg\n", @_;
1844     };
1845   } else {
1846     $Log = sub { };
1847   }
1848
1849   my $python_src = "$install_dir/python";
1850   my $venv_dir = "$job_work/.arvados.venv";
1851   my $venv_built = -e "$venv_dir/bin/activate";
1852   if ((!$venv_built) and (-d $python_src) and can_run("virtualenv")) {
1853     shell_or_die("virtualenv", "--quiet", "--system-site-packages",
1854                  "--python=python2.7", $venv_dir);
1855     shell_or_die("$venv_dir/bin/pip", "--quiet", "install", "-I", $python_src);
1856     $venv_built = 1;
1857     $Log->("Built Python SDK virtualenv");
1858   }
1859
1860   my $pip_bin = "pip";
1861   if ($venv_built) {
1862     $Log->("Running in Python SDK virtualenv");
1863     $pip_bin = "$venv_dir/bin/pip";
1864     my $orig_argv = join(" ", map { quotemeta($_); } @ARGV);
1865     @ARGV = ("/bin/sh", "-ec",
1866              ". \Q$venv_dir/bin/activate\E; exec $orig_argv");
1867   } elsif (-d $python_src) {
1868     $Log->("Warning: virtualenv not found inside Docker container default " .
1869            "\$PATH. Can't install Python SDK.");
1870   }
1871
1872   my $pkgs = `(\Q$pip_bin\E freeze 2>/dev/null | grep arvados) || dpkg-query --show '*arvados*'`;
1873   if ($pkgs) {
1874     $Log->("Using Arvados SDK:");
1875     foreach my $line (split /\n/, $pkgs) {
1876       $Log->($line);
1877     }
1878   } else {
1879     $Log->("Arvados SDK packages not found");
1880   }
1881
1882   while (my ($sdk_dir, $sdk_envkey) = each(%SDK_ENVVARS)) {
1883     my $sdk_path = "$install_dir/$sdk_dir";
1884     if (-d $sdk_path) {
1885       if ($ENV{$sdk_envkey}) {
1886         $ENV{$sdk_envkey} = "$sdk_path:" . $ENV{$sdk_envkey};
1887       } else {
1888         $ENV{$sdk_envkey} = $sdk_path;
1889       }
1890       $Log->("Arvados SDK added to %s", $sdk_envkey);
1891     }
1892   }
1893
1894   close(STDOUT);
1895   close(STDERR);
1896   open(STDOUT, ">&", STDOUT_ORIG);
1897   open(STDERR, ">&", STDERR_ORIG);
1898   exec(@ARGV);
1899   die "Cannot exec `@ARGV`: $!";
1900 }
1901
1902 ### Installation mode
1903 open L, ">", "$destdir.lock" or die "$destdir.lock: $!";
1904 flock L, LOCK_EX;
1905 if (readlink ("$destdir.commit") eq $commit && -d $destdir) {
1906   # This version already installed -> nothing to do.
1907   exit(0);
1908 }
1909
1910 unlink "$destdir.commit";
1911 mkdir $destdir;
1912
1913 if (!open(TARX, "|-", "tar", "-xC", $destdir)) {
1914   die "Error launching 'tar -xC $destdir': $!";
1915 }
1916 # If we send too much data to tar in one write (> 4-5 MiB), it stops, and we
1917 # get SIGPIPE.  We must feed it data incrementally.
1918 my $tar_input;
1919 while (read(DATA, $tar_input, 65536)) {
1920   print TARX $tar_input;
1921 }
1922 if(!close(TARX)) {
1923   die "'tar -xC $destdir' exited $?: $!";
1924 }
1925
1926 mkdir $install_dir;
1927
1928 my $sdk_root = "$destdir/.arvados.sdk/sdk";
1929 if (-d $sdk_root) {
1930   foreach my $sdk_lang (("python",
1931                          map { (split /\//, $_, 2)[0]; } keys(%SDK_ENVVARS))) {
1932     if (-d "$sdk_root/$sdk_lang") {
1933       if (!rename("$sdk_root/$sdk_lang", "$install_dir/$sdk_lang")) {
1934         die "Failed to install $sdk_lang SDK: $!";
1935       }
1936     }
1937   }
1938 }
1939
1940 my $python_dir = "$install_dir/python";
1941 if ((-d $python_dir) and can_run("python2.7") and
1942     (system("python2.7", "$python_dir/setup.py", "--quiet", "egg_info") != 0)) {
1943   # egg_info failed, probably when it asked git for a build tag.
1944   # Specify no build tag.
1945   open(my $pysdk_cfg, ">>", "$python_dir/setup.cfg");
1946   print $pysdk_cfg "\n[egg_info]\ntag_build =\n";
1947   close($pysdk_cfg);
1948 }
1949
1950 if (-e "$destdir/crunch_scripts/install") {
1951     shell_or_die ("$destdir/crunch_scripts/install", $install_dir);
1952 } elsif (!-e "./install.sh" && -e "./tests/autotests.sh") {
1953     # Old version
1954     shell_or_die ("./tests/autotests.sh", $install_dir);
1955 } elsif (-e "./install.sh") {
1956     shell_or_die ("./install.sh", $install_dir);
1957 }
1958
1959 if ($commit) {
1960     unlink "$destdir.commit.new";
1961     symlink ($commit, "$destdir.commit.new") or die "$destdir.commit.new: $!";
1962     rename ("$destdir.commit.new", "$destdir.commit") or die "$destdir.commit: $!";
1963 }
1964
1965 close L;
1966
1967 sub can_run {
1968   my $command_name = shift;
1969   open(my $which, "-|", "which", $command_name);
1970   while (<$which>) { }
1971   close($which);
1972   return ($? == 0);
1973 }
1974
1975 sub shell_or_die
1976 {
1977   if ($ENV{"DEBUG"}) {
1978     print STDERR "@_\n";
1979   }
1980   if (system (@_) != 0) {
1981     my $err = $!;
1982     my $exitstatus = sprintf("exit %d signal %d", $? >> 8, $? & 0x7f);
1983     open STDERR, ">&STDERR_ORIG";
1984     system ("cat $destdir.log >&2");
1985     die "@_ failed ($err): $exitstatus";
1986   }
1987 }
1988
1989 __DATA__