sdk/cli/bin/crunch-job

   1 #!/usr/bin/env perl
   2 # -*- mode: perl; perl-indent-level: 2; indent-tabs-mode: nil; -*-
   3
   4 =head1 NAME
   5
   6 crunch-job: Execute job steps, save snapshots as requested, collate output.
   7
   8 =head1 SYNOPSIS
   9
  10 Obtain job details from Arvados, run tasks on compute nodes (typically
  11 invoked by scheduler on controller):
  12
  13  crunch-job --job x-y-z --git-dir /path/to/repo/.git
  14
  15 Obtain job details from command line, run tasks on local machine
  16 (typically invoked by application or developer on VM):
  17
  18  crunch-job --job '{"script_version":"/path/to/working/tree","script":"scriptname",...}'
  19
  20  crunch-job --job '{"repository":"https://github.com/curoverse/arvados.git","script_version":"master","script":"scriptname",...}'
  21
  22 =head1 OPTIONS
  23
  24 =over
  25
  26 =item --force-unlock
  27
  28 If the job is already locked, steal the lock and run it anyway.
  29
  30 =item --git-dir
  31
  32 Path to a .git directory (or a git URL) where the commit given in the
  33 job's C<script_version> attribute is to be found. If this is I<not>
  34 given, the job's C<repository> attribute will be used.
  35
  36 =item --job-api-token
  37
  38 Arvados API authorization token to use during the course of the job.
  39
  40 =item --no-clear-tmp
  41
  42 Do not clear per-job/task temporary directories during initial job
  43 setup. This can speed up development and debugging when running jobs
  44 locally.
  45
  46 =item --job
  47
  48 UUID of the job to run, or a JSON-encoded job resource without a
  49 UUID. If the latter is given, a new job object will be created.
  50
  51 =back
  52
  53 =head1 RUNNING JOBS LOCALLY
  54
  55 crunch-job's log messages appear on stderr along with the job tasks'
  56 stderr streams. The log is saved in Keep at each checkpoint and when
  57 the job finishes.
  58
  59 If the job succeeds, the job's output locator is printed on stdout.
  60
  61 While the job is running, the following signals are accepted:
  62
  63 =over
  64
  65 =item control-C, SIGINT, SIGQUIT
  66
  67 Save a checkpoint, terminate any job tasks that are running, and stop.
  68
  69 =item SIGALRM
  70
  71 Save a checkpoint and continue.
  72
  73 =item SIGHUP
  74
  75 Refresh node allocation (i.e., check whether any nodes have been added
  76 or unallocated) and attributes of the Job record that should affect
  77 behavior (e.g., cancel job if cancelled_at becomes non-nil).
  78
  79 =back
  80
  81 =cut
  82
  83
  84 use strict;
  85 use POSIX ':sys_wait_h';
  86 use POSIX qw(strftime);
  87 use Fcntl qw(F_GETFL F_SETFL O_NONBLOCK);
  88 use Arvados;
  89 use Cwd qw(realpath);
  90 use Data::Dumper;
  91 use Digest::MD5 qw(md5_hex);
  92 use Getopt::Long;
  93 use IPC::Open2;
  94 use IO::Select;
  95 use File::Temp;
  96 use Fcntl ':flock';
  97 use File::Path qw( make_path remove_tree );
  98
  99 use constant TASK_TEMPFAIL => 111;
 100 use constant EX_TEMPFAIL => 75;
 101 use constant EX_RETRY_UNLOCKED => 93;
 102
 103 $ENV{"TMPDIR"} ||= "/tmp";
 104 unless (defined $ENV{"CRUNCH_TMP"}) {
 105   $ENV{"CRUNCH_TMP"} = $ENV{"TMPDIR"} . "/crunch-job";
 106   if ($ENV{"USER"} ne "crunch" && $< != 0) {
 107     # use a tmp dir unique for my uid
 108     $ENV{"CRUNCH_TMP"} .= "-$<";
 109   }
 110 }
 111
 112 # Create the tmp directory if it does not exist
 113 if ( ! -d $ENV{"CRUNCH_TMP"} ) {
 114   make_path $ENV{"CRUNCH_TMP"} or die "Failed to create temporary working directory: " . $ENV{"CRUNCH_TMP"};
 115 }
 116
 117 $ENV{"JOB_WORK"} = $ENV{"CRUNCH_TMP"} . "/work";
 118 $ENV{"CRUNCH_INSTALL"} = "$ENV{CRUNCH_TMP}/opt";
 119 $ENV{"CRUNCH_WORK"} = $ENV{"JOB_WORK"}; # deprecated
 120 mkdir ($ENV{"JOB_WORK"});
 121
 122 my %proc;
 123 my $force_unlock;
 124 my $git_dir;
 125 my $jobspec;
 126 my $job_api_token;
 127 my $no_clear_tmp;
 128 my $resume_stash;
 129 my $cgroup_root = "/sys/fs/cgroup";
 130 my $docker_bin = "docker.io";
 131 my $docker_run_args = "";
 132 GetOptions('force-unlock' => \$force_unlock,
 133            'git-dir=s' => \$git_dir,
 134            'job=s' => \$jobspec,
 135            'job-api-token=s' => \$job_api_token,
 136            'no-clear-tmp' => \$no_clear_tmp,
 137            'resume-stash=s' => \$resume_stash,
 138            'cgroup-root=s' => \$cgroup_root,
 139            'docker-bin=s' => \$docker_bin,
 140            'docker-run-args=s' => \$docker_run_args,
 141     );
 142
 143 if (defined $job_api_token) {
 144   $ENV{ARVADOS_API_TOKEN} = $job_api_token;
 145 }
 146
 147 my $have_slurm = exists $ENV{SLURM_JOB_ID} && exists $ENV{SLURM_NODELIST};
 148
 149
 150 $SIG{'USR1'} = sub
 151 {
 152   $main::ENV{CRUNCH_DEBUG} = 1;
 153 };
 154 $SIG{'USR2'} = sub
 155 {
 156   $main::ENV{CRUNCH_DEBUG} = 0;
 157 };
 158
 159 my $arv = Arvados->new('apiVersion' => 'v1');
 160
 161 my $Job;
 162 my $job_id;
 163 my $dbh;
 164 my $sth;
 165 my @jobstep;
 166
 167 my $local_job;
 168 if ($jobspec =~ /^[-a-z\d]+$/)
 169 {
 170   # $jobspec is an Arvados UUID, not a JSON job specification
 171   $Job = api_call("jobs/get", uuid => $jobspec);
 172   $local_job = 0;
 173 }
 174 else
 175 {
 176   $local_job = JSON::decode_json($jobspec);
 177 }
 178
 179
 180 # Make sure our workers (our slurm nodes, localhost, or whatever) are
 181 # at least able to run basic commands: they aren't down or severely
 182 # misconfigured.
 183 my $cmd = ['true'];
 184 if (($Job || $local_job)->{docker_image_locator}) {
 185   $cmd = [$docker_bin, 'ps', '-q'];
 186 }
 187 Log(undef, "Sanity check is `@$cmd`");
 188 my ($exited, $stdout, $stderr) = srun_sync(
 189   ["srun", "--nodes=\Q$ENV{SLURM_NNODES}\E", "--ntasks-per-node=1"],
 190   $cmd,
 191   {label => "sanity check"});
 192 if ($exited != 0) {
 193   Log(undef, "Sanity check failed: ".exit_status_s($exited));
 194   exit EX_TEMPFAIL;
 195 }
 196 Log(undef, "Sanity check OK");
 197
 198
 199 my $User = api_call("users/current");
 200
 201 if (!$local_job) {
 202   if (!$force_unlock) {
 203     # Claim this job, and make sure nobody else does
 204     eval { api_call("jobs/lock", uuid => $Job->{uuid}); };
 205     if ($@) {
 206       Log(undef, "Error while locking job, exiting ".EX_TEMPFAIL);
 207       exit EX_TEMPFAIL;
 208     };
 209   }
 210 }
 211 else
 212 {
 213   if (!$resume_stash)
 214   {
 215     map { croak ("No $_ specified") unless $local_job->{$_} }
 216     qw(script script_version script_parameters);
 217   }
 218
 219   $local_job->{'is_locked_by_uuid'} = $User->{'uuid'};
 220   $local_job->{'started_at'} = gmtime;
 221   $local_job->{'state'} = 'Running';
 222
 223   $Job = api_call("jobs/create", job => $local_job);
 224 }
 225 $job_id = $Job->{'uuid'};
 226
 227 my $keep_logfile = $job_id . '.log.txt';
 228 log_writer_start($keep_logfile);
 229
 230 $Job->{'runtime_constraints'} ||= {};
 231 $Job->{'runtime_constraints'}->{'max_tasks_per_node'} ||= 0;
 232 my $max_ncpus = $Job->{'runtime_constraints'}->{'max_tasks_per_node'};
 233
 234 my $gem_versions = `gem list --quiet arvados-cli 2>/dev/null`;
 235 if ($? == 0) {
 236   $gem_versions =~ s/^arvados-cli \(/ with arvados-cli Gem version(s) /;
 237   chomp($gem_versions);
 238   chop($gem_versions);  # Closing parentheses
 239 } else {
 240   $gem_versions = "";
 241 }
 242 Log(undef,
 243     "running from " . ((-e $0) ? realpath($0) : "stdin") . $gem_versions);
 244
 245 Log (undef, "check slurm allocation");
 246 my @slot;
 247 my @node;
 248 # Should use $ENV{SLURM_TASKS_PER_NODE} instead of sinfo? (eg. "4(x3),2,4(x2)")
 249 my @sinfo;
 250 if (!$have_slurm)
 251 {
 252   my $localcpus = 0 + `grep -cw ^processor /proc/cpuinfo` || 1;
 253   push @sinfo, "$localcpus localhost";
 254 }
 255 if (exists $ENV{SLURM_NODELIST})
 256 {
 257   push @sinfo, `sinfo -h --format='%c %N' --nodes=\Q$ENV{SLURM_NODELIST}\E`;
 258 }
 259 foreach (@sinfo)
 260 {
 261   my ($ncpus, $slurm_nodelist) = split;
 262   $ncpus = $max_ncpus if $max_ncpus && $ncpus > $max_ncpus;
 263
 264   my @nodelist;
 265   while ($slurm_nodelist =~ s/^([^\[,]+?(\[.*?\])?)(,|$)//)
 266   {
 267     my $nodelist = $1;
 268     if ($nodelist =~ /\[((\d+)(-(\d+))?(,(\d+)(-(\d+))?)*)\]/)
 269     {
 270       my $ranges = $1;
 271       foreach (split (",", $ranges))
 272       {
 273         my ($a, $b);
 274         if (/(\d+)-(\d+)/)
 275         {
 276           $a = $1;
 277           $b = $2;
 278         }
 279         else
 280         {
 281           $a = $_;
 282           $b = $_;
 283         }
 284         push @nodelist, map {
 285           my $n = $nodelist;
 286           $n =~ s/\[[-,\d]+\]/$_/;
 287           $n;
 288         } ($a..$b);
 289       }
 290     }
 291     else
 292     {
 293       push @nodelist, $nodelist;
 294     }
 295   }
 296   foreach my $nodename (@nodelist)
 297   {
 298     Log (undef, "node $nodename - $ncpus slots");
 299     my $node = { name => $nodename,
 300                  ncpus => $ncpus,
 301                  # The number of consecutive times a task has been dispatched
 302                  # to this node and failed.
 303                  losing_streak => 0,
 304                  # The number of consecutive times that SLURM has reported
 305                  # a node failure since the last successful task.
 306                  fail_count => 0,
 307                  # Don't dispatch work to this node until this time
 308                  # (in seconds since the epoch) has passed.
 309                  hold_until => 0 };
 310     foreach my $cpu (1..$ncpus)
 311     {
 312       push @slot, { node => $node,
 313                     cpu => $cpu };
 314     }
 315   }
 316   push @node, @nodelist;
 317 }
 318
 319
 320
 321 # Ensure that we get one jobstep running on each allocated node before
 322 # we start overloading nodes with concurrent steps
 323
 324 @slot = sort { $a->{cpu} <=> $b->{cpu} } @slot;
 325
 326
 327 $Job->update_attributes(
 328   'tasks_summary' => { 'failed' => 0,
 329                        'todo' => 1,
 330                        'running' => 0,
 331                        'done' => 0 });
 332
 333 Log (undef, "start");
 334 $SIG{'INT'} = sub { $main::please_freeze = 1; };
 335 $SIG{'QUIT'} = sub { $main::please_freeze = 1; };
 336 $SIG{'TERM'} = \&croak;
 337 $SIG{'TSTP'} = sub { $main::please_freeze = 1; };
 338 $SIG{'ALRM'} = sub { $main::please_info = 1; };
 339 $SIG{'CONT'} = sub { $main::please_continue = 1; };
 340 $SIG{'HUP'} = sub { $main::please_refresh = 1; };
 341
 342 $main::please_freeze = 0;
 343 $main::please_info = 0;
 344 $main::please_continue = 0;
 345 $main::please_refresh = 0;
 346 my $jobsteps_must_output_keys = 0;      # becomes 1 when any task outputs a key
 347
 348 grep { $ENV{$1} = $2 if /^(NOCACHE.*?)=(.*)/ } split ("\n", $$Job{knobs});
 349 $ENV{"CRUNCH_JOB_UUID"} = $job_id;
 350 $ENV{"JOB_UUID"} = $job_id;
 351
 352
 353 my @jobstep_todo = ();
 354 my @jobstep_done = ();
 355 my @jobstep_tomerge = ();
 356 my $jobstep_tomerge_level = 0;
 357 my $squeue_checked = 0;
 358 my $sinfo_checked = 0;
 359 my $latest_refresh = scalar time;
 360
 361
 362
 363 if (defined $Job->{thawedfromkey})
 364 {
 365   thaw ($Job->{thawedfromkey});
 366 }
 367 else
 368 {
 369   my $first_task = api_call("job_tasks/create", job_task => {
 370     'job_uuid' => $Job->{'uuid'},
 371     'sequence' => 0,
 372     'qsequence' => 0,
 373     'parameters' => {},
 374   });
 375   push @jobstep, { 'level' => 0,
 376                    'failures' => 0,
 377                    'arvados_task' => $first_task,
 378                  };
 379   push @jobstep_todo, 0;
 380 }
 381
 382
 383 if (!$have_slurm)
 384 {
 385   must_lock_now("$ENV{CRUNCH_TMP}/.lock", "a job is already running here.");
 386 }
 387
 388 my $build_script = handle_readall(\*DATA);
 389 my $nodelist = join(",", @node);
 390 my $git_tar_count = 0;
 391
 392 if (!defined $no_clear_tmp) {
 393   # Find FUSE mounts under $CRUNCH_TMP and unmount them.  Then clean
 394   # up work directories crunch_tmp/work, crunch_tmp/opt,
 395   # crunch_tmp/src*.
 396   #
 397   # TODO: When #5036 is done and widely deployed, we can limit mount's
 398   # -t option to simply fuse.keep.
 399   my ($exited, $stdout, $stderr) = srun_sync(
 400     ["srun", "--nodelist=$nodelist", "-D", $ENV{'TMPDIR'}],
 401     ['bash', '-ec', '-o', 'pipefail', 'mount -t fuse,fuse.keep | awk "(index(\$3, \"$CRUNCH_TMP\") == 1){print \$3}" | xargs -r -n 1 fusermount -u -z; sleep 1; rm -rf $JOB_WORK $CRUNCH_INSTALL $CRUNCH_TMP/task $CRUNCH_TMP/src* $CRUNCH_TMP/*.cid'],
 402     {label => "clean work dirs"});
 403   if ($exited != 0) {
 404     exit(EX_RETRY_UNLOCKED);
 405   }
 406 }
 407
 408 # If this job requires a Docker image, install that.
 409 my ($docker_locator, $docker_stream, $docker_hash, $docker_limitmem, $dockeruserarg);
 410 if ($docker_locator = $Job->{docker_image_locator}) {
 411   Log (undef, "Install docker image $docker_locator");
 412   ($docker_stream, $docker_hash) = find_docker_image($docker_locator);
 413   if (!$docker_hash)
 414   {
 415     croak("No Docker image hash found from locator $docker_locator");
 416   }
 417   Log (undef, "docker image hash is $docker_hash");
 418   $docker_stream =~ s/^\.//;
 419   my $docker_install_script = qq{
 420 if $docker_bin images -q --no-trunc --all | grep -xF \Q$docker_hash\E >/dev/null; then
 421     exit 0
 422 fi
 423 declare -a exit_codes=("\${PIPESTATUS[@]}")
 424 if [ 0 != "\${exit_codes[0]}" ]; then
 425    exit "\${exit_codes[0]}"  # `docker images` failed
 426 elif [ 1 != "\${exit_codes[1]}" ]; then
 427    exit "\${exit_codes[1]}"  # `grep` encountered an error
 428 else
 429    # Everything worked fine, but grep didn't find the image on this host.
 430    arv-get \Q$docker_locator$docker_stream/$docker_hash.tar\E | $docker_bin load
 431 fi
 432 };
 433
 434   my ($exited, $stdout, $stderr) = srun_sync(
 435     ["srun", "--nodelist=" . join(',', @node)],
 436     ["/bin/bash", "-o", "pipefail", "-ec", $docker_install_script],
 437     {label => "load docker image"});
 438   if ($exited != 0)
 439   {
 440     exit(EX_RETRY_UNLOCKED);
 441   }
 442
 443   # Determine whether this version of Docker supports memory+swap limits.
 444   ($exited, $stdout, $stderr) = srun_sync(
 445     ["srun", "--nodes=1"],
 446     [$docker_bin, 'run', '--help'],
 447     {label => "check --memory-swap feature"});
 448   $docker_limitmem = ($stdout =~ /--memory-swap/);
 449
 450   # Find a non-root Docker user to use.
 451   # Tries the default user for the container, then 'crunch', then 'nobody',
 452   # testing for whether the actual user id is non-zero.  This defends against
 453   # mistakes but not malice, but we intend to harden the security in the future
 454   # so we don't want anyone getting used to their jobs running as root in their
 455   # Docker containers.
 456   my @tryusers = ("", "crunch", "nobody");
 457   foreach my $try_user (@tryusers) {
 458     my $label;
 459     my $try_user_arg;
 460     if ($try_user eq "") {
 461       $label = "check whether default user is UID 0";
 462       $try_user_arg = "";
 463     } else {
 464       $label = "check whether user '$try_user' is UID 0";
 465       $try_user_arg = "--user=$try_user";
 466     }
 467     my ($exited, $stdout, $stderr) = srun_sync(
 468       ["srun", "--nodes=1"],
 469       ["/bin/sh", "-ec",
 470        "$docker_bin run $docker_run_args $try_user_arg $docker_hash id --user"],
 471       {label => $label});
 472     chomp($stdout);
 473     if ($exited == 0 && $stdout =~ /^\d+$/ && $stdout > 0) {
 474       $dockeruserarg = $try_user_arg;
 475       if ($try_user eq "") {
 476         Log(undef, "Container will run with default user");
 477       } else {
 478         Log(undef, "Container will run with $dockeruserarg");
 479       }
 480       last;
 481     }
 482   }
 483
 484   if (!defined $dockeruserarg) {
 485     croak("Could not find a user in container that is not UID 0 (tried default user, @tryusers) or there was a problem running 'id' in the container.");
 486   }
 487
 488   if ($Job->{arvados_sdk_version}) {
 489     # The job also specifies an Arvados SDK version.  Add the SDKs to the
 490     # tar file for the build script to install.
 491     Log(undef, sprintf("Packing Arvados SDK version %s for installation",
 492                        $Job->{arvados_sdk_version}));
 493     add_git_archive("git", "--git-dir=$git_dir", "archive",
 494                     "--prefix=.arvados.sdk/",
 495                     $Job->{arvados_sdk_version}, "sdk");
 496   }
 497 }
 498
 499 if (!defined $git_dir && $Job->{'script_version'} =~ m{^/}) {
 500   # If script_version looks like an absolute path, *and* the --git-dir
 501   # argument was not given -- which implies we were not invoked by
 502   # crunch-dispatch -- we will use the given path as a working
 503   # directory instead of resolving script_version to a git commit (or
 504   # doing anything else with git).
 505   $ENV{"CRUNCH_SRC_COMMIT"} = $Job->{'script_version'};
 506   $ENV{"CRUNCH_SRC"} = $Job->{'script_version'};
 507 }
 508 else {
 509   # Resolve the given script_version to a git commit sha1. Also, if
 510   # the repository is remote, clone it into our local filesystem: this
 511   # ensures "git archive" will work, and is necessary to reliably
 512   # resolve a symbolic script_version like "master^".
 513   $ENV{"CRUNCH_SRC"} = "$ENV{CRUNCH_TMP}/src";
 514
 515   Log (undef, "Looking for version ".$Job->{script_version}." from repository ".$Job->{repository});
 516
 517   $ENV{"CRUNCH_SRC_COMMIT"} = $Job->{script_version};
 518
 519   # If we're running under crunch-dispatch, it will have already
 520   # pulled the appropriate source tree into its own repository, and
 521   # given us that repo's path as $git_dir.
 522   #
 523   # If we're running a "local" job, we might have to fetch content
 524   # from a remote repository.
 525   #
 526   # (Currently crunch-dispatch gives a local path with --git-dir, but
 527   # we might as well accept URLs there too in case it changes its
 528   # mind.)
 529   my $repo = $git_dir || $Job->{'repository'};
 530
 531   # Repository can be remote or local. If remote, we'll need to fetch it
 532   # to a local dir before doing `git log` et al.
 533   my $repo_location;
 534
 535   if ($repo =~ m{://|^[^/]*:}) {
 536     # $repo is a git url we can clone, like git:// or https:// or
 537     # file:/// or [user@]host:repo.git. Note "user/name@host:foo" is
 538     # not recognized here because distinguishing that from a local
 539     # path is too fragile. If you really need something strange here,
 540     # use the ssh:// form.
 541     $repo_location = 'remote';
 542   } elsif ($repo =~ m{^\.*/}) {
 543     # $repo is a local path to a git index. We'll also resolve ../foo
 544     # to ../foo/.git if the latter is a directory. To help
 545     # disambiguate local paths from named hosted repositories, this
 546     # form must be given as ./ or ../ if it's a relative path.
 547     if (-d "$repo/.git") {
 548       $repo = "$repo/.git";
 549     }
 550     $repo_location = 'local';
 551   } else {
 552     # $repo is none of the above. It must be the name of a hosted
 553     # repository.
 554     my $arv_repo_list = api_call("repositories/list",
 555                                  'filters' => [['name','=',$repo]]);
 556     my @repos_found = @{$arv_repo_list->{'items'}};
 557     my $n_found = $arv_repo_list->{'serverResponse'}->{'items_available'};
 558     if ($n_found > 0) {
 559       Log(undef, "Repository '$repo' -> "
 560           . join(", ", map { $_->{'uuid'} } @repos_found));
 561     }
 562     if ($n_found != 1) {
 563       croak("Error: Found $n_found repositories with name '$repo'.");
 564     }
 565     $repo = $repos_found[0]->{'fetch_url'};
 566     $repo_location = 'remote';
 567   }
 568   Log(undef, "Using $repo_location repository '$repo'");
 569   $ENV{"CRUNCH_SRC_URL"} = $repo;
 570
 571   # Resolve given script_version (we'll call that $treeish here) to a
 572   # commit sha1 ($commit).
 573   my $treeish = $Job->{'script_version'};
 574   my $commit;
 575   if ($repo_location eq 'remote') {
 576     # We minimize excess object-fetching by re-using the same bare
 577     # repository in CRUNCH_TMP/.git for multiple crunch-jobs -- we
 578     # just keep adding remotes to it as needed.
 579     my $local_repo = $ENV{'CRUNCH_TMP'}."/.git";
 580     my $gitcmd = "git --git-dir=\Q$local_repo\E";
 581
 582     # Set up our local repo for caching remote objects, making
 583     # archives, etc.
 584     if (!-d $local_repo) {
 585       make_path($local_repo) or croak("Error: could not create $local_repo");
 586     }
 587     # This works (exits 0 and doesn't delete fetched objects) even
 588     # if $local_repo is already initialized:
 589     `$gitcmd init --bare`;
 590     if ($?) {
 591       croak("Error: $gitcmd init --bare exited ".exit_status_s($?));
 592     }
 593
 594     # If $treeish looks like a hash (or abbrev hash) we look it up in
 595     # our local cache first, since that's cheaper. (We don't want to
 596     # do that with tags/branches though -- those change over time, so
 597     # they should always be resolved by the remote repo.)
 598     if ($treeish =~ /^[0-9a-f]{7,40}$/s) {
 599       # Hide stderr because it's normal for this to fail:
 600       my $sha1 = `$gitcmd rev-list -n1 ''\Q$treeish\E 2>/dev/null`;
 601       if ($? == 0 &&
 602           # Careful not to resolve a branch named abcdeff to commit 1234567:
 603           $sha1 =~ /^$treeish/ &&
 604           $sha1 =~ /^([0-9a-f]{40})$/s) {
 605         $commit = $1;
 606         Log(undef, "Commit $commit already present in $local_repo");
 607       }
 608     }
 609
 610     if (!defined $commit) {
 611       # If $treeish isn't just a hash or abbrev hash, or isn't here
 612       # yet, we need to fetch the remote to resolve it correctly.
 613
 614       # First, remove all local heads. This prevents a name that does
 615       # not exist on the remote from resolving to (or colliding with)
 616       # a previously fetched branch or tag (possibly from a different
 617       # remote).
 618       remove_tree("$local_repo/refs/heads", {keep_root => 1});
 619
 620       Log(undef, "Fetching objects from $repo to $local_repo");
 621       `$gitcmd fetch --no-progress --tags ''\Q$repo\E \Q+refs/heads/*:refs/heads/*\E`;
 622       if ($?) {
 623         croak("Error: `$gitcmd fetch` exited ".exit_status_s($?));
 624       }
 625     }
 626
 627     # Now that the data is all here, we will use our local repo for
 628     # the rest of our git activities.
 629     $repo = $local_repo;
 630   }
 631
 632   my $gitcmd = "git --git-dir=\Q$repo\E";
 633   my $sha1 = `$gitcmd rev-list -n1 ''\Q$treeish\E`;
 634   unless ($? == 0 && $sha1 =~ /^([0-9a-f]{40})$/) {
 635     croak("`$gitcmd rev-list` exited "
 636           .exit_status_s($?)
 637           .", '$treeish' not found, giving up");
 638   }
 639   $commit = $1;
 640   Log(undef, "Version $treeish is commit $commit");
 641
 642   if ($commit ne $Job->{'script_version'}) {
 643     # Record the real commit id in the database, frozentokey, logs,
 644     # etc. -- instead of an abbreviation or a branch name which can
 645     # become ambiguous or point to a different commit in the future.
 646     if (!$Job->update_attributes('script_version' => $commit)) {
 647       croak("Error: failed to update job's script_version attribute");
 648     }
 649   }
 650
 651   $ENV{"CRUNCH_SRC_COMMIT"} = $commit;
 652   add_git_archive("$gitcmd archive ''\Q$commit\E");
 653 }
 654
 655 my $git_archive = combined_git_archive();
 656 if (!defined $git_archive) {
 657   Log(undef, "Skip install phase (no git archive)");
 658   if ($have_slurm) {
 659     Log(undef, "Warning: This probably means workers have no source tree!");
 660   }
 661 }
 662 else {
 663   my $exited;
 664   my $install_script_tries_left = 3;
 665   for (my $attempts = 0; $attempts < 3; $attempts++) {
 666     my @srunargs = ("srun",
 667                     "--nodelist=$nodelist",
 668                     "-D", $ENV{'TMPDIR'}, "--job-name=$job_id");
 669     my @execargs = ("sh", "-c",
 670                     "mkdir -p $ENV{CRUNCH_INSTALL} && cd $ENV{CRUNCH_TMP} && perl -");
 671
 672     $ENV{"CRUNCH_GIT_ARCHIVE_HASH"} = md5_hex($git_archive);
 673     my ($stdout, $stderr);
 674     ($exited, $stdout, $stderr) = srun_sync(
 675       \@srunargs, \@execargs,
 676       {label => "run install script on all workers"},
 677       $build_script . $git_archive);
 678
 679     my $stderr_anything_from_script = 0;
 680     for my $line (split(/\n/, $stderr)) {
 681       if ($line !~ /^(srun: error: |starting: \[)/) {
 682         $stderr_anything_from_script = 1;
 683       }
 684     }
 685
 686     last if $exited == 0 || $main::please_freeze;
 687
 688     # If the install script fails but doesn't print an error message,
 689     # the next thing anyone is likely to do is just run it again in
 690     # case it was a transient problem like "slurm communication fails
 691     # because the network isn't reliable enough". So we'll just do
 692     # that ourselves (up to 3 attempts in total). OTOH, if there is an
 693     # error message, the problem is more likely to have a real fix and
 694     # we should fail the job so the fixing process can start, instead
 695     # of doing 2 more attempts.
 696     last if $stderr_anything_from_script;
 697   }
 698
 699   foreach my $tar_filename (map { tar_filename_n($_); } (1..$git_tar_count)) {
 700     unlink($tar_filename);
 701   }
 702
 703   if ($exited != 0) {
 704     croak("Giving up");
 705   }
 706 }
 707
 708 foreach (qw (script script_version script_parameters runtime_constraints))
 709 {
 710   Log (undef,
 711        "$_ " .
 712        (ref($Job->{$_}) ? JSON::encode_json($Job->{$_}) : $Job->{$_}));
 713 }
 714 foreach (split (/\n/, $Job->{knobs}))
 715 {
 716   Log (undef, "knob " . $_);
 717 }
 718 my $resp = api_call(
 719   'nodes/list',
 720   'filters' => [['hostname', 'in', \@node]],
 721   'order' => 'hostname',
 722   'limit' => scalar(@node),
 723     );
 724 for my $n (@{$resp->{items}}) {
 725   Log(undef, "$n->{hostname} $n->{uuid} ".JSON::encode_json($n->{properties}));
 726 }
 727
 728
 729
 730 $main::success = undef;
 731
 732
 733
 734 ONELEVEL:
 735
 736 my $thisround_succeeded = 0;
 737 my $thisround_failed = 0;
 738 my $thisround_failed_multiple = 0;
 739 my $working_slot_count = scalar(@slot);
 740
 741 @jobstep_todo = sort { $jobstep[$a]->{level} <=> $jobstep[$b]->{level}
 742                        or $a <=> $b } @jobstep_todo;
 743 my $level = $jobstep[$jobstep_todo[0]]->{level};
 744
 745 my $initial_tasks_this_level = 0;
 746 foreach my $id (@jobstep_todo) {
 747   $initial_tasks_this_level++ if ($jobstep[$id]->{level} == $level);
 748 }
 749
 750 # If the number of tasks scheduled at this level #T is smaller than the number
 751 # of slots available #S, only use the first #T slots, or the first slot on
 752 # each node, whichever number is greater.
 753 #
 754 # When we dispatch tasks later, we'll allocate whole-node resources like RAM
 755 # based on these numbers.  Using fewer slots makes more resources available
 756 # to each individual task, which should normally be a better strategy when
 757 # there are fewer of them running with less parallelism.
 758 #
 759 # Note that this calculation is not redone if the initial tasks at
 760 # this level queue more tasks at the same level.  This may harm
 761 # overall task throughput for that level.
 762 my @freeslot;
 763 if ($initial_tasks_this_level < @node) {
 764   @freeslot = (0..$#node);
 765 } elsif ($initial_tasks_this_level < @slot) {
 766   @freeslot = (0..$initial_tasks_this_level - 1);
 767 } else {
 768   @freeslot = (0..$#slot);
 769 }
 770 my $round_num_freeslots = scalar(@freeslot);
 771 print STDERR "crunch-job have ${round_num_freeslots} free slots for ${initial_tasks_this_level} initial tasks at this level, ".scalar(@node)." nodes, and ".scalar(@slot)." slots\n";
 772
 773 my %round_max_slots = ();
 774 for (my $ii = $#freeslot; $ii >= 0; $ii--) {
 775   my $this_slot = $slot[$freeslot[$ii]];
 776   my $node_name = $this_slot->{node}->{name};
 777   $round_max_slots{$node_name} ||= $this_slot->{cpu};
 778   last if (scalar(keys(%round_max_slots)) >= @node);
 779 }
 780
 781 Log(undef, "start level $level with $round_num_freeslots slots");
 782 my @holdslot;
 783 my %reader;
 784 my $progress_is_dirty = 1;
 785 my $progress_stats_updated = 0;
 786
 787 update_progress_stats();
 788
 789
 790 THISROUND:
 791 for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
 792 {
 793   # Don't create new tasks if we already know the job's final result.
 794   last if defined($main::success);
 795
 796   my $id = $jobstep_todo[$todo_ptr];
 797   my $Jobstep = $jobstep[$id];
 798   if ($Jobstep->{level} != $level)
 799   {
 800     next;
 801   }
 802
 803   pipe $reader{$id}, "writer" or croak("pipe() failed: $!");
 804   set_nonblocking($reader{$id});
 805
 806   my $childslot = $freeslot[0];
 807   my $childnode = $slot[$childslot]->{node};
 808   my $childslotname = join (".",
 809                             $slot[$childslot]->{node}->{name},
 810                             $slot[$childslot]->{cpu});
 811
 812   my $childpid = fork();
 813   if ($childpid == 0)
 814   {
 815     $SIG{'INT'} = 'DEFAULT';
 816     $SIG{'QUIT'} = 'DEFAULT';
 817     $SIG{'TERM'} = 'DEFAULT';
 818
 819     foreach (values (%reader))
 820     {
 821       close($_);
 822     }
 823     fcntl ("writer", F_SETFL, 0) or croak ($!); # no close-on-exec
 824     open(STDOUT,">&writer");
 825     open(STDERR,">&writer");
 826
 827     undef $dbh;
 828     undef $sth;
 829
 830     delete $ENV{"GNUPGHOME"};
 831     $ENV{"TASK_UUID"} = $Jobstep->{'arvados_task'}->{'uuid'};
 832     $ENV{"TASK_QSEQUENCE"} = $id;
 833     $ENV{"TASK_SEQUENCE"} = $level;
 834     $ENV{"JOB_SCRIPT"} = $Job->{script};
 835     while (my ($param, $value) = each %{$Job->{script_parameters}}) {
 836       $param =~ tr/a-z/A-Z/;
 837       $ENV{"JOB_PARAMETER_$param"} = $value;
 838     }
 839     $ENV{"TASK_SLOT_NODE"} = $slot[$childslot]->{node}->{name};
 840     $ENV{"TASK_SLOT_NUMBER"} = $slot[$childslot]->{cpu};
 841     $ENV{"TASK_WORK"} = $ENV{"CRUNCH_TMP"}."/task/$childslotname";
 842     $ENV{"HOME"} = $ENV{"TASK_WORK"};
 843     $ENV{"TASK_TMPDIR"} = $ENV{"TASK_WORK"}; # deprecated
 844     $ENV{"CRUNCH_NODE_SLOTS"} = $round_max_slots{$ENV{TASK_SLOT_NODE}};
 845     $ENV{"PATH"} = $ENV{"CRUNCH_INSTALL"} . "/bin:" . $ENV{"PATH"};
 846
 847     my $keep_mnt = $ENV{"TASK_WORK"}.".keep";
 848
 849     $ENV{"GZIP"} = "-n";
 850
 851     my @srunargs = (
 852       "srun",
 853       "--nodelist=".$childnode->{name},
 854       qw(-n1 -c1 -N1 -D), $ENV{'TMPDIR'},
 855       "--job-name=$job_id.$id.$$",
 856         );
 857
 858     my $stdbuf = " stdbuf --output=0 --error=0 ";
 859
 860     my $arv_file_cache = "";
 861     if (defined($Job->{'runtime_constraints'}->{'keep_cache_mb_per_task'})) {
 862       $arv_file_cache = "--file-cache=" . ($Job->{'runtime_constraints'}->{'keep_cache_mb_per_task'} * 1024 * 1024);
 863     }
 864
 865     my $command =
 866         "if [ -e \Q$ENV{TASK_WORK}\E ]; then rm -rf \Q$ENV{TASK_WORK}\E; fi; "
 867         ."mkdir -p \Q$ENV{CRUNCH_TMP}\E \Q$ENV{JOB_WORK}\E \Q$ENV{TASK_WORK}\E \Q$keep_mnt\E "
 868         ."&& cd \Q$ENV{CRUNCH_TMP}\E "
 869         # These environment variables get used explicitly later in
 870         # $command.  No tool is expected to read these values directly.
 871         .q{&& MEM=$(awk '($1 == "MemTotal:"){print $2}' </proc/meminfo) }
 872         .q{&& SWAP=$(awk '($1 == "SwapTotal:"){print $2}' </proc/meminfo) }
 873         ."&& MEMLIMIT=\$(( (\$MEM * 95) / ($ENV{CRUNCH_NODE_SLOTS} * 100) )) "
 874         ."&& let SWAPLIMIT=\$MEMLIMIT+\$SWAP "
 875         .q{&& declare -a VOLUMES=() }
 876         .q{&& if which crunchrunner >/dev/null ; then VOLUMES+=("--volume=$(which crunchrunner):/usr/local/bin/crunchrunner:ro") ; fi }
 877         .q{&& if test -f /etc/ssl/certs/ca-certificates.crt ; then VOLUMES+=("--volume=/etc/ssl/certs/ca-certificates.crt:/etc/arvados/ca-certificates.crt:ro") ; }
 878         .q{elif test -f /etc/pki/tls/certs/ca-bundle.crt ; then VOLUMES+=("--volume=/etc/pki/tls/certs/ca-bundle.crt:/etc/arvados/ca-certificates.crt:ro") ; fi };
 879
 880     $command .= "&& exec arv-mount --read-write --mount-by-pdh=by_pdh --mount-tmp=tmp --crunchstat-interval=10 --allow-other $arv_file_cache \Q$keep_mnt\E --exec ";
 881     $ENV{TASK_KEEPMOUNT} = "$keep_mnt/by_pdh";
 882     $ENV{TASK_KEEPMOUNT_TMP} = "$keep_mnt/tmp";
 883
 884     if ($docker_hash)
 885     {
 886       my $containername = "$Jobstep->{arvados_task}->{uuid}-$Jobstep->{failures}";
 887       my $cidfile = "$ENV{CRUNCH_TMP}/$containername.cid";
 888       $command .= "crunchstat -cgroup-root=\Q$cgroup_root\E -cgroup-parent=docker -cgroup-cid=$cidfile -poll=10000 ";
 889       $command .= "$docker_bin run $docker_run_args --name=$containername --attach=stdout --attach=stderr --attach=stdin -i \Q$dockeruserarg\E --cidfile=$cidfile --sig-proxy ";
 890       # We only set memory limits if Docker lets us limit both memory and swap.
 891       # Memory limits alone have been supported longer, but subprocesses tend
 892       # to get SIGKILL if they exceed that without any swap limit set.
 893       # See #5642 for additional background.
 894       if ($docker_limitmem) {
 895         $command .= "--memory=\${MEMLIMIT}k --memory-swap=\${SWAPLIMIT}k ";
 896       }
 897
 898       # The source tree and $destdir directory (which we have
 899       # installed on the worker host) are available in the container,
 900       # under the same path.
 901       $command .= "--volume=\Q$ENV{CRUNCH_SRC}:$ENV{CRUNCH_SRC}:ro\E ";
 902       $command .= "--volume=\Q$ENV{CRUNCH_INSTALL}:$ENV{CRUNCH_INSTALL}:ro\E ";
 903
 904       # Currently, we make the "by_pdh" directory in arv-mount's mount
 905       # point appear at /keep inside the container (instead of using
 906       # the same path as the host like we do with CRUNCH_SRC and
 907       # CRUNCH_INSTALL). However, crunch scripts and utilities must
 908       # not rely on this. They must use $TASK_KEEPMOUNT.
 909       $command .= "--volume=\Q$ENV{TASK_KEEPMOUNT}:/keep:ro\E ";
 910       $ENV{TASK_KEEPMOUNT} = "/keep";
 911
 912       # Ditto TASK_KEEPMOUNT_TMP, as /keep_tmp.
 913       $command .= "--volume=\Q$ENV{TASK_KEEPMOUNT_TMP}:/keep_tmp\E ";
 914       $ENV{TASK_KEEPMOUNT_TMP} = "/keep_tmp";
 915
 916       # TASK_WORK is almost exactly like a docker data volume: it
 917       # starts out empty, is writable, and persists until no
 918       # containers use it any more. We don't use --volumes-from to
 919       # share it with other containers: it is only accessible to this
 920       # task, and it goes away when this task stops.
 921       #
 922       # However, a docker data volume is writable only by root unless
 923       # the mount point already happens to exist in the container with
 924       # different permissions. Therefore, we [1] assume /tmp already
 925       # exists in the image and is writable by the crunch user; [2]
 926       # avoid putting TASK_WORK inside CRUNCH_TMP (which won't be
 927       # writable if they are created by docker while setting up the
 928       # other --volumes); and [3] create $TASK_WORK inside the
 929       # container using $build_script.
 930       $command .= "--volume=/tmp ";
 931       $ENV{"TASK_WORK"} = "/tmp/crunch-job-task-work/$childslotname";
 932       $ENV{"HOME"} = $ENV{"TASK_WORK"};
 933       $ENV{"TASK_TMPDIR"} = $ENV{"TASK_WORK"}; # deprecated
 934
 935       # TODO: Share a single JOB_WORK volume across all task
 936       # containers on a given worker node, and delete it when the job
 937       # ends (and, in case that doesn't work, when the next job
 938       # starts).
 939       #
 940       # For now, use the same approach as TASK_WORK above.
 941       $ENV{"JOB_WORK"} = "/tmp/crunch-job-work";
 942
 943       # Bind mount the crunchrunner binary and host TLS certificates file into
 944       # the container.
 945       $command .= '"${VOLUMES[@]}" ';
 946
 947       while (my ($env_key, $env_val) = each %ENV)
 948       {
 949         if ($env_key =~ /^(ARVADOS|CRUNCH|JOB|TASK)_/) {
 950           $command .= "--env=\Q$env_key=$env_val\E ";
 951         }
 952       }
 953       $command .= "--env=\QHOME=$ENV{HOME}\E ";
 954       $command .= "\Q$docker_hash\E ";
 955
 956       if ($Job->{arvados_sdk_version}) {
 957         $command .= $stdbuf;
 958         $command .= "perl - \Q$ENV{CRUNCH_SRC}/crunch_scripts/$Job->{script}\E";
 959       } else {
 960         $command .= "/bin/sh -c \'python -c " .
 961             '"from pkg_resources import get_distribution as get; print \"Using Arvados SDK version\", get(\"arvados-python-client\").version"' .
 962             ">&2 2>/dev/null; " .
 963             "mkdir -p \"$ENV{JOB_WORK}\" \"$ENV{TASK_WORK}\" && " .
 964             "if which stdbuf >/dev/null ; then " .
 965             "  exec $stdbuf \Q$ENV{CRUNCH_SRC}/crunch_scripts/$Job->{script}\E ;" .
 966             " else " .
 967             "  exec \Q$ENV{CRUNCH_SRC}/crunch_scripts/$Job->{script}\E ;" .
 968             " fi\'";
 969       }
 970     } else {
 971       # Non-docker run
 972       $command .= "crunchstat -cgroup-root=\Q$cgroup_root\E -poll=10000 ";
 973       $command .= $stdbuf;
 974       $command .= "perl - $ENV{CRUNCH_SRC}/crunch_scripts/" . $Job->{"script"};
 975     }
 976
 977     my @execargs = ('bash', '-c', $command);
 978     srun (\@srunargs, \@execargs, undef, $build_script);
 979     # exec() failed, we assume nothing happened.
 980     die "srun() failed on build script\n";
 981   }
 982   close("writer");
 983   if (!defined $childpid)
 984   {
 985     close $reader{$id};
 986     delete $reader{$id};
 987     next;
 988   }
 989   shift @freeslot;
 990   $proc{$childpid} = {
 991     jobstepidx => $id,
 992     time => time,
 993     slot => $childslot,
 994     jobstepname => "$job_id.$id.$childpid",
 995   };
 996   croak ("assert failed: \$slot[$childslot]->{'pid'} exists") if exists $slot[$childslot]->{pid};
 997   $slot[$childslot]->{pid} = $childpid;
 998
 999   Log ($id, "job_task ".$Jobstep->{'arvados_task'}->{'uuid'});
1000   Log ($id, "child $childpid started on $childslotname");
1001   $Jobstep->{starttime} = time;
1002   $Jobstep->{node} = $childnode->{name};
1003   $Jobstep->{slotindex} = $childslot;
1004   delete $Jobstep->{stderr};
1005   delete $Jobstep->{finishtime};
1006   delete $Jobstep->{tempfail};
1007
1008   $Jobstep->{'arvados_task'}->{started_at} = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($Jobstep->{starttime});
1009   $Jobstep->{'arvados_task'}->save;
1010
1011   splice @jobstep_todo, $todo_ptr, 1;
1012   --$todo_ptr;
1013
1014   $progress_is_dirty = 1;
1015
1016   while (!@freeslot
1017          ||
1018          ($round_num_freeslots > @freeslot && $todo_ptr+1 > $#jobstep_todo))
1019   {
1020     last THISROUND if $main::please_freeze;
1021     if ($main::please_info)
1022     {
1023       $main::please_info = 0;
1024       freeze();
1025       create_output_collection();
1026       save_meta(1);
1027       update_progress_stats();
1028     }
1029     my $gotsome
1030         = readfrompipes ()
1031         + reapchildren ();
1032     if (!$gotsome || ($latest_refresh + 2 < scalar time))
1033     {
1034       check_refresh_wanted();
1035       check_squeue();
1036       update_progress_stats();
1037     }
1038     elsif (time - $progress_stats_updated >= 30 || $progress_is_dirty)
1039     {
1040       update_progress_stats();
1041     }
1042     if (!$gotsome) {
1043       select (undef, undef, undef, 0.1);
1044     }
1045     $working_slot_count = scalar(grep { $_->{node}->{fail_count} == 0 &&
1046                                         $_->{node}->{hold_count} < 4 } @slot);
1047     if (($thisround_failed_multiple >= 8 && $thisround_succeeded == 0) ||
1048         ($thisround_failed_multiple >= 16 && $thisround_failed_multiple > $thisround_succeeded))
1049     {
1050       my $message = "Repeated failure rate too high ($thisround_failed_multiple/"
1051           .($thisround_failed+$thisround_succeeded)
1052           .") -- giving up on this round";
1053       Log (undef, $message);
1054       last THISROUND;
1055     }
1056
1057     # move slots from freeslot to holdslot (or back to freeslot) if necessary
1058     for (my $i=$#freeslot; $i>=0; $i--) {
1059       if ($slot[$freeslot[$i]]->{node}->{hold_until} > scalar time) {
1060         push @holdslot, (splice @freeslot, $i, 1);
1061       }
1062     }
1063     for (my $i=$#holdslot; $i>=0; $i--) {
1064       if ($slot[$holdslot[$i]]->{node}->{hold_until} <= scalar time) {
1065         push @freeslot, (splice @holdslot, $i, 1);
1066       }
1067     }
1068
1069     # give up if no nodes are succeeding
1070     if ($working_slot_count < 1) {
1071       Log(undef, "Every node has failed -- giving up");
1072       last THISROUND;
1073     }
1074   }
1075 }
1076
1077
1078 push @freeslot, splice @holdslot;
1079 map { $slot[$freeslot[$_]]->{node}->{losing_streak} = 0 } (0..$#freeslot);
1080
1081
1082 Log (undef, "wait for last ".(scalar keys %proc)." children to finish");
1083 while (%proc)
1084 {
1085   if ($main::please_continue) {
1086     $main::please_continue = 0;
1087     goto THISROUND;
1088   }
1089   $main::please_info = 0, freeze(), create_output_collection(), save_meta(1) if $main::please_info;
1090   readfrompipes ();
1091   if (!reapchildren())
1092   {
1093     check_refresh_wanted();
1094     check_squeue();
1095     update_progress_stats();
1096     select (undef, undef, undef, 0.1);
1097     killem (keys %proc) if $main::please_freeze;
1098   }
1099 }
1100
1101 update_progress_stats();
1102 freeze_if_want_freeze();
1103
1104
1105 if (!defined $main::success)
1106 {
1107   if (!@jobstep_todo) {
1108     $main::success = 1;
1109   } elsif ($working_slot_count < 1) {
1110     save_output_collection();
1111     save_meta();
1112     exit(EX_RETRY_UNLOCKED);
1113   } elsif ($thisround_succeeded == 0 &&
1114            ($thisround_failed == 0 || $thisround_failed > 4)) {
1115     my $message = "stop because $thisround_failed tasks failed and none succeeded";
1116     Log (undef, $message);
1117     $main::success = 0;
1118   }
1119 }
1120
1121 goto ONELEVEL if !defined $main::success;
1122
1123
1124 release_allocation();
1125 freeze();
1126 my $collated_output = save_output_collection();
1127 Log (undef, "finish");
1128
1129 save_meta();
1130
1131 my $final_state;
1132 if ($collated_output && $main::success) {
1133   $final_state = 'Complete';
1134 } else {
1135   $final_state = 'Failed';
1136 }
1137 $Job->update_attributes('state' => $final_state);
1138
1139 exit (($final_state eq 'Complete') ? 0 : 1);
1140
1141
1142
1143 sub update_progress_stats
1144 {
1145   $progress_stats_updated = time;
1146   return if !$progress_is_dirty;
1147   my ($todo, $done, $running) = (scalar @jobstep_todo,
1148                                  scalar @jobstep_done,
1149                                  scalar keys(%proc));
1150   $Job->{'tasks_summary'} ||= {};
1151   $Job->{'tasks_summary'}->{'todo'} = $todo;
1152   $Job->{'tasks_summary'}->{'done'} = $done;
1153   $Job->{'tasks_summary'}->{'running'} = $running;
1154   $Job->update_attributes('tasks_summary' => $Job->{'tasks_summary'});
1155   Log (undef, "status: $done done, $running running, $todo todo");
1156   $progress_is_dirty = 0;
1157 }
1158
1159
1160
1161 sub reapchildren
1162 {
1163   my $children_reaped = 0;
1164   my @successful_task_uuids = ();
1165
1166   while((my $pid = waitpid (-1, WNOHANG)) > 0)
1167   {
1168     my $childstatus = $?;
1169
1170     my $whatslot = ($slot[$proc{$pid}->{slot}]->{node}->{name}
1171                     . "."
1172                     . $slot[$proc{$pid}->{slot}]->{cpu});
1173     my $jobstepidx = $proc{$pid}->{jobstepidx};
1174
1175     $children_reaped++;
1176     my $elapsed = time - $proc{$pid}->{time};
1177     my $Jobstep = $jobstep[$jobstepidx];
1178
1179     my $exitvalue = $childstatus >> 8;
1180     my $exitinfo = "exit ".exit_status_s($childstatus);
1181     $Jobstep->{'arvados_task'}->reload;
1182     my $task_success = $Jobstep->{'arvados_task'}->{success};
1183
1184     Log ($jobstepidx, "child $pid on $whatslot $exitinfo success=$task_success");
1185
1186     if (!defined $task_success) {
1187       # task did not indicate one way or the other --> fail
1188       Log($jobstepidx, sprintf(
1189             "ERROR: Task process exited %s, but never updated its task record to indicate success and record its output.",
1190             exit_status_s($childstatus)));
1191       $Jobstep->{'arvados_task'}->{success} = 0;
1192       $Jobstep->{'arvados_task'}->save;
1193       $task_success = 0;
1194     }
1195
1196     if (!$task_success)
1197     {
1198       my $temporary_fail;
1199       $temporary_fail ||= $Jobstep->{tempfail};
1200       $temporary_fail ||= ($exitvalue == TASK_TEMPFAIL);
1201
1202       ++$thisround_failed;
1203       ++$thisround_failed_multiple if $Jobstep->{'failures'} >= 1;
1204
1205       # Check for signs of a failed or misconfigured node
1206       if (++$slot[$proc{$pid}->{slot}]->{node}->{losing_streak} >=
1207           2+$slot[$proc{$pid}->{slot}]->{node}->{ncpus}) {
1208         # Don't count this against jobstep failure thresholds if this
1209         # node is already suspected faulty and srun exited quickly
1210         if ($slot[$proc{$pid}->{slot}]->{node}->{hold_until} &&
1211             $elapsed < 5) {
1212           Log ($jobstepidx, "blaming failure on suspect node " .
1213                $slot[$proc{$pid}->{slot}]->{node}->{name});
1214           $temporary_fail ||= 1;
1215         }
1216         ban_node_by_slot($proc{$pid}->{slot});
1217       }
1218
1219       Log ($jobstepidx, sprintf('failure (#%d, %s) after %d seconds',
1220                                 ++$Jobstep->{'failures'},
1221                                 $temporary_fail ? 'temporary' : 'permanent',
1222                                 $elapsed));
1223
1224       if (!$temporary_fail || $Jobstep->{'failures'} >= 3) {
1225         # Give up on this task, and the whole job
1226         $main::success = 0;
1227       }
1228       # Put this task back on the todo queue
1229       push @jobstep_todo, $jobstepidx;
1230       $Job->{'tasks_summary'}->{'failed'}++;
1231     }
1232     else # task_success
1233     {
1234       push @successful_task_uuids, $Jobstep->{'arvados_task'}->{uuid};
1235       ++$thisround_succeeded;
1236       $slot[$proc{$pid}->{slot}]->{node}->{losing_streak} = 0;
1237       $slot[$proc{$pid}->{slot}]->{node}->{hold_until} = 0;
1238       $slot[$proc{$pid}->{slot}]->{node}->{fail_count} = 0;
1239       push @jobstep_done, $jobstepidx;
1240       Log ($jobstepidx, "success in $elapsed seconds");
1241     }
1242     $Jobstep->{exitcode} = $childstatus;
1243     $Jobstep->{finishtime} = time;
1244     $Jobstep->{'arvados_task'}->{finished_at} = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($Jobstep->{finishtime});
1245     $Jobstep->{'arvados_task'}->save;
1246     process_stderr_final ($jobstepidx);
1247     Log ($jobstepidx, sprintf("task output (%d bytes): %s",
1248                               length($Jobstep->{'arvados_task'}->{output}),
1249                               $Jobstep->{'arvados_task'}->{output}));
1250
1251     close $reader{$jobstepidx};
1252     delete $reader{$jobstepidx};
1253     delete $slot[$proc{$pid}->{slot}]->{pid};
1254     push @freeslot, $proc{$pid}->{slot};
1255     delete $proc{$pid};
1256
1257     $progress_is_dirty = 1;
1258   }
1259
1260   if (scalar(@successful_task_uuids) > 0)
1261   {
1262     Log (undef, sprintf("%d tasks exited (%d succeeded), checking for new tasks from API server.", $children_reaped, scalar(@successful_task_uuids)));
1263     # Load new tasks
1264     my $newtask_list = [];
1265     my $newtask_results;
1266     do {
1267       $newtask_results = api_call(
1268         "job_tasks/list",
1269         'filters' => [["created_by_job_task_uuid","in",\@successful_task_uuids]],
1270         'order' => 'qsequence',
1271         'offset' => scalar(@$newtask_list),
1272           );
1273       push(@$newtask_list, @{$newtask_results->{items}});
1274     } while (@{$newtask_results->{items}});
1275     Log (undef, sprintf("Got %d new tasks from API server.", scalar(@$newtask_list)));
1276     foreach my $arvados_task (@$newtask_list) {
1277       my $jobstep = {
1278         'level' => $arvados_task->{'sequence'},
1279         'failures' => 0,
1280         'arvados_task' => $arvados_task
1281       };
1282       push @jobstep, $jobstep;
1283       push @jobstep_todo, $#jobstep;
1284     }
1285   }
1286
1287   return $children_reaped;
1288 }
1289
1290 sub check_refresh_wanted
1291 {
1292   my @stat = stat $ENV{"CRUNCH_REFRESH_TRIGGER"};
1293   if (@stat &&
1294       $stat[9] > $latest_refresh &&
1295       # ...and we have actually locked the job record...
1296       $job_id eq $Job->{'uuid'}) {
1297     $latest_refresh = scalar time;
1298     my $Job2 = api_call("jobs/get", uuid => $jobspec);
1299     for my $attr ('cancelled_at',
1300                   'cancelled_by_user_uuid',
1301                   'cancelled_by_client_uuid',
1302                   'state') {
1303       $Job->{$attr} = $Job2->{$attr};
1304     }
1305     if ($Job->{'state'} ne "Running") {
1306       if ($Job->{'state'} eq "Cancelled") {
1307         Log (undef, "Job cancelled at " . $Job->{'cancelled_at'} . " by user " . $Job->{'cancelled_by_user_uuid'});
1308       } else {
1309         Log (undef, "Job state unexpectedly changed to " . $Job->{'state'});
1310       }
1311       $main::success = 0;
1312       $main::please_freeze = 1;
1313     }
1314   }
1315 }
1316
1317 sub check_squeue
1318 {
1319   my $last_squeue_check = $squeue_checked;
1320
1321   # Do not call `squeue` or check the kill list more than once every
1322   # 15 seconds.
1323   return if $last_squeue_check > time - 15;
1324   $squeue_checked = time;
1325
1326   # Look for children from which we haven't received stderr data since
1327   # the last squeue check. If no such children exist, all procs are
1328   # alive and there's no need to even look at squeue.
1329   #
1330   # As long as the crunchstat poll interval (10s) is shorter than the
1331   # squeue check interval (15s) this should make the squeue check an
1332   # infrequent event.
1333   my $silent_procs = 0;
1334   for my $js (map {$jobstep[$_->{jobstepidx}]} values %proc)
1335   {
1336     if (!exists($js->{stderr_at}))
1337     {
1338       $js->{stderr_at} = 0;
1339     }
1340     if ($js->{stderr_at} < $last_squeue_check)
1341     {
1342       $silent_procs++;
1343     }
1344   }
1345   return if $silent_procs == 0;
1346
1347   # use killem() on procs whose killtime is reached
1348   while (my ($pid, $procinfo) = each %proc)
1349   {
1350     my $js = $jobstep[$procinfo->{jobstepidx}];
1351     if (exists $procinfo->{killtime}
1352         && $procinfo->{killtime} <= time
1353         && $js->{stderr_at} < $last_squeue_check)
1354     {
1355       my $sincewhen = "";
1356       if ($js->{stderr_at}) {
1357         $sincewhen = " in last " . (time - $js->{stderr_at}) . "s";
1358       }
1359       Log($procinfo->{jobstepidx}, "killing orphaned srun process $pid (task not in slurm queue, no stderr received$sincewhen)");
1360       killem ($pid);
1361     }
1362   }
1363
1364   if (!$have_slurm)
1365   {
1366     # here is an opportunity to check for mysterious problems with local procs
1367     return;
1368   }
1369
1370   # Get a list of steps still running.  Note: squeue(1) says --steps
1371   # selects a format (which we override anyway) and allows us to
1372   # specify which steps we're interested in (which we don't).
1373   # Importantly, it also changes the meaning of %j from "job name" to
1374   # "step name" and (although this isn't mentioned explicitly in the
1375   # docs) switches from "one line per job" mode to "one line per step"
1376   # mode. Without it, we'd just get a list of one job, instead of a
1377   # list of N steps.
1378   my @squeue = `squeue --jobs=\Q$ENV{SLURM_JOB_ID}\E --steps --format='%j' --noheader`;
1379   if ($? != 0)
1380   {
1381     Log(undef, "warning: squeue exit status $? ($!)");
1382     return;
1383   }
1384   chop @squeue;
1385
1386   # which of my jobsteps are running, according to squeue?
1387   my %ok;
1388   for my $jobstepname (@squeue)
1389   {
1390     $ok{$jobstepname} = 1;
1391   }
1392
1393   # Check for child procs >60s old and not mentioned by squeue.
1394   while (my ($pid, $procinfo) = each %proc)
1395   {
1396     if ($procinfo->{time} < time - 60
1397         && $procinfo->{jobstepname}
1398         && !exists $ok{$procinfo->{jobstepname}}
1399         && !exists $procinfo->{killtime})
1400     {
1401       # According to slurm, this task has ended (successfully or not)
1402       # -- but our srun child hasn't exited. First we must wait (30
1403       # seconds) in case this is just a race between communication
1404       # channels. Then, if our srun child process still hasn't
1405       # terminated, we'll conclude some slurm communication
1406       # error/delay has caused the task to die without notifying srun,
1407       # and we'll kill srun ourselves.
1408       $procinfo->{killtime} = time + 30;
1409       Log($procinfo->{jobstepidx}, "notice: task is not in slurm queue but srun process $pid has not exited");
1410     }
1411   }
1412 }
1413
1414 sub check_sinfo
1415 {
1416   # If a node fails in a multi-node "srun" call during job setup, the call
1417   # may hang instead of exiting with a nonzero code.  This function checks
1418   # "sinfo" for the health of the nodes that were allocated and ensures that
1419   # they are all still in the "alloc" state.  If a node that is allocated to
1420   # this job is not in "alloc" state, then set please_freeze.
1421   #
1422   # This is only called from srun_sync() for node configuration.  If a
1423   # node fails doing actual work, there are other recovery mechanisms.
1424
1425   # Do not call `sinfo` more than once every 15 seconds.
1426   return if $sinfo_checked > time - 15;
1427   $sinfo_checked = time;
1428
1429   # The output format "%t" means output node states.
1430   my @sinfo = `sinfo --nodes=\Q$ENV{SLURM_NODELIST}\E --noheader -o "%t"`;
1431   if ($? != 0)
1432   {
1433     Log(undef, "warning: sinfo exit status $? ($!)");
1434     return;
1435   }
1436   chop @sinfo;
1437
1438   foreach (@sinfo)
1439   {
1440     if ($_ != "alloc" && $_ != "alloc*") {
1441       $main::please_freeze = 1;
1442     }
1443   }
1444 }
1445
1446 sub release_allocation
1447 {
1448   if ($have_slurm)
1449   {
1450     Log (undef, "release job allocation");
1451     system "scancel $ENV{SLURM_JOB_ID}";
1452   }
1453 }
1454
1455
1456 sub readfrompipes
1457 {
1458   my $gotsome = 0;
1459   my %fd_job;
1460   my $sel = IO::Select->new();
1461   foreach my $jobstepidx (keys %reader)
1462   {
1463     my $fd = $reader{$jobstepidx};
1464     $sel->add($fd);
1465     $fd_job{$fd} = $jobstepidx;
1466
1467     if (my $stdout_fd = $jobstep[$jobstepidx]->{stdout_r}) {
1468       $sel->add($stdout_fd);
1469       $fd_job{$stdout_fd} = $jobstepidx;
1470     }
1471   }
1472   # select on all reader fds with 0.1s timeout
1473   my @ready_fds = $sel->can_read(0.1);
1474   foreach my $fd (@ready_fds)
1475   {
1476     my $buf;
1477     if (0 < sysread ($fd, $buf, 65536))
1478     {
1479       $gotsome = 1;
1480       print STDERR $buf if $ENV{CRUNCH_DEBUG};
1481
1482       my $jobstepidx = $fd_job{$fd};
1483       if ($jobstep[$jobstepidx]->{stdout_r} == $fd) {
1484         $jobstep[$jobstepidx]->{stdout_captured} .= $buf;
1485         next;
1486       }
1487
1488       $jobstep[$jobstepidx]->{stderr_at} = time;
1489       $jobstep[$jobstepidx]->{stderr} .= $buf;
1490
1491       # Consume everything up to the last \n
1492       preprocess_stderr ($jobstepidx);
1493
1494       if (length ($jobstep[$jobstepidx]->{stderr}) > 16384)
1495       {
1496         # If we get a lot of stderr without a newline, chop off the
1497         # front to avoid letting our buffer grow indefinitely.
1498         substr ($jobstep[$jobstepidx]->{stderr},
1499                 0, length($jobstep[$jobstepidx]->{stderr}) - 8192) = "";
1500       }
1501     }
1502   }
1503   return $gotsome;
1504 }
1505
1506
1507 # Consume all full lines of stderr for a jobstep. Everything after the
1508 # last newline will remain in $jobstep[$jobstepidx]->{stderr} after
1509 # returning.
1510 sub preprocess_stderr
1511 {
1512   my $jobstepidx = shift;
1513   # slotindex is only defined for children running Arvados job tasks.
1514   # Be prepared to handle the undef case (for setup srun calls, etc.).
1515   my $job_slot_index = $jobstep[$jobstepidx]->{slotindex};
1516
1517   while ($jobstep[$jobstepidx]->{stderr} =~ /^(.*?)\n/) {
1518     my $line = $1;
1519     substr $jobstep[$jobstepidx]->{stderr}, 0, 1+length($line), "";
1520     Log ($jobstepidx, "stderr $line");
1521     if ($line =~ /srun: error: (SLURM job $ENV{SLURM_JOB_ID} has expired|Unable to confirm allocation for job $ENV{SLURM_JOB_ID})/i) {
1522       # If the allocation is revoked, we can't possibly continue, so mark all
1523       # nodes as failed.  This will cause the overall exit code to be
1524       # EX_RETRY_UNLOCKED instead of failure so that crunch_dispatch can re-run
1525       # this job.
1526       $main::please_freeze = 1;
1527       foreach my $st (@slot) {
1528         $st->{node}->{fail_count}++;
1529       }
1530     }
1531     elsif ($line =~ /srun: error: .*?\b(Node failure on|Aborting, .*?\bio error\b)/i) {
1532       $jobstep[$jobstepidx]->{tempfail} = 1;
1533       if (defined($job_slot_index)) {
1534         $slot[$job_slot_index]->{node}->{fail_count}++;
1535         ban_node_by_slot($job_slot_index);
1536       }
1537     }
1538     elsif ($line =~ /srun: error: (Unable to create job step|.*?: Communication connection failure)/i) {
1539       $jobstep[$jobstepidx]->{tempfail} = 1;
1540       ban_node_by_slot($job_slot_index) if (defined($job_slot_index));
1541     }
1542     elsif ($line =~ /\bKeep(Read|Write|Request)Error:/) {
1543       $jobstep[$jobstepidx]->{tempfail} = 1;
1544     }
1545   }
1546 }
1547
1548
1549 sub process_stderr_final
1550 {
1551   my $jobstepidx = shift;
1552   preprocess_stderr ($jobstepidx);
1553
1554   map {
1555     Log ($jobstepidx, "stderr $_");
1556   } split ("\n", $jobstep[$jobstepidx]->{stderr});
1557   $jobstep[$jobstepidx]->{stderr} = '';
1558 }
1559
1560 sub fetch_block
1561 {
1562   my $hash = shift;
1563   my $keep;
1564   if (!open($keep, "-|", "arv-get", "--retries", retry_count(), $hash)) {
1565     Log(undef, "fetch_block run error from arv-get $hash: $!");
1566     return undef;
1567   }
1568   my $output_block = "";
1569   while (1) {
1570     my $buf;
1571     my $bytes = sysread($keep, $buf, 1024 * 1024);
1572     if (!defined $bytes) {
1573       Log(undef, "fetch_block read error from arv-get: $!");
1574       $output_block = undef;
1575       last;
1576     } elsif ($bytes == 0) {
1577       # sysread returns 0 at the end of the pipe.
1578       last;
1579     } else {
1580       # some bytes were read into buf.
1581       $output_block .= $buf;
1582     }
1583   }
1584   close $keep;
1585   if ($?) {
1586     Log(undef, "fetch_block arv-get exited " . exit_status_s($?));
1587     $output_block = undef;
1588   }
1589   return $output_block;
1590 }
1591
1592 # Create a collection by concatenating the output of all tasks (each
1593 # task's output is either a manifest fragment, a locator for a
1594 # manifest fragment stored in Keep, or nothing at all). Return the
1595 # portable_data_hash of the new collection.
1596 sub create_output_collection
1597 {
1598   Log (undef, "collate");
1599
1600   my ($child_out, $child_in);
1601   my $pid = open2($child_out, $child_in, 'python', '-c', q{
1602 import arvados
1603 import sys
1604 print (arvados.api("v1").collections().
1605        create(body={"manifest_text": sys.stdin.read(),
1606                     "owner_uuid": sys.argv[2]}).
1607        execute(num_retries=int(sys.argv[1]))["portable_data_hash"])
1608 }, retry_count(), $Job->{owner_uuid});
1609
1610   my $task_idx = -1;
1611   my $manifest_size = 0;
1612   for (@jobstep)
1613   {
1614     ++$task_idx;
1615     my $output = $_->{'arvados_task'}->{output};
1616     next if (!defined($output));
1617     my $next_write;
1618     if ($output =~ /^[0-9a-f]{32}(\+\S+)*$/) {
1619       $next_write = fetch_block($output);
1620     } else {
1621       $next_write = $output;
1622     }
1623     if (defined($next_write)) {
1624       if (!defined(syswrite($child_in, $next_write))) {
1625         # There's been an error writing.  Stop the loop.
1626         # We'll log details about the exit code later.
1627         last;
1628       } else {
1629         $manifest_size += length($next_write);
1630       }
1631     } else {
1632       my $uuid = $_->{'arvados_task'}->{'uuid'};
1633       Log (undef, "Error retrieving '$output' output by task $task_idx ($uuid)");
1634       $main::success = 0;
1635     }
1636   }
1637   close($child_in);
1638   Log(undef, "collated output manifest text to send to API server is $manifest_size bytes with access tokens");
1639
1640   my $joboutput;
1641   my $s = IO::Select->new($child_out);
1642   if ($s->can_read(120)) {
1643     sysread($child_out, $joboutput, 1024 * 1024);
1644     waitpid($pid, 0);
1645     if ($?) {
1646       Log(undef, "output collection creation exited " . exit_status_s($?));
1647       $joboutput = undef;
1648     } else {
1649       chomp($joboutput);
1650     }
1651   } else {
1652     Log (undef, "timed out while creating output collection");
1653     foreach my $signal (2, 2, 2, 15, 15, 9) {
1654       kill($signal, $pid);
1655       last if waitpid($pid, WNOHANG) == -1;
1656       sleep(1);
1657     }
1658   }
1659   close($child_out);
1660
1661   return $joboutput;
1662 }
1663
1664 # Calls create_output_collection, logs the result, and returns it.
1665 # If that was successful, save that as the output in the job record.
1666 sub save_output_collection {
1667   my $collated_output = create_output_collection();
1668
1669   if (!$collated_output) {
1670     Log(undef, "Failed to write output collection");
1671   }
1672   else {
1673     Log(undef, "job output $collated_output");
1674     $Job->update_attributes('output' => $collated_output);
1675   }
1676   return $collated_output;
1677 }
1678
1679 sub killem
1680 {
1681   foreach (@_)
1682   {
1683     my $sig = 2;                # SIGINT first
1684     if (exists $proc{$_}->{"sent_$sig"} &&
1685         time - $proc{$_}->{"sent_$sig"} > 4)
1686     {
1687       $sig = 15;                # SIGTERM if SIGINT doesn't work
1688     }
1689     if (exists $proc{$_}->{"sent_$sig"} &&
1690         time - $proc{$_}->{"sent_$sig"} > 4)
1691     {
1692       $sig = 9;                 # SIGKILL if SIGTERM doesn't work
1693     }
1694     if (!exists $proc{$_}->{"sent_$sig"})
1695     {
1696       Log ($proc{$_}->{jobstepidx}, "sending 2x signal $sig to pid $_");
1697       kill $sig, $_;
1698       select (undef, undef, undef, 0.1);
1699       if ($sig == 2)
1700       {
1701         kill $sig, $_;     # srun wants two SIGINT to really interrupt
1702       }
1703       $proc{$_}->{"sent_$sig"} = time;
1704       $proc{$_}->{"killedafter"} = time - $proc{$_}->{"time"};
1705     }
1706   }
1707 }
1708
1709
1710 sub fhbits
1711 {
1712   my($bits);
1713   for (@_) {
1714     vec($bits,fileno($_),1) = 1;
1715   }
1716   $bits;
1717 }
1718
1719
1720 # Send log output to Keep via arv-put.
1721 #
1722 # $log_pipe_in and $log_pipe_out are the input and output filehandles to the arv-put pipe.
1723 # $log_pipe_out_buf is a string containing all output read from arv-put so far.
1724 # $log_pipe_out_select is an IO::Select object around $log_pipe_out.
1725 # $log_pipe_pid is the pid of the arv-put subprocess.
1726 #
1727 # The only functions that should access these variables directly are:
1728 #
1729 # log_writer_start($logfilename)
1730 #     Starts an arv-put pipe, reading data on stdin and writing it to
1731 #     a $logfilename file in an output collection.
1732 #
1733 # log_writer_read_output([$timeout])
1734 #     Read output from $log_pipe_out and append it to $log_pipe_out_buf.
1735 #     Passes $timeout to the select() call, with a default of 0.01.
1736 #     Returns the result of the last read() call on $log_pipe_out, or
1737 #     -1 if read() wasn't called because select() timed out.
1738 #     Only other log_writer_* functions should need to call this.
1739 #
1740 # log_writer_send($txt)
1741 #     Writes $txt to the output log collection.
1742 #
1743 # log_writer_finish()
1744 #     Closes the arv-put pipe and returns the output that it produces.
1745 #
1746 # log_writer_is_active()
1747 #     Returns a true value if there is currently a live arv-put
1748 #     process, false otherwise.
1749 #
1750 my ($log_pipe_in, $log_pipe_out, $log_pipe_out_buf, $log_pipe_out_select,
1751     $log_pipe_pid);
1752
1753 sub log_writer_start($)
1754 {
1755   my $logfilename = shift;
1756   $log_pipe_pid = open2($log_pipe_out, $log_pipe_in,
1757                         'arv-put',
1758                         '--stream',
1759                         '--retries', '3',
1760                         '--filename', $logfilename,
1761                         '-');
1762   $log_pipe_out_buf = "";
1763   $log_pipe_out_select = IO::Select->new($log_pipe_out);
1764 }
1765
1766 sub log_writer_read_output {
1767   my $timeout = shift || 0.01;
1768   my $read = -1;
1769   while ($read && $log_pipe_out_select->can_read($timeout)) {
1770     $read = read($log_pipe_out, $log_pipe_out_buf, 65536,
1771                  length($log_pipe_out_buf));
1772   }
1773   if (!defined($read)) {
1774     Log(undef, "error reading log manifest from arv-put: $!");
1775   }
1776   return $read;
1777 }
1778
1779 sub log_writer_send($)
1780 {
1781   my $txt = shift;
1782   print $log_pipe_in $txt;
1783   log_writer_read_output();
1784 }
1785
1786 sub log_writer_finish()
1787 {
1788   return unless $log_pipe_pid;
1789
1790   close($log_pipe_in);
1791
1792   my $logger_failed = 0;
1793   my $read_result = log_writer_read_output(600);
1794   if ($read_result == -1) {
1795     $logger_failed = -1;
1796     Log (undef, "timed out reading from 'arv-put'");
1797   } elsif ($read_result != 0) {
1798     $logger_failed = -2;
1799     Log(undef, "failed to read arv-put log manifest to EOF");
1800   }
1801
1802   waitpid($log_pipe_pid, 0);
1803   if ($?) {
1804     $logger_failed ||= $?;
1805     Log(undef, "log_writer_finish: arv-put exited " . exit_status_s($?))
1806   }
1807
1808   close($log_pipe_out);
1809   my $arv_put_output = $logger_failed ? undef : $log_pipe_out_buf;
1810   $log_pipe_pid = $log_pipe_in = $log_pipe_out = $log_pipe_out_buf =
1811       $log_pipe_out_select = undef;
1812
1813   return $arv_put_output;
1814 }
1815
1816 sub log_writer_is_active() {
1817   return $log_pipe_pid;
1818 }
1819
1820 sub Log                         # ($jobstepidx, $logmessage)
1821 {
1822   my ($jobstepidx, $logmessage) = @_;
1823   if ($logmessage =~ /\n/) {
1824     for my $line (split (/\n/, $_[1])) {
1825       Log ($jobstepidx, $line);
1826     }
1827     return;
1828   }
1829   my $fh = select STDERR; $|=1; select $fh;
1830   my $task_qseq = '';
1831   if (defined($jobstepidx) && exists($jobstep[$jobstepidx]->{arvados_task})) {
1832     $task_qseq = $jobstepidx;
1833   }
1834   my $message = sprintf ("%s %d %s %s", $job_id, $$, $task_qseq, $logmessage);
1835   $message =~ s{([^ -\176])}{"\\" . sprintf ("%03o", ord($1))}ge;
1836   $message .= "\n";
1837   my $datetime;
1838   if (log_writer_is_active() || -t STDERR) {
1839     my @gmtime = gmtime;
1840     $datetime = sprintf ("%04d-%02d-%02d_%02d:%02d:%02d",
1841                          $gmtime[5]+1900, $gmtime[4]+1, @gmtime[3,2,1,0]);
1842   }
1843   print STDERR ((-t STDERR) ? ($datetime." ".$message) : $message);
1844
1845   if (log_writer_is_active()) {
1846     log_writer_send($datetime . " " . $message);
1847   }
1848 }
1849
1850
1851 sub croak
1852 {
1853   my ($package, $file, $line) = caller;
1854   my $message = "@_ at $file line $line\n";
1855   Log (undef, $message);
1856   release_allocation();
1857   freeze() if @jobstep_todo;
1858   create_output_collection() if @jobstep_todo;
1859   cleanup();
1860   save_meta();
1861   die;
1862 }
1863
1864
1865 sub cleanup
1866 {
1867   return unless $Job;
1868   if ($Job->{'state'} eq 'Cancelled') {
1869     $Job->update_attributes('finished_at' => scalar gmtime);
1870   } else {
1871     $Job->update_attributes('state' => 'Failed');
1872   }
1873 }
1874
1875
1876 sub save_meta
1877 {
1878   my $justcheckpoint = shift; # false if this will be the last meta saved
1879   return if $justcheckpoint;  # checkpointing is not relevant post-Warehouse.pm
1880   return unless log_writer_is_active();
1881   my $log_manifest = log_writer_finish();
1882   return unless defined($log_manifest);
1883
1884   if ($Job->{log}) {
1885     my $prev_log_coll = api_call("collections/get", uuid => $Job->{log});
1886     $log_manifest = $prev_log_coll->{manifest_text} . $log_manifest;
1887   }
1888
1889   my $log_coll = api_call(
1890     "collections/create", ensure_unique_name => 1, collection => {
1891       manifest_text => $log_manifest,
1892       owner_uuid => $Job->{owner_uuid},
1893       name => sprintf("Log from %s job %s", $Job->{script}, $Job->{uuid}),
1894     });
1895   Log(undef, "log collection is " . $log_coll->{portable_data_hash});
1896   $Job->update_attributes('log' => $log_coll->{portable_data_hash});
1897 }
1898
1899
1900 sub freeze_if_want_freeze
1901 {
1902   if ($main::please_freeze)
1903   {
1904     release_allocation();
1905     if (@_)
1906     {
1907       # kill some srun procs before freeze+stop
1908       map { $proc{$_} = {} } @_;
1909       while (%proc)
1910       {
1911         killem (keys %proc);
1912         select (undef, undef, undef, 0.1);
1913         my $died;
1914         while (($died = waitpid (-1, WNOHANG)) > 0)
1915         {
1916           delete $proc{$died};
1917         }
1918       }
1919     }
1920     freeze();
1921     create_output_collection();
1922     cleanup();
1923     save_meta();
1924     exit 1;
1925   }
1926 }
1927
1928
1929 sub freeze
1930 {
1931   Log (undef, "Freeze not implemented");
1932   return;
1933 }
1934
1935
1936 sub thaw
1937 {
1938   croak ("Thaw not implemented");
1939 }
1940
1941
1942 sub freezequote
1943 {
1944   my $s = shift;
1945   $s =~ s/\\/\\\\/g;
1946   $s =~ s/\n/\\n/g;
1947   return $s;
1948 }
1949
1950
1951 sub freezeunquote
1952 {
1953   my $s = shift;
1954   $s =~ s{\\(.)}{$1 eq "n" ? "\n" : $1}ge;
1955   return $s;
1956 }
1957
1958 sub srun_sync
1959 {
1960   my $srunargs = shift;
1961   my $execargs = shift;
1962   my $opts = shift || {};
1963   my $stdin = shift;
1964
1965   my $label = exists $opts->{label} ? $opts->{label} : "@$execargs";
1966   Log (undef, "$label: start");
1967
1968   my ($stderr_r, $stderr_w);
1969   pipe $stderr_r, $stderr_w or croak("pipe() failed: $!");
1970
1971   my ($stdout_r, $stdout_w);
1972   pipe $stdout_r, $stdout_w or croak("pipe() failed: $!");
1973
1974   my $srunpid = fork();
1975   if ($srunpid == 0)
1976   {
1977     close($stderr_r);
1978     close($stdout_r);
1979     fcntl($stderr_w, F_SETFL, 0) or croak($!); # no close-on-exec
1980     fcntl($stdout_w, F_SETFL, 0) or croak($!);
1981     open(STDERR, ">&", $stderr_w);
1982     open(STDOUT, ">&", $stdout_w);
1983     srun ($srunargs, $execargs, $opts, $stdin);
1984     exit (1);
1985   }
1986   close($stderr_w);
1987   close($stdout_w);
1988
1989   set_nonblocking($stderr_r);
1990   set_nonblocking($stdout_r);
1991
1992   # Add entries to @jobstep and %proc so check_squeue() and
1993   # freeze_if_want_freeze() can treat it like a job task process.
1994   push @jobstep, {
1995     stderr => '',
1996     stderr_at => 0,
1997     stderr_captured => '',
1998     stdout_r => $stdout_r,
1999     stdout_captured => '',
2000   };
2001   my $jobstepidx = $#jobstep;
2002   $proc{$srunpid} = {
2003     jobstepidx => $jobstepidx,
2004   };
2005   $reader{$jobstepidx} = $stderr_r;
2006
2007   while ($srunpid != waitpid ($srunpid, WNOHANG)) {
2008     my $busy = readfrompipes();
2009     if (!$busy || ($latest_refresh + 2 < scalar time)) {
2010       check_refresh_wanted();
2011       check_squeue();
2012       check_sinfo();
2013     }
2014     if (!$busy) {
2015       select(undef, undef, undef, 0.1);
2016     }
2017     killem(keys %proc) if $main::please_freeze;
2018   }
2019   my $exited = $?;
2020
2021   1 while readfrompipes();
2022   process_stderr_final ($jobstepidx);
2023
2024   Log (undef, "$label: exit ".exit_status_s($exited));
2025
2026   close($stdout_r);
2027   close($stderr_r);
2028   delete $proc{$srunpid};
2029   delete $reader{$jobstepidx};
2030
2031   my $j = pop @jobstep;
2032   # If the srun showed signs of tempfail, ensure the caller treats that as a
2033   # failure case.
2034   if ($main::please_freeze || $j->{tempfail}) {
2035     $exited ||= 255;
2036   }
2037   return ($exited, $j->{stdout_captured}, $j->{stderr_captured});
2038 }
2039
2040
2041 sub srun
2042 {
2043   my $srunargs = shift;
2044   my $execargs = shift;
2045   my $opts = shift || {};
2046   my $stdin = shift;
2047   my $args = $have_slurm ? [@$srunargs, @$execargs] : $execargs;
2048
2049   $Data::Dumper::Terse = 1;
2050   $Data::Dumper::Indent = 0;
2051   my $show_cmd = Dumper($args);
2052   $show_cmd =~ s/(TOKEN\\*=)[^\s\']+/${1}[...]/g;
2053   $show_cmd =~ s/\n/ /g;
2054   if ($opts->{fork}) {
2055     Log(undef, "starting: $show_cmd");
2056   } else {
2057     # This is a child process: parent is in charge of reading our
2058     # stderr and copying it to Log() if needed.
2059     warn "starting: $show_cmd\n";
2060   }
2061
2062   if (defined $stdin) {
2063     my $child = open STDIN, "-|";
2064     defined $child or die "no fork: $!";
2065     if ($child == 0) {
2066       print $stdin or die $!;
2067       close STDOUT or die $!;
2068       exit 0;
2069     }
2070   }
2071
2072   return system (@$args) if $opts->{fork};
2073
2074   exec @$args;
2075   warn "ENV size is ".length(join(" ",%ENV));
2076   die "exec failed: $!: @$args";
2077 }
2078
2079
2080 sub ban_node_by_slot {
2081   # Don't start any new jobsteps on this node for 60 seconds
2082   my $slotid = shift;
2083   $slot[$slotid]->{node}->{hold_until} = 60 + scalar time;
2084   $slot[$slotid]->{node}->{hold_count}++;
2085   Log (undef, "backing off node " . $slot[$slotid]->{node}->{name} . " for 60 seconds");
2086 }
2087
2088 sub must_lock_now
2089 {
2090   my ($lockfile, $error_message) = @_;
2091   open L, ">", $lockfile or croak("$lockfile: $!");
2092   if (!flock L, LOCK_EX|LOCK_NB) {
2093     croak("Can't lock $lockfile: $error_message\n");
2094   }
2095 }
2096
2097 sub find_docker_image {
2098   # Given a Keep locator, check to see if it contains a Docker image.
2099   # If so, return its stream name and Docker hash.
2100   # If not, return undef for both values.
2101   my $locator = shift;
2102   my ($streamname, $filename);
2103   my $image = api_call("collections/get", uuid => $locator);
2104   if ($image) {
2105     foreach my $line (split(/\n/, $image->{manifest_text})) {
2106       my @tokens = split(/\s+/, $line);
2107       next if (!@tokens);
2108       $streamname = shift(@tokens);
2109       foreach my $filedata (grep(/^\d+:\d+:/, @tokens)) {
2110         if (defined($filename)) {
2111           return (undef, undef);  # More than one file in the Collection.
2112         } else {
2113           $filename = (split(/:/, $filedata, 3))[2];
2114         }
2115       }
2116     }
2117   }
2118   if (defined($filename) and ($filename =~ /^((?:sha256:)?[0-9A-Fa-f]{64})\.tar$/)) {
2119     return ($streamname, $1);
2120   } else {
2121     return (undef, undef);
2122   }
2123 }
2124
2125 sub retry_count {
2126   # Calculate the number of times an operation should be retried,
2127   # assuming exponential backoff, and that we're willing to retry as
2128   # long as tasks have been running.  Enforce a minimum of 3 retries.
2129   my ($starttime, $endtime, $timediff, $retries);
2130   if (@jobstep) {
2131     $starttime = $jobstep[0]->{starttime};
2132     $endtime = $jobstep[-1]->{finishtime};
2133   }
2134   if (!defined($starttime)) {
2135     $timediff = 0;
2136   } elsif (!defined($endtime)) {
2137     $timediff = time - $starttime;
2138   } else {
2139     $timediff = ($endtime - $starttime) - (time - $endtime);
2140   }
2141   if ($timediff > 0) {
2142     $retries = int(log($timediff) / log(2));
2143   } else {
2144     $retries = 1;  # Use the minimum.
2145   }
2146   return ($retries > 3) ? $retries : 3;
2147 }
2148
2149 sub retry_op {
2150   # Pass in two function references.
2151   # This method will be called with the remaining arguments.
2152   # If it dies, retry it with exponential backoff until it succeeds,
2153   # or until the current retry_count is exhausted.  After each failure
2154   # that can be retried, the second function will be called with
2155   # the current try count (0-based), next try time, and error message.
2156   my $operation = shift;
2157   my $retry_callback = shift;
2158   my $retries = retry_count();
2159   foreach my $try_count (0..$retries) {
2160     my $next_try = time + (2 ** $try_count);
2161     my $result = eval { $operation->(@_); };
2162     if (!$@) {
2163       return $result;
2164     } elsif ($try_count < $retries) {
2165       $retry_callback->($try_count, $next_try, $@);
2166       my $sleep_time = $next_try - time;
2167       sleep($sleep_time) if ($sleep_time > 0);
2168     }
2169   }
2170   # Ensure the error message ends in a newline, so Perl doesn't add
2171   # retry_op's line number to it.
2172   chomp($@);
2173   die($@ . "\n");
2174 }
2175
2176 sub api_call {
2177   # Pass in a /-separated API method name, and arguments for it.
2178   # This function will call that method, retrying as needed until
2179   # the current retry_count is exhausted, with a log on the first failure.
2180   my $method_name = shift;
2181   my $log_api_retry = sub {
2182     my ($try_count, $next_try_at, $errmsg) = @_;
2183     $errmsg =~ s/\s*\bat \Q$0\E line \d+\.?\s*//;
2184     $errmsg =~ s/\s/ /g;
2185     $errmsg =~ s/\s+$//;
2186     my $retry_msg;
2187     if ($next_try_at < time) {
2188       $retry_msg = "Retrying.";
2189     } else {
2190       my $next_try_fmt = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($next_try_at);
2191       $retry_msg = "Retrying at $next_try_fmt.";
2192     }
2193     Log(undef, "API method $method_name failed: $errmsg. $retry_msg");
2194   };
2195   my $method = $arv;
2196   foreach my $key (split(/\//, $method_name)) {
2197     $method = $method->{$key};
2198   }
2199   return retry_op(sub { $method->execute(@_); }, $log_api_retry, @_);
2200 }
2201
2202 sub exit_status_s {
2203   # Given a $?, return a human-readable exit code string like "0" or
2204   # "1" or "0 with signal 1" or "1 with signal 11".
2205   my $exitcode = shift;
2206   my $s = $exitcode >> 8;
2207   if ($exitcode & 0x7f) {
2208     $s .= " with signal " . ($exitcode & 0x7f);
2209   }
2210   if ($exitcode & 0x80) {
2211     $s .= " with core dump";
2212   }
2213   return $s;
2214 }
2215
2216 sub handle_readall {
2217   # Pass in a glob reference to a file handle.
2218   # Read all its contents and return them as a string.
2219   my $fh_glob_ref = shift;
2220   local $/ = undef;
2221   return <$fh_glob_ref>;
2222 }
2223
2224 sub tar_filename_n {
2225   my $n = shift;
2226   return sprintf("%s/git.%s.%d.tar", $ENV{CRUNCH_TMP}, $job_id, $n);
2227 }
2228
2229 sub add_git_archive {
2230   # Pass in a git archive command as a string or list, a la system().
2231   # This method will save its output to be included in the archive sent to the
2232   # build script.
2233   my $git_input;
2234   $git_tar_count++;
2235   if (!open(GIT_ARCHIVE, ">", tar_filename_n($git_tar_count))) {
2236     croak("Failed to save git archive: $!");
2237   }
2238   my $git_pid = open2(">&GIT_ARCHIVE", $git_input, @_);
2239   close($git_input);
2240   waitpid($git_pid, 0);
2241   close(GIT_ARCHIVE);
2242   if ($?) {
2243     croak("Failed to save git archive: git exited " . exit_status_s($?));
2244   }
2245 }
2246
2247 sub combined_git_archive {
2248   # Combine all saved tar archives into a single archive, then return its
2249   # contents in a string.  Return undef if no archives have been saved.
2250   if ($git_tar_count < 1) {
2251     return undef;
2252   }
2253   my $base_tar_name = tar_filename_n(1);
2254   foreach my $tar_to_append (map { tar_filename_n($_); } (2..$git_tar_count)) {
2255     my $tar_exit = system("tar", "-Af", $base_tar_name, $tar_to_append);
2256     if ($tar_exit != 0) {
2257       croak("Error preparing build archive: tar -A exited " .
2258             exit_status_s($tar_exit));
2259     }
2260   }
2261   if (!open(GIT_TAR, "<", $base_tar_name)) {
2262     croak("Could not open build archive: $!");
2263   }
2264   my $tar_contents = handle_readall(\*GIT_TAR);
2265   close(GIT_TAR);
2266   return $tar_contents;
2267 }
2268
2269 sub set_nonblocking {
2270   my $fh = shift;
2271   my $flags = fcntl ($fh, F_GETFL, 0) or croak ($!);
2272   fcntl ($fh, F_SETFL, $flags | O_NONBLOCK) or croak ($!);
2273 }
2274
2275 __DATA__
2276 #!/usr/bin/env perl
2277 #
2278 # This is crunch-job's internal dispatch script.  crunch-job running on the API
2279 # server invokes this script on individual compute nodes, or localhost if we're
2280 # running a job locally.  It gets called in two modes:
2281 #
2282 # * No arguments: Installation mode.  Read a tar archive from the DATA
2283 #   file handle; it includes the Crunch script's source code, and
2284 #   maybe SDKs as well.  Those should be installed in the proper
2285 #   locations.  This runs outside of any Docker container, so don't try to
2286 #   introspect Crunch's runtime environment.
2287 #
2288 # * With arguments: Crunch script run mode.  This script should set up the
2289 #   environment, then run the command specified in the arguments.  This runs
2290 #   inside any Docker container.
2291
2292 use Fcntl ':flock';
2293 use File::Path qw( make_path remove_tree );
2294 use POSIX qw(getcwd);
2295
2296 use constant TASK_TEMPFAIL => 111;
2297
2298 # Map SDK subdirectories to the path environments they belong to.
2299 my %SDK_ENVVARS = ("perl/lib" => "PERLLIB", "ruby/lib" => "RUBYLIB");
2300
2301 my $destdir = $ENV{"CRUNCH_SRC"};
2302 my $archive_hash = $ENV{"CRUNCH_GIT_ARCHIVE_HASH"};
2303 my $repo = $ENV{"CRUNCH_SRC_URL"};
2304 my $install_dir = $ENV{"CRUNCH_INSTALL"} || (getcwd() . "/opt");
2305 my $job_work = $ENV{"JOB_WORK"};
2306 my $task_work = $ENV{"TASK_WORK"};
2307
2308 open(STDOUT_ORIG, ">&", STDOUT);
2309 open(STDERR_ORIG, ">&", STDERR);
2310
2311 for my $dir ($destdir, $job_work, $task_work) {
2312   if ($dir) {
2313     make_path $dir;
2314     -e $dir or die "Failed to create temporary directory ($dir): $!";
2315   }
2316 }
2317
2318 if ($task_work) {
2319   remove_tree($task_work, {keep_root => 1});
2320 }
2321
2322 ### Crunch script run mode
2323 if (@ARGV) {
2324   # We want to do routine logging during task 0 only.  This gives the user
2325   # the information they need, but avoids repeating the information for every
2326   # task.
2327   my $Log;
2328   if ($ENV{TASK_SEQUENCE} eq "0") {
2329     $Log = sub {
2330       my $msg = shift;
2331       printf STDERR_ORIG "[Crunch] $msg\n", @_;
2332     };
2333   } else {
2334     $Log = sub { };
2335   }
2336
2337   my $python_src = "$install_dir/python";
2338   my $venv_dir = "$job_work/.arvados.venv";
2339   my $venv_built = -e "$venv_dir/bin/activate";
2340   if ((!$venv_built) and (-d $python_src) and can_run("virtualenv")) {
2341     shell_or_die(undef, "virtualenv", "--quiet", "--system-site-packages",
2342                  "--python=python2.7", $venv_dir);
2343     shell_or_die(TASK_TEMPFAIL, "$venv_dir/bin/pip", "--quiet", "install", "-I", $python_src);
2344     $venv_built = 1;
2345     $Log->("Built Python SDK virtualenv");
2346   }
2347
2348   my @pysdk_version_cmd = ("python", "-c",
2349     "from pkg_resources import get_distribution as get; print get('arvados-python-client').version");
2350   if ($venv_built) {
2351     $Log->("Running in Python SDK virtualenv");
2352     @pysdk_version_cmd = ();
2353     my $orig_argv = join(" ", map { quotemeta($_); } @ARGV);
2354     @ARGV = ("/bin/sh", "-ec",
2355              ". \Q$venv_dir/bin/activate\E; exec $orig_argv");
2356   } elsif (-d $python_src) {
2357     $Log->("Warning: virtualenv not found inside Docker container default " .
2358            "\$PATH. Can't install Python SDK.");
2359   }
2360
2361   if (@pysdk_version_cmd) {
2362     open(my $pysdk_version_pipe, "-|", @pysdk_version_cmd);
2363     my $pysdk_version = <$pysdk_version_pipe>;
2364     close($pysdk_version_pipe);
2365     if ($? == 0) {
2366       chomp($pysdk_version);
2367       $Log->("Using Arvados SDK version $pysdk_version");
2368     } else {
2369       # A lot could've gone wrong here, but pretty much all of it means that
2370       # Python won't be able to load the Arvados SDK.
2371       $Log->("Warning: Arvados SDK not found");
2372     }
2373   }
2374
2375   while (my ($sdk_dir, $sdk_envkey) = each(%SDK_ENVVARS)) {
2376     my $sdk_path = "$install_dir/$sdk_dir";
2377     if (-d $sdk_path) {
2378       if ($ENV{$sdk_envkey}) {
2379         $ENV{$sdk_envkey} = "$sdk_path:" . $ENV{$sdk_envkey};
2380       } else {
2381         $ENV{$sdk_envkey} = $sdk_path;
2382       }
2383       $Log->("Arvados SDK added to %s", $sdk_envkey);
2384     }
2385   }
2386
2387   exec(@ARGV);
2388   die "Cannot exec `@ARGV`: $!";
2389 }
2390
2391 ### Installation mode
2392 open L, ">", "$destdir.lock" or die "$destdir.lock: $!";
2393 flock L, LOCK_EX;
2394 if (readlink ("$destdir.archive_hash") eq $archive_hash && -d $destdir) {
2395   # This exact git archive (source + arvados sdk) is already installed
2396   # here, so there's no need to reinstall it.
2397
2398   # We must consume our DATA section, though: otherwise the process
2399   # feeding it to us will get SIGPIPE.
2400   my $buf;
2401   while (read(DATA, $buf, 65536)) { }
2402
2403   exit(0);
2404 }
2405
2406 unlink "$destdir.archive_hash";
2407 mkdir $destdir;
2408
2409 do {
2410   # Ignore SIGPIPE: we check retval of close() instead. See perlipc(1).
2411   local $SIG{PIPE} = "IGNORE";
2412   warn "Extracting archive: $archive_hash\n";
2413   # --ignore-zeros is necessary sometimes: depending on how much NUL
2414   # padding tar -A put on our combined archive (which in turn depends
2415   # on the length of the component archives) tar without
2416   # --ignore-zeros will exit before consuming stdin and cause close()
2417   # to fail on the resulting SIGPIPE.
2418   if (!open(TARX, "|-", "tar", "--ignore-zeros", "-xC", $destdir)) {
2419     die "Error launching 'tar -xC $destdir': $!";
2420   }
2421   # If we send too much data to tar in one write (> 4-5 MiB), it stops, and we
2422   # get SIGPIPE.  We must feed it data incrementally.
2423   my $tar_input;
2424   while (read(DATA, $tar_input, 65536)) {
2425     print TARX $tar_input;
2426   }
2427   if(!close(TARX)) {
2428     die "'tar -xC $destdir' exited $?: $!";
2429   }
2430 };
2431
2432 mkdir $install_dir;
2433
2434 my $sdk_root = "$destdir/.arvados.sdk/sdk";
2435 if (-d $sdk_root) {
2436   foreach my $sdk_lang (("python",
2437                          map { (split /\//, $_, 2)[0]; } keys(%SDK_ENVVARS))) {
2438     if (-d "$sdk_root/$sdk_lang") {
2439       if (!rename("$sdk_root/$sdk_lang", "$install_dir/$sdk_lang")) {
2440         die "Failed to install $sdk_lang SDK: $!";
2441       }
2442     }
2443   }
2444 }
2445
2446 my $python_dir = "$install_dir/python";
2447 if ((-d $python_dir) and can_run("python2.7")) {
2448   open(my $egg_info_pipe, "-|",
2449        "python2.7 \Q$python_dir/setup.py\E egg_info 2>&1 >/dev/null");
2450   my @egg_info_errors = <$egg_info_pipe>;
2451   close($egg_info_pipe);
2452
2453   if ($?) {
2454     if (@egg_info_errors and (($egg_info_errors[-1] =~ /\bgit\b/) or ($egg_info_errors[-1] =~ /\[Errno 2\]/))) {
2455       # egg_info apparently failed because it couldn't ask git for a build tag.
2456       # Specify no build tag.
2457       open(my $pysdk_cfg, ">>", "$python_dir/setup.cfg");
2458       print $pysdk_cfg "\n[egg_info]\ntag_build =\n";
2459       close($pysdk_cfg);
2460     } else {
2461       my $egg_info_exit = $? >> 8;
2462       foreach my $errline (@egg_info_errors) {
2463         warn $errline;
2464       }
2465       warn "python setup.py egg_info failed: exit $egg_info_exit";
2466       exit ($egg_info_exit || 1);
2467     }
2468   }
2469 }
2470
2471 # Hide messages from the install script (unless it fails: shell_or_die
2472 # will show $destdir.log in that case).
2473 open(STDOUT, ">>", "$destdir.log");
2474 open(STDERR, ">&", STDOUT);
2475
2476 if (-e "$destdir/crunch_scripts/install") {
2477     shell_or_die (undef, "$destdir/crunch_scripts/install", $install_dir);
2478 } elsif (!-e "./install.sh" && -e "./tests/autotests.sh") {
2479     # Old version
2480     shell_or_die (undef, "./tests/autotests.sh", $install_dir);
2481 } elsif (-e "./install.sh") {
2482     shell_or_die (undef, "./install.sh", $install_dir);
2483 }
2484
2485 if ($archive_hash) {
2486     unlink "$destdir.archive_hash.new";
2487     symlink ($archive_hash, "$destdir.archive_hash.new") or die "$destdir.archive_hash.new: $!";
2488     rename ("$destdir.archive_hash.new", "$destdir.archive_hash") or die "$destdir.archive_hash: $!";
2489 }
2490
2491 close L;
2492
2493 sub can_run {
2494   my $command_name = shift;
2495   open(my $which, "-|", "which", $command_name);
2496   while (<$which>) { }
2497   close($which);
2498   return ($? == 0);
2499 }
2500
2501 sub shell_or_die
2502 {
2503   my $exitcode = shift;
2504
2505   if ($ENV{"DEBUG"}) {
2506     print STDERR "@_\n";
2507   }
2508   if (system (@_) != 0) {
2509     my $err = $!;
2510     my $code = $?;
2511     my $exitstatus = sprintf("exit %d signal %d", $code >> 8, $code & 0x7f);
2512     open STDERR, ">&STDERR_ORIG";
2513     system ("cat $destdir.log >&2");
2514     warn "@_ failed ($err): $exitstatus";
2515     if (defined($exitcode)) {
2516       exit $exitcode;
2517     }
2518     else {
2519       exit (($code >> 8) || 1);
2520     }
2521   }
2522 }
2523
2524 __DATA__