sdk/cli/bin/crunch-job

   1 #!/usr/bin/env perl
   2 # -*- mode: perl; perl-indent-level: 2; indent-tabs-mode: nil; -*-
   3 # Copyright (C) The Arvados Authors. All rights reserved.
   4 #
   5 # SPDX-License-Identifier: AGPL-3.0
   6
   7 =head1 NAME
   8
   9 crunch-job: Execute job steps, save snapshots as requested, collate output.
  10
  11 =head1 SYNOPSIS
  12
  13 Obtain job details from Arvados, run tasks on compute nodes (typically
  14 invoked by scheduler on controller):
  15
  16  crunch-job --job x-y-z --git-dir /path/to/repo/.git
  17
  18 Obtain job details from command line, run tasks on local machine
  19 (typically invoked by application or developer on VM):
  20
  21  crunch-job --job '{"script_version":"/path/to/working/tree","script":"scriptname",...}'
  22
  23  crunch-job --job '{"repository":"https://github.com/curoverse/arvados.git","script_version":"master","script":"scriptname",...}'
  24
  25 =head1 OPTIONS
  26
  27 =over
  28
  29 =item --force-unlock
  30
  31 If the job is already locked, steal the lock and run it anyway.
  32
  33 =item --git-dir
  34
  35 Path to a .git directory (or a git URL) where the commit given in the
  36 job's C<script_version> attribute is to be found. If this is I<not>
  37 given, the job's C<repository> attribute will be used.
  38
  39 =item --job-api-token
  40
  41 Arvados API authorization token to use during the course of the job.
  42
  43 =item --no-clear-tmp
  44
  45 Do not clear per-job/task temporary directories during initial job
  46 setup. This can speed up development and debugging when running jobs
  47 locally.
  48
  49 =item --job
  50
  51 UUID of the job to run, or a JSON-encoded job resource without a
  52 UUID. If the latter is given, a new job object will be created.
  53
  54 =back
  55
  56 =head1 RUNNING JOBS LOCALLY
  57
  58 crunch-job's log messages appear on stderr along with the job tasks'
  59 stderr streams. The log is saved in Keep at each checkpoint and when
  60 the job finishes.
  61
  62 If the job succeeds, the job's output locator is printed on stdout.
  63
  64 While the job is running, the following signals are accepted:
  65
  66 =over
  67
  68 =item control-C, SIGINT, SIGQUIT
  69
  70 Save a checkpoint, terminate any job tasks that are running, and stop.
  71
  72 =item SIGALRM
  73
  74 Save a checkpoint and continue.
  75
  76 =item SIGHUP
  77
  78 Refresh node allocation (i.e., check whether any nodes have been added
  79 or unallocated) and attributes of the Job record that should affect
  80 behavior (e.g., cancel job if cancelled_at becomes non-nil).
  81
  82 =back
  83
  84 =cut
  85
  86
  87 use strict;
  88 use POSIX ':sys_wait_h';
  89 use POSIX qw(strftime);
  90 use Fcntl qw(F_GETFL F_SETFL O_NONBLOCK);
  91 use Arvados;
  92 use Cwd qw(realpath);
  93 use Data::Dumper;
  94 use Digest::MD5 qw(md5_hex);
  95 use Getopt::Long;
  96 use IPC::Open2;
  97 use IO::Select;
  98 use File::Temp;
  99 use Fcntl ':flock';
 100 use File::Path qw( make_path remove_tree );
 101
 102 use constant TASK_TEMPFAIL => 111;
 103 use constant EX_TEMPFAIL => 75;
 104 use constant EX_RETRY_UNLOCKED => 93;
 105
 106 $ENV{"TMPDIR"} ||= "/tmp";
 107 unless (defined $ENV{"CRUNCH_TMP"}) {
 108   $ENV{"CRUNCH_TMP"} = $ENV{"TMPDIR"} . "/crunch-job";
 109   if ($ENV{"USER"} ne "crunch" && $< != 0) {
 110     # use a tmp dir unique for my uid
 111     $ENV{"CRUNCH_TMP"} .= "-$<";
 112   }
 113 }
 114
 115 # Create the tmp directory if it does not exist
 116 if ( ! -d $ENV{"CRUNCH_TMP"} ) {
 117   make_path $ENV{"CRUNCH_TMP"} or die "Failed to create temporary working directory: " . $ENV{"CRUNCH_TMP"};
 118 }
 119
 120 $ENV{"JOB_WORK"} = $ENV{"CRUNCH_TMP"} . "/work";
 121 $ENV{"CRUNCH_INSTALL"} = "$ENV{CRUNCH_TMP}/opt";
 122 $ENV{"CRUNCH_WORK"} = $ENV{"JOB_WORK"}; # deprecated
 123 mkdir ($ENV{"JOB_WORK"});
 124
 125 my %proc;
 126 my $force_unlock;
 127 my $git_dir;
 128 my $jobspec;
 129 my $job_api_token;
 130 my $no_clear_tmp;
 131 my $resume_stash;
 132 my $cgroup_root = "/sys/fs/cgroup";
 133 my $docker_bin = "docker.io";
 134 my $docker_run_args = "";
 135 my $srun_sync_timeout = 15*60;
 136 GetOptions('force-unlock' => \$force_unlock,
 137            'git-dir=s' => \$git_dir,
 138            'job=s' => \$jobspec,
 139            'job-api-token=s' => \$job_api_token,
 140            'no-clear-tmp' => \$no_clear_tmp,
 141            'resume-stash=s' => \$resume_stash,
 142            'cgroup-root=s' => \$cgroup_root,
 143            'docker-bin=s' => \$docker_bin,
 144            'docker-run-args=s' => \$docker_run_args,
 145            'srun-sync-timeout=i' => \$srun_sync_timeout,
 146     );
 147
 148 if (defined $job_api_token) {
 149   $ENV{ARVADOS_API_TOKEN} = $job_api_token;
 150 }
 151
 152 my $have_slurm = exists $ENV{SLURM_JOB_ID} && exists $ENV{SLURM_NODELIST};
 153
 154
 155 $SIG{'USR1'} = sub
 156 {
 157   $main::ENV{CRUNCH_DEBUG} = 1;
 158 };
 159 $SIG{'USR2'} = sub
 160 {
 161   $main::ENV{CRUNCH_DEBUG} = 0;
 162 };
 163
 164 my $arv = Arvados->new('apiVersion' => 'v1');
 165
 166 my $Job;
 167 my $job_id;
 168 my $dbh;
 169 my $sth;
 170 my @jobstep;
 171
 172 my $local_job;
 173 if ($jobspec =~ /^[-a-z\d]+$/)
 174 {
 175   # $jobspec is an Arvados UUID, not a JSON job specification
 176   $Job = api_call("jobs/get", uuid => $jobspec);
 177   $local_job = 0;
 178 }
 179 else
 180 {
 181   $local_job = JSON::decode_json($jobspec);
 182 }
 183
 184
 185 # Make sure our workers (our slurm nodes, localhost, or whatever) are
 186 # at least able to run basic commands: they aren't down or severely
 187 # misconfigured.
 188 my $cmd = ['true'];
 189 if (($Job || $local_job)->{docker_image_locator}) {
 190   $cmd = [$docker_bin, 'ps', '-q'];
 191 }
 192 Log(undef, "Sanity check is `@$cmd`");
 193 my ($exited, $stdout, $stderr, $tempfail) = srun_sync(
 194   ["srun", "--nodes=\Q$ENV{SLURM_NNODES}\E", "--ntasks-per-node=1"],
 195   $cmd,
 196   {label => "sanity check"});
 197 if ($exited != 0) {
 198   Log(undef, "Sanity check failed: ".exit_status_s($exited));
 199   exit EX_TEMPFAIL;
 200 }
 201 Log(undef, "Sanity check OK");
 202
 203
 204 my $User = api_call("users/current");
 205
 206 if (!$local_job) {
 207   if (!$force_unlock) {
 208     # Claim this job, and make sure nobody else does
 209     eval { api_call("jobs/lock", uuid => $Job->{uuid}); };
 210     if ($@) {
 211       Log(undef, "Error while locking job, exiting ".EX_TEMPFAIL);
 212       exit EX_TEMPFAIL;
 213     };
 214   }
 215 }
 216 else
 217 {
 218   if (!$resume_stash)
 219   {
 220     map { croak ("No $_ specified") unless $local_job->{$_} }
 221     qw(script script_version script_parameters);
 222   }
 223
 224   $local_job->{'is_locked_by_uuid'} = $User->{'uuid'};
 225   $local_job->{'started_at'} = gmtime;
 226   $local_job->{'state'} = 'Running';
 227
 228   $Job = api_call("jobs/create", job => $local_job);
 229 }
 230 $job_id = $Job->{'uuid'};
 231
 232 my $keep_logfile = $job_id . '.log.txt';
 233 log_writer_start($keep_logfile);
 234
 235 $Job->{'runtime_constraints'} ||= {};
 236 $Job->{'runtime_constraints'}->{'max_tasks_per_node'} ||= 0;
 237 my $max_ncpus = $Job->{'runtime_constraints'}->{'max_tasks_per_node'};
 238
 239 my $gem_versions = `gem list --quiet arvados-cli 2>/dev/null`;
 240 if ($? == 0) {
 241   $gem_versions =~ s/^arvados-cli \(/ with arvados-cli Gem version(s) /;
 242   chomp($gem_versions);
 243   chop($gem_versions);  # Closing parentheses
 244 } else {
 245   $gem_versions = "";
 246 }
 247 Log(undef,
 248     "running from " . ((-e $0) ? realpath($0) : "stdin") . $gem_versions);
 249
 250 Log (undef, "check slurm allocation");
 251 my @slot;
 252 my @node;
 253 # Should use $ENV{SLURM_TASKS_PER_NODE} instead of sinfo? (eg. "4(x3),2,4(x2)")
 254 my @sinfo;
 255 if (!$have_slurm)
 256 {
 257   my $localcpus = 0 + `grep -cw ^processor /proc/cpuinfo` || 1;
 258   push @sinfo, "$localcpus localhost";
 259 }
 260 if (exists $ENV{SLURM_NODELIST})
 261 {
 262   push @sinfo, `sinfo -h --format='%c %N' --nodes=\Q$ENV{SLURM_NODELIST}\E`;
 263 }
 264 foreach (@sinfo)
 265 {
 266   my ($ncpus, $slurm_nodelist) = split;
 267   $ncpus = $max_ncpus if $max_ncpus && $ncpus > $max_ncpus;
 268
 269   my @nodelist;
 270   while ($slurm_nodelist =~ s/^([^\[,]+?(\[.*?\])?)(,|$)//)
 271   {
 272     my $nodelist = $1;
 273     if ($nodelist =~ /\[((\d+)(-(\d+))?(,(\d+)(-(\d+))?)*)\]/)
 274     {
 275       my $ranges = $1;
 276       foreach (split (",", $ranges))
 277       {
 278         my ($a, $b);
 279         if (/(\d+)-(\d+)/)
 280         {
 281           $a = $1;
 282           $b = $2;
 283         }
 284         else
 285         {
 286           $a = $_;
 287           $b = $_;
 288         }
 289         push @nodelist, map {
 290           my $n = $nodelist;
 291           $n =~ s/\[[-,\d]+\]/$_/;
 292           $n;
 293         } ($a..$b);
 294       }
 295     }
 296     else
 297     {
 298       push @nodelist, $nodelist;
 299     }
 300   }
 301   foreach my $nodename (@nodelist)
 302   {
 303     Log (undef, "node $nodename - $ncpus slots");
 304     my $node = { name => $nodename,
 305                  ncpus => $ncpus,
 306                  # The number of consecutive times a task has been dispatched
 307                  # to this node and failed.
 308                  losing_streak => 0,
 309                  # The number of consecutive times that SLURM has reported
 310                  # a node failure since the last successful task.
 311                  fail_count => 0,
 312                  # Don't dispatch work to this node until this time
 313                  # (in seconds since the epoch) has passed.
 314                  hold_until => 0 };
 315     foreach my $cpu (1..$ncpus)
 316     {
 317       push @slot, { node => $node,
 318                     cpu => $cpu };
 319     }
 320   }
 321   push @node, @nodelist;
 322 }
 323
 324
 325
 326 # Ensure that we get one jobstep running on each allocated node before
 327 # we start overloading nodes with concurrent steps
 328
 329 @slot = sort { $a->{cpu} <=> $b->{cpu} } @slot;
 330
 331
 332 $Job->update_attributes(
 333   'tasks_summary' => { 'failed' => 0,
 334                        'todo' => 1,
 335                        'running' => 0,
 336                        'done' => 0 });
 337
 338 Log (undef, "start");
 339 $SIG{'INT'} = sub { $main::please_freeze = 1; };
 340 $SIG{'QUIT'} = sub { $main::please_freeze = 1; };
 341 $SIG{'TERM'} = \&croak;
 342 $SIG{'TSTP'} = sub { $main::please_freeze = 1; };
 343 $SIG{'ALRM'} = sub { $main::please_info = 1; };
 344 $SIG{'CONT'} = sub { $main::please_continue = 1; };
 345 $SIG{'HUP'} = sub { $main::please_refresh = 1; };
 346
 347 $main::please_freeze = 0;
 348 $main::please_info = 0;
 349 $main::please_continue = 0;
 350 $main::please_refresh = 0;
 351 my $jobsteps_must_output_keys = 0;      # becomes 1 when any task outputs a key
 352
 353 grep { $ENV{$1} = $2 if /^(NOCACHE.*?)=(.*)/ } split ("\n", $$Job{knobs});
 354 $ENV{"CRUNCH_JOB_UUID"} = $job_id;
 355 $ENV{"JOB_UUID"} = $job_id;
 356
 357
 358 my @jobstep_todo = ();
 359 my @jobstep_done = ();
 360 my @jobstep_tomerge = ();
 361 my $jobstep_tomerge_level = 0;
 362 my $squeue_checked = 0;
 363 my $sinfo_checked = 0;
 364 my $latest_refresh = scalar time;
 365
 366
 367
 368 if (defined $Job->{thawedfromkey})
 369 {
 370   thaw ($Job->{thawedfromkey});
 371 }
 372 else
 373 {
 374   my $first_task = api_call("job_tasks/create", job_task => {
 375     'job_uuid' => $Job->{'uuid'},
 376     'sequence' => 0,
 377     'qsequence' => 0,
 378     'parameters' => {},
 379   });
 380   push @jobstep, { 'level' => 0,
 381                    'failures' => 0,
 382                    'arvados_task' => $first_task,
 383                  };
 384   push @jobstep_todo, 0;
 385 }
 386
 387
 388 if (!$have_slurm)
 389 {
 390   must_lock_now("$ENV{CRUNCH_TMP}/.lock", "a job is already running here.");
 391 }
 392
 393 my $build_script = handle_readall(\*DATA);
 394 my $nodelist = join(",", @node);
 395 my $git_tar_count = 0;
 396
 397 if (!defined $no_clear_tmp) {
 398   # Find FUSE mounts under $CRUNCH_TMP and unmount them.  Then clean
 399   # up work directories crunch_tmp/work, crunch_tmp/opt,
 400   # crunch_tmp/src*.
 401   my ($exited, $stdout, $stderr, $tempfail) = srun_sync(
 402     ["srun", "--nodelist=$nodelist", "-D", $ENV{'TMPDIR'}],
 403     ['bash', '-ec', q{
 404 arv-mount --unmount-timeout 10 --unmount-all ${CRUNCH_TMP}
 405 rm -rf ${JOB_WORK} ${CRUNCH_INSTALL} ${CRUNCH_TMP}/task ${CRUNCH_TMP}/src* ${CRUNCH_TMP}/*.cid
 406     }],
 407     {label => "clean work dirs"});
 408   if ($exited != 0) {
 409     exit_retry_unlocked();
 410   }
 411 }
 412
 413 # If this job requires a Docker image, install that.
 414 my ($docker_locator, $docker_stream, $docker_hash, $docker_limitmem, $dockeruserarg);
 415 if ($docker_locator = $Job->{docker_image_locator}) {
 416   Log (undef, "Install docker image $docker_locator");
 417   ($docker_stream, $docker_hash) = find_docker_image($docker_locator);
 418   if (!$docker_hash)
 419   {
 420     croak("No Docker image hash found from locator $docker_locator");
 421   }
 422   Log (undef, "docker image hash is $docker_hash");
 423   $docker_stream =~ s/^\.//;
 424   my $docker_install_script = qq{
 425 loaded() {
 426   id=\$($docker_bin inspect --format="{{.ID}}" \Q$docker_hash\E) || return 1
 427   echo "image ID is \$id"
 428   [[ \${id} = \Q$docker_hash\E ]]
 429 }
 430 if loaded >&2 2>/dev/null; then
 431   echo >&2 "image is already present"
 432   exit 0
 433 fi
 434 echo >&2 "docker image is not present; loading"
 435 arv-get \Q$docker_locator$docker_stream/$docker_hash.tar\E | $docker_bin load
 436 if ! loaded >&2; then
 437   echo >&2 "`docker load` exited 0, but image is not found (!)"
 438   exit 1
 439 fi
 440 echo >&2 "image loaded successfully"
 441 };
 442
 443   my ($exited, $stdout, $stderr, $tempfail) = srun_sync(
 444     ["srun", "--nodelist=" . join(',', @node)],
 445     ["/bin/bash", "-o", "pipefail", "-ec", $docker_install_script],
 446     {label => "load docker image"});
 447   if ($exited != 0)
 448   {
 449     exit_retry_unlocked();
 450   }
 451
 452   # Determine whether this version of Docker supports memory+swap limits.
 453   ($exited, $stdout, $stderr, $tempfail) = srun_sync(
 454     ["srun", "--nodes=1"],
 455     [$docker_bin, 'run', '--help'],
 456     {label => "check --memory-swap feature"});
 457   if ($tempfail) {
 458     exit_retry_unlocked();
 459   }
 460   $docker_limitmem = ($stdout =~ /--memory-swap/);
 461
 462   # Find a non-root Docker user to use.
 463   # Tries the default user for the container, then 'crunch', then 'nobody',
 464   # testing for whether the actual user id is non-zero.  This defends against
 465   # mistakes but not malice, but we intend to harden the security in the future
 466   # so we don't want anyone getting used to their jobs running as root in their
 467   # Docker containers.
 468   my @tryusers = ("", "crunch", "nobody");
 469   foreach my $try_user (@tryusers) {
 470     my $label;
 471     my $try_user_arg;
 472     if ($try_user eq "") {
 473       $label = "check whether default user is UID 0";
 474       $try_user_arg = "";
 475     } else {
 476       $label = "check whether user '$try_user' is UID 0";
 477       $try_user_arg = "--user=$try_user";
 478     }
 479     my ($exited, $stdout, $stderr, $tempfail) = srun_sync(
 480       ["srun", "--nodes=1"],
 481       ["/bin/sh", "-ec",
 482        "$docker_bin run $docker_run_args $try_user_arg $docker_hash id --user"],
 483       {label => $label});
 484     chomp($stdout);
 485     if ($exited == 0 && $stdout =~ /^\d+$/ && $stdout > 0) {
 486       $dockeruserarg = $try_user_arg;
 487       if ($try_user eq "") {
 488         Log(undef, "Container will run with default user");
 489       } else {
 490         Log(undef, "Container will run with $dockeruserarg");
 491       }
 492       last;
 493     } elsif ($tempfail) {
 494       exit_retry_unlocked();
 495     }
 496   }
 497
 498   if (!defined $dockeruserarg) {
 499     croak("Could not find a user in container that is not UID 0 (tried default user, @tryusers) or there was a problem running 'id' in the container.");
 500   }
 501
 502   if ($Job->{arvados_sdk_version}) {
 503     # The job also specifies an Arvados SDK version.  Add the SDKs to the
 504     # tar file for the build script to install.
 505     Log(undef, sprintf("Packing Arvados SDK version %s for installation",
 506                        $Job->{arvados_sdk_version}));
 507     add_git_archive("git", "--git-dir=$git_dir", "archive",
 508                     "--prefix=.arvados.sdk/",
 509                     $Job->{arvados_sdk_version}, "sdk");
 510   }
 511 }
 512
 513 if (!defined $git_dir && $Job->{'script_version'} =~ m{^/}) {
 514   # If script_version looks like an absolute path, *and* the --git-dir
 515   # argument was not given -- which implies we were not invoked by
 516   # crunch-dispatch -- we will use the given path as a working
 517   # directory instead of resolving script_version to a git commit (or
 518   # doing anything else with git).
 519   $ENV{"CRUNCH_SRC_COMMIT"} = $Job->{'script_version'};
 520   $ENV{"CRUNCH_SRC"} = $Job->{'script_version'};
 521 }
 522 else {
 523   # Resolve the given script_version to a git commit sha1. Also, if
 524   # the repository is remote, clone it into our local filesystem: this
 525   # ensures "git archive" will work, and is necessary to reliably
 526   # resolve a symbolic script_version like "master^".
 527   $ENV{"CRUNCH_SRC"} = "$ENV{CRUNCH_TMP}/src";
 528
 529   Log (undef, "Looking for version ".$Job->{script_version}." from repository ".$Job->{repository});
 530
 531   $ENV{"CRUNCH_SRC_COMMIT"} = $Job->{script_version};
 532
 533   # If we're running under crunch-dispatch, it will have already
 534   # pulled the appropriate source tree into its own repository, and
 535   # given us that repo's path as $git_dir.
 536   #
 537   # If we're running a "local" job, we might have to fetch content
 538   # from a remote repository.
 539   #
 540   # (Currently crunch-dispatch gives a local path with --git-dir, but
 541   # we might as well accept URLs there too in case it changes its
 542   # mind.)
 543   my $repo = $git_dir || $Job->{'repository'};
 544
 545   # Repository can be remote or local. If remote, we'll need to fetch it
 546   # to a local dir before doing `git log` et al.
 547   my $repo_location;
 548
 549   if ($repo =~ m{://|^[^/]*:}) {
 550     # $repo is a git url we can clone, like git:// or https:// or
 551     # file:/// or [user@]host:repo.git. Note "user/name@host:foo" is
 552     # not recognized here because distinguishing that from a local
 553     # path is too fragile. If you really need something strange here,
 554     # use the ssh:// form.
 555     $repo_location = 'remote';
 556   } elsif ($repo =~ m{^\.*/}) {
 557     # $repo is a local path to a git index. We'll also resolve ../foo
 558     # to ../foo/.git if the latter is a directory. To help
 559     # disambiguate local paths from named hosted repositories, this
 560     # form must be given as ./ or ../ if it's a relative path.
 561     if (-d "$repo/.git") {
 562       $repo = "$repo/.git";
 563     }
 564     $repo_location = 'local';
 565   } else {
 566     # $repo is none of the above. It must be the name of a hosted
 567     # repository.
 568     my $arv_repo_list = api_call("repositories/list",
 569                                  'filters' => [['name','=',$repo]]);
 570     my @repos_found = @{$arv_repo_list->{'items'}};
 571     my $n_found = $arv_repo_list->{'serverResponse'}->{'items_available'};
 572     if ($n_found > 0) {
 573       Log(undef, "Repository '$repo' -> "
 574           . join(", ", map { $_->{'uuid'} } @repos_found));
 575     }
 576     if ($n_found != 1) {
 577       croak("Error: Found $n_found repositories with name '$repo'.");
 578     }
 579     $repo = $repos_found[0]->{'fetch_url'};
 580     $repo_location = 'remote';
 581   }
 582   Log(undef, "Using $repo_location repository '$repo'");
 583   $ENV{"CRUNCH_SRC_URL"} = $repo;
 584
 585   # Resolve given script_version (we'll call that $treeish here) to a
 586   # commit sha1 ($commit).
 587   my $treeish = $Job->{'script_version'};
 588   my $commit;
 589   if ($repo_location eq 'remote') {
 590     # We minimize excess object-fetching by re-using the same bare
 591     # repository in CRUNCH_TMP/.git for multiple crunch-jobs -- we
 592     # just keep adding remotes to it as needed.
 593     my $local_repo = $ENV{'CRUNCH_TMP'}."/.git";
 594     my $gitcmd = "git --git-dir=\Q$local_repo\E";
 595
 596     # Set up our local repo for caching remote objects, making
 597     # archives, etc.
 598     if (!-d $local_repo) {
 599       make_path($local_repo) or croak("Error: could not create $local_repo");
 600     }
 601     # This works (exits 0 and doesn't delete fetched objects) even
 602     # if $local_repo is already initialized:
 603     `$gitcmd init --bare`;
 604     if ($?) {
 605       croak("Error: $gitcmd init --bare exited ".exit_status_s($?));
 606     }
 607
 608     # If $treeish looks like a hash (or abbrev hash) we look it up in
 609     # our local cache first, since that's cheaper. (We don't want to
 610     # do that with tags/branches though -- those change over time, so
 611     # they should always be resolved by the remote repo.)
 612     if ($treeish =~ /^[0-9a-f]{7,40}$/s) {
 613       # Hide stderr because it's normal for this to fail:
 614       my $sha1 = `$gitcmd rev-list -n1 ''\Q$treeish\E 2>/dev/null`;
 615       if ($? == 0 &&
 616           # Careful not to resolve a branch named abcdeff to commit 1234567:
 617           $sha1 =~ /^$treeish/ &&
 618           $sha1 =~ /^([0-9a-f]{40})$/s) {
 619         $commit = $1;
 620         Log(undef, "Commit $commit already present in $local_repo");
 621       }
 622     }
 623
 624     if (!defined $commit) {
 625       # If $treeish isn't just a hash or abbrev hash, or isn't here
 626       # yet, we need to fetch the remote to resolve it correctly.
 627
 628       # First, remove all local heads. This prevents a name that does
 629       # not exist on the remote from resolving to (or colliding with)
 630       # a previously fetched branch or tag (possibly from a different
 631       # remote).
 632       remove_tree("$local_repo/refs/heads", {keep_root => 1});
 633
 634       Log(undef, "Fetching objects from $repo to $local_repo");
 635       `$gitcmd fetch --no-progress --tags ''\Q$repo\E \Q+refs/heads/*:refs/heads/*\E`;
 636       if ($?) {
 637         croak("Error: `$gitcmd fetch` exited ".exit_status_s($?));
 638       }
 639     }
 640
 641     # Now that the data is all here, we will use our local repo for
 642     # the rest of our git activities.
 643     $repo = $local_repo;
 644   }
 645
 646   my $gitcmd = "git --git-dir=\Q$repo\E";
 647   my $sha1 = `$gitcmd rev-list -n1 ''\Q$treeish\E`;
 648   unless ($? == 0 && $sha1 =~ /^([0-9a-f]{40})$/) {
 649     croak("`$gitcmd rev-list` exited "
 650           .exit_status_s($?)
 651           .", '$treeish' not found, giving up");
 652   }
 653   $commit = $1;
 654   Log(undef, "Version $treeish is commit $commit");
 655
 656   if ($commit ne $Job->{'script_version'}) {
 657     # Record the real commit id in the database, frozentokey, logs,
 658     # etc. -- instead of an abbreviation or a branch name which can
 659     # become ambiguous or point to a different commit in the future.
 660     if (!$Job->update_attributes('script_version' => $commit)) {
 661       croak("Error: failed to update job's script_version attribute");
 662     }
 663   }
 664
 665   $ENV{"CRUNCH_SRC_COMMIT"} = $commit;
 666   add_git_archive("$gitcmd archive ''\Q$commit\E");
 667 }
 668
 669 my $git_archive = combined_git_archive();
 670 if (!defined $git_archive) {
 671   Log(undef, "Skip install phase (no git archive)");
 672   if ($have_slurm) {
 673     Log(undef, "Warning: This probably means workers have no source tree!");
 674   }
 675 }
 676 else {
 677   my $exited;
 678   my $install_script_tries_left = 3;
 679   for (my $attempts = 0; $attempts < 3; $attempts++) {
 680     my @srunargs = ("srun",
 681                     "--nodelist=$nodelist",
 682                     "-D", $ENV{'TMPDIR'}, "--job-name=$job_id");
 683     my @execargs = ("sh", "-c",
 684                     "mkdir -p $ENV{CRUNCH_INSTALL} && cd $ENV{CRUNCH_TMP} && perl -");
 685
 686     $ENV{"CRUNCH_GIT_ARCHIVE_HASH"} = md5_hex($git_archive);
 687     my ($stdout, $stderr, $tempfail);
 688     ($exited, $stdout, $stderr, $tempfail) = srun_sync(
 689       \@srunargs, \@execargs,
 690       {label => "run install script on all workers"},
 691         $build_script . $git_archive);
 692     if ($tempfail) {
 693       exit_retry_unlocked();
 694     }
 695
 696     my $stderr_anything_from_script = 0;
 697     for my $line (split(/\n/, $stderr)) {
 698       if ($line !~ /^(srun: error: |starting: \[)/) {
 699         $stderr_anything_from_script = 1;
 700       }
 701     }
 702
 703     last if $exited == 0 || $main::please_freeze;
 704
 705     # If the install script fails but doesn't print an error message,
 706     # the next thing anyone is likely to do is just run it again in
 707     # case it was a transient problem like "slurm communication fails
 708     # because the network isn't reliable enough". So we'll just do
 709     # that ourselves (up to 3 attempts in total). OTOH, if there is an
 710     # error message, the problem is more likely to have a real fix and
 711     # we should fail the job so the fixing process can start, instead
 712     # of doing 2 more attempts.
 713     last if $stderr_anything_from_script;
 714   }
 715
 716   foreach my $tar_filename (map { tar_filename_n($_); } (1..$git_tar_count)) {
 717     unlink($tar_filename);
 718   }
 719
 720   if ($exited != 0) {
 721     croak("Giving up");
 722   }
 723 }
 724
 725 foreach (qw (script script_version script_parameters runtime_constraints))
 726 {
 727   Log (undef,
 728        "$_ " .
 729        (ref($Job->{$_}) ? JSON::encode_json($Job->{$_}) : $Job->{$_}));
 730 }
 731 foreach (split (/\n/, $Job->{knobs}))
 732 {
 733   Log (undef, "knob " . $_);
 734 }
 735 my $resp = api_call(
 736   'nodes/list',
 737   'filters' => [['hostname', 'in', \@node]],
 738   'order' => 'hostname',
 739   'limit' => scalar(@node),
 740     );
 741 for my $n (@{$resp->{items}}) {
 742   Log(undef, "$n->{hostname} $n->{uuid} ".JSON::encode_json($n->{properties}));
 743 }
 744
 745
 746
 747 $main::success = undef;
 748
 749
 750
 751 ONELEVEL:
 752
 753 my $thisround_succeeded = 0;
 754 my $thisround_failed = 0;
 755 my $thisround_failed_multiple = 0;
 756 my $working_slot_count = scalar(@slot);
 757
 758 @jobstep_todo = sort { $jobstep[$a]->{level} <=> $jobstep[$b]->{level}
 759                        or $a <=> $b } @jobstep_todo;
 760 my $level = $jobstep[$jobstep_todo[0]]->{level};
 761
 762 my $initial_tasks_this_level = 0;
 763 foreach my $id (@jobstep_todo) {
 764   $initial_tasks_this_level++ if ($jobstep[$id]->{level} == $level);
 765 }
 766
 767 # If the number of tasks scheduled at this level #T is smaller than the number
 768 # of slots available #S, only use the first #T slots, or the first slot on
 769 # each node, whichever number is greater.
 770 #
 771 # When we dispatch tasks later, we'll allocate whole-node resources like RAM
 772 # based on these numbers.  Using fewer slots makes more resources available
 773 # to each individual task, which should normally be a better strategy when
 774 # there are fewer of them running with less parallelism.
 775 #
 776 # Note that this calculation is not redone if the initial tasks at
 777 # this level queue more tasks at the same level.  This may harm
 778 # overall task throughput for that level.
 779 my @freeslot;
 780 if ($initial_tasks_this_level < @node) {
 781   @freeslot = (0..$#node);
 782 } elsif ($initial_tasks_this_level < @slot) {
 783   @freeslot = (0..$initial_tasks_this_level - 1);
 784 } else {
 785   @freeslot = (0..$#slot);
 786 }
 787 my $round_num_freeslots = scalar(@freeslot);
 788 print STDERR "crunch-job have ${round_num_freeslots} free slots for ${initial_tasks_this_level} initial tasks at this level, ".scalar(@node)." nodes, and ".scalar(@slot)." slots\n";
 789
 790 my %round_max_slots = ();
 791 for (my $ii = $#freeslot; $ii >= 0; $ii--) {
 792   my $this_slot = $slot[$freeslot[$ii]];
 793   my $node_name = $this_slot->{node}->{name};
 794   $round_max_slots{$node_name} ||= $this_slot->{cpu};
 795   last if (scalar(keys(%round_max_slots)) >= @node);
 796 }
 797
 798 Log(undef, "start level $level with $round_num_freeslots slots");
 799 my @holdslot;
 800 my %reader;
 801 my $progress_is_dirty = 1;
 802 my $progress_stats_updated = 0;
 803
 804 update_progress_stats();
 805
 806
 807 THISROUND:
 808 for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
 809 {
 810   # Don't create new tasks if we already know the job's final result.
 811   last if defined($main::success);
 812
 813   my $id = $jobstep_todo[$todo_ptr];
 814   my $Jobstep = $jobstep[$id];
 815   if ($Jobstep->{level} != $level)
 816   {
 817     next;
 818   }
 819
 820   pipe $reader{$id}, "writer" or croak("pipe() failed: $!");
 821   set_nonblocking($reader{$id});
 822
 823   my $childslot = $freeslot[0];
 824   my $childnode = $slot[$childslot]->{node};
 825   my $childslotname = join (".",
 826                             $slot[$childslot]->{node}->{name},
 827                             $slot[$childslot]->{cpu});
 828
 829   my $childpid = fork();
 830   if ($childpid == 0)
 831   {
 832     $SIG{'INT'} = 'DEFAULT';
 833     $SIG{'QUIT'} = 'DEFAULT';
 834     $SIG{'TERM'} = 'DEFAULT';
 835
 836     foreach (values (%reader))
 837     {
 838       close($_);
 839     }
 840     fcntl ("writer", F_SETFL, 0) or croak ($!); # no close-on-exec
 841     open(STDOUT,">&writer") or croak ($!);
 842     open(STDERR,">&writer") or croak ($!);
 843
 844     undef $dbh;
 845     undef $sth;
 846
 847     delete $ENV{"GNUPGHOME"};
 848     $ENV{"TASK_UUID"} = $Jobstep->{'arvados_task'}->{'uuid'};
 849     $ENV{"TASK_QSEQUENCE"} = $id;
 850     $ENV{"TASK_SEQUENCE"} = $level;
 851     $ENV{"JOB_SCRIPT"} = $Job->{script};
 852     while (my ($param, $value) = each %{$Job->{script_parameters}}) {
 853       $param =~ tr/a-z/A-Z/;
 854       $ENV{"JOB_PARAMETER_$param"} = $value;
 855     }
 856     $ENV{"TASK_SLOT_NODE"} = $slot[$childslot]->{node}->{name};
 857     $ENV{"TASK_SLOT_NUMBER"} = $slot[$childslot]->{cpu};
 858     $ENV{"TASK_WORK"} = $ENV{"CRUNCH_TMP"}."/task/$childslotname";
 859     $ENV{"HOME"} = $ENV{"TASK_WORK"};
 860     $ENV{"TASK_TMPDIR"} = $ENV{"TASK_WORK"}; # deprecated
 861     $ENV{"CRUNCH_NODE_SLOTS"} = $round_max_slots{$ENV{TASK_SLOT_NODE}};
 862     $ENV{"PATH"} = $ENV{"CRUNCH_INSTALL"} . "/bin:" . $ENV{"PATH"};
 863
 864     my $keep_mnt = $ENV{"TASK_WORK"}.".keep";
 865
 866     $ENV{"GZIP"} = "-n";
 867
 868     my @srunargs = (
 869       "srun",
 870       "--nodelist=".$childnode->{name},
 871       qw(-n1 -c1 -N1 -D), $ENV{'TMPDIR'},
 872       "--job-name=$job_id.$id.$$",
 873         );
 874
 875     my $stdbuf = " stdbuf --output=0 --error=0 ";
 876
 877     my $arv_file_cache = "";
 878     if (defined($Job->{'runtime_constraints'}->{'keep_cache_mb_per_task'})) {
 879       $arv_file_cache = "--file-cache=" . ($Job->{'runtime_constraints'}->{'keep_cache_mb_per_task'} * 1024 * 1024);
 880     }
 881
 882     my $command =
 883         "if [ -e \Q$ENV{TASK_WORK}\E ]; then rm -rf \Q$ENV{TASK_WORK}\E; fi; "
 884         ."mkdir -p \Q$ENV{CRUNCH_TMP}\E \Q$ENV{JOB_WORK}\E \Q$ENV{TASK_WORK}\E \Q$keep_mnt\E "
 885         ."&& cd \Q$ENV{CRUNCH_TMP}\E "
 886         # These environment variables get used explicitly later in
 887         # $command.  No tool is expected to read these values directly.
 888         .q{&& MEM=$(awk '($1 == "MemTotal:"){print $2}' </proc/meminfo) }
 889         .q{&& SWAP=$(awk '($1 == "SwapTotal:"){print $2}' </proc/meminfo) }
 890         ."&& MEMLIMIT=\$(( (\$MEM * 95) / ($ENV{CRUNCH_NODE_SLOTS} * 100) )) "
 891         ."&& let SWAPLIMIT=\$MEMLIMIT+\$SWAP "
 892         .q{&& declare -a VOLUMES=() }
 893         .q{&& if which crunchrunner >/dev/null ; then VOLUMES+=("--volume=$(which crunchrunner):/usr/local/bin/crunchrunner:ro") ; fi }
 894         .q{&& if test -f /etc/ssl/certs/ca-certificates.crt ; then VOLUMES+=("--volume=/etc/ssl/certs/ca-certificates.crt:/etc/arvados/ca-certificates.crt:ro") ; }
 895         .q{elif test -f /etc/pki/tls/certs/ca-bundle.crt ; then VOLUMES+=("--volume=/etc/pki/tls/certs/ca-bundle.crt:/etc/arvados/ca-certificates.crt:ro") ; fi };
 896
 897     $command .= "&& exec arv-mount --read-write --mount-by-pdh=by_pdh --mount-tmp=tmp --crunchstat-interval=10 --allow-other $arv_file_cache \Q$keep_mnt\E --exec ";
 898     $ENV{TASK_KEEPMOUNT} = "$keep_mnt/by_pdh";
 899     $ENV{TASK_KEEPMOUNT_TMP} = "$keep_mnt/tmp";
 900
 901     if ($docker_hash)
 902     {
 903       my $containername = "$Jobstep->{arvados_task}->{uuid}-$Jobstep->{failures}";
 904       my $cidfile = "$ENV{CRUNCH_TMP}/$containername.cid";
 905       $command .= "crunchstat -cgroup-root=\Q$cgroup_root\E -cgroup-parent=docker -cgroup-cid=$cidfile -poll=10000 ";
 906       $command .= "$docker_bin run $docker_run_args --name=$containername --attach=stdout --attach=stderr --attach=stdin -i \Q$dockeruserarg\E --cidfile=$cidfile --sig-proxy ";
 907       # We only set memory limits if Docker lets us limit both memory and swap.
 908       # Memory limits alone have been supported longer, but subprocesses tend
 909       # to get SIGKILL if they exceed that without any swap limit set.
 910       # See #5642 for additional background.
 911       if ($docker_limitmem) {
 912         $command .= "--memory=\${MEMLIMIT}k --memory-swap=\${SWAPLIMIT}k ";
 913       }
 914
 915       # The source tree and $destdir directory (which we have
 916       # installed on the worker host) are available in the container,
 917       # under the same path.
 918       $command .= "--volume=\Q$ENV{CRUNCH_SRC}:$ENV{CRUNCH_SRC}:ro\E ";
 919       $command .= "--volume=\Q$ENV{CRUNCH_INSTALL}:$ENV{CRUNCH_INSTALL}:ro\E ";
 920
 921       # Currently, we make the "by_pdh" directory in arv-mount's mount
 922       # point appear at /keep inside the container (instead of using
 923       # the same path as the host like we do with CRUNCH_SRC and
 924       # CRUNCH_INSTALL). However, crunch scripts and utilities must
 925       # not rely on this. They must use $TASK_KEEPMOUNT.
 926       $command .= "--volume=\Q$ENV{TASK_KEEPMOUNT}:/keep:ro\E ";
 927       $ENV{TASK_KEEPMOUNT} = "/keep";
 928
 929       # Ditto TASK_KEEPMOUNT_TMP, as /keep_tmp.
 930       $command .= "--volume=\Q$ENV{TASK_KEEPMOUNT_TMP}:/keep_tmp\E ";
 931       $ENV{TASK_KEEPMOUNT_TMP} = "/keep_tmp";
 932
 933       # TASK_WORK is almost exactly like a docker data volume: it
 934       # starts out empty, is writable, and persists until no
 935       # containers use it any more. We don't use --volumes-from to
 936       # share it with other containers: it is only accessible to this
 937       # task, and it goes away when this task stops.
 938       #
 939       # However, a docker data volume is writable only by root unless
 940       # the mount point already happens to exist in the container with
 941       # different permissions. Therefore, we [1] assume /tmp already
 942       # exists in the image and is writable by the crunch user; [2]
 943       # avoid putting TASK_WORK inside CRUNCH_TMP (which won't be
 944       # writable if they are created by docker while setting up the
 945       # other --volumes); and [3] create $TASK_WORK inside the
 946       # container using $build_script.
 947       $command .= "--volume=/tmp ";
 948       $ENV{"TASK_WORK"} = "/tmp/crunch-job-task-work/$childslotname";
 949       $ENV{"HOME"} = $ENV{"TASK_WORK"};
 950       $ENV{"TASK_TMPDIR"} = $ENV{"TASK_WORK"}; # deprecated
 951
 952       # TODO: Share a single JOB_WORK volume across all task
 953       # containers on a given worker node, and delete it when the job
 954       # ends (and, in case that doesn't work, when the next job
 955       # starts).
 956       #
 957       # For now, use the same approach as TASK_WORK above.
 958       $ENV{"JOB_WORK"} = "/tmp/crunch-job-work";
 959
 960       # Bind mount the crunchrunner binary and host TLS certificates file into
 961       # the container.
 962       $command .= '"${VOLUMES[@]}" ';
 963
 964       while (my ($env_key, $env_val) = each %ENV)
 965       {
 966         if ($env_key =~ /^(ARVADOS|CRUNCH|JOB|TASK)_/) {
 967           $command .= "--env=\Q$env_key=$env_val\E ";
 968         }
 969       }
 970       $command .= "--env=\QHOME=$ENV{HOME}\E ";
 971       $command .= "\Q$docker_hash\E ";
 972
 973       if ($Job->{arvados_sdk_version}) {
 974         $command .= $stdbuf;
 975         $command .= "perl - \Q$ENV{CRUNCH_SRC}/crunch_scripts/$Job->{script}\E";
 976       } else {
 977         $command .= "/bin/sh -c \'python -c " .
 978             '"from pkg_resources import get_distribution as get; print \"Using Arvados SDK version\", get(\"arvados-python-client\").version"' .
 979             ">&2 2>/dev/null; " .
 980             "mkdir -p \"$ENV{JOB_WORK}\" \"$ENV{TASK_WORK}\" && " .
 981             "if which stdbuf >/dev/null ; then " .
 982             "  exec $stdbuf \Q$ENV{CRUNCH_SRC}/crunch_scripts/$Job->{script}\E ;" .
 983             " else " .
 984             "  exec \Q$ENV{CRUNCH_SRC}/crunch_scripts/$Job->{script}\E ;" .
 985             " fi\'";
 986       }
 987     } else {
 988       # Non-docker run
 989       $command .= "crunchstat -cgroup-root=\Q$cgroup_root\E -poll=10000 ";
 990       $command .= $stdbuf;
 991       $command .= "perl - $ENV{CRUNCH_SRC}/crunch_scripts/" . $Job->{"script"};
 992     }
 993
 994     my @execargs = ('bash', '-c', $command);
 995     srun (\@srunargs, \@execargs, undef, $build_script);
 996     # exec() failed, we assume nothing happened.
 997     die "srun() failed on build script\n";
 998   }
 999   close("writer");
1000   if (!defined $childpid)
1001   {
1002     close $reader{$id};
1003     delete $reader{$id};
1004     next;
1005   }
1006   shift @freeslot;
1007   $proc{$childpid} = {
1008     jobstepidx => $id,
1009     time => time,
1010     slot => $childslot,
1011     jobstepname => "$job_id.$id.$childpid",
1012   };
1013   croak ("assert failed: \$slot[$childslot]->{'pid'} exists") if exists $slot[$childslot]->{pid};
1014   $slot[$childslot]->{pid} = $childpid;
1015
1016   Log ($id, "job_task ".$Jobstep->{'arvados_task'}->{'uuid'});
1017   Log ($id, "child $childpid started on $childslotname");
1018   $Jobstep->{starttime} = time;
1019   $Jobstep->{node} = $childnode->{name};
1020   $Jobstep->{slotindex} = $childslot;
1021   delete $Jobstep->{stderr};
1022   delete $Jobstep->{finishtime};
1023   delete $Jobstep->{tempfail};
1024
1025   $Jobstep->{'arvados_task'}->{started_at} = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($Jobstep->{starttime});
1026   retry_op(sub { $Jobstep->{'arvados_task'}->save; }, "job_tasks.update API");
1027
1028   splice @jobstep_todo, $todo_ptr, 1;
1029   --$todo_ptr;
1030
1031   $progress_is_dirty = 1;
1032
1033   while (!@freeslot
1034          ||
1035          ($round_num_freeslots > @freeslot && $todo_ptr+1 > $#jobstep_todo))
1036   {
1037     last THISROUND if $main::please_freeze;
1038     if ($main::please_info)
1039     {
1040       $main::please_info = 0;
1041       freeze();
1042       create_output_collection();
1043       save_meta(1);
1044       update_progress_stats();
1045     }
1046     my $gotsome
1047         = readfrompipes ()
1048         + reapchildren ();
1049     if (!$gotsome || ($latest_refresh + 2 < scalar time))
1050     {
1051       check_refresh_wanted();
1052       check_squeue();
1053       update_progress_stats();
1054     }
1055     elsif (time - $progress_stats_updated >= 30 || $progress_is_dirty)
1056     {
1057       update_progress_stats();
1058     }
1059     if (!$gotsome) {
1060       select (undef, undef, undef, 0.1);
1061     }
1062     $working_slot_count = scalar(grep { $_->{node}->{fail_count} == 0 &&
1063                                         $_->{node}->{hold_count} < 4 } @slot);
1064     if (($thisround_failed_multiple >= 8 && $thisround_succeeded == 0) ||
1065         ($thisround_failed_multiple >= 16 && $thisround_failed_multiple > $thisround_succeeded))
1066     {
1067       my $message = "Repeated failure rate too high ($thisround_failed_multiple/"
1068           .($thisround_failed+$thisround_succeeded)
1069           .") -- giving up on this round";
1070       Log (undef, $message);
1071       last THISROUND;
1072     }
1073
1074     # move slots from freeslot to holdslot (or back to freeslot) if necessary
1075     for (my $i=$#freeslot; $i>=0; $i--) {
1076       if ($slot[$freeslot[$i]]->{node}->{hold_until} > scalar time) {
1077         push @holdslot, (splice @freeslot, $i, 1);
1078       }
1079     }
1080     for (my $i=$#holdslot; $i>=0; $i--) {
1081       if ($slot[$holdslot[$i]]->{node}->{hold_until} <= scalar time) {
1082         push @freeslot, (splice @holdslot, $i, 1);
1083       }
1084     }
1085
1086     # give up if no nodes are succeeding
1087     if ($working_slot_count < 1) {
1088       Log(undef, "Every node has failed -- giving up");
1089       last THISROUND;
1090     }
1091   }
1092 }
1093
1094
1095 push @freeslot, splice @holdslot;
1096 map { $slot[$freeslot[$_]]->{node}->{losing_streak} = 0 } (0..$#freeslot);
1097
1098
1099 Log (undef, "wait for last ".(scalar keys %proc)." children to finish");
1100 while (%proc)
1101 {
1102   if ($main::please_continue) {
1103     $main::please_continue = 0;
1104     goto THISROUND;
1105   }
1106   $main::please_info = 0, freeze(), create_output_collection(), save_meta(1) if $main::please_info;
1107   readfrompipes ();
1108   if (!reapchildren())
1109   {
1110     check_refresh_wanted();
1111     check_squeue();
1112     update_progress_stats();
1113     select (undef, undef, undef, 0.1);
1114     killem (keys %proc) if $main::please_freeze;
1115   }
1116 }
1117
1118 update_progress_stats();
1119 freeze_if_want_freeze();
1120
1121
1122 if (!defined $main::success)
1123 {
1124   if (!@jobstep_todo) {
1125     $main::success = 1;
1126   } elsif ($working_slot_count < 1) {
1127     save_output_collection();
1128     save_meta();
1129     exit_retry_unlocked();
1130   } elsif ($thisround_succeeded == 0 &&
1131            ($thisround_failed == 0 || $thisround_failed > 4)) {
1132     my $message = "stop because $thisround_failed tasks failed and none succeeded";
1133     Log (undef, $message);
1134     $main::success = 0;
1135   }
1136 }
1137
1138 goto ONELEVEL if !defined $main::success;
1139
1140
1141 release_allocation();
1142 freeze();
1143 my $collated_output = save_output_collection();
1144 Log (undef, "finish");
1145
1146 my $final_log = save_meta();
1147
1148 my $final_state;
1149 if ($collated_output && $final_log && $main::success) {
1150   $final_state = 'Complete';
1151 } else {
1152   $final_state = 'Failed';
1153 }
1154 $Job->update_attributes('state' => $final_state);
1155
1156 exit (($final_state eq 'Complete') ? 0 : 1);
1157
1158
1159
1160 sub update_progress_stats
1161 {
1162   $progress_stats_updated = time;
1163   return if !$progress_is_dirty;
1164   my ($todo, $done, $running) = (scalar @jobstep_todo,
1165                                  scalar @jobstep_done,
1166                                  scalar keys(%proc));
1167   $Job->{'tasks_summary'} ||= {};
1168   $Job->{'tasks_summary'}->{'todo'} = $todo;
1169   $Job->{'tasks_summary'}->{'done'} = $done;
1170   $Job->{'tasks_summary'}->{'running'} = $running;
1171   $Job->update_attributes('tasks_summary' => $Job->{'tasks_summary'});
1172   Log (undef, "status: $done done, $running running, $todo todo");
1173   $progress_is_dirty = 0;
1174 }
1175
1176
1177
1178 sub reapchildren
1179 {
1180   my $children_reaped = 0;
1181   my @successful_task_uuids = ();
1182
1183   while((my $pid = waitpid (-1, WNOHANG)) > 0)
1184   {
1185     my $childstatus = $?;
1186
1187     my $whatslot = ($slot[$proc{$pid}->{slot}]->{node}->{name}
1188                     . "."
1189                     . $slot[$proc{$pid}->{slot}]->{cpu});
1190     my $jobstepidx = $proc{$pid}->{jobstepidx};
1191
1192     readfrompipes_after_exit ($jobstepidx);
1193
1194     $children_reaped++;
1195     my $elapsed = time - $proc{$pid}->{time};
1196     my $Jobstep = $jobstep[$jobstepidx];
1197
1198     my $exitvalue = $childstatus >> 8;
1199     my $exitinfo = "exit ".exit_status_s($childstatus);
1200     $Jobstep->{'arvados_task'}->reload;
1201     my $task_success = $Jobstep->{'arvados_task'}->{success};
1202
1203     Log ($jobstepidx, "child $pid on $whatslot $exitinfo success=$task_success");
1204
1205     if (!defined $task_success) {
1206       # task did not indicate one way or the other --> fail
1207       Log($jobstepidx, sprintf(
1208             "ERROR: Task process exited %s, but never updated its task record to indicate success and record its output.",
1209             exit_status_s($childstatus)));
1210       $Jobstep->{'arvados_task'}->{success} = 0;
1211       retry_op(sub { $Jobstep->{'arvados_task'}->save; }, "job_tasks.update API");
1212       $task_success = 0;
1213     }
1214
1215     if (!$task_success)
1216     {
1217       my $temporary_fail;
1218       $temporary_fail ||= $Jobstep->{tempfail};
1219       $temporary_fail ||= ($exitvalue == TASK_TEMPFAIL);
1220
1221       ++$thisround_failed;
1222       ++$thisround_failed_multiple if $Jobstep->{'failures'} >= 1;
1223
1224       # Check for signs of a failed or misconfigured node
1225       if (++$slot[$proc{$pid}->{slot}]->{node}->{losing_streak} >=
1226           2+$slot[$proc{$pid}->{slot}]->{node}->{ncpus}) {
1227         # Don't count this against jobstep failure thresholds if this
1228         # node is already suspected faulty and srun exited quickly
1229         if ($slot[$proc{$pid}->{slot}]->{node}->{hold_until} &&
1230             $elapsed < 5) {
1231           Log ($jobstepidx, "blaming failure on suspect node " .
1232                $slot[$proc{$pid}->{slot}]->{node}->{name});
1233           $temporary_fail ||= 1;
1234         }
1235         ban_node_by_slot($proc{$pid}->{slot});
1236       }
1237
1238       Log ($jobstepidx, sprintf('failure (#%d, %s) after %d seconds',
1239                                 ++$Jobstep->{'failures'},
1240                                 $temporary_fail ? 'temporary' : 'permanent',
1241                                 $elapsed));
1242
1243       if (!$temporary_fail || $Jobstep->{'failures'} >= 3) {
1244         # Give up on this task, and the whole job
1245         $main::success = 0;
1246       }
1247       # Put this task back on the todo queue
1248       push @jobstep_todo, $jobstepidx;
1249       $Job->{'tasks_summary'}->{'failed'}++;
1250     }
1251     else # task_success
1252     {
1253       push @successful_task_uuids, $Jobstep->{'arvados_task'}->{uuid};
1254       ++$thisround_succeeded;
1255       $slot[$proc{$pid}->{slot}]->{node}->{losing_streak} = 0;
1256       $slot[$proc{$pid}->{slot}]->{node}->{hold_until} = 0;
1257       $slot[$proc{$pid}->{slot}]->{node}->{fail_count} = 0;
1258       push @jobstep_done, $jobstepidx;
1259       Log ($jobstepidx, "success in $elapsed seconds");
1260     }
1261     $Jobstep->{exitcode} = $childstatus;
1262     $Jobstep->{finishtime} = time;
1263     $Jobstep->{'arvados_task'}->{finished_at} = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($Jobstep->{finishtime});
1264     retry_op(sub { $Jobstep->{'arvados_task'}->save; }, "job_tasks.update API");
1265     Log ($jobstepidx, sprintf("task output (%d bytes): %s",
1266                               length($Jobstep->{'arvados_task'}->{output}),
1267                               $Jobstep->{'arvados_task'}->{output}));
1268
1269     close $reader{$jobstepidx};
1270     delete $reader{$jobstepidx};
1271     delete $slot[$proc{$pid}->{slot}]->{pid};
1272     push @freeslot, $proc{$pid}->{slot};
1273     delete $proc{$pid};
1274
1275     $progress_is_dirty = 1;
1276   }
1277
1278   if (scalar(@successful_task_uuids) > 0)
1279   {
1280     Log (undef, sprintf("%d tasks exited (%d succeeded), checking for new tasks from API server.", $children_reaped, scalar(@successful_task_uuids)));
1281     # Load new tasks
1282     my $newtask_list = [];
1283     my $newtask_results;
1284     do {
1285       $newtask_results = api_call(
1286         "job_tasks/list",
1287         'filters' => [["created_by_job_task_uuid","in",\@successful_task_uuids]],
1288         'order' => 'qsequence',
1289         'offset' => scalar(@$newtask_list),
1290           );
1291       push(@$newtask_list, @{$newtask_results->{items}});
1292     } while (@{$newtask_results->{items}});
1293     Log (undef, sprintf("Got %d new tasks from API server.", scalar(@$newtask_list)));
1294     foreach my $arvados_task (@$newtask_list) {
1295       my $jobstep = {
1296         'level' => $arvados_task->{'sequence'},
1297         'failures' => 0,
1298         'arvados_task' => $arvados_task
1299       };
1300       push @jobstep, $jobstep;
1301       push @jobstep_todo, $#jobstep;
1302     }
1303   }
1304
1305   return $children_reaped;
1306 }
1307
1308 sub check_refresh_wanted
1309 {
1310   my @stat = stat $ENV{"CRUNCH_REFRESH_TRIGGER"};
1311   if (@stat &&
1312       $stat[9] > $latest_refresh &&
1313       # ...and we have actually locked the job record...
1314       $job_id eq $Job->{'uuid'}) {
1315     $latest_refresh = scalar time;
1316     my $Job2 = api_call("jobs/get", uuid => $jobspec);
1317     for my $attr ('cancelled_at',
1318                   'cancelled_by_user_uuid',
1319                   'cancelled_by_client_uuid',
1320                   'state') {
1321       $Job->{$attr} = $Job2->{$attr};
1322     }
1323     if ($Job->{'state'} ne "Running") {
1324       if ($Job->{'state'} eq "Cancelled") {
1325         Log (undef, "Job cancelled at " . $Job->{'cancelled_at'} . " by user " . $Job->{'cancelled_by_user_uuid'});
1326       } else {
1327         Log (undef, "Job state unexpectedly changed to " . $Job->{'state'});
1328       }
1329       $main::success = 0;
1330       $main::please_freeze = 1;
1331     }
1332   }
1333 }
1334
1335 sub check_squeue
1336 {
1337   my $last_squeue_check = $squeue_checked;
1338
1339   # Do not call `squeue` or check the kill list more than once every
1340   # 15 seconds.
1341   return if $last_squeue_check > time - 15;
1342   $squeue_checked = time;
1343
1344   # Look for children from which we haven't received stderr data since
1345   # the last squeue check. If no such children exist, all procs are
1346   # alive and there's no need to even look at squeue.
1347   #
1348   # As long as the crunchstat poll interval (10s) is shorter than the
1349   # squeue check interval (15s) this should make the squeue check an
1350   # infrequent event.
1351   my $silent_procs = 0;
1352   for my $js (map {$jobstep[$_->{jobstepidx}]} values %proc)
1353   {
1354     if (!exists($js->{stderr_at}))
1355     {
1356       $js->{stderr_at} = 0;
1357     }
1358     if ($js->{stderr_at} < $last_squeue_check)
1359     {
1360       $silent_procs++;
1361     }
1362   }
1363   return if $silent_procs == 0;
1364
1365   # use killem() on procs whose killtime is reached
1366   while (my ($pid, $procinfo) = each %proc)
1367   {
1368     my $js = $jobstep[$procinfo->{jobstepidx}];
1369     if (exists $procinfo->{killtime}
1370         && $procinfo->{killtime} <= time
1371         && $js->{stderr_at} < $last_squeue_check)
1372     {
1373       my $sincewhen = "";
1374       if ($js->{stderr_at}) {
1375         $sincewhen = " in last " . (time - $js->{stderr_at}) . "s";
1376       }
1377       Log($procinfo->{jobstepidx}, "killing orphaned srun process $pid (task not in slurm queue, no stderr received$sincewhen)");
1378       killem ($pid);
1379     }
1380   }
1381
1382   if (!$have_slurm)
1383   {
1384     # here is an opportunity to check for mysterious problems with local procs
1385     return;
1386   }
1387
1388   # Get a list of steps still running.  Note: squeue(1) says --steps
1389   # selects a format (which we override anyway) and allows us to
1390   # specify which steps we're interested in (which we don't).
1391   # Importantly, it also changes the meaning of %j from "job name" to
1392   # "step name" and (although this isn't mentioned explicitly in the
1393   # docs) switches from "one line per job" mode to "one line per step"
1394   # mode. Without it, we'd just get a list of one job, instead of a
1395   # list of N steps.
1396   my @squeue = `squeue --jobs=\Q$ENV{SLURM_JOB_ID}\E --steps --format='%j' --noheader`;
1397   if ($? != 0)
1398   {
1399     Log(undef, "warning: squeue exit status $? ($!)");
1400     return;
1401   }
1402   chop @squeue;
1403
1404   # which of my jobsteps are running, according to squeue?
1405   my %ok;
1406   for my $jobstepname (@squeue)
1407   {
1408     $ok{$jobstepname} = 1;
1409   }
1410
1411   # Check for child procs >60s old and not mentioned by squeue.
1412   while (my ($pid, $procinfo) = each %proc)
1413   {
1414     if ($procinfo->{time} < time - 60
1415         && $procinfo->{jobstepname}
1416         && !exists $ok{$procinfo->{jobstepname}}
1417         && !exists $procinfo->{killtime})
1418     {
1419       # According to slurm, this task has ended (successfully or not)
1420       # -- but our srun child hasn't exited. First we must wait (30
1421       # seconds) in case this is just a race between communication
1422       # channels. Then, if our srun child process still hasn't
1423       # terminated, we'll conclude some slurm communication
1424       # error/delay has caused the task to die without notifying srun,
1425       # and we'll kill srun ourselves.
1426       $procinfo->{killtime} = time + 30;
1427       Log($procinfo->{jobstepidx}, "notice: task is not in slurm queue but srun process $pid has not exited");
1428     }
1429   }
1430 }
1431
1432 sub check_sinfo
1433 {
1434   # If a node fails in a multi-node "srun" call during job setup, the call
1435   # may hang instead of exiting with a nonzero code.  This function checks
1436   # "sinfo" for the health of the nodes that were allocated and ensures that
1437   # they are all still in the "alloc" state.  If a node that is allocated to
1438   # this job is not in "alloc" state, then set please_freeze.
1439   #
1440   # This is only called from srun_sync() for node configuration.  If a
1441   # node fails doing actual work, there are other recovery mechanisms.
1442
1443   # Do not call `sinfo` more than once every 15 seconds.
1444   return if $sinfo_checked > time - 15;
1445   $sinfo_checked = time;
1446
1447   # The output format "%t" means output node states.
1448   my @sinfo = `sinfo --nodes=\Q$ENV{SLURM_NODELIST}\E --noheader -o "%t"`;
1449   if ($? != 0)
1450   {
1451     Log(undef, "warning: sinfo exit status $? ($!)");
1452     return;
1453   }
1454   chop @sinfo;
1455
1456   foreach (@sinfo)
1457   {
1458     if ($_ != "alloc" && $_ != "alloc*") {
1459       $main::please_freeze = 1;
1460     }
1461   }
1462 }
1463
1464 sub release_allocation
1465 {
1466   if ($have_slurm)
1467   {
1468     Log (undef, "release job allocation");
1469     system "scancel $ENV{SLURM_JOB_ID}";
1470   }
1471 }
1472
1473
1474 sub readfrompipes
1475 {
1476   my $gotsome = 0;
1477   my %fd_job;
1478   my $sel = IO::Select->new();
1479   foreach my $jobstepidx (keys %reader)
1480   {
1481     my $fd = $reader{$jobstepidx};
1482     $sel->add($fd);
1483     $fd_job{$fd} = $jobstepidx;
1484
1485     if (my $stdout_fd = $jobstep[$jobstepidx]->{stdout_r}) {
1486       $sel->add($stdout_fd);
1487       $fd_job{$stdout_fd} = $jobstepidx;
1488     }
1489   }
1490   # select on all reader fds with 0.1s timeout
1491   my @ready_fds = $sel->can_read(0.1);
1492   foreach my $fd (@ready_fds)
1493   {
1494     my $buf;
1495     if (0 < sysread ($fd, $buf, 65536))
1496     {
1497       $gotsome = 1;
1498       print STDERR $buf if $ENV{CRUNCH_DEBUG};
1499
1500       my $jobstepidx = $fd_job{$fd};
1501       if ($jobstep[$jobstepidx]->{stdout_r} == $fd) {
1502         $jobstep[$jobstepidx]->{stdout_captured} .= $buf;
1503         next;
1504       }
1505
1506       $jobstep[$jobstepidx]->{stderr_at} = time;
1507       $jobstep[$jobstepidx]->{stderr} .= $buf;
1508
1509       # Consume everything up to the last \n
1510       preprocess_stderr ($jobstepidx);
1511
1512       if (length ($jobstep[$jobstepidx]->{stderr}) > 16384)
1513       {
1514         # If we get a lot of stderr without a newline, chop off the
1515         # front to avoid letting our buffer grow indefinitely.
1516         substr ($jobstep[$jobstepidx]->{stderr},
1517                 0, length($jobstep[$jobstepidx]->{stderr}) - 8192) = "";
1518       }
1519     }
1520   }
1521   return $gotsome;
1522 }
1523
1524
1525 # Consume all full lines of stderr for a jobstep. Everything after the
1526 # last newline will remain in $jobstep[$jobstepidx]->{stderr} after
1527 # returning.
1528 sub preprocess_stderr
1529 {
1530   my $jobstepidx = shift;
1531   # slotindex is only defined for children running Arvados job tasks.
1532   # Be prepared to handle the undef case (for setup srun calls, etc.).
1533   my $job_slot_index = $jobstep[$jobstepidx]->{slotindex};
1534
1535   while ($jobstep[$jobstepidx]->{stderr} =~ /^(.*?)\n/) {
1536     my $line = $1;
1537     substr $jobstep[$jobstepidx]->{stderr}, 0, 1+length($line), "";
1538     Log ($jobstepidx, "stderr $line");
1539     if ($line =~ /srun: error: (SLURM job $ENV{SLURM_JOB_ID} has expired|Unable to confirm allocation for job $ENV{SLURM_JOB_ID})/i) {
1540       # If the allocation is revoked, we can't possibly continue, so mark all
1541       # nodes as failed.  This will cause the overall exit code to be
1542       # EX_RETRY_UNLOCKED instead of failure so that crunch_dispatch can re-run
1543       # this job.
1544       $main::please_freeze = 1;
1545       foreach my $st (@slot) {
1546         $st->{node}->{fail_count}++;
1547       }
1548     }
1549     elsif ($line =~ /srun: error: .*?\b(Node failure on|Aborting, .*?\bio error\b|cannot communicate with node .* aborting job)/i) {
1550       $jobstep[$jobstepidx]->{tempfail} = 1;
1551       if (defined($job_slot_index)) {
1552         $slot[$job_slot_index]->{node}->{fail_count}++;
1553         ban_node_by_slot($job_slot_index);
1554       }
1555     }
1556     elsif ($line =~ /srun: error: (Unable to create job step|.*?: Communication connection failure)/i) {
1557       $jobstep[$jobstepidx]->{tempfail} = 1;
1558       ban_node_by_slot($job_slot_index) if (defined($job_slot_index));
1559     }
1560     elsif ($line =~ /\bKeep(Read|Write|Request)Error:/) {
1561       $jobstep[$jobstepidx]->{tempfail} = 1;
1562     }
1563   }
1564 }
1565
1566
1567 # Read whatever is still available on its stderr+stdout pipes after
1568 # the given child process has exited.
1569 sub readfrompipes_after_exit
1570 {
1571   my $jobstepidx = shift;
1572
1573   # The fact that the child has exited allows some convenient
1574   # simplifications: (1) all data must have already been written, so
1575   # there's no need to wait for more once sysread returns 0; (2) the
1576   # total amount of data available is bounded by the pipe buffer size,
1577   # so it's safe to read everything into one string.
1578   my $buf;
1579   while (0 < sysread ($reader{$jobstepidx}, $buf, 65536)) {
1580     $jobstep[$jobstepidx]->{stderr_at} = time;
1581     $jobstep[$jobstepidx]->{stderr} .= $buf;
1582   }
1583   if ($jobstep[$jobstepidx]->{stdout_r}) {
1584     while (0 < sysread ($jobstep[$jobstepidx]->{stdout_r}, $buf, 65536)) {
1585       $jobstep[$jobstepidx]->{stdout_captured} .= $buf;
1586     }
1587   }
1588   preprocess_stderr ($jobstepidx);
1589
1590   map {
1591     Log ($jobstepidx, "stderr $_");
1592   } split ("\n", $jobstep[$jobstepidx]->{stderr});
1593   $jobstep[$jobstepidx]->{stderr} = '';
1594 }
1595
1596 sub fetch_block
1597 {
1598   my $hash = shift;
1599   my $keep;
1600   if (!open($keep, "-|", "arv-get", "--retries", retry_count(), $hash)) {
1601     Log(undef, "fetch_block run error from arv-get $hash: $!");
1602     return undef;
1603   }
1604   my $output_block = "";
1605   while (1) {
1606     my $buf;
1607     my $bytes = sysread($keep, $buf, 1024 * 1024);
1608     if (!defined $bytes) {
1609       Log(undef, "fetch_block read error from arv-get: $!");
1610       $output_block = undef;
1611       last;
1612     } elsif ($bytes == 0) {
1613       # sysread returns 0 at the end of the pipe.
1614       last;
1615     } else {
1616       # some bytes were read into buf.
1617       $output_block .= $buf;
1618     }
1619   }
1620   close $keep;
1621   if ($?) {
1622     Log(undef, "fetch_block arv-get exited " . exit_status_s($?));
1623     $output_block = undef;
1624   }
1625   return $output_block;
1626 }
1627
1628 # Create a collection by concatenating the output of all tasks (each
1629 # task's output is either a manifest fragment, a locator for a
1630 # manifest fragment stored in Keep, or nothing at all). Return the
1631 # portable_data_hash of the new collection.
1632 sub create_output_collection
1633 {
1634   Log (undef, "collate");
1635
1636   my ($child_out, $child_in);
1637   # This depends on the python-arvados-python-client package, which needs to be installed
1638   # on the machine running crunch-dispatch (typically, the API server).
1639   my $pid = open2($child_out, $child_in, '/usr/share/python2.7/dist/python-arvados-python-client/bin/python', '-c', q{
1640 import arvados
1641 import sys
1642 print (arvados.api("v1").collections().
1643        create(body={"manifest_text": sys.stdin.read(),
1644                     "owner_uuid": sys.argv[2]}).
1645        execute(num_retries=int(sys.argv[1]))["portable_data_hash"])
1646 }, retry_count(), $Job->{owner_uuid});
1647
1648   my $task_idx = -1;
1649   my $manifest_size = 0;
1650   for (@jobstep)
1651   {
1652     ++$task_idx;
1653     my $output = $_->{'arvados_task'}->{output};
1654     next if (!defined($output));
1655     my $next_write;
1656     if ($output =~ /^[0-9a-f]{32}(\+\S+)*$/) {
1657       $next_write = fetch_block($output);
1658     } else {
1659       $next_write = $output;
1660     }
1661     if (defined($next_write)) {
1662       if (!defined(syswrite($child_in, $next_write))) {
1663         # There's been an error writing.  Stop the loop.
1664         # We'll log details about the exit code later.
1665         last;
1666       } else {
1667         $manifest_size += length($next_write);
1668       }
1669     } else {
1670       my $uuid = $_->{'arvados_task'}->{'uuid'};
1671       Log (undef, "Error retrieving '$output' output by task $task_idx ($uuid)");
1672       $main::success = 0;
1673     }
1674   }
1675   close($child_in);
1676   Log(undef, "collated output manifest text to send to API server is $manifest_size bytes with access tokens");
1677
1678   my $joboutput;
1679   my $s = IO::Select->new($child_out);
1680   if ($s->can_read(120)) {
1681     sysread($child_out, $joboutput, 1024 * 1024);
1682     waitpid($pid, 0);
1683     if ($?) {
1684       Log(undef, "output collection creation exited " . exit_status_s($?));
1685       $joboutput = undef;
1686     } else {
1687       chomp($joboutput);
1688     }
1689   } else {
1690     Log (undef, "timed out while creating output collection");
1691     foreach my $signal (2, 2, 2, 15, 15, 9) {
1692       kill($signal, $pid);
1693       last if waitpid($pid, WNOHANG) == -1;
1694       sleep(1);
1695     }
1696   }
1697   close($child_out);
1698
1699   return $joboutput;
1700 }
1701
1702 # Calls create_output_collection, logs the result, and returns it.
1703 # If that was successful, save that as the output in the job record.
1704 sub save_output_collection {
1705   my $collated_output = create_output_collection();
1706
1707   if (!$collated_output) {
1708     Log(undef, "Failed to write output collection");
1709   }
1710   else {
1711     Log(undef, "job output $collated_output");
1712     $Job->update_attributes('output' => $collated_output);
1713   }
1714   return $collated_output;
1715 }
1716
1717 sub killem
1718 {
1719   foreach (@_)
1720   {
1721     my $sig = 2;                # SIGINT first
1722     if (exists $proc{$_}->{"sent_$sig"} &&
1723         time - $proc{$_}->{"sent_$sig"} > 4)
1724     {
1725       $sig = 15;                # SIGTERM if SIGINT doesn't work
1726     }
1727     if (exists $proc{$_}->{"sent_$sig"} &&
1728         time - $proc{$_}->{"sent_$sig"} > 4)
1729     {
1730       $sig = 9;                 # SIGKILL if SIGTERM doesn't work
1731     }
1732     if (!exists $proc{$_}->{"sent_$sig"})
1733     {
1734       Log ($proc{$_}->{jobstepidx}, "sending 2x signal $sig to pid $_");
1735       kill $sig, $_;
1736       select (undef, undef, undef, 0.1);
1737       if ($sig == 2)
1738       {
1739         kill $sig, $_;     # srun wants two SIGINT to really interrupt
1740       }
1741       $proc{$_}->{"sent_$sig"} = time;
1742       $proc{$_}->{"killedafter"} = time - $proc{$_}->{"time"};
1743     }
1744   }
1745 }
1746
1747
1748 sub fhbits
1749 {
1750   my($bits);
1751   for (@_) {
1752     vec($bits,fileno($_),1) = 1;
1753   }
1754   $bits;
1755 }
1756
1757
1758 # Send log output to Keep via arv-put.
1759 #
1760 # $log_pipe_in and $log_pipe_out are the input and output filehandles to the arv-put pipe.
1761 # $log_pipe_out_buf is a string containing all output read from arv-put so far.
1762 # $log_pipe_out_select is an IO::Select object around $log_pipe_out.
1763 # $log_pipe_pid is the pid of the arv-put subprocess.
1764 #
1765 # The only functions that should access these variables directly are:
1766 #
1767 # log_writer_start($logfilename)
1768 #     Starts an arv-put pipe, reading data on stdin and writing it to
1769 #     a $logfilename file in an output collection.
1770 #
1771 # log_writer_read_output([$timeout])
1772 #     Read output from $log_pipe_out and append it to $log_pipe_out_buf.
1773 #     Passes $timeout to the select() call, with a default of 0.01.
1774 #     Returns the result of the last read() call on $log_pipe_out, or
1775 #     -1 if read() wasn't called because select() timed out.
1776 #     Only other log_writer_* functions should need to call this.
1777 #
1778 # log_writer_send($txt)
1779 #     Writes $txt to the output log collection.
1780 #
1781 # log_writer_finish()
1782 #     Closes the arv-put pipe and returns the output that it produces.
1783 #
1784 # log_writer_is_active()
1785 #     Returns a true value if there is currently a live arv-put
1786 #     process, false otherwise.
1787 #
1788 my ($log_pipe_in, $log_pipe_out, $log_pipe_out_buf, $log_pipe_out_select,
1789     $log_pipe_pid);
1790
1791 sub log_writer_start($)
1792 {
1793   my $logfilename = shift;
1794   $log_pipe_pid = open2($log_pipe_out, $log_pipe_in,
1795                         'arv-put',
1796                         '--stream',
1797                         '--retries', '6',
1798                         '--filename', $logfilename,
1799                         '-');
1800   $log_pipe_out_buf = "";
1801   $log_pipe_out_select = IO::Select->new($log_pipe_out);
1802 }
1803
1804 sub log_writer_read_output {
1805   my $timeout = shift || 0.01;
1806   my $read = -1;
1807   while ($read && $log_pipe_out_select->can_read($timeout)) {
1808     $read = read($log_pipe_out, $log_pipe_out_buf, 65536,
1809                  length($log_pipe_out_buf));
1810   }
1811   if (!defined($read)) {
1812     Log(undef, "error reading log manifest from arv-put: $!");
1813   }
1814   return $read;
1815 }
1816
1817 sub log_writer_send($)
1818 {
1819   my $txt = shift;
1820   print $log_pipe_in $txt;
1821   log_writer_read_output();
1822 }
1823
1824 sub log_writer_finish()
1825 {
1826   return unless $log_pipe_pid;
1827
1828   close($log_pipe_in);
1829
1830   my $logger_failed = 0;
1831   my $read_result = log_writer_read_output(600);
1832   if ($read_result == -1) {
1833     $logger_failed = -1;
1834     Log (undef, "timed out reading from 'arv-put'");
1835   } elsif ($read_result != 0) {
1836     $logger_failed = -2;
1837     Log(undef, "failed to read arv-put log manifest to EOF");
1838   }
1839
1840   waitpid($log_pipe_pid, 0);
1841   if ($?) {
1842     $logger_failed ||= $?;
1843     Log(undef, "log_writer_finish: arv-put exited " . exit_status_s($?))
1844   }
1845
1846   close($log_pipe_out);
1847   my $arv_put_output = $logger_failed ? undef : $log_pipe_out_buf;
1848   $log_pipe_pid = $log_pipe_in = $log_pipe_out = $log_pipe_out_buf =
1849       $log_pipe_out_select = undef;
1850
1851   return $arv_put_output;
1852 }
1853
1854 sub log_writer_is_active() {
1855   return $log_pipe_pid;
1856 }
1857
1858 sub Log                         # ($jobstepidx, $logmessage)
1859 {
1860   my ($jobstepidx, $logmessage) = @_;
1861   if ($logmessage =~ /\n/) {
1862     for my $line (split (/\n/, $_[1])) {
1863       Log ($jobstepidx, $line);
1864     }
1865     return;
1866   }
1867   my $fh = select STDERR; $|=1; select $fh;
1868   my $task_qseq = '';
1869   if (defined($jobstepidx) && exists($jobstep[$jobstepidx]->{arvados_task})) {
1870     $task_qseq = $jobstepidx;
1871   }
1872   my $message = sprintf ("%s %d %s %s", $job_id, $$, $task_qseq, $logmessage);
1873   $message =~ s{([^ -\176])}{"\\" . sprintf ("%03o", ord($1))}ge;
1874   $message .= "\n";
1875   my $datetime;
1876   if (log_writer_is_active() || -t STDERR) {
1877     my @gmtime = gmtime;
1878     $datetime = sprintf ("%04d-%02d-%02d_%02d:%02d:%02d",
1879                          $gmtime[5]+1900, $gmtime[4]+1, @gmtime[3,2,1,0]);
1880   }
1881   print STDERR ((-t STDERR) ? ($datetime." ".$message) : $message);
1882
1883   if (log_writer_is_active()) {
1884     log_writer_send($datetime . " " . $message);
1885   }
1886 }
1887
1888
1889 sub croak
1890 {
1891   my ($package, $file, $line) = caller;
1892   my $message = "@_ at $file line $line\n";
1893   Log (undef, $message);
1894   release_allocation();
1895   freeze() if @jobstep_todo;
1896   create_output_collection() if @jobstep_todo;
1897   cleanup();
1898   save_meta();
1899   die;
1900 }
1901
1902
1903 sub cleanup
1904 {
1905   return unless $Job;
1906   if ($Job->{'state'} eq 'Cancelled') {
1907     $Job->update_attributes('finished_at' => scalar gmtime);
1908   } else {
1909     $Job->update_attributes('state' => 'Failed');
1910   }
1911 }
1912
1913
1914 sub save_meta
1915 {
1916   my $justcheckpoint = shift; # false if this will be the last meta saved
1917   return if $justcheckpoint;  # checkpointing is not relevant post-Warehouse.pm
1918   return unless log_writer_is_active();
1919   my $log_manifest = log_writer_finish();
1920   return unless defined($log_manifest);
1921
1922   if ($Job->{log}) {
1923     my $prev_log_coll = api_call("collections/get", uuid => $Job->{log});
1924     $log_manifest = $prev_log_coll->{manifest_text} . $log_manifest;
1925   }
1926
1927   my $log_coll = api_call(
1928     "collections/create", ensure_unique_name => 1, collection => {
1929       manifest_text => $log_manifest,
1930       owner_uuid => $Job->{owner_uuid},
1931       name => sprintf("Log from %s job %s", $Job->{script}, $Job->{uuid}),
1932     });
1933   Log(undef, "log collection is " . $log_coll->{portable_data_hash});
1934   $Job->update_attributes('log' => $log_coll->{portable_data_hash});
1935
1936   return $log_coll->{portable_data_hash};
1937 }
1938
1939
1940 sub freeze_if_want_freeze
1941 {
1942   if ($main::please_freeze)
1943   {
1944     release_allocation();
1945     if (@_)
1946     {
1947       # kill some srun procs before freeze+stop
1948       map { $proc{$_} = {} } @_;
1949       while (%proc)
1950       {
1951         killem (keys %proc);
1952         select (undef, undef, undef, 0.1);
1953         my $died;
1954         while (($died = waitpid (-1, WNOHANG)) > 0)
1955         {
1956           delete $proc{$died};
1957         }
1958       }
1959     }
1960     freeze();
1961     create_output_collection();
1962     cleanup();
1963     save_meta();
1964     exit 1;
1965   }
1966 }
1967
1968
1969 sub freeze
1970 {
1971   Log (undef, "Freeze not implemented");
1972   return;
1973 }
1974
1975
1976 sub thaw
1977 {
1978   croak ("Thaw not implemented");
1979 }
1980
1981
1982 sub freezequote
1983 {
1984   my $s = shift;
1985   $s =~ s/\\/\\\\/g;
1986   $s =~ s/\n/\\n/g;
1987   return $s;
1988 }
1989
1990
1991 sub freezeunquote
1992 {
1993   my $s = shift;
1994   $s =~ s{\\(.)}{$1 eq "n" ? "\n" : $1}ge;
1995   return $s;
1996 }
1997
1998 sub srun_sync
1999 {
2000   my $srunargs = shift;
2001   my $execargs = shift;
2002   my $opts = shift || {};
2003   my $stdin = shift;
2004
2005   my $label = exists $opts->{label} ? $opts->{label} : "@$execargs";
2006   Log (undef, "$label: start");
2007
2008   my ($stderr_r, $stderr_w);
2009   pipe $stderr_r, $stderr_w or croak("pipe() failed: $!");
2010
2011   my ($stdout_r, $stdout_w);
2012   pipe $stdout_r, $stdout_w or croak("pipe() failed: $!");
2013
2014   my $started_srun = scalar time;
2015
2016   my $srunpid = fork();
2017   if ($srunpid == 0)
2018   {
2019     close($stderr_r);
2020     close($stdout_r);
2021     fcntl($stderr_w, F_SETFL, 0) or croak($!); # no close-on-exec
2022     fcntl($stdout_w, F_SETFL, 0) or croak($!);
2023     open(STDERR, ">&", $stderr_w) or croak ($!);
2024     open(STDOUT, ">&", $stdout_w) or croak ($!);
2025     srun ($srunargs, $execargs, $opts, $stdin);
2026     exit (1);
2027   }
2028   close($stderr_w);
2029   close($stdout_w);
2030
2031   set_nonblocking($stderr_r);
2032   set_nonblocking($stdout_r);
2033
2034   # Add entries to @jobstep and %proc so check_squeue() and
2035   # freeze_if_want_freeze() can treat it like a job task process.
2036   push @jobstep, {
2037     stderr => '',
2038     stderr_at => 0,
2039     stderr_captured => '',
2040     stdout_r => $stdout_r,
2041     stdout_captured => '',
2042   };
2043   my $jobstepidx = $#jobstep;
2044   $proc{$srunpid} = {
2045     jobstepidx => $jobstepidx,
2046   };
2047   $reader{$jobstepidx} = $stderr_r;
2048
2049   while ($srunpid != waitpid ($srunpid, WNOHANG)) {
2050     my $busy = readfrompipes();
2051     if (!$busy || ($latest_refresh + 2 < scalar time)) {
2052       check_refresh_wanted();
2053       check_squeue();
2054       check_sinfo();
2055     }
2056     if (!$busy) {
2057       select(undef, undef, undef, 0.1);
2058     }
2059     if (($started_srun + $srun_sync_timeout) < scalar time) {
2060       # Exceeded general timeout for "srun_sync" operations, likely
2061       # means something got stuck on the remote node.
2062       Log(undef, "srun_sync exceeded timeout, will fail.");
2063       $main::please_freeze = 1;
2064     }
2065     killem(keys %proc) if $main::please_freeze;
2066   }
2067   my $exited = $?;
2068
2069   readfrompipes_after_exit ($jobstepidx);
2070
2071   Log (undef, "$label: exit ".exit_status_s($exited));
2072
2073   close($stdout_r);
2074   close($stderr_r);
2075   delete $proc{$srunpid};
2076   delete $reader{$jobstepidx};
2077
2078   my $j = pop @jobstep;
2079   # If the srun showed signs of tempfail, ensure the caller treats that as a
2080   # failure case.
2081   if ($main::please_freeze || $j->{tempfail}) {
2082     $exited ||= 255;
2083   }
2084   return ($exited, $j->{stdout_captured}, $j->{stderr_captured}, $j->{tempfail});
2085 }
2086
2087
2088 sub srun
2089 {
2090   my $srunargs = shift;
2091   my $execargs = shift;
2092   my $opts = shift || {};
2093   my $stdin = shift;
2094   my $args = $have_slurm ? [@$srunargs, @$execargs] : $execargs;
2095
2096   $Data::Dumper::Terse = 1;
2097   $Data::Dumper::Indent = 0;
2098   my $show_cmd = Dumper($args);
2099   $show_cmd =~ s/(TOKEN\\*=)[^\s\']+/${1}[...]/g;
2100   $show_cmd =~ s/\n/ /g;
2101   if ($opts->{fork}) {
2102     Log(undef, "starting: $show_cmd");
2103   } else {
2104     # This is a child process: parent is in charge of reading our
2105     # stderr and copying it to Log() if needed.
2106     warn "starting: $show_cmd\n";
2107   }
2108
2109   if (defined $stdin) {
2110     my $child = open STDIN, "-|";
2111     defined $child or die "no fork: $!";
2112     if ($child == 0) {
2113       print $stdin or die $!;
2114       close STDOUT or die $!;
2115       exit 0;
2116     }
2117   }
2118
2119   return system (@$args) if $opts->{fork};
2120
2121   exec @$args;
2122   warn "ENV size is ".length(join(" ",%ENV));
2123   die "exec failed: $!: @$args";
2124 }
2125
2126
2127 sub ban_node_by_slot {
2128   # Don't start any new jobsteps on this node for 60 seconds
2129   my $slotid = shift;
2130   $slot[$slotid]->{node}->{hold_until} = 60 + scalar time;
2131   $slot[$slotid]->{node}->{hold_count}++;
2132   Log (undef, "backing off node " . $slot[$slotid]->{node}->{name} . " for 60 seconds");
2133 }
2134
2135 sub must_lock_now
2136 {
2137   my ($lockfile, $error_message) = @_;
2138   open L, ">", $lockfile or croak("$lockfile: $!");
2139   if (!flock L, LOCK_EX|LOCK_NB) {
2140     croak("Can't lock $lockfile: $error_message\n");
2141   }
2142 }
2143
2144 sub find_docker_image {
2145   # Given a Keep locator, check to see if it contains a Docker image.
2146   # If so, return its stream name and Docker hash.
2147   # If not, return undef for both values.
2148   my $locator = shift;
2149   my ($streamname, $filename);
2150   my $image = api_call("collections/get", uuid => $locator);
2151   if ($image) {
2152     foreach my $line (split(/\n/, $image->{manifest_text})) {
2153       my @tokens = split(/\s+/, $line);
2154       next if (!@tokens);
2155       $streamname = shift(@tokens);
2156       foreach my $filedata (grep(/^\d+:\d+:/, @tokens)) {
2157         if (defined($filename)) {
2158           return (undef, undef);  # More than one file in the Collection.
2159         } else {
2160           $filename = (split(/:/, $filedata, 3))[2];
2161           $filename =~ s/\\([0-3][0-7][0-7])/chr(oct($1))/ge;
2162         }
2163       }
2164     }
2165   }
2166   if (defined($filename) and ($filename =~ /^((?:sha256:)?[0-9A-Fa-f]{64})\.tar$/)) {
2167     return ($streamname, $1);
2168   } else {
2169     return (undef, undef);
2170   }
2171 }
2172
2173 sub exit_retry_unlocked {
2174   Log(undef, "Transient failure with lock acquired; asking for re-dispatch by exiting ".EX_RETRY_UNLOCKED);
2175   exit(EX_RETRY_UNLOCKED);
2176 }
2177
2178 sub retry_count {
2179   # Calculate the number of times an operation should be retried,
2180   # assuming exponential backoff, and that we're willing to retry as
2181   # long as tasks have been running.  Enforce a minimum of 3 retries.
2182   my ($starttime, $endtime, $timediff, $retries);
2183   if (@jobstep) {
2184     $starttime = $jobstep[0]->{starttime};
2185     $endtime = $jobstep[-1]->{finishtime};
2186   }
2187   if (!defined($starttime)) {
2188     $timediff = 0;
2189   } elsif (!defined($endtime)) {
2190     $timediff = time - $starttime;
2191   } else {
2192     $timediff = ($endtime - $starttime) - (time - $endtime);
2193   }
2194   if ($timediff > 0) {
2195     $retries = int(log($timediff) / log(2));
2196   } else {
2197     $retries = 1;  # Use the minimum.
2198   }
2199   return ($retries > 3) ? $retries : 3;
2200 }
2201
2202 sub retry_op {
2203   # Pass in two function references.
2204   # This method will be called with the remaining arguments.
2205   # If it dies, retry it with exponential backoff until it succeeds,
2206   # or until the current retry_count is exhausted.  After each failure
2207   # that can be retried, the second function will be called with
2208   # the current try count (0-based), next try time, and error message.
2209   my $operation = shift;
2210   my $op_text = shift;
2211   my $retries = retry_count();
2212   my $retry_callback = sub {
2213     my ($try_count, $next_try_at, $errmsg) = @_;
2214     $errmsg =~ s/\s*\bat \Q$0\E line \d+\.?\s*//;
2215     $errmsg =~ s/\s/ /g;
2216     $errmsg =~ s/\s+$//;
2217     my $retry_msg;
2218     if ($next_try_at < time) {
2219       $retry_msg = "Retrying.";
2220     } else {
2221       my $next_try_fmt = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($next_try_at);
2222       $retry_msg = "Retrying at $next_try_fmt.";
2223     }
2224     Log(undef, "$op_text failed: $errmsg. $retry_msg");
2225   };
2226   foreach my $try_count (0..$retries) {
2227     my $next_try = time + (2 ** $try_count);
2228     my $result = eval { $operation->(@_); };
2229     if (!$@) {
2230       return $result;
2231     } elsif ($try_count < $retries) {
2232       $retry_callback->($try_count, $next_try, $@);
2233       my $sleep_time = $next_try - time;
2234       sleep($sleep_time) if ($sleep_time > 0);
2235     }
2236   }
2237   # Ensure the error message ends in a newline, so Perl doesn't add
2238   # retry_op's line number to it.
2239   chomp($@);
2240   die($@ . "\n");
2241 }
2242
2243 sub api_call {
2244   # Pass in a /-separated API method name, and arguments for it.
2245   # This function will call that method, retrying as needed until
2246   # the current retry_count is exhausted, with a log on the first failure.
2247   my $method_name = shift;
2248   my $method = $arv;
2249   foreach my $key (split(/\//, $method_name)) {
2250     $method = $method->{$key};
2251   }
2252   return retry_op(sub { $method->execute(@_); }, "API method $method_name", @_);
2253 }
2254
2255 sub exit_status_s {
2256   # Given a $?, return a human-readable exit code string like "0" or
2257   # "1" or "0 with signal 1" or "1 with signal 11".
2258   my $exitcode = shift;
2259   my $s = $exitcode >> 8;
2260   if ($exitcode & 0x7f) {
2261     $s .= " with signal " . ($exitcode & 0x7f);
2262   }
2263   if ($exitcode & 0x80) {
2264     $s .= " with core dump";
2265   }
2266   return $s;
2267 }
2268
2269 sub handle_readall {
2270   # Pass in a glob reference to a file handle.
2271   # Read all its contents and return them as a string.
2272   my $fh_glob_ref = shift;
2273   local $/ = undef;
2274   return <$fh_glob_ref>;
2275 }
2276
2277 sub tar_filename_n {
2278   my $n = shift;
2279   return sprintf("%s/git.%s.%d.tar", $ENV{CRUNCH_TMP}, $job_id, $n);
2280 }
2281
2282 sub add_git_archive {
2283   # Pass in a git archive command as a string or list, a la system().
2284   # This method will save its output to be included in the archive sent to the
2285   # build script.
2286   my $git_input;
2287   $git_tar_count++;
2288   if (!open(GIT_ARCHIVE, ">", tar_filename_n($git_tar_count))) {
2289     croak("Failed to save git archive: $!");
2290   }
2291   my $git_pid = open2(">&GIT_ARCHIVE", $git_input, @_);
2292   close($git_input);
2293   waitpid($git_pid, 0);
2294   close(GIT_ARCHIVE);
2295   if ($?) {
2296     croak("Failed to save git archive: git exited " . exit_status_s($?));
2297   }
2298 }
2299
2300 sub combined_git_archive {
2301   # Combine all saved tar archives into a single archive, then return its
2302   # contents in a string.  Return undef if no archives have been saved.
2303   if ($git_tar_count < 1) {
2304     return undef;
2305   }
2306   my $base_tar_name = tar_filename_n(1);
2307   foreach my $tar_to_append (map { tar_filename_n($_); } (2..$git_tar_count)) {
2308     my $tar_exit = system("tar", "-Af", $base_tar_name, $tar_to_append);
2309     if ($tar_exit != 0) {
2310       croak("Error preparing build archive: tar -A exited " .
2311             exit_status_s($tar_exit));
2312     }
2313   }
2314   if (!open(GIT_TAR, "<", $base_tar_name)) {
2315     croak("Could not open build archive: $!");
2316   }
2317   my $tar_contents = handle_readall(\*GIT_TAR);
2318   close(GIT_TAR);
2319   return $tar_contents;
2320 }
2321
2322 sub set_nonblocking {
2323   my $fh = shift;
2324   my $flags = fcntl ($fh, F_GETFL, 0) or croak ($!);
2325   fcntl ($fh, F_SETFL, $flags | O_NONBLOCK) or croak ($!);
2326 }
2327
2328 __DATA__
2329 #!/usr/bin/env perl
2330 #
2331 # This is crunch-job's internal dispatch script.  crunch-job running on the API
2332 # server invokes this script on individual compute nodes, or localhost if we're
2333 # running a job locally.  It gets called in two modes:
2334 #
2335 # * No arguments: Installation mode.  Read a tar archive from the DATA
2336 #   file handle; it includes the Crunch script's source code, and
2337 #   maybe SDKs as well.  Those should be installed in the proper
2338 #   locations.  This runs outside of any Docker container, so don't try to
2339 #   introspect Crunch's runtime environment.
2340 #
2341 # * With arguments: Crunch script run mode.  This script should set up the
2342 #   environment, then run the command specified in the arguments.  This runs
2343 #   inside any Docker container.
2344
2345 use Fcntl ':flock';
2346 use File::Path qw( make_path remove_tree );
2347 use POSIX qw(getcwd);
2348
2349 use constant TASK_TEMPFAIL => 111;
2350
2351 # Map SDK subdirectories to the path environments they belong to.
2352 my %SDK_ENVVARS = ("perl/lib" => "PERLLIB", "ruby/lib" => "RUBYLIB");
2353
2354 my $destdir = $ENV{"CRUNCH_SRC"};
2355 my $archive_hash = $ENV{"CRUNCH_GIT_ARCHIVE_HASH"};
2356 my $repo = $ENV{"CRUNCH_SRC_URL"};
2357 my $install_dir = $ENV{"CRUNCH_INSTALL"} || (getcwd() . "/opt");
2358 my $job_work = $ENV{"JOB_WORK"};
2359 my $task_work = $ENV{"TASK_WORK"};
2360
2361 open(STDOUT_ORIG, ">&", STDOUT);
2362 open(STDERR_ORIG, ">&", STDERR);
2363
2364 for my $dir ($destdir, $job_work, $task_work) {
2365   if ($dir) {
2366     make_path $dir;
2367     -e $dir or die "Failed to create temporary directory ($dir): $!";
2368   }
2369 }
2370
2371 if ($task_work) {
2372   remove_tree($task_work, {keep_root => 1});
2373 }
2374
2375 ### Crunch script run mode
2376 if (@ARGV) {
2377   # We want to do routine logging during task 0 only.  This gives the user
2378   # the information they need, but avoids repeating the information for every
2379   # task.
2380   my $Log;
2381   if ($ENV{TASK_SEQUENCE} eq "0") {
2382     $Log = sub {
2383       my $msg = shift;
2384       printf STDERR_ORIG "[Crunch] $msg\n", @_;
2385     };
2386   } else {
2387     $Log = sub { };
2388   }
2389
2390   my $python_src = "$install_dir/python";
2391   my $venv_dir = "$job_work/.arvados.venv";
2392   my $venv_built = -e "$venv_dir/bin/activate";
2393   if ((!$venv_built) and (-d $python_src) and can_run("virtualenv")) {
2394     shell_or_die(undef, "virtualenv", "--quiet", "--system-site-packages",
2395                  "--python=python2.7", $venv_dir);
2396     shell_or_die(TASK_TEMPFAIL, "$venv_dir/bin/pip", "--quiet", "install", "-I", $python_src);
2397     $venv_built = 1;
2398     $Log->("Built Python SDK virtualenv");
2399   }
2400
2401   my @pysdk_version_cmd = ("python", "-c",
2402     "from pkg_resources import get_distribution as get; print get('arvados-python-client').version");
2403   if ($venv_built) {
2404     $Log->("Running in Python SDK virtualenv");
2405     @pysdk_version_cmd = ();
2406     my $orig_argv = join(" ", map { quotemeta($_); } @ARGV);
2407     @ARGV = ("/bin/sh", "-ec",
2408              ". \Q$venv_dir/bin/activate\E; exec $orig_argv");
2409   } elsif (-d $python_src) {
2410     $Log->("Warning: virtualenv not found inside Docker container default " .
2411            "\$PATH. Can't install Python SDK.");
2412   }
2413
2414   if (@pysdk_version_cmd) {
2415     open(my $pysdk_version_pipe, "-|", @pysdk_version_cmd);
2416     my $pysdk_version = <$pysdk_version_pipe>;
2417     close($pysdk_version_pipe);
2418     if ($? == 0) {
2419       chomp($pysdk_version);
2420       $Log->("Using Arvados SDK version $pysdk_version");
2421     } else {
2422       # A lot could've gone wrong here, but pretty much all of it means that
2423       # Python won't be able to load the Arvados SDK.
2424       $Log->("Warning: Arvados SDK not found");
2425     }
2426   }
2427
2428   while (my ($sdk_dir, $sdk_envkey) = each(%SDK_ENVVARS)) {
2429     my $sdk_path = "$install_dir/$sdk_dir";
2430     if (-d $sdk_path) {
2431       if ($ENV{$sdk_envkey}) {
2432         $ENV{$sdk_envkey} = "$sdk_path:" . $ENV{$sdk_envkey};
2433       } else {
2434         $ENV{$sdk_envkey} = $sdk_path;
2435       }
2436       $Log->("Arvados SDK added to %s", $sdk_envkey);
2437     }
2438   }
2439
2440   exec(@ARGV);
2441   die "Cannot exec `@ARGV`: $!";
2442 }
2443
2444 ### Installation mode
2445 open L, ">", "$destdir.lock" or die "$destdir.lock: $!";
2446 flock L, LOCK_EX;
2447 if (readlink ("$destdir.archive_hash") eq $archive_hash && -d $destdir) {
2448   # This exact git archive (source + arvados sdk) is already installed
2449   # here, so there's no need to reinstall it.
2450
2451   # We must consume our DATA section, though: otherwise the process
2452   # feeding it to us will get SIGPIPE.
2453   my $buf;
2454   while (read(DATA, $buf, 65536)) { }
2455
2456   exit(0);
2457 }
2458
2459 unlink "$destdir.archive_hash";
2460 mkdir $destdir;
2461
2462 do {
2463   # Ignore SIGPIPE: we check retval of close() instead. See perlipc(1).
2464   local $SIG{PIPE} = "IGNORE";
2465   warn "Extracting archive: $archive_hash\n";
2466   # --ignore-zeros is necessary sometimes: depending on how much NUL
2467   # padding tar -A put on our combined archive (which in turn depends
2468   # on the length of the component archives) tar without
2469   # --ignore-zeros will exit before consuming stdin and cause close()
2470   # to fail on the resulting SIGPIPE.
2471   if (!open(TARX, "|-", "tar", "--ignore-zeros", "-xC", $destdir)) {
2472     die "Error launching 'tar -xC $destdir': $!";
2473   }
2474   # If we send too much data to tar in one write (> 4-5 MiB), it stops, and we
2475   # get SIGPIPE.  We must feed it data incrementally.
2476   my $tar_input;
2477   while (read(DATA, $tar_input, 65536)) {
2478     print TARX $tar_input;
2479   }
2480   if(!close(TARX)) {
2481     die "'tar -xC $destdir' exited $?: $!";
2482   }
2483 };
2484
2485 mkdir $install_dir;
2486
2487 my $sdk_root = "$destdir/.arvados.sdk/sdk";
2488 if (-d $sdk_root) {
2489   foreach my $sdk_lang (("python",
2490                          map { (split /\//, $_, 2)[0]; } keys(%SDK_ENVVARS))) {
2491     if (-d "$sdk_root/$sdk_lang") {
2492       if (!rename("$sdk_root/$sdk_lang", "$install_dir/$sdk_lang")) {
2493         die "Failed to install $sdk_lang SDK: $!";
2494       }
2495     }
2496   }
2497 }
2498
2499 my $python_dir = "$install_dir/python";
2500 if ((-d $python_dir) and can_run("python2.7")) {
2501   open(my $egg_info_pipe, "-|",
2502        "python2.7 \Q$python_dir/setup.py\E egg_info 2>&1 >/dev/null");
2503   my @egg_info_errors = <$egg_info_pipe>;
2504   close($egg_info_pipe);
2505
2506   if ($?) {
2507     if (@egg_info_errors and (($egg_info_errors[-1] =~ /\bgit\b/) or ($egg_info_errors[-1] =~ /\[Errno 2\]/))) {
2508       # egg_info apparently failed because it couldn't ask git for a build tag.
2509       # Specify no build tag.
2510       open(my $pysdk_cfg, ">>", "$python_dir/setup.cfg");
2511       print $pysdk_cfg "\n[egg_info]\ntag_build =\n";
2512       close($pysdk_cfg);
2513     } else {
2514       my $egg_info_exit = $? >> 8;
2515       foreach my $errline (@egg_info_errors) {
2516         warn $errline;
2517       }
2518       warn "python setup.py egg_info failed: exit $egg_info_exit";
2519       exit ($egg_info_exit || 1);
2520     }
2521   }
2522 }
2523
2524 # Hide messages from the install script (unless it fails: shell_or_die
2525 # will show $destdir.log in that case).
2526 open(STDOUT, ">>", "$destdir.log") or die ($!);
2527 open(STDERR, ">&", STDOUT) or die ($!);
2528
2529 if (-e "$destdir/crunch_scripts/install") {
2530     shell_or_die (undef, "$destdir/crunch_scripts/install", $install_dir);
2531 } elsif (!-e "./install.sh" && -e "./tests/autotests.sh") {
2532     # Old version
2533     shell_or_die (undef, "./tests/autotests.sh", $install_dir);
2534 } elsif (-e "./install.sh") {
2535     shell_or_die (undef, "./install.sh", $install_dir);
2536 }
2537
2538 if ($archive_hash) {
2539     unlink "$destdir.archive_hash.new";
2540     symlink ($archive_hash, "$destdir.archive_hash.new") or die "$destdir.archive_hash.new: $!";
2541     rename ("$destdir.archive_hash.new", "$destdir.archive_hash") or die "$destdir.archive_hash: $!";
2542 }
2543
2544 close L;
2545
2546 sub can_run {
2547   my $command_name = shift;
2548   open(my $which, "-|", "which", $command_name) or die ($!);
2549   while (<$which>) { }
2550   close($which);
2551   return ($? == 0);
2552 }
2553
2554 sub shell_or_die
2555 {
2556   my $exitcode = shift;
2557
2558   if ($ENV{"DEBUG"}) {
2559     print STDERR "@_\n";
2560   }
2561   if (system (@_) != 0) {
2562     my $err = $!;
2563     my $code = $?;
2564     my $exitstatus = sprintf("exit %d signal %d", $code >> 8, $code & 0x7f);
2565     open STDERR, ">&STDERR_ORIG";
2566     system ("cat $destdir.log >&2");
2567     warn "@_ failed ($err): $exitstatus";
2568     if (defined($exitcode)) {
2569       exit $exitcode;
2570     }
2571     else {
2572       exit (($code >> 8) || 1);
2573     }
2574   }
2575 }
2576
2577 __DATA__