Merge branch '8893-crunch-job-crunchrunner-quoting-wip'

[arvados.git] / sdk / cli / bin / crunch-job
diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job

index 1a9dac30081ef03f55d5c1ce49c48898c63224c3..6423c1cf79af5892b19e0dd73c91a46b4784bbdd 100755 (executable)
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -432,7 +432,7 @@ fi
  
    # Determine whether this version of Docker supports memory+swap limits.
    ($exited, $stdout, $stderr) = srun_sync(
-    ["srun", "--nodelist=" . $node[0]],
+    ["srun", "--nodes=1"],
      [$docker_bin, 'run', '--help'],
      {label => "check --memory-swap feature"});
    $docker_limitmem = ($stdout =~ /--memory-swap/);
@@ -455,7 +455,7 @@ fi
        $try_user_arg = "--user=$try_user";
      }
      my ($exited, $stdout, $stderr) = srun_sync(
-      ["srun", "--nodelist=" . $node[0]],
+      ["srun", "--nodes=1"],
        ["/bin/sh", "-ec",
         "$docker_bin run $docker_run_args $try_user_arg $docker_hash id --user"],
        {label => $label});
@@ -852,7 +852,13 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
          .q{&& MEM=$(awk '($1 == "MemTotal:"){print $2}' </proc/meminfo) }
          .q{&& SWAP=$(awk '($1 == "SwapTotal:"){print $2}' </proc/meminfo) }
          ."&& MEMLIMIT=\$(( (\$MEM * 95) / ($ENV{CRUNCH_NODE_SLOTS} * 100) )) "
-        ."&& let SWAPLIMIT=\$MEMLIMIT+\$SWAP ";
+        ."&& let SWAPLIMIT=\$MEMLIMIT+\$SWAP "
+        # $VOLUME_CRUNCHRUNNER and $VOLUME_CERTS will be passed unquoted as
+        # arguments to `docker run`.  They must contain their own quoting.
+        .q{&& VOLUME_CRUNCHRUNNER="" VOLUME_CERTS="" }
+        .q{&& if which crunchrunner >/dev/null ; then VOLUME_CRUNCHRUNNER=\\"--volume=$(which crunchrunner):/usr/local/bin/crunchrunner\\" ; fi }
+        .q{&& if test -f /etc/ssl/certs/ca-certificates.crt ; then VOLUME_CERTS=\\"--volume=/etc/ssl/certs/ca-certificates.crt:/etc/arvados/ca-certificates.crt\\" ; }
+        .q{elif test -f /etc/pki/tls/certs/ca-bundle.crt ; then VOLUME_CERTS=\\"--volume=/etc/pki/tls/certs/ca-bundle.crt:/etc/arvados/ca-certificates.crt\\" ; fi };
  
      $command .= "&& exec arv-mount --read-write --mount-by-pdh=by_pdh --mount-tmp=tmp --crunchstat-interval=10 --allow-other $arv_file_cache \Q$keep_mnt\E --exec ";
      $ENV{TASK_KEEPMOUNT} = "$keep_mnt/by_pdh";
@@ -917,6 +923,10 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
        # For now, use the same approach as TASK_WORK above.
        $ENV{"JOB_WORK"} = "/tmp/crunch-job-work";
  
+      # Bind mount the crunchrunner binary and host TLS certificates file into
+      # the container.
+      $command .= "\$VOLUME_CRUNCHRUNNER \$VOLUME_CERTS ";
+
        while (my ($env_key, $env_val) = each %ENV)
        {
          if ($env_key =~ /^(ARVADOS|CRUNCH|JOB|TASK)_/) {
@@ -1134,7 +1144,9 @@ sub update_progress_stats
  sub reapchildren
  {
    my $children_reaped = 0;
-  while ((my $pid = waitpid (-1, WNOHANG)) > 0)
+  my @successful_task_uuids = ();
+
+  while((my $pid = waitpid (-1, WNOHANG)) > 0)
    {
      my $childstatus = $?;
  
@@ -1143,13 +1155,6 @@ sub reapchildren
                      . $slot[$proc{$pid}->{slot}]->{cpu});
      my $jobstepidx = $proc{$pid}->{jobstepidx};
  
-    if (!WIFEXITED($childstatus))
-    {
-      # child did not exit (may be temporarily stopped)
-      Log ($jobstepidx, "child $pid did not actually exit in reapchildren, ignoring for now.");
-      next;
-    }
-
      $children_reaped++;
      my $elapsed = time - $proc{$pid}->{time};
      my $Jobstep = $jobstep[$jobstepidx];
@@ -1207,8 +1212,9 @@ sub reapchildren
        push @jobstep_todo, $jobstepidx;
        $Job->{'tasks_summary'}->{'failed'}++;
      }
-    else
+    else # task_success
      {
+      push @successful_task_uuids, $Jobstep->{'arvados_task'}->{uuid};
        ++$thisround_succeeded;
        $slot[$proc{$pid}->{slot}]->{node}->{losing_streak} = 0;
        $slot[$proc{$pid}->{slot}]->{node}->{hold_until} = 0;
@@ -1231,34 +1237,36 @@ sub reapchildren
      push @freeslot, $proc{$pid}->{slot};
      delete $proc{$pid};
  
-    if ($task_success) {
-      # Load new tasks
-      my $newtask_list = [];
-      my $newtask_results;
-      do {
-        $newtask_results = api_call(
-          "job_tasks/list",
-          'where' => {
-            'created_by_job_task_uuid' => $Jobstep->{'arvados_task'}->{uuid}
-          },
-          'order' => 'qsequence',
-          'offset' => scalar(@$newtask_list),
-            );
-        push(@$newtask_list, @{$newtask_results->{items}});
-      } while (@{$newtask_results->{items}});
-      foreach my $arvados_task (@$newtask_list) {
-        my $jobstep = {
-          'level' => $arvados_task->{'sequence'},
-          'failures' => 0,
-          'arvados_task' => $arvados_task
-        };
-        push @jobstep, $jobstep;
-        push @jobstep_todo, $#jobstep;
-      }
-    }
      $progress_is_dirty = 1;
    }
  
+  if (scalar(@successful_task_uuids) > 0)
+  {
+    Log (undef, sprintf("%d tasks exited (%d succeeded), checking for new tasks from API server.", $children_reaped, scalar(@successful_task_uuids)));
+    # Load new tasks
+    my $newtask_list = [];
+    my $newtask_results;
+    do {
+      $newtask_results = api_call(
+        "job_tasks/list",
+        'filters' => [["created_by_job_task_uuid","in",\@successful_task_uuids]],
+        'order' => 'qsequence',
+        'offset' => scalar(@$newtask_list),
+          );
+      push(@$newtask_list, @{$newtask_results->{items}});
+    } while (@{$newtask_results->{items}});
+    Log (undef, sprintf("Got %d new tasks from API server.", scalar(@$newtask_list)));
+    foreach my $arvados_task (@$newtask_list) {
+      my $jobstep = {
+        'level' => $arvados_task->{'sequence'},
+        'failures' => 0,
+        'arvados_task' => $arvados_task
+      };
+      push @jobstep, $jobstep;
+      push @jobstep_todo, $#jobstep;
+    }
+  }
+
    return $children_reaped;
  }
  
@@ -1454,6 +1462,9 @@ sub readfrompipes
  sub preprocess_stderr
  {
    my $jobstepidx = shift;
+  # slotindex is only defined for children running Arvados job tasks.
+  # Be prepared to handle the undef case (for setup srun calls, etc.).
+  my $job_slot_index = $jobstep[$jobstepidx]->{slotindex};
  
    while ($jobstep[$jobstepidx]->{stderr} =~ /^(.*?)\n/) {
      my $line = $1;
@@ -1463,19 +1474,16 @@ sub preprocess_stderr
        # whoa.
        $main::please_freeze = 1;
      }
-    elsif (!exists $jobstep[$jobstepidx]->{slotindex}) {
-      # Skip the following tempfail checks if this srun proc isn't
-      # attached to a particular worker slot.
-    }
      elsif ($line =~ /srun: error: (Node failure on|Aborting, .*\bio error\b)/) {
-      my $job_slot_index = $jobstep[$jobstepidx]->{slotindex};
-      $slot[$job_slot_index]->{node}->{fail_count}++;
        $jobstep[$jobstepidx]->{tempfail} = 1;
-      ban_node_by_slot($job_slot_index);
+      if (defined($job_slot_index)) {
+        $slot[$job_slot_index]->{node}->{fail_count}++;
+        ban_node_by_slot($job_slot_index);
+      }
      }
      elsif ($line =~ /srun: error: (Unable to create job step|.*: Communication connection failure)/) {
        $jobstep[$jobstepidx]->{tempfail} = 1;
-      ban_node_by_slot($jobstep[$jobstepidx]->{slotindex});
+      ban_node_by_slot($job_slot_index) if (defined($job_slot_index));
      }
      elsif ($line =~ /\bKeep(Read|Write|Request)Error:/) {
        $jobstep[$jobstepidx]->{tempfail} = 1;
@@ -1965,6 +1973,11 @@ sub srun_sync
    delete $reader{$jobstepidx};
  
    my $j = pop @jobstep;
+  # If the srun showed signs of tempfail, ensure the caller treats that as a
+  # failure case.
+  if ($main::please_freeze || $j->{tempfail}) {
+    $exited ||= 255;
+  }
    return ($exited, $j->{stdout_captured}, $j->{stderr_captured});
  }