Merge branch '10004-check-sinfo' closes #10004

[arvados.git] / sdk / cli / bin / crunch-job
diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job

index 1a9dac30081ef03f55d5c1ce49c48898c63224c3..e0aff312cce8ec4737f0285e5f768d4cb5028b50 100755 (executable)
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -355,6 +355,7 @@ my @jobstep_done = ();
  my @jobstep_tomerge = ();
  my $jobstep_tomerge_level = 0;
  my $squeue_checked = 0;
+my $sinfo_checked = 0;
  my $latest_refresh = scalar time;
  
  
@@ -416,8 +417,17 @@ if ($docker_locator = $Job->{docker_image_locator}) {
    Log (undef, "docker image hash is $docker_hash");
    $docker_stream =~ s/^\.//;
    my $docker_install_script = qq{
-if ! $docker_bin images -q --no-trunc --all | grep -qxF \Q$docker_hash\E; then
-    arv-get \Q$docker_locator$docker_stream/$docker_hash.tar\E | $docker_bin load
+if $docker_bin images -q --no-trunc --all | grep -xF \Q$docker_hash\E >/dev/null; then
+    exit 0
+fi
+declare -a exit_codes=("\${PIPESTATUS[@]}")
+if [ 0 != "\${exit_codes[0]}" ]; then
+   exit "\${exit_codes[0]}"  # `docker images` failed
+elif [ 1 != "\${exit_codes[1]}" ]; then
+   exit "\${exit_codes[1]}"  # `grep` encountered an error
+else
+   # Everything worked fine, but grep didn't find the image on this host.
+   arv-get \Q$docker_locator$docker_stream/$docker_hash.tar\E | $docker_bin load
  fi
  };
  
@@ -432,7 +442,7 @@ fi
  
    # Determine whether this version of Docker supports memory+swap limits.
    ($exited, $stdout, $stderr) = srun_sync(
-    ["srun", "--nodelist=" . $node[0]],
+    ["srun", "--nodes=1"],
      [$docker_bin, 'run', '--help'],
      {label => "check --memory-swap feature"});
    $docker_limitmem = ($stdout =~ /--memory-swap/);
@@ -455,7 +465,7 @@ fi
        $try_user_arg = "--user=$try_user";
      }
      my ($exited, $stdout, $stderr) = srun_sync(
-      ["srun", "--nodelist=" . $node[0]],
+      ["srun", "--nodes=1"],
        ["/bin/sh", "-ec",
         "$docker_bin run $docker_run_args $try_user_arg $docker_hash id --user"],
        {label => $label});
@@ -852,7 +862,11 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
          .q{&& MEM=$(awk '($1 == "MemTotal:"){print $2}' </proc/meminfo) }
          .q{&& SWAP=$(awk '($1 == "SwapTotal:"){print $2}' </proc/meminfo) }
          ."&& MEMLIMIT=\$(( (\$MEM * 95) / ($ENV{CRUNCH_NODE_SLOTS} * 100) )) "
-        ."&& let SWAPLIMIT=\$MEMLIMIT+\$SWAP ";
+        ."&& let SWAPLIMIT=\$MEMLIMIT+\$SWAP "
+        .q{&& declare -a VOLUMES=() }
+        .q{&& if which crunchrunner >/dev/null ; then VOLUMES+=("--volume=$(which crunchrunner):/usr/local/bin/crunchrunner") ; fi }
+        .q{&& if test -f /etc/ssl/certs/ca-certificates.crt ; then VOLUMES+=("--volume=/etc/ssl/certs/ca-certificates.crt:/etc/arvados/ca-certificates.crt") ; }
+        .q{elif test -f /etc/pki/tls/certs/ca-bundle.crt ; then VOLUMES+=("--volume=/etc/pki/tls/certs/ca-bundle.crt:/etc/arvados/ca-certificates.crt") ; fi };
  
      $command .= "&& exec arv-mount --read-write --mount-by-pdh=by_pdh --mount-tmp=tmp --crunchstat-interval=10 --allow-other $arv_file_cache \Q$keep_mnt\E --exec ";
      $ENV{TASK_KEEPMOUNT} = "$keep_mnt/by_pdh";
@@ -917,6 +931,10 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
        # For now, use the same approach as TASK_WORK above.
        $ENV{"JOB_WORK"} = "/tmp/crunch-job-work";
  
+      # Bind mount the crunchrunner binary and host TLS certificates file into
+      # the container.
+      $command .= '"${VOLUMES[@]}" ';
+
        while (my ($env_key, $env_val) = each %ENV)
        {
          if ($env_key =~ /^(ARVADOS|CRUNCH|JOB|TASK)_/) {
@@ -1134,7 +1152,9 @@ sub update_progress_stats
  sub reapchildren
  {
    my $children_reaped = 0;
-  while ((my $pid = waitpid (-1, WNOHANG)) > 0)
+  my @successful_task_uuids = ();
+
+  while((my $pid = waitpid (-1, WNOHANG)) > 0)
    {
      my $childstatus = $?;
  
@@ -1143,13 +1163,6 @@ sub reapchildren
                      . $slot[$proc{$pid}->{slot}]->{cpu});
      my $jobstepidx = $proc{$pid}->{jobstepidx};
  
-    if (!WIFEXITED($childstatus))
-    {
-      # child did not exit (may be temporarily stopped)
-      Log ($jobstepidx, "child $pid did not actually exit in reapchildren, ignoring for now.");
-      next;
-    }
-
      $children_reaped++;
      my $elapsed = time - $proc{$pid}->{time};
      my $Jobstep = $jobstep[$jobstepidx];
@@ -1207,8 +1220,9 @@ sub reapchildren
        push @jobstep_todo, $jobstepidx;
        $Job->{'tasks_summary'}->{'failed'}++;
      }
-    else
+    else # task_success
      {
+      push @successful_task_uuids, $Jobstep->{'arvados_task'}->{uuid};
        ++$thisround_succeeded;
        $slot[$proc{$pid}->{slot}]->{node}->{losing_streak} = 0;
        $slot[$proc{$pid}->{slot}]->{node}->{hold_until} = 0;
@@ -1231,34 +1245,36 @@ sub reapchildren
      push @freeslot, $proc{$pid}->{slot};
      delete $proc{$pid};
  
-    if ($task_success) {
-      # Load new tasks
-      my $newtask_list = [];
-      my $newtask_results;
-      do {
-        $newtask_results = api_call(
-          "job_tasks/list",
-          'where' => {
-            'created_by_job_task_uuid' => $Jobstep->{'arvados_task'}->{uuid}
-          },
-          'order' => 'qsequence',
-          'offset' => scalar(@$newtask_list),
-            );
-        push(@$newtask_list, @{$newtask_results->{items}});
-      } while (@{$newtask_results->{items}});
-      foreach my $arvados_task (@$newtask_list) {
-        my $jobstep = {
-          'level' => $arvados_task->{'sequence'},
-          'failures' => 0,
-          'arvados_task' => $arvados_task
-        };
-        push @jobstep, $jobstep;
-        push @jobstep_todo, $#jobstep;
-      }
-    }
      $progress_is_dirty = 1;
    }
  
+  if (scalar(@successful_task_uuids) > 0)
+  {
+    Log (undef, sprintf("%d tasks exited (%d succeeded), checking for new tasks from API server.", $children_reaped, scalar(@successful_task_uuids)));
+    # Load new tasks
+    my $newtask_list = [];
+    my $newtask_results;
+    do {
+      $newtask_results = api_call(
+        "job_tasks/list",
+        'filters' => [["created_by_job_task_uuid","in",\@successful_task_uuids]],
+        'order' => 'qsequence',
+        'offset' => scalar(@$newtask_list),
+          );
+      push(@$newtask_list, @{$newtask_results->{items}});
+    } while (@{$newtask_results->{items}});
+    Log (undef, sprintf("Got %d new tasks from API server.", scalar(@$newtask_list)));
+    foreach my $arvados_task (@$newtask_list) {
+      my $jobstep = {
+        'level' => $arvados_task->{'sequence'},
+        'failures' => 0,
+        'arvados_task' => $arvados_task
+      };
+      push @jobstep, $jobstep;
+      push @jobstep_todo, $#jobstep;
+    }
+  }
+
    return $children_reaped;
  }
  
@@ -1386,6 +1402,37 @@ sub check_squeue
    }
  }
  
+sub check_sinfo
+{
+  # If a node fails in a multi-node "srun" call during job setup, the call
+  # may hang instead of exiting with a nonzero code.  This function checks
+  # "sinfo" for the health of the nodes that were allocated and ensures that
+  # they are all still in the "alloc" state.  If a node that is allocated to
+  # this job is not in "alloc" state, then set please_freeze.
+  #
+  # This is only called from srun_sync() for node configuration.  If a
+  # node fails doing actual work, there are other recovery mechanisms.
+
+  # Do not call `sinfo` more than once every 15 seconds.
+  return if $sinfo_checked > time - 15;
+  $sinfo_checked = time;
+
+  # The output format "%t" means output node states.
+  my @sinfo = `sinfo --nodes=\Q$ENV{SLURM_NODELIST}\E --noheader -o "%t"`;
+  if ($? != 0)
+  {
+    Log(undef, "warning: sinfo exit status $? ($!)");
+    return;
+  }
+  chop @sinfo;
+
+  foreach (@sinfo)
+  {
+    if ($_ != "alloc" && $_ != "alloc*") {
+      $main::please_freeze = 1;
+    }
+  }
+}
  
  sub release_allocation
  {
@@ -1454,6 +1501,9 @@ sub readfrompipes
  sub preprocess_stderr
  {
    my $jobstepidx = shift;
+  # slotindex is only defined for children running Arvados job tasks.
+  # Be prepared to handle the undef case (for setup srun calls, etc.).
+  my $job_slot_index = $jobstep[$jobstepidx]->{slotindex};
  
    while ($jobstep[$jobstepidx]->{stderr} =~ /^(.*?)\n/) {
      my $line = $1;
@@ -1463,19 +1513,16 @@ sub preprocess_stderr
        # whoa.
        $main::please_freeze = 1;
      }
-    elsif (!exists $jobstep[$jobstepidx]->{slotindex}) {
-      # Skip the following tempfail checks if this srun proc isn't
-      # attached to a particular worker slot.
-    }
      elsif ($line =~ /srun: error: (Node failure on|Aborting, .*\bio error\b)/) {
-      my $job_slot_index = $jobstep[$jobstepidx]->{slotindex};
-      $slot[$job_slot_index]->{node}->{fail_count}++;
        $jobstep[$jobstepidx]->{tempfail} = 1;
-      ban_node_by_slot($job_slot_index);
+      if (defined($job_slot_index)) {
+        $slot[$job_slot_index]->{node}->{fail_count}++;
+        ban_node_by_slot($job_slot_index);
+      }
      }
      elsif ($line =~ /srun: error: (Unable to create job step|.*: Communication connection failure)/) {
        $jobstep[$jobstepidx]->{tempfail} = 1;
-      ban_node_by_slot($jobstep[$jobstepidx]->{slotindex});
+      ban_node_by_slot($job_slot_index) if (defined($job_slot_index));
      }
      elsif ($line =~ /\bKeep(Read|Write|Request)Error:/) {
        $jobstep[$jobstepidx]->{tempfail} = 1;
@@ -1891,7 +1938,6 @@ sub freezeunquote
    return $s;
  }
  
-
  sub srun_sync
  {
    my $srunargs = shift;
@@ -1946,6 +1992,7 @@ sub srun_sync
      if (!$busy || ($latest_refresh + 2 < scalar time)) {
        check_refresh_wanted();
        check_squeue();
+      check_sinfo();
      }
      if (!$busy) {
        select(undef, undef, undef, 0.1);
@@ -1965,6 +2012,11 @@ sub srun_sync
    delete $reader{$jobstepidx};
  
    my $j = pop @jobstep;
+  # If the srun showed signs of tempfail, ensure the caller treats that as a
+  # failure case.
+  if ($main::please_freeze || $j->{tempfail}) {
+    $exited ||= 255;
+  }
    return ($exited, $j->{stdout_captured}, $j->{stderr_captured});
  }