closes #7399

[arvados.git] / services / api / lib / crunch_dispatch.rb
diff --git a/services/api/lib/crunch_dispatch.rb b/services/api/lib/crunch_dispatch.rb

index 131386d765f66b66d958d244a6590131d0189769..ce94f737a2467f855a7156ba76873db57cd183ee 100644 (file)
--- a/services/api/lib/crunch_dispatch.rb
+++ b/services/api/lib/crunch_dispatch.rb
@@ -24,6 +24,7 @@ class CrunchDispatch
  
      @docker_bin = ENV['CRUNCH_JOB_DOCKER_BIN']
      @docker_run_args = ENV['CRUNCH_JOB_DOCKER_RUN_ARGS']
+    @cgroup_root = ENV['CRUNCH_CGROUP_ROOT']
  
      @arvados_internal = Rails.configuration.git_internal_dir
      if not File.exists? @arvados_internal
@@ -384,6 +385,10 @@ class CrunchDispatch
                     '--job', job.uuid,
                     '--git-dir', @arvados_internal]
  
+      if @cgroup_root
+        cmd_args += ['--cgroup-root', @cgroup_root]
+      end
+
        if @docker_bin
          cmd_args += ['--docker-bin', @docker_bin]
        end
@@ -429,6 +434,8 @@ class CrunchDispatch
          log_throttle_bytes_so_far: 0,
          log_throttle_lines_so_far: 0,
          log_throttle_bytes_skipped: 0,
+        log_throttle_partial_line_last_at: Time.new(0),
+        log_throttle_first_partial_line: true,
        }
        i.close
        @todo_job_retries.delete(job.uuid)
@@ -443,9 +450,23 @@ class CrunchDispatch
      message = false
      linesize = line.size
      if running_job[:log_throttle_is_open]
-      running_job[:log_throttle_lines_so_far] += 1
-      running_job[:log_throttle_bytes_so_far] += linesize
-      running_job[:bytes_logged] += linesize
+      partial_line = false
+      skip_counts = false
+      matches = line.match(/^\S+ \S+ \d+ \d+ stderr (.*)/)
+      if matches and matches[1] and matches[1].start_with?('[...]') and matches[1].end_with?('[...]')
+        partial_line = true
+        if Time.now > running_job[:log_throttle_partial_line_last_at] + Rails.configuration.crunch_log_partial_line_throttle_period
+          running_job[:log_throttle_partial_line_last_at] = Time.now
+        else
+          skip_counts = true
+        end
+      end
+
+      if !skip_counts
+        running_job[:log_throttle_lines_so_far] += 1
+        running_job[:log_throttle_bytes_so_far] += linesize
+        running_job[:bytes_logged] += linesize
+      end
  
        if (running_job[:bytes_logged] >
            Rails.configuration.crunch_limit_log_bytes_per_job)
@@ -456,14 +477,18 @@ class CrunchDispatch
        elsif (running_job[:log_throttle_bytes_so_far] >
               Rails.configuration.crunch_log_throttle_bytes)
          remaining_time = running_job[:log_throttle_reset_time] - Time.now
-        message = "Exceeded rate #{Rails.configuration.crunch_log_throttle_bytes} bytes per #{Rails.configuration.crunch_log_throttle_period} seconds (crunch_log_throttle_bytes). Logging will be silenced for the next #{remaining_time.round} seconds.\n"
+        message = "Exceeded rate #{Rails.configuration.crunch_log_throttle_bytes} bytes per #{Rails.configuration.crunch_log_throttle_period} seconds (crunch_log_throttle_bytes). Logging will be silenced for the next #{remaining_time.round} seconds."
          running_job[:log_throttle_is_open] = false
  
        elsif (running_job[:log_throttle_lines_so_far] >
               Rails.configuration.crunch_log_throttle_lines)
          remaining_time = running_job[:log_throttle_reset_time] - Time.now
-        message = "Exceeded rate #{Rails.configuration.crunch_log_throttle_lines} lines per #{Rails.configuration.crunch_log_throttle_period} seconds (crunch_log_throttle_lines), logging will be silenced for the next #{remaining_time.round} seconds.\n"
+        message = "Exceeded rate #{Rails.configuration.crunch_log_throttle_lines} lines per #{Rails.configuration.crunch_log_throttle_period} seconds (crunch_log_throttle_lines), logging will be silenced for the next #{remaining_time.round} seconds."
          running_job[:log_throttle_is_open] = false
+
+      elsif partial_line and running_job[:log_throttle_first_partial_line]
+        running_job[:log_throttle_first_partial_line] = false
+        message = "Rate-limiting partial segments of long lines to one every #{Rails.configuration.crunch_log_partial_line_throttle_period} seconds."
        end
      end
  
@@ -475,8 +500,11 @@ class CrunchDispatch
      if message
        # Yes, write to logs, but use our "rate exceeded" message
        # instead of the log message that exceeded the limit.
+      message += " A complete log is still being written to Keep, and will be available when the job finishes.\n"
        line.replace message
        true
+    elsif partial_line
+      false
      else
        running_job[:log_throttle_is_open]
      end
@@ -501,6 +529,8 @@ class CrunchDispatch
          j[:log_throttle_lines_so_far] = 0
          j[:log_throttle_bytes_skipped] = 0
          j[:log_throttle_is_open] = true
+        j[:log_throttle_partial_line_last_at] = Time.new(0)
+        j[:log_throttle_first_partial_line] = true
        end
  
        j[:buf].each do |stream, streambuf|
@@ -638,8 +668,7 @@ class CrunchDispatch
      jobrecord = Job.find_by_uuid(job_done.uuid)
  
      if exit_status == EXIT_RETRY_UNLOCKED or (exit_tempfail and @job_retry_counts.include? jobrecord.uuid)
-      # The job failed because all of the nodes allocated to it
-      # failed.  Only this crunch-dispatch process can retry the job:
+      # Only this crunch-dispatch process can retry the job:
        # it's already locked, and there's no way to put it back in the
        # Queued state.  Put it in our internal todo list unless the job
        # has failed this way excessively.