10813: Merge branch 'master' into 10813-arv-put-six-threads
[arvados.git] / services / api / lib / crunch_dispatch.rb
index 1017895b25218cea5927fd568851be8047554ffb..1b1df360133a04a247ed58dbb04e67525f76045c 100644 (file)
@@ -24,9 +24,10 @@ class CrunchDispatch
 
     @docker_bin = ENV['CRUNCH_JOB_DOCKER_BIN']
     @docker_run_args = ENV['CRUNCH_JOB_DOCKER_RUN_ARGS']
+    @cgroup_root = ENV['CRUNCH_CGROUP_ROOT']
 
     @arvados_internal = Rails.configuration.git_internal_dir
-    if not File.exists? @arvados_internal
+    if not File.exist? @arvados_internal
       $stderr.puts `mkdir -p #{@arvados_internal.shellescape} && git init --bare #{@arvados_internal.shellescape}`
       raise "No internal git repository available" unless ($? == 0)
     end
@@ -72,7 +73,7 @@ class CrunchDispatch
       # into multiple rows with one hostname each.
       `#{cmd} --noheader -o '%N:#{outfmt}'`.each_line do |line|
         tokens = line.chomp.split(":", max_fields)
-        if (re = tokens[0].match /^(.*?)\[([-,\d]+)\]$/)
+        if (re = tokens[0].match(/^(.*?)\[([-,\d]+)\]$/))
           tokens.shift
           re[2].split(",").each do |range|
             range = range.split("-").collect(&:to_i)
@@ -104,7 +105,7 @@ class CrunchDispatch
   end
 
   def update_node_status
-    return unless Server::Application.config.crunch_job_wrapper.to_s.match /^slurm/
+    return unless Server::Application.config.crunch_job_wrapper.to_s.match(/^slurm/)
     slurm_status.each_pair do |hostname, slurmdata|
       next if @node_state[hostname] == slurmdata
       begin
@@ -168,7 +169,25 @@ class CrunchDispatch
       end
       usable_nodes << node
       if usable_nodes.count >= min_node_count
-        return usable_nodes.map { |node| node.hostname }
+        hostnames = usable_nodes.map(&:hostname)
+        log_nodes = usable_nodes.map do |n|
+          "#{n.hostname} #{n.uuid} #{n.properties.to_json}"
+        end
+        log_job = "#{job.uuid} #{job.runtime_constraints}"
+        log_text = "dispatching job #{log_job} to #{log_nodes.join(", ")}"
+        $stderr.puts log_text
+        begin
+          act_as_system_user do
+            Log.new(object_uuid: job.uuid,
+                    event_type: 'dispatch',
+                    owner_uuid: system_user_uuid,
+                    summary: "dispatching to #{hostnames.join(", ")}",
+                    properties: {'text' => log_text}).save!
+          end
+        rescue => e
+          $stderr.puts "dispatch: log.create failed: #{e}"
+        end
+        return hostnames
       end
     end
     nil
@@ -203,8 +222,8 @@ class CrunchDispatch
               owner_uuid: job.owner_uuid,
               summary: message,
               properties: {"text" => message}).save!
-    rescue
-      $stderr.puts "dispatch: log.create failed"
+    rescue => e
+      $stderr.puts "dispatch: log.create failed: #{e}"
     end
 
     if not skip_lock and not have_job_lock?(job)
@@ -384,6 +403,10 @@ class CrunchDispatch
                    '--job', job.uuid,
                    '--git-dir', @arvados_internal]
 
+      if @cgroup_root
+        cmd_args += ['--cgroup-root', @cgroup_root]
+      end
+
       if @docker_bin
         cmd_args += ['--docker-bin', @docker_bin]
       end
@@ -447,8 +470,8 @@ class CrunchDispatch
     if running_job[:log_throttle_is_open]
       partial_line = false
       skip_counts = false
-      line_splits = line.split('stderr ')
-      if line_splits[1].andand.start_with?('[...]') and line_splits[1].end_with?('[...]')
+      matches = line.match(/^\S+ \S+ \d+ \d+ stderr (.*)/)
+      if matches and matches[1] and matches[1].start_with?('[...]') and matches[1].end_with?('[...]')
         partial_line = true
         if Time.now > running_job[:log_throttle_partial_line_last_at] + Rails.configuration.crunch_log_partial_line_throttle_period
           running_job[:log_throttle_partial_line_last_at] = Time.now
@@ -507,8 +530,6 @@ class CrunchDispatch
 
   def read_pipes
     @running.each do |job_uuid, j|
-      job = j[:job]
-
       now = Time.now
       if now > j[:log_throttle_reset_time]
         # It has been more than throttle_period seconds since the last