18790: Merge branch 'main' into 18790-log-client

[arvados.git] / services / api / app / models / container.rb
diff --git a/services/api/app/models/container.rb b/services/api/app/models/container.rb

index 3a04c56046416771a903714e99543e40c7d66f4e..61557eacb0efec3b77d97646b0662da302fad15b 100644 (file)
--- a/services/api/app/models/container.rb
+++ b/services/api/app/models/container.rb
@@ -5,7 +5,6 @@
  require 'log_reuse_info'
  require 'whitelist_update'
  require 'safe_json'
-require 'update_priority'
  
  class Container < ArvadosModel
    include ArvadosModelUpdates
@@ -51,7 +50,6 @@ class Container < ArvadosModel
    after_save :update_cr_logs
    after_save :handle_completed
    after_save :propagate_priority
-  after_commit { UpdatePriority.run_update_thread }
  
    has_many :container_requests, :foreign_key => :container_uuid, :class_name => 'ContainerRequest', :primary_key => :uuid
    belongs_to :auth, :class_name => 'ApiClientAuthorization', :foreign_key => :auth_uuid, :primary_key => :uuid
@@ -83,6 +81,8 @@ class Container < ArvadosModel
      t.add :interactive_session_started
      t.add :output_storage_classes
      t.add :output_properties
+    t.add :cost
+    t.add :subrequests_cost
    end
  
    # Supported states for a container
@@ -131,12 +131,12 @@ class Container < ArvadosModel
    def update_priority!
      return if ![Queued, Locked, Running].include?(state)
      p = ContainerRequest.
-        where('container_uuid=? and priority>0', uuid).
-        includes(:requesting_container).
-        lock(true).
-        map do |cr|
-      if cr.requesting_container
-        cr.requesting_container.priority
+          where('container_uuid=? and priority>0 and state=?', uuid, ContainerRequest::Committed).
+          select("priority, requesting_container_uuid, created_at").
+          lock(true).
+          map do |cr|
+      if cr.requesting_container_uuid
+        Container.where(uuid: cr.requesting_container_uuid).pluck(:priority).first
        else
          (cr.priority << 50) - (cr.created_at.to_time.to_f * 1000).to_i
        end
@@ -151,12 +151,11 @@ class Container < ArvadosModel
        # priority of the parent container (ignoring requests with no
        # container assigned, because their priority doesn't matter).
        ContainerRequest.
-        where(requesting_container_uuid: self.uuid,
-              state: ContainerRequest::Committed).
-        where('container_uuid is not null').
-        includes(:container).
-        map(&:container).
-        map(&:update_priority!)
+        where('requesting_container_uuid = ? and state = ? and container_uuid is not null',
+              self.uuid, ContainerRequest::Committed).
+        pluck(:container_uuid).each do |container_uuid|
+        Container.find_by_uuid(container_uuid).update_priority!
+      end
      end
    end
  
@@ -225,6 +224,9 @@ class Container < ArvadosModel
      if rc['keep_cache_ram'] == 0
        rc['keep_cache_ram'] = Rails.configuration.Containers.DefaultKeepCacheRAM
      end
+    if rc['keep_cache_disk'] == 0 and rc['keep_cache_ram'] == 0
+      rc['keep_cache_disk'] = bound_keep_cache_disk(rc['ram'])
+    end
      rc
    end
  
@@ -291,21 +293,55 @@ class Container < ArvadosModel
      candidates = candidates.where('secret_mounts_md5 = ?', secret_mounts_md5)
      log_reuse_info(candidates) { "after filtering on secret_mounts_md5 #{secret_mounts_md5.inspect}" }
  
-    if attrs[:runtime_constraints]['cuda'].nil?
-      attrs[:runtime_constraints]['cuda'] = {
-        'device_count' => 0,
-        'driver_version' => '',
-        'hardware_capability' => '',
-      }
-    end
-    resolved_runtime_constraints = [resolve_runtime_constraints(attrs[:runtime_constraints])]
-    if resolved_runtime_constraints[0]['cuda']['device_count'] == 0
-      # If no CUDA requested, extend search to include older container
-      # records that don't have a 'cuda' section in runtime_constraints
-      resolved_runtime_constraints << resolved_runtime_constraints[0].except('cuda')
-    end
-
-    candidates = candidates.where_serialized(:runtime_constraints, resolved_runtime_constraints, md5: true, multivalue: true)
+    resolved_runtime_constraints = resolve_runtime_constraints(attrs[:runtime_constraints])
+    # Ideally we would completely ignore Keep cache constraints when making
+    # reuse considerations, but our database structure makes that impractical.
+    # The best we can do is generate a search that matches on all likely values.
+    runtime_constraint_variations = {
+      keep_cache_disk: [
+        # Check for constraints without keep_cache_disk
+        # (containers that predate the constraint)
+        nil,
+        # Containers that use keep_cache_ram instead
+        0,
+        # The default value
+        bound_keep_cache_disk(resolved_runtime_constraints['ram']),
+        # The minimum default bound
+        bound_keep_cache_disk(0),
+        # The maximum default bound (presumably)
+        bound_keep_cache_disk(1 << 60),
+        # The requested value
+        resolved_runtime_constraints.delete('keep_cache_disk'),
+      ].uniq,
+      keep_cache_ram: [
+        # Containers that use keep_cache_disk instead
+        0,
+        # The default value
+        Rails.configuration.Containers.DefaultKeepCacheRAM,
+        # The requested value
+        resolved_runtime_constraints.delete('keep_cache_ram'),
+      ].uniq,
+    }
+    resolved_cuda = resolved_runtime_constraints['cuda']
+    if resolved_cuda.nil? or resolved_cuda['device_count'] == 0
+      runtime_constraint_variations[:cuda] = [
+        # Check for constraints without cuda
+        # (containers that predate the constraint)
+        nil,
+        # The default "don't need CUDA" value
+        {
+          'device_count' => 0,
+          'driver_version' => '',
+          'hardware_capability' => '',
+        },
+        # The requested value
+        resolved_runtime_constraints.delete('cuda')
+      ].uniq
+    end
+    reusable_runtime_constraints = hash_product(runtime_constraint_variations)
+                                     .map { |v| resolved_runtime_constraints.merge(v) }
+
+    candidates = candidates.where_serialized(:runtime_constraints, reusable_runtime_constraints, md5: true, multivalue: true)
      log_reuse_info(candidates) { "after filtering on runtime_constraints #{attrs[:runtime_constraints].inspect}" }
  
      log_reuse_info { "checking for state=Complete with readable output and log..." }
@@ -430,6 +466,31 @@ class Container < ArvadosModel
  
    protected
  
+  def self.bound_keep_cache_disk(value)
+    value ||= 0
+    min_value = 2 << 30
+    max_value = 32 << 30
+    if value < min_value
+      min_value
+    elsif value > max_value
+      max_value
+    else
+      value
+    end
+  end
+
+  def self.hash_product(**kwargs)
+    # kwargs is a hash that maps parameters to an array of values.
+    # This function enumerates every possible hash where each key has one of
+    # the values from its array.
+    # The output keys are strings since that's what container hash attributes
+    # want.
+    # A nil value yields a hash without that key.
+    [[:_, nil]].product(
+      *kwargs.map { |(key, values)| [key.to_s].product(values) },
+    ).map { |param_pairs| Hash[param_pairs].compact }
+  end
+
    def fill_field_defaults
      self.state ||= Queued
      self.environment ||= {}
@@ -478,8 +539,9 @@ class Container < ArvadosModel
  
    def validate_change
      permitted = [:state]
-    progress_attrs = [:progress, :runtime_status, :log, :output, :output_properties]
-    final_attrs = [:exit_code, :finished_at]
+    final_attrs = [:finished_at]
+    progress_attrs = [:progress, :runtime_status, :subrequests_cost, :cost,
+                      :log, :output, :output_properties, :exit_code]
  
      if self.new_record?
        permitted.push(:owner_uuid, :command, :container_image, :cwd,
@@ -498,9 +560,9 @@ class Container < ArvadosModel
        permitted.push :priority
  
      when Running
-      permitted.push :priority, :output_properties, *progress_attrs
+      permitted.push :priority, :output_properties, :gateway_address, *progress_attrs
        if self.state_changed?
-        permitted.push :started_at, :gateway_address
+        permitted.push :started_at
        end
        if !self.interactive_session_started_was
          permitted.push :interactive_session_started
@@ -516,7 +578,7 @@ class Container < ArvadosModel
        when Running
          permitted.push :finished_at, *progress_attrs
        when Queued, Locked
-        permitted.push :finished_at, :log, :runtime_status
+        permitted.push :finished_at, :log, :runtime_status, :cost
        end
  
      else
@@ -583,7 +645,7 @@ class Container < ArvadosModel
      # each requesting CR.
      return if self.final? || !saved_change_to_log?
      leave_modified_by_user_alone do
-      ContainerRequest.where(container_uuid: self.uuid).each do |cr|
+      ContainerRequest.where(container_uuid: self.uuid, state: ContainerRequest::Committed).each do |cr|
          cr.update_collections(container: self, collections: ['log'])
          cr.save!
        end
@@ -700,6 +762,36 @@ class Container < ArvadosModel
            end
  
            if retryable_requests.any?
+            scheduling_parameters = {
+              # partitions: empty if any are empty, else the union of all parameters
+              "partitions": retryable_requests
+                              .map { |req| req.scheduling_parameters["partitions"] || [] }
+                              .reduce { |cur, new| (cur.empty? or new.empty?) ? [] : (cur | new) },
+
+              # preemptible: true if all are true, else false
+              "preemptible": retryable_requests
+                               .map { |req| req.scheduling_parameters["preemptible"] }
+                               .all?,
+
+              # supervisor: true if all any true, else false
+              "supervisor": retryable_requests
+                               .map { |req| req.scheduling_parameters["supervisor"] }
+                               .any?,
+
+              # max_run_time: 0 if any are 0 (unlimited), else the maximum
+              "max_run_time": retryable_requests
+                                .map { |req| req.scheduling_parameters["max_run_time"] || 0 }
+                                .reduce do |cur, new|
+                if cur == 0 or new == 0
+                  0
+                elsif new > cur
+                  new
+                else
+                  cur
+                end
+              end,
+            }
+
              c_attrs = {
                command: self.command,
                cwd: self.cwd,
@@ -708,7 +800,7 @@ class Container < ArvadosModel
                container_image: self.container_image,
                mounts: self.mounts,
                runtime_constraints: self.runtime_constraints,
-              scheduling_parameters: self.scheduling_parameters,
+              scheduling_parameters: scheduling_parameters,
                secret_mounts: prev_secret_mounts,
                runtime_token: prev_runtime_token,
                runtime_user_uuid: self.runtime_user_uuid,
@@ -719,6 +811,7 @@ class Container < ArvadosModel
                cr.with_lock do
                  leave_modified_by_user_alone do
                    # Use row locking because this increments container_count
+                  cr.cumulative_cost += self.cost + self.subrequests_cost
                    cr.container_uuid = c.uuid
                    cr.save!
                  end
@@ -736,13 +829,13 @@ class Container < ArvadosModel
  
            # Cancel outstanding container requests made by this container.
            ContainerRequest.
-            includes(:container).
              where(requesting_container_uuid: uuid,
-                  state: ContainerRequest::Committed).each do |cr|
+                  state: ContainerRequest::Committed).
+            in_batches(of: 15).each_record do |cr|
              leave_modified_by_user_alone do
-              cr.update_attributes!(priority: 0)
-              cr.container.reload
-              if cr.container.state == Container::Queued || cr.container.state == Container::Locked
+              cr.set_priority_zero
+              container_state = Container.where(uuid: cr.container_uuid).pluck(:state).first
+              if container_state == Container::Queued || container_state == Container::Locked
                  # If the child container hasn't started yet, finalize the
                  # child CR now instead of leaving it "on hold", i.e.,
                  # Queued with priority 0.  (OTOH, if the child is already