services/api/app/models/container.rb

   1 # Copyright (C) The Arvados Authors. All rights reserved.
   2 #
   3 # SPDX-License-Identifier: AGPL-3.0
   4
   5 require 'log_reuse_info'
   6 require 'whitelist_update'
   7 require 'safe_json'
   8
   9 class Container < ArvadosModel
  10   include ArvadosModelUpdates
  11   include HasUuid
  12   include KindAndEtag
  13   include CommonApiTemplate
  14   include WhitelistUpdate
  15   extend CurrentApiClient
  16   extend DbCurrentTime
  17   extend LogReuseInfo
  18
  19   serialize :environment, Hash
  20   serialize :mounts, Hash
  21   serialize :runtime_constraints, Hash
  22   serialize :command, Array
  23   serialize :scheduling_parameters, Hash
  24   serialize :secret_mounts, Hash
  25
  26   before_validation :fill_field_defaults, :if => :new_record?
  27   before_validation :set_timestamps
  28   validates :command, :container_image, :output_path, :cwd, :priority, { presence: true }
  29   validates :priority, numericality: { only_integer: true, greater_than_or_equal_to: 0 }
  30   validate :validate_state_change
  31   validate :validate_change
  32   validate :validate_lock
  33   validate :validate_output
  34   after_validation :assign_auth
  35   before_save :sort_serialized_attrs
  36   before_save :update_secret_mounts_md5
  37   before_save :scrub_secret_mounts
  38   after_save :handle_completed
  39   after_save :propagate_priority
  40
  41   has_many :container_requests, :foreign_key => :container_uuid, :class_name => 'ContainerRequest', :primary_key => :uuid
  42   belongs_to :auth, :class_name => 'ApiClientAuthorization', :foreign_key => :auth_uuid, :primary_key => :uuid
  43
  44   api_accessible :user, extend: :common do |t|
  45     t.add :command
  46     t.add :container_image
  47     t.add :cwd
  48     t.add :environment
  49     t.add :exit_code
  50     t.add :finished_at
  51     t.add :locked_by_uuid
  52     t.add :log
  53     t.add :mounts
  54     t.add :output
  55     t.add :output_path
  56     t.add :priority
  57     t.add :progress
  58     t.add :runtime_constraints
  59     t.add :started_at
  60     t.add :state
  61     t.add :auth_uuid
  62     t.add :scheduling_parameters
  63   end
  64
  65   # Supported states for a container
  66   States =
  67     [
  68      (Queued = 'Queued'),
  69      (Locked = 'Locked'),
  70      (Running = 'Running'),
  71      (Complete = 'Complete'),
  72      (Cancelled = 'Cancelled')
  73     ]
  74
  75   State_transitions = {
  76     nil => [Queued],
  77     Queued => [Locked, Cancelled],
  78     Locked => [Queued, Running, Cancelled],
  79     Running => [Complete, Cancelled]
  80   }
  81
  82   def self.limit_index_columns_read
  83     ["mounts"]
  84   end
  85
  86   def self.full_text_searchable_columns
  87     super - ["secret_mounts", "secret_mounts_md5"]
  88   end
  89
  90   def self.searchable_columns *args
  91     super - ["secret_mounts_md5"]
  92   end
  93
  94   def logged_attributes
  95     super.except('secret_mounts')
  96   end
  97
  98   def state_transitions
  99     State_transitions
 100   end
 101
 102   # Container priority is the highest "computed priority" of any
 103   # matching request. The computed priority of a container-submitted
 104   # request is the priority of the submitting container. The computed
 105   # priority of a user-submitted request is a function of
 106   # user-assigned priority and request creation time.
 107   def update_priority!
 108     return if ![Queued, Locked, Running].include?(state)
 109     p = ContainerRequest.
 110         where('container_uuid=? and priority>0', uuid).
 111         includes(:requesting_container).
 112         lock(true).
 113         map do |cr|
 114       if cr.requesting_container
 115         cr.requesting_container.priority
 116       else
 117         (cr.priority << 50) - (cr.created_at.to_time.to_f * 1000).to_i
 118       end
 119     end.max || 0
 120     update_attributes!(priority: p)
 121   end
 122
 123   def propagate_priority
 124     return true unless priority_changed?
 125     act_as_system_user do
 126       # Update the priority of child container requests to match new
 127       # priority of the parent container (ignoring requests with no
 128       # container assigned, because their priority doesn't matter).
 129       ContainerRequest.
 130         where(requesting_container_uuid: self.uuid,
 131               state: ContainerRequest::Committed).
 132         where('container_uuid is not null').
 133         includes(:container).
 134         map(&:container).
 135         map(&:update_priority!)
 136     end
 137   end
 138
 139   # Create a new container (or find an existing one) to satisfy the
 140   # given container request.
 141   def self.resolve(req)
 142     c_attrs = {
 143       command: req.command,
 144       cwd: req.cwd,
 145       environment: req.environment,
 146       output_path: req.output_path,
 147       container_image: resolve_container_image(req.container_image),
 148       mounts: resolve_mounts(req.mounts),
 149       runtime_constraints: resolve_runtime_constraints(req.runtime_constraints),
 150       scheduling_parameters: req.scheduling_parameters,
 151       secret_mounts: req.secret_mounts,
 152     }
 153     act_as_system_user do
 154       if req.use_existing && (reusable = find_reusable(c_attrs))
 155         reusable
 156       else
 157         Container.create!(c_attrs)
 158       end
 159     end
 160   end
 161
 162   # Return a runtime_constraints hash that complies with requested but
 163   # is suitable for saving in a container record, i.e., has specific
 164   # values instead of ranges.
 165   #
 166   # Doing this as a step separate from other resolutions, like "git
 167   # revision range to commit hash", makes sense only when there is no
 168   # opportunity to reuse an existing container (e.g., container reuse
 169   # is not implemented yet, or we have already found that no existing
 170   # containers are suitable).
 171   def self.resolve_runtime_constraints(runtime_constraints)
 172     rc = {}
 173     defaults = {
 174       'keep_cache_ram' =>
 175       Rails.configuration.container_default_keep_cache_ram,
 176     }
 177     defaults.merge(runtime_constraints).each do |k, v|
 178       if v.is_a? Array
 179         rc[k] = v[0]
 180       else
 181         rc[k] = v
 182       end
 183     end
 184     rc
 185   end
 186
 187   # Return a mounts hash suitable for a Container, i.e., with every
 188   # readonly collection UUID resolved to a PDH.
 189   def self.resolve_mounts(mounts)
 190     c_mounts = {}
 191     mounts.each do |k, mount|
 192       mount = mount.dup
 193       c_mounts[k] = mount
 194       if mount['kind'] != 'collection'
 195         next
 196       end
 197       if (uuid = mount.delete 'uuid')
 198         c = Collection.
 199           readable_by(current_user).
 200           where(uuid: uuid).
 201           select(:portable_data_hash).
 202           first
 203         if !c
 204           raise ArvadosModel::UnresolvableContainerError.new "cannot mount collection #{uuid.inspect}: not found"
 205         end
 206         if mount['portable_data_hash'].nil?
 207           # PDH not supplied by client
 208           mount['portable_data_hash'] = c.portable_data_hash
 209         elsif mount['portable_data_hash'] != c.portable_data_hash
 210           # UUID and PDH supplied by client, but they don't agree
 211           raise ArgumentError.new "cannot mount collection #{uuid.inspect}: current portable_data_hash #{c.portable_data_hash.inspect} does not match #{c['portable_data_hash'].inspect} in request"
 212         end
 213       end
 214     end
 215     return c_mounts
 216   end
 217
 218   # Return a container_image PDH suitable for a Container.
 219   def self.resolve_container_image(container_image)
 220     coll = Collection.for_latest_docker_image(container_image)
 221     if !coll
 222       raise ArvadosModel::UnresolvableContainerError.new "docker image #{container_image.inspect} not found"
 223     end
 224     coll.portable_data_hash
 225   end
 226
 227   def self.find_reusable(attrs)
 228     log_reuse_info { "starting with #{Container.all.count} container records in database" }
 229     candidates = Container.where_serialized(:command, attrs[:command])
 230     log_reuse_info(candidates) { "after filtering on command #{attrs[:command].inspect}" }
 231
 232     candidates = candidates.where('cwd = ?', attrs[:cwd])
 233     log_reuse_info(candidates) { "after filtering on cwd #{attrs[:cwd].inspect}" }
 234
 235     candidates = candidates.where_serialized(:environment, attrs[:environment])
 236     log_reuse_info(candidates) { "after filtering on environment #{attrs[:environment].inspect}" }
 237
 238     candidates = candidates.where('output_path = ?', attrs[:output_path])
 239     log_reuse_info(candidates) { "after filtering on output_path #{attrs[:output_path].inspect}" }
 240
 241     image = resolve_container_image(attrs[:container_image])
 242     candidates = candidates.where('container_image = ?', image)
 243     log_reuse_info(candidates) { "after filtering on container_image #{image.inspect} (resolved from #{attrs[:container_image].inspect})" }
 244
 245     candidates = candidates.where_serialized(:mounts, resolve_mounts(attrs[:mounts]))
 246     log_reuse_info(candidates) { "after filtering on mounts #{attrs[:mounts].inspect}" }
 247
 248     candidates = candidates.where('secret_mounts_md5 = ?', Digest::MD5.hexdigest(SafeJSON.dump(self.deep_sort_hash(attrs[:secret_mounts]))))
 249     log_reuse_info(candidates) { "after filtering on mounts #{attrs[:mounts].inspect}" }
 250
 251     candidates = candidates.where_serialized(:runtime_constraints, resolve_runtime_constraints(attrs[:runtime_constraints]))
 252     log_reuse_info(candidates) { "after filtering on runtime_constraints #{attrs[:runtime_constraints].inspect}" }
 253
 254     log_reuse_info { "checking for state=Complete with readable output and log..." }
 255
 256     select_readable_pdh = Collection.
 257       readable_by(current_user).
 258       select(:portable_data_hash).
 259       to_sql
 260
 261     usable = candidates.where(state: Complete, exit_code: 0)
 262     log_reuse_info(usable) { "with state=Complete, exit_code=0" }
 263
 264     usable = usable.where("log IN (#{select_readable_pdh})")
 265     log_reuse_info(usable) { "with readable log" }
 266
 267     usable = usable.where("output IN (#{select_readable_pdh})")
 268     log_reuse_info(usable) { "with readable output" }
 269
 270     usable = usable.order('finished_at ASC').limit(1).first
 271     if usable
 272       log_reuse_info { "done, reusing container #{usable.uuid} with state=Complete" }
 273       return usable
 274     end
 275
 276     # Check for Running candidates and return the most likely to finish sooner.
 277     log_reuse_info { "checking for state=Running..." }
 278     running = candidates.where(state: Running).
 279               order('progress desc, started_at asc').
 280               limit(1).first
 281     if running
 282       log_reuse_info { "done, reusing container #{running.uuid} with state=Running" }
 283       return running
 284     else
 285       log_reuse_info { "have no containers in Running state" }
 286     end
 287
 288     # Check for Locked or Queued ones and return the most likely to start first.
 289     locked_or_queued = candidates.
 290                        where("state IN (?)", [Locked, Queued]).
 291                        order('state asc, priority desc, created_at asc').
 292                        limit(1).first
 293     if locked_or_queued
 294       log_reuse_info { "done, reusing container #{locked_or_queued.uuid} with state=#{locked_or_queued.state}" }
 295       return locked_or_queued
 296     else
 297       log_reuse_info { "have no containers in Locked or Queued state" }
 298     end
 299
 300     log_reuse_info { "done, no reusable container found" }
 301     nil
 302   end
 303
 304   def check_lock_fail
 305     if self.state != Queued
 306       raise LockFailedError.new("cannot lock when #{self.state}")
 307     elsif self.priority <= 0
 308       raise LockFailedError.new("cannot lock when priority<=0")
 309     end
 310   end
 311
 312   def lock
 313     # Check invalid state transitions once before getting the lock
 314     # (because it's cheaper that way) and once after getting the lock
 315     # (because state might have changed while acquiring the lock).
 316     check_lock_fail
 317     transaction do
 318       begin
 319         reload(lock: 'FOR UPDATE NOWAIT')
 320       rescue
 321         raise LockFailedError.new("cannot lock: other transaction in progress")
 322       end
 323       check_lock_fail
 324       update_attributes!(state: Locked)
 325     end
 326   end
 327
 328   def check_unlock_fail
 329     if self.state != Locked
 330       raise InvalidStateTransitionError.new("cannot unlock when #{self.state}")
 331     elsif self.locked_by_uuid != current_api_client_authorization.uuid
 332       raise InvalidStateTransitionError.new("locked by a different token")
 333     end
 334   end
 335
 336   def unlock
 337     # Check invalid state transitions twice (see lock)
 338     check_unlock_fail
 339     transaction do
 340       reload(lock: 'FOR UPDATE')
 341       check_unlock_fail
 342       update_attributes!(state: Queued)
 343     end
 344   end
 345
 346   def self.readable_by(*users_list)
 347     # Load optional keyword arguments, if they exist.
 348     if users_list.last.is_a? Hash
 349       kwargs = users_list.pop
 350     else
 351       kwargs = {}
 352     end
 353     Container.where(ContainerRequest.readable_by(*users_list).where("containers.uuid = container_requests.container_uuid").exists)
 354   end
 355
 356   def final?
 357     [Complete, Cancelled].include?(self.state)
 358   end
 359
 360   protected
 361
 362   def fill_field_defaults
 363     self.state ||= Queued
 364     self.environment ||= {}
 365     self.runtime_constraints ||= {}
 366     self.mounts ||= {}
 367     self.cwd ||= "."
 368     self.priority ||= 0
 369     self.scheduling_parameters ||= {}
 370   end
 371
 372   def permission_to_create
 373     current_user.andand.is_admin
 374   end
 375
 376   def permission_to_update
 377     # Override base permission check to allow auth_uuid to set progress and
 378     # output (only).  Whether it is legal to set progress and output in the current
 379     # state has already been checked in validate_change.
 380     current_user.andand.is_admin ||
 381       (!current_api_client_authorization.nil? and
 382        [self.auth_uuid, self.locked_by_uuid].include? current_api_client_authorization.uuid)
 383   end
 384
 385   def ensure_owner_uuid_is_permitted
 386     # Override base permission check to allow auth_uuid to set progress and
 387     # output (only).  Whether it is legal to set progress and output in the current
 388     # state has already been checked in validate_change.
 389     if !current_api_client_authorization.nil? and self.auth_uuid == current_api_client_authorization.uuid
 390       check_update_whitelist [:progress, :output]
 391     else
 392       super
 393     end
 394   end
 395
 396   def set_timestamps
 397     if self.state_changed? and self.state == Running
 398       self.started_at ||= db_current_time
 399     end
 400
 401     if self.state_changed? and [Complete, Cancelled].include? self.state
 402       self.finished_at ||= db_current_time
 403     end
 404   end
 405
 406   def validate_change
 407     permitted = [:state]
 408
 409     if self.new_record?
 410       permitted.push(:owner_uuid, :command, :container_image, :cwd,
 411                      :environment, :mounts, :output_path, :priority,
 412                      :runtime_constraints, :scheduling_parameters,
 413                      :secret_mounts)
 414     end
 415
 416     case self.state
 417     when Queued, Locked
 418       permitted.push :priority
 419
 420     when Running
 421       permitted.push :priority, :progress, :output
 422       if self.state_changed?
 423         permitted.push :started_at
 424       end
 425
 426     when Complete
 427       if self.state_was == Running
 428         permitted.push :finished_at, :output, :log, :exit_code
 429       end
 430
 431     when Cancelled
 432       case self.state_was
 433       when Running
 434         permitted.push :finished_at, :output, :log
 435       when Queued, Locked
 436         permitted.push :finished_at, :log
 437       end
 438
 439     else
 440       # The state_transitions check will add an error message for this
 441       return false
 442     end
 443
 444     check_update_whitelist permitted
 445   end
 446
 447   def validate_lock
 448     if [Locked, Running].include? self.state
 449       # If the Container was already locked, locked_by_uuid must not
 450       # changes. Otherwise, the current auth gets the lock.
 451       need_lock = locked_by_uuid_was || current_api_client_authorization.andand.uuid
 452     else
 453       need_lock = nil
 454     end
 455
 456     # The caller can provide a new value for locked_by_uuid, but only
 457     # if it's exactly what we expect. This allows a caller to perform
 458     # an update like {"state":"Unlocked","locked_by_uuid":null}.
 459     if self.locked_by_uuid_changed?
 460       if self.locked_by_uuid != need_lock
 461         return errors.add :locked_by_uuid, "can only change to #{need_lock}"
 462       end
 463     end
 464     self.locked_by_uuid = need_lock
 465   end
 466
 467   def validate_output
 468     # Output must exist and be readable by the current user.  This is so
 469     # that a container cannot "claim" a collection that it doesn't otherwise
 470     # have access to just by setting the output field to the collection PDH.
 471     if output_changed?
 472       c = Collection.
 473             readable_by(current_user, {include_trash: true}).
 474             where(portable_data_hash: self.output).
 475             first
 476       if !c
 477         errors.add :output, "collection must exist and be readable by current user."
 478       end
 479     end
 480   end
 481
 482   def assign_auth
 483     if self.auth_uuid_changed?
 484       return errors.add :auth_uuid, 'is readonly'
 485     end
 486     if not [Locked, Running].include? self.state
 487       # don't need one
 488       self.auth.andand.update_attributes(expires_at: db_current_time)
 489       self.auth = nil
 490       return
 491     elsif self.auth
 492       # already have one
 493       return
 494     end
 495     cr = ContainerRequest.
 496       where('container_uuid=? and priority>0', self.uuid).
 497       order('priority desc').
 498       first
 499     if !cr
 500       return errors.add :auth_uuid, "cannot be assigned because priority <= 0"
 501     end
 502     self.auth = ApiClientAuthorization.
 503       create!(user_id: User.find_by_uuid(cr.modified_by_user_uuid).id,
 504               api_client_id: 0)
 505   end
 506
 507   def sort_serialized_attrs
 508     if self.environment_changed?
 509       self.environment = self.class.deep_sort_hash(self.environment)
 510     end
 511     if self.mounts_changed?
 512       self.mounts = self.class.deep_sort_hash(self.mounts)
 513     end
 514     if self.runtime_constraints_changed?
 515       self.runtime_constraints = self.class.deep_sort_hash(self.runtime_constraints)
 516     end
 517     if self.scheduling_parameters_changed?
 518       self.scheduling_parameters = self.class.deep_sort_hash(self.scheduling_parameters)
 519     end
 520   end
 521
 522   def update_secret_mounts_md5
 523     if self.secret_mounts_changed?
 524       self.secret_mounts_md5 = Digest::MD5.hexdigest(
 525         SafeJSON.dump(self.class.deep_sort_hash(self.secret_mounts)))
 526     end
 527   end
 528
 529   def scrub_secret_mounts
 530     # this runs after update_secret_mounts_md5, so the
 531     # secret_mounts_md5 will still reflect the secrets that are being
 532     # scrubbed here.
 533     if self.state_changed? && self.final?
 534       self.secret_mounts = {}
 535     end
 536   end
 537
 538   def handle_completed
 539     # This container is finished so finalize any associated container requests
 540     # that are associated with this container.
 541     if self.state_changed? and self.final?
 542       act_as_system_user do
 543
 544         if self.state == Cancelled
 545           retryable_requests = ContainerRequest.where("container_uuid = ? and priority > 0 and state = 'Committed' and container_count < container_count_max", uuid)
 546         else
 547           retryable_requests = []
 548         end
 549
 550         if retryable_requests.any?
 551           c_attrs = {
 552             command: self.command,
 553             cwd: self.cwd,
 554             environment: self.environment,
 555             output_path: self.output_path,
 556             container_image: self.container_image,
 557             mounts: self.mounts,
 558             runtime_constraints: self.runtime_constraints,
 559             scheduling_parameters: self.scheduling_parameters
 560           }
 561           c = Container.create! c_attrs
 562           retryable_requests.each do |cr|
 563             cr.with_lock do
 564               leave_modified_by_user_alone do
 565                 # Use row locking because this increments container_count
 566                 cr.container_uuid = c.uuid
 567                 cr.save!
 568               end
 569             end
 570           end
 571         end
 572
 573         # Notify container requests associated with this container
 574         ContainerRequest.where(container_uuid: uuid,
 575                                state: ContainerRequest::Committed).each do |cr|
 576           leave_modified_by_user_alone do
 577             cr.finalize!
 578           end
 579         end
 580
 581         # Cancel outstanding container requests made by this container.
 582         ContainerRequest.
 583           includes(:container).
 584           where(requesting_container_uuid: uuid,
 585                 state: ContainerRequest::Committed).each do |cr|
 586           leave_modified_by_user_alone do
 587             cr.update_attributes!(priority: 0)
 588             cr.container.reload
 589             if cr.container.state == Container::Queued || cr.container.state == Container::Locked
 590               # If the child container hasn't started yet, finalize the
 591               # child CR now instead of leaving it "on hold", i.e.,
 592               # Queued with priority 0.  (OTOH, if the child is already
 593               # running, leave it alone so it can get cancelled the
 594               # usual way, get a copy of the log collection, etc.)
 595               cr.update_attributes!(state: ContainerRequest::Final)
 596             end
 597           end
 598         end
 599       end
 600     end
 601   end
 602 end