15164: Add locking between container completion and reuse
[arvados.git] / services / api / app / models / container_request.rb
index e1ae2b54891adbc85e8fc6b48e17e9faaeac9b59..45db4ee916f202352b658a695e0bae2184693e46 100644 (file)
@@ -19,13 +19,16 @@ class ContainerRequest < ArvadosModel
                primary_key: :uuid,
              }
 
-  serialize :properties, Hash
+  # Posgresql JSONB columns should NOT be declared as serialized, Rails 5
+  # already know how to properly treat them.
+  attribute :properties, :jsonbHash, default: {}
+  attribute :secret_mounts, :jsonbHash, default: {}
+
   serialize :environment, Hash
   serialize :mounts, Hash
   serialize :runtime_constraints, Hash
   serialize :command, Array
   serialize :scheduling_parameters, Hash
-  serialize :secret_mounts, Hash
 
   before_validation :fill_field_defaults, :if => :new_record?
   before_validation :validate_runtime_constraints
@@ -116,13 +119,34 @@ class ContainerRequest < ArvadosModel
   end
 
   def finalize_if_needed
-    if state == Committed && Container.find_by_uuid(container_uuid).final?
-      reload
-      act_as_system_user do
-        leave_modified_by_user_alone do
-          finalize!
+    return if state != Committed
+    while true
+      # get container lock first, then lock current container request
+      # (same order as Container#handle_completed). Locking always
+      # reloads the Container and ContainerRequest records.
+      c = Container.find_by_uuid(container_uuid)
+      c.lock!
+      self.lock!
+
+      if container_uuid != c.uuid
+        # After locking, we've noticed a race, the container_uuid is
+        # different than the container record we just loaded.  This
+        # can happen if Container#handle_completed scheduled a new
+        # container for retry and set container_uuid while we were
+        # waiting on the container lock.  Restart the loop and get the
+        # new container.
+        redo
+      end
+
+      if state == Committed && c.final?
+        # The current container is
+        act_as_system_user do
+          leave_modified_by_user_alone do
+            finalize!
+          end
         end
       end
+      return true
     end
   end
 
@@ -150,7 +174,7 @@ class ContainerRequest < ArvadosModel
       manifest = Collection.where(portable_data_hash: pdh).first.manifest_text
 
       coll_uuid = self.send(out_type + '_uuid')
-      coll = coll_uuid.nil? ? nil : Collection.find_by_uuid(coll_uuid)
+      coll = coll_uuid.nil? ? nil : Collection.where(uuid: coll_uuid).first
       if !coll
         coll = Collection.new(
           owner_uuid: self.owner_uuid,
@@ -166,7 +190,7 @@ class ContainerRequest < ArvadosModel
         src = Arv::Collection.new(manifest)
         dst = Arv::Collection.new(coll.manifest_text)
         dst.cp_r("./", ".", src)
-        dst.cp_r("./", "container #{container.uuid}", src)
+        dst.cp_r("./", "log for container #{container.uuid}", src)
         manifest = dst.manifest_text
       end
 
@@ -191,8 +215,9 @@ class ContainerRequest < ArvadosModel
     self.environment ||= {}
     self.runtime_constraints ||= {}
     self.mounts ||= {}
+    self.secret_mounts ||= {}
     self.cwd ||= "."
-    self.container_count_max ||= Rails.configuration.container_count_max
+    self.container_count_max ||= Rails.configuration.Containers.MaxComputeVMs
     self.scheduling_parameters ||= {}
     self.output_ttl ||= 0
     self.priority ||= 0
@@ -206,7 +231,18 @@ class ContainerRequest < ArvadosModel
       return false
     end
     if state_changed? and state == Committed and container_uuid.nil?
-      self.container_uuid = Container.resolve(self).uuid
+      while true
+        c = Container.resolve(self)
+        c.lock!
+        if c.state == Container::Cancelled
+          # Lost a race, we have a lock on the container but the
+          # container was cancelled in a different request, restart
+          # the loop and resolve request to a new container.
+          redo
+        end
+        self.container_uuid = c.uuid
+        break
+      end
     end
     if self.container_uuid != self.container_uuid_was
       if self.container_count_changed?
@@ -215,14 +251,29 @@ class ContainerRequest < ArvadosModel
       else
         self.container_count += 1
         if self.container_uuid_was
-          # old_container = Container.find_by_uuid()
-          # # copy logs from old container into CR's log collection
-          # #
-          #   src = Arv::Collection.new(manifest)
-          #   tgt = Arv::Collection.new(coll.manifest_text)
-          #   tgt.cp_r("./", "./", src)
-          #   tgt.cp_r("./", "container #{container.uuid}", src)
-          #   manifest = tgt.manifest_text
+          old_container = Container.find_by_uuid(self.container_uuid_was)
+          old_logs = Collection.where(portable_data_hash: old_container.log).first
+          if old_logs
+            log_coll = self.log_uuid.nil? ? nil : Collection.where(uuid: self.log_uuid).first
+            if self.log_uuid.nil?
+              log_coll = Collection.new(
+                owner_uuid: self.owner_uuid,
+                name: coll_name = "Container log for request #{uuid}",
+                manifest_text: "")
+            end
+
+            # copy logs from old container into CR's log collection
+            src = Arv::Collection.new(old_logs.manifest_text)
+            dst = Arv::Collection.new(log_coll.manifest_text)
+            dst.cp_r("./", "log for container #{old_container.uuid}", src)
+            manifest = dst.manifest_text
+
+            log_coll.assign_attributes(
+              portable_data_hash: Digest::MD5.hexdigest(manifest) + '+' + manifest.bytesize.to_s,
+              manifest_text: manifest)
+            log_coll.save_with_unique_name!
+            self.log_uuid = log_coll.uuid
+          end
         end
       end
     end
@@ -233,7 +284,7 @@ class ContainerRequest < ArvadosModel
     if self.state == Committed
       # If preemptible instances (eg: AWS Spot Instances) are allowed,
       # ask them on child containers by default.
-      if Rails.configuration.preemptible_instances and !c.nil? and
+      if Rails.configuration.Containers.UsePreemptibleInstances and !c.nil? and
         self.scheduling_parameters['preemptible'].nil?
           self.scheduling_parameters['preemptible'] = true
       end
@@ -303,7 +354,7 @@ class ContainerRequest < ArvadosModel
             scheduling_parameters['partitions'].size)
             errors.add :scheduling_parameters, "partitions must be an array of strings"
       end
-      if !Rails.configuration.preemptible_instances and scheduling_parameters['preemptible']
+      if !Rails.configuration.Containers.UsePreemptibleInstances and scheduling_parameters['preemptible']
         errors.add :scheduling_parameters, "preemptible instances are not allowed"
       end
       if scheduling_parameters.include? 'max_run_time' and