14484: Simplifies pdh transaction grouping, keeps SQL in the migration
authorEric Biagiotti <ebiagiotti@veritasgenetics.com>
Wed, 27 Mar 2019 20:35:14 +0000 (16:35 -0400)
committerEric Biagiotti <ebiagiotti@veritasgenetics.com>
Wed, 27 Mar 2019 20:35:14 +0000 (16:35 -0400)
Arvados-DCO-1.1-Signed-off-by: Eric Biagiotti <ebiagiotti@veritasgenetics.com>

services/api/app/models/container.rb
services/api/db/migrate/20190322174136_add_file_info_to_collection.rb

index 694aa5a0d74301b59338726e3e99a98d3ace6753..f3da800827b16468137b31bf8c4be4b57754fdd1 100644 (file)
@@ -411,58 +411,33 @@ class Container < ArvadosModel
   #
   # Correctly groups pdhs to use for batch database updates. Helps avoid
   # updating too many database rows in a single transaction.
-  def self.group_pdhs_for_multiple_transactions(log_prefix)
+  def self.group_pdhs_for_multiple_transactions(distinct_ordered_pdhs, distinct_pdh_count, log_prefix)
     batch_size_max = 1 << 28 # 256 MiB
+    batch_size = 0
+    batch_pdhs = {}
     last_pdh = '0'
     done = 0
     any = true
 
-    total = ActiveRecord::Base.connection.exec_query(
-      "SELECT DISTINCT portable_data_hash FROM collections"
-    ).rows.count
-
     while any
       any = false
-      pdhs_res = ActiveRecord::Base.connection.exec_query(
-        "SELECT DISTINCT portable_data_hash FROM collections "\
-        "WHERE portable_data_hash > '#{last_pdh}' "\
-        "GROUP BY portable_data_hash LIMIT 1000"
-      )
-      break if pdhs_res.rows.count.zero?
-
-      pdhs = pdhs_res.rows.collect { |r| r[0] }
-      Container.group_pdhs_by_manifest_size(pdhs, batch_size_max) do |grouped_pdhs|
+      distinct_ordered_pdhs.call(last_pdh) do |pdh|
         any = true
-        yield grouped_pdhs
-        done += grouped_pdhs.size
-        last_pdh = pdhs[-1]
-        Rails.logger.info(log_prefix + ": #{done}/#{total}")
-      end
-    end
-    Rails.logger.info(log_prefix + ": finished")
-  end
-
-  # NOTE: Migration 20190322174136_add_file_info_to_collection.rb relies on this function.
-  #
-  # Change with caution!
-  #
-  # Given an array of pdhs, yield a subset array of pdhs when the total
-  # size of all manifest_texts is no more than batch_size_max. Pdhs whose manifest_text 
-  # is bigger than batch_size_max are yielded by themselves
-  def self.group_pdhs_by_manifest_size(pdhs, batch_size_max)
-    batch_size = 0
-    batch_pdhs = {}
-    pdhs.each do |pdh|
-      manifest_size = pdh.split('+')[1].to_i
-      if batch_size > 0 && batch_size + manifest_size > batch_size_max
-        yield batch_pdhs.keys
-        batch_pdhs = {}
-        batch_size = 0
+        last_pdh = pdh
+        manifest_size = pdh.split('+')[1].to_i
+        if batch_size > 0 && batch_size + manifest_size > batch_size_max
+          yield batch_pdhs.keys
+          done += batch_pdhs.size
+          Rails.logger.info(log_prefix + ": #{done}/#{distinct_pdh_count}")
+          batch_pdhs = {}
+          batch_size = 0
+        end
+        batch_pdhs[pdh] = true
+        batch_size += manifest_size
       end
-      batch_pdhs[pdh] = true
-      batch_size += manifest_size
     end
     yield batch_pdhs.keys
+    Rails.logger.info(log_prefix + ": finished")
   end
 
   protected
index c1c336247f70942993940063d2cb3d13590aac35..99db8133d47e807323908a5b92a0948e18cba19d 100755 (executable)
@@ -30,7 +30,23 @@ class AddFileInfoToCollection < ActiveRecord::Migration
     add_column :collections, :file_count, :integer, default: 0, null: false
     add_column :collections, :file_size_total, :integer, default: 0, null: false
 
-    Container.group_pdhs_for_multiple_transactions("AddFileInfoToCollection") do |pdhs|
+    distinct_pdh_count = ActiveRecord::Base.connection.exec_query(
+      "SELECT DISTINCT portable_data_hash FROM collections"
+    ).rows.count
+
+    # Generator that queries for all the distince pdhs greater than last_pdh
+    ordered_pdh_query = lambda { |last_pdh, &block|
+      pdhs = ActiveRecord::Base.connection.exec_query(
+        "SELECT DISTINCT portable_data_hash FROM collections "\
+        "WHERE portable_data_hash > '#{last_pdh}' "\
+        "ORDER BY portable_data_hash LIMIT 1000"
+      )
+      pdhs.rows.each do |row|
+        block.call(row[0])
+      end
+    }
+
+    Container.group_pdhs_for_multiple_transactions(ordered_pdh_query, distinct_pdh_count, "AddFileInfoToCollection") do |pdhs|
       do_batch(pdhs)
     end
   end