13752: Migrate file_names column using multiple transactions.
authorTom Clegg <tclegg@veritasgenetics.com>
Mon, 17 Sep 2018 21:50:13 +0000 (17:50 -0400)
committerTom Clegg <tclegg@veritasgenetics.com>
Mon, 17 Sep 2018 21:50:13 +0000 (17:50 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg@veritasgenetics.com>

services/api/db/migrate/20180917205609_recompute_file_names_index.rb [new file with mode: 0644]
services/api/db/structure.sql

diff --git a/services/api/db/migrate/20180917205609_recompute_file_names_index.rb b/services/api/db/migrate/20180917205609_recompute_file_names_index.rb
new file mode 100644 (file)
index 0000000..3dbc0ec
--- /dev/null
@@ -0,0 +1,51 @@
+class RecomputeFileNamesIndex < ActiveRecord::Migration
+  def do_batch(pdhs:)
+    ActiveRecord::Base.connection.exec_query('BEGIN')
+    Collection.select(:portable_data_hash, :manifest_text).where(portable_data_hash: pdhs).distinct(:portable_data_hash).each do |c|
+      ActiveRecord::Base.connection.exec_query("update collections set file_names=$1 where portable_data_hash=$2",
+                                               "update file_names index",
+                                               [[nil, c.manifest_files], [nil, c.portable_data_hash]])
+    end
+    ActiveRecord::Base.connection.exec_query('COMMIT')
+  end
+  def up
+    # Process collections in multiple transactions, where the total
+    # size of all manifest_texts processed in a transaction is no more
+    # than batch_size_max.  Collections whose manifest_text is bigger
+    # than batch_size_max are updated in their own individual
+    # transactions.
+    batch_size_max = 1 << 28    # 256 MiB
+    batch_size = 0
+    batch_pdhs = {}
+    last_pdh = '0'
+    total = Collection.distinct.count(:portable_data_hash)
+    done = 0
+    any = true
+    while any
+      any = false
+      Collection.
+        unscoped.
+        select(:portable_data_hash).distinct.
+        order(:portable_data_hash).
+        where('portable_data_hash > ?', last_pdh).
+        limit(1000).each do |c|
+        any = true
+        last_pdh = c.portable_data_hash
+        manifest_size = c.portable_data_hash.split('+')[1].to_i
+        if batch_size > 0 && batch_size + manifest_size > batch_size_max
+          do_batch(pdhs: batch_pdhs.keys)
+          done += batch_pdhs.size
+          Rails.logger.info("RecomputeFileNamesIndex: #{done}/#{total}")
+          batch_pdhs = {}
+          batch_size = 0
+        end
+        batch_pdhs[c.portable_data_hash] = true
+        batch_size += manifest_size
+      end
+    end
+    do_batch(pdhs: batch_pdhs.keys)
+    Rails.logger.info("RecomputeFileNamesIndex: finished")
+  end
+  def down
+  end
+end
index 427c9afb561f20eb4916517d766731de6c126b84..f8d9b3f35d1a68a4715256ff4dc299edd4e16b90 100644 (file)
@@ -3169,3 +3169,5 @@ INSERT INTO schema_migrations (version) VALUES ('20180824155207');
 
 INSERT INTO schema_migrations (version) VALUES ('20180904110712');
 
+INSERT INTO schema_migrations (version) VALUES ('20180917205609');
+