From: Tom Clegg Date: Mon, 17 Sep 2018 21:50:13 +0000 (-0400) Subject: 13752: Migrate file_names column using multiple transactions. X-Git-Tag: 1.3.0~107^2~1 X-Git-Url: https://git.arvados.org/arvados.git/commitdiff_plain/af1125bd1bc10f6ac2f9129261176c4510aadd54?hp=591fd5d18644037426b58abc0d21bb2ccbcae888 13752: Migrate file_names column using multiple transactions. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- diff --git a/services/api/db/migrate/20180917205609_recompute_file_names_index.rb b/services/api/db/migrate/20180917205609_recompute_file_names_index.rb new file mode 100644 index 0000000000..3dbc0ec3df --- /dev/null +++ b/services/api/db/migrate/20180917205609_recompute_file_names_index.rb @@ -0,0 +1,51 @@ +class RecomputeFileNamesIndex < ActiveRecord::Migration + def do_batch(pdhs:) + ActiveRecord::Base.connection.exec_query('BEGIN') + Collection.select(:portable_data_hash, :manifest_text).where(portable_data_hash: pdhs).distinct(:portable_data_hash).each do |c| + ActiveRecord::Base.connection.exec_query("update collections set file_names=$1 where portable_data_hash=$2", + "update file_names index", + [[nil, c.manifest_files], [nil, c.portable_data_hash]]) + end + ActiveRecord::Base.connection.exec_query('COMMIT') + end + def up + # Process collections in multiple transactions, where the total + # size of all manifest_texts processed in a transaction is no more + # than batch_size_max. Collections whose manifest_text is bigger + # than batch_size_max are updated in their own individual + # transactions. + batch_size_max = 1 << 28 # 256 MiB + batch_size = 0 + batch_pdhs = {} + last_pdh = '0' + total = Collection.distinct.count(:portable_data_hash) + done = 0 + any = true + while any + any = false + Collection. + unscoped. + select(:portable_data_hash).distinct. + order(:portable_data_hash). + where('portable_data_hash > ?', last_pdh). + limit(1000).each do |c| + any = true + last_pdh = c.portable_data_hash + manifest_size = c.portable_data_hash.split('+')[1].to_i + if batch_size > 0 && batch_size + manifest_size > batch_size_max + do_batch(pdhs: batch_pdhs.keys) + done += batch_pdhs.size + Rails.logger.info("RecomputeFileNamesIndex: #{done}/#{total}") + batch_pdhs = {} + batch_size = 0 + end + batch_pdhs[c.portable_data_hash] = true + batch_size += manifest_size + end + end + do_batch(pdhs: batch_pdhs.keys) + Rails.logger.info("RecomputeFileNamesIndex: finished") + end + def down + end +end diff --git a/services/api/db/structure.sql b/services/api/db/structure.sql index 427c9afb56..f8d9b3f35d 100644 --- a/services/api/db/structure.sql +++ b/services/api/db/structure.sql @@ -3169,3 +3169,5 @@ INSERT INTO schema_migrations (version) VALUES ('20180824155207'); INSERT INTO schema_migrations (version) VALUES ('20180904110712'); +INSERT INTO schema_migrations (version) VALUES ('20180917205609'); +