#
# SPDX-License-Identifier: AGPL-3.0
-class AddFileInfoToCollection < ActiveRecord::Migration
+require "arvados/keep"
+require "group_pdhs"
+
+class AddFileInfoToCollection < ActiveRecord::Migration[4.2]
def do_batch(pdhs)
pdhs_str = ''
pdhs.each do |pdh|
- pdhs_str << "'" << pdh << "'" << ','
+ pdhs_str << "'" << pdh << "'" << ","
end
collections = ActiveRecord::Base.connection.exec_query(
- 'SELECT DISTINCT portable_data_hash, manifest_text FROM collections '\
+ "SELECT DISTINCT portable_data_hash, manifest_text FROM collections "\
"WHERE portable_data_hash IN (#{pdhs_str[0..-2]}) "
)
collections.rows.each do |row|
- file_count = 0
- file_size_total = 0
- row[1].scan(/\S+/) do |token|
- is_file = token.match(/^[[:digit:]]+:[[:digit:]]+:/)
- if is_file
- _, filesize, filename = token.split(':', 3)
-
- # Avoid counting empty dir placeholders
- break if filename == '.' && filesize.zero?
-
- file_count += 1
- file_size_total += filesize.to_i
- end
- end
- ActiveRecord::Base.connection.exec_query('BEGIN')
- ActiveRecord::Base.connection.exec_query("UPDATE collections SET file_count=#{file_count}, "\
- "file_size_total=#{file_size_total} "\
+ manifest = Keep::Manifest.new(row[1])
+ ActiveRecord::Base.connection.exec_query("BEGIN")
+ ActiveRecord::Base.connection.exec_query("UPDATE collections SET file_count=#{manifest.files_count}, "\
+ "file_size_total=#{manifest.files_size} "\
"WHERE portable_data_hash='#{row[0]}'")
- ActiveRecord::Base.connection.exec_query('COMMIT')
+ ActiveRecord::Base.connection.exec_query("COMMIT")
end
end
def up
add_column :collections, :file_count, :integer, default: 0, null: false
- add_column :collections, :file_size_total, :integer, default: 0, null: false
+ add_column :collections, :file_size_total, :integer, limit: 8, default: 0, null: false
+
+ distinct_pdh_count = ActiveRecord::Base.connection.exec_query(
+ "SELECT DISTINCT portable_data_hash FROM collections"
+ ).rows.count
+
+ # Generator that queries for all the distinct pdhs greater than last_pdh
+ ordered_pdh_query = lambda { |last_pdh, &block|
+ pdhs = ActiveRecord::Base.connection.exec_query(
+ "SELECT DISTINCT portable_data_hash FROM collections "\
+ "WHERE portable_data_hash > '#{last_pdh}' "\
+ "ORDER BY portable_data_hash LIMIT 1000"
+ )
+ pdhs.rows.each do |row|
+ block.call(row[0])
+ end
+ }
- Container.group_pdhs_for_multiple_transactions('AddFileInfoToCollection') do |pdhs|
+ batch_size_max = 1 << 28 # 256 MiB
+ GroupPdhs.group_pdhs_for_multiple_transactions(ordered_pdh_query,
+ distinct_pdh_count,
+ batch_size_max,
+ "AddFileInfoToCollection") do |pdhs|
do_batch(pdhs)
end
end