From: Eric Biagiotti Date: Fri, 29 Mar 2019 18:46:30 +0000 (-0400) Subject: 14484: Moves pdh grouping into a lib module X-Git-Tag: 1.4.0~77^2~7 X-Git-Url: https://git.arvados.org/arvados.git/commitdiff_plain/59a1fc872723c0bafa9764b95756723f54419631 14484: Moves pdh grouping into a lib module Arvados-DCO-1.1-Signed-off-by: Eric Biagiotti --- diff --git a/services/api/app/models/container.rb b/services/api/app/models/container.rb index 0f48a75011..abcfdbd296 100644 --- a/services/api/app/models/container.rb +++ b/services/api/app/models/container.rb @@ -405,40 +405,6 @@ class Container < ArvadosModel end end - # NOTE: Migration 20190322174136_add_file_info_to_collection.rb relies on this function. - # - # Change with caution! - # - # Correctly groups pdhs to use for batch database updates. Helps avoid - # updating too many database rows in a single transaction. - def self.group_pdhs_for_multiple_transactions(distinct_ordered_pdhs, distinct_pdh_count, batch_size_max, log_prefix) - batch_size = 0 - batch_pdhs = {} - last_pdh = '0' - done = 0 - any = true - - while any - any = false - distinct_ordered_pdhs.call(last_pdh) do |pdh| - any = true - last_pdh = pdh - manifest_size = pdh.split('+')[1].to_i - if batch_size > 0 && batch_size + manifest_size > batch_size_max - yield batch_pdhs.keys - done += batch_pdhs.size - Rails.logger.info(log_prefix + ": #{done}/#{distinct_pdh_count}") - batch_pdhs = {} - batch_size = 0 - end - batch_pdhs[pdh] = true - batch_size += manifest_size - end - end - yield batch_pdhs.keys - Rails.logger.info(log_prefix + ": finished") - end - protected def fill_field_defaults diff --git a/services/api/db/migrate/20190322174136_add_file_info_to_collection.rb b/services/api/db/migrate/20190322174136_add_file_info_to_collection.rb index 47f5398265..146e105afa 100755 --- a/services/api/db/migrate/20190322174136_add_file_info_to_collection.rb +++ b/services/api/db/migrate/20190322174136_add_file_info_to_collection.rb @@ -3,6 +3,7 @@ # SPDX-License-Identifier: AGPL-3.0 require "arvados/keep" +require "group_pdhs" class AddFileInfoToCollection < ActiveRecord::Migration def do_batch(pdhs) @@ -34,7 +35,7 @@ class AddFileInfoToCollection < ActiveRecord::Migration "SELECT DISTINCT portable_data_hash FROM collections" ).rows.count - # Generator that queries for all the distince pdhs greater than last_pdh + # Generator that queries for all the distinct pdhs greater than last_pdh ordered_pdh_query = lambda { |last_pdh, &block| pdhs = ActiveRecord::Base.connection.exec_query( "SELECT DISTINCT portable_data_hash FROM collections "\ @@ -47,7 +48,7 @@ class AddFileInfoToCollection < ActiveRecord::Migration } batch_size_max = 1 << 28 # 256 MiB - Container.group_pdhs_for_multiple_transactions(ordered_pdh_query, + GroupPdhs.group_pdhs_for_multiple_transactions(ordered_pdh_query, distinct_pdh_count, batch_size_max, "AddFileInfoToCollection") do |pdhs| diff --git a/services/api/lib/group_pdhs.rb b/services/api/lib/group_pdhs.rb new file mode 100644 index 0000000000..0630ef8b5e --- /dev/null +++ b/services/api/lib/group_pdhs.rb @@ -0,0 +1,39 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0 + +module GroupPdhs + # NOTE: Migration 20190322174136_add_file_info_to_collection.rb relies on this function. + # + # Change with caution! + # + # Correctly groups pdhs to use for batch database updates. Helps avoid + # updating too many database rows in a single transaction. + def self.group_pdhs_for_multiple_transactions(distinct_ordered_pdhs, distinct_pdh_count, batch_size_max, log_prefix) + batch_size = 0 + batch_pdhs = {} + last_pdh = '0' + done = 0 + any = true + + while any + any = false + distinct_ordered_pdhs.call(last_pdh) do |pdh| + any = true + last_pdh = pdh + manifest_size = pdh.split('+')[1].to_i + if batch_size > 0 && batch_size + manifest_size > batch_size_max + yield batch_pdhs.keys + done += batch_pdhs.size + Rails.logger.info(log_prefix + ": #{done}/#{distinct_pdh_count}") + batch_pdhs = {} + batch_size = 0 + end + batch_pdhs[pdh] = true + batch_size += manifest_size + end + end + yield batch_pdhs.keys + Rails.logger.info(log_prefix + ": finished") + end +end