From 8c82f404b48a159797bd0e96e3d0098f0cf3ba16 Mon Sep 17 00:00:00 2001 From: Eric Biagiotti Date: Mon, 25 Mar 2019 18:30:40 -0400 Subject: [PATCH] 14484: Adds functionality and test for pdh grouping in the container model Arvados-DCO-1.1-Signed-off-by: Eric Biagiotti --- services/api/app/models/container.rb | 61 ++++++++++++++++++++++++ services/api/test/unit/container_test.rb | 13 +++++ 2 files changed, 74 insertions(+) diff --git a/services/api/app/models/container.rb b/services/api/app/models/container.rb index abcfdbd296..b3328d9c76 100644 --- a/services/api/app/models/container.rb +++ b/services/api/app/models/container.rb @@ -405,6 +405,67 @@ class Container < ArvadosModel end end + # NOTE: Migration 20190322174136_add_file_info_to_collection.rb relies on this function. + # + # Change with caution! + # + # Correctly groups pdhs to use for batch database updates. Helps avoid + # updating too many database rows in a single transaction. + def self.group_pdhs_for_multiple_transactions(log_prefix) + batch_size_max = 1 << 28 # 256 MiB + last_pdh = '0' + done = 0 + any = true + + total = ActiveRecord::Base.connection.exec_query( + 'SELECT DISTINCT portable_data_hash FROM collections' + ).rows.count + + while any + any = false + pdhs = ActiveRecord::Base.connection.exec_query( + 'SELECT DISTINCT portable_data_hash FROM collections '\ + "WHERE portable_data_hash > '#{last_pdh}' "\ + 'GROUP BY portable_data_hash LIMIT 1000' + ) + if pdhs.rows.count.zero? + break + end + + Container.group_pdhs_by_manifest_size(pdhs, batch_size_max) do |grouped_pdhs| + any = true + yield grouped_pdhs + done += grouped_pdhs.size + last_pdh = pdhs[-1] + Rails.logger.info(log_prefix + ": #{done}/#{total}") + end + end + Rails.logger.info(log_prefix + ': finished') + end + + # NOTE: Migration 20190322174136_add_file_info_to_collection.rb relies on this function. + # + # Change with caution! + # + # Given an array of pdhs, yield a subset array of pdhs when the total + # size of all manifest_texts is no more than batch_size_max. Pdhs whose manifest_text + # is bigger than batch_size_max are yielded by themselves + def self.group_pdhs_by_manifest_size(pdhs, batch_size_max) + batch_size = 0 + batch_pdhs = {} + pdhs.each do |pdh| + manifest_size = pdh.split('+')[1].to_i + if batch_size > 0 && batch_size + manifest_size > batch_size_max + yield batch_pdhs.keys + batch_pdhs = {} + batch_size = 0 + end + batch_pdhs[pdh] = true + batch_size += manifest_size + end + yield batch_pdhs.keys + end + protected def fill_field_defaults diff --git a/services/api/test/unit/container_test.rb b/services/api/test/unit/container_test.rb index 1a53df7dab..2b7fda8d7f 100644 --- a/services/api/test/unit/container_test.rb +++ b/services/api/test/unit/container_test.rb @@ -956,4 +956,17 @@ class ContainerTest < ActiveSupport::TestCase assert_no_secrets_logged end end + + # NOTE: Migration 20190322174136_add_file_info_to_collection.rb + # relies on this test. Change with caution! + test "pdh_grouping_by_manifest_size" do + batch_size_max = 200 + pdhs_in = ['x1+30', 'x2+30', 'x3+201', 'x4+100', 'x5+100'] + batched_pdhs = [] + Container.group_pdhs_by_manifest_size(pdhs_in, batch_size_max) do |pdhs| + batched_pdhs << pdhs + end + expected = [['x1+30', 'x2+30'], ['x3+201'], ['x4+100', 'x5+100']] + assert_equal(batched_pdhs, expected) + end end -- 2.30.2