2 # Copyright (C) The Arvados Authors. All rights reserved.
4 # SPDX-License-Identifier: AGPL-3.0
6 # Arvados version 1.4.0 introduces two new columns on the collections table named
10 # The database migration that adds these columns does not populate them with data,
11 # it initializes them set to zero.
13 # This script will populate the columns, if file_count is zero. It will ignore
14 # collections that have invalid manifests, but it will spit out details for those
20 # RAILS_ENV=production bundle exec populate-file-info-columns-in-collections.rb
23 ENV["RAILS_ENV"] = ARGV[0] || ENV["RAILS_ENV"] || "development"
24 require File.dirname(__FILE__) + '/../config/boot'
25 require File.dirname(__FILE__) + '/../config/environment'
27 require "arvados/keep"
33 pdhs_str << "'" << pdh << "'" << ","
36 collections = ActiveRecord::Base.connection.exec_query(
37 "SELECT DISTINCT portable_data_hash, manifest_text FROM collections "\
38 "WHERE portable_data_hash IN (#{pdhs_str[0..-2]}) "
40 collections.rows.each do |row|
42 manifest = Keep::Manifest.new(row[1])
43 ActiveRecord::Base.connection.exec_query("BEGIN")
44 ActiveRecord::Base.connection.exec_query("UPDATE collections SET file_count=#{manifest.files_count}, "\
45 "file_size_total=#{manifest.files_size} "\
46 "WHERE portable_data_hash='#{row[0]}'")
47 ActiveRecord::Base.connection.exec_query("COMMIT")
48 rescue ArgumentError => detail
51 puts "*************** Row detail ***************"
55 puts "************ Collection detail ***********"
57 pp Collection.find_by_portable_data_hash(row[0])
59 puts "************** Error detail **************"
63 puts "Skipping this collection, continuing!"
72 distinct_pdh_count = ActiveRecord::Base.connection.exec_query(
73 "SELECT DISTINCT portable_data_hash FROM collections where file_count=0"
76 # Generator that queries for all the distinct pdhs greater than last_pdh
77 ordered_pdh_query = lambda { |last_pdh, &block|
78 pdhs = ActiveRecord::Base.connection.exec_query(
79 "SELECT DISTINCT portable_data_hash FROM collections "\
80 "WHERE file_count=0 and portable_data_hash > '#{last_pdh}' "\
81 "ORDER BY portable_data_hash LIMIT 1000"
83 pdhs.rows.each do |row|
88 batch_size_max = 1 << 28 # 256 MiB
89 GroupPdhs.group_pdhs_for_multiple_transactions(ordered_pdh_query,
92 "AddFileInfoToCollection") do |pdhs|