Merge branch 'master' into 14930-arvput-trash-at
[arvados.git] / services / api / script / populate-file-info-columns-in-collections.rb
1 #!/usr/bin/env ruby
2 # Copyright (C) The Arvados Authors. All rights reserved.
3 #
4 # SPDX-License-Identifier: AGPL-3.0
5
6 # Arvados version 1.4.0 introduces two new columns on the collections table named
7 #   file_count
8 #   file_size_total
9 #
10 # The database migration that adds these columns does not populate them with data,
11 # it initializes them set to zero.
12 #
13 # This script will populate the columns, if file_count is zero. It will ignore
14 # collections that have invalid manifests, but it will spit out details for those
15 # collections.
16 #
17 # Run the script as
18 #
19 # cd scripts
20 # RAILS_ENV=production bundle exec populate-file-info-columns-in-collections.rb
21 #
22
23 ENV["RAILS_ENV"] = ARGV[0] || ENV["RAILS_ENV"] || "development"
24 require File.dirname(__FILE__) + '/../config/boot'
25 require File.dirname(__FILE__) + '/../config/environment'
26
27 require "arvados/keep"
28 require "group_pdhs"
29
30   def do_batch(pdhs)
31     pdhs_str = ''
32     pdhs.each do |pdh|
33       pdhs_str << "'" << pdh << "'" << ","
34     end
35
36     collections = ActiveRecord::Base.connection.exec_query(
37       "SELECT DISTINCT portable_data_hash, manifest_text FROM collections "\
38       "WHERE portable_data_hash IN (#{pdhs_str[0..-2]}) "
39     )
40     collections.rows.each do |row|
41       begin
42         manifest = Keep::Manifest.new(row[1])
43         ActiveRecord::Base.connection.exec_query("BEGIN")
44         ActiveRecord::Base.connection.exec_query("UPDATE collections SET file_count=#{manifest.files_count}, "\
45                                                  "file_size_total=#{manifest.files_size} "\
46                                                  "WHERE portable_data_hash='#{row[0]}'")
47         ActiveRecord::Base.connection.exec_query("COMMIT")
48       rescue ArgumentError => detail
49         require 'pp'
50         puts
51         puts "*************** Row detail ***************"
52         puts
53         pp row
54         puts
55         puts "************ Collection detail ***********"
56         puts
57         pp Collection.find_by_portable_data_hash(row[0])
58         puts
59         puts "************** Error detail **************"
60         puts
61         pp detail
62         puts
63         puts "Skipping this collection, continuing!"
64         next
65       end
66     end
67   end
68
69
70 def main
71
72   distinct_pdh_count = ActiveRecord::Base.connection.exec_query(
73     "SELECT DISTINCT portable_data_hash FROM collections where file_count=0"
74   ).rows.count
75
76   # Generator that queries for all the distinct pdhs greater than last_pdh
77   ordered_pdh_query = lambda { |last_pdh, &block|
78     pdhs = ActiveRecord::Base.connection.exec_query(
79       "SELECT DISTINCT portable_data_hash FROM collections "\
80       "WHERE file_count=0 and portable_data_hash > '#{last_pdh}' "\
81       "ORDER BY portable_data_hash LIMIT 1000"
82     )
83     pdhs.rows.each do |row|
84       block.call(row[0])
85     end
86   }
87
88   batch_size_max = 1 << 28 # 256 MiB
89   GroupPdhs.group_pdhs_for_multiple_transactions(ordered_pdh_query,
90                                                  distinct_pdh_count,
91                                                  batch_size_max,
92                                                  "AddFileInfoToCollection") do |pdhs|
93     do_batch(pdhs)
94   end
95 end
96
97 main