h3. current master branch
+h4. Populating the new file_count and file_size_total columns on the collections table
+
+As part of story "#14484":https://dev.arvados.org/issues/14484, two new columns were added to the collections table in a database migration. These columns are initialized with a zero value. In order to populate them, it is necessary to run a script called <code class="userinput">populate-file-info-columns-in-collections.rb</code> from the scripts directory of the API server. This can be done out of band, ideally directly after the API server has been upgraded to v1.4.0.
+
h4. Stricter collection manifest validation on the API server
As a consequence of "#14482":https://dev.arvados.org/issues/14482, the Ruby SDK does a more rigorous collection manifest validation. Collections created after 2015-05 are unlikely to be invalid, however you may check for invalid manifests using the script below.
#
# SPDX-License-Identifier: AGPL-3.0
-require "arvados/keep"
-require "group_pdhs"
-
class AddFileInfoToCollection < ActiveRecord::Migration[4.2]
- def do_batch(pdhs)
- pdhs_str = ''
- pdhs.each do |pdh|
- pdhs_str << "'" << pdh << "'" << ","
- end
-
- collections = ActiveRecord::Base.connection.exec_query(
- "SELECT DISTINCT portable_data_hash, manifest_text FROM collections "\
- "WHERE portable_data_hash IN (#{pdhs_str[0..-2]}) "
- )
-
- collections.rows.each do |row|
- manifest = Keep::Manifest.new(row[1])
- ActiveRecord::Base.connection.exec_query("BEGIN")
- ActiveRecord::Base.connection.exec_query("UPDATE collections SET file_count=#{manifest.files_count}, "\
- "file_size_total=#{manifest.files_size} "\
- "WHERE portable_data_hash='#{row[0]}'")
- ActiveRecord::Base.connection.exec_query("COMMIT")
- end
- end
-
def up
add_column :collections, :file_count, :integer, default: 0, null: false
add_column :collections, :file_size_total, :integer, limit: 8, default: 0, null: false
- distinct_pdh_count = ActiveRecord::Base.connection.exec_query(
- "SELECT DISTINCT portable_data_hash FROM collections"
- ).rows.count
-
- # Generator that queries for all the distinct pdhs greater than last_pdh
- ordered_pdh_query = lambda { |last_pdh, &block|
- pdhs = ActiveRecord::Base.connection.exec_query(
- "SELECT DISTINCT portable_data_hash FROM collections "\
- "WHERE portable_data_hash > '#{last_pdh}' "\
- "ORDER BY portable_data_hash LIMIT 1000"
- )
- pdhs.rows.each do |row|
- block.call(row[0])
- end
- }
-
- batch_size_max = 1 << 28 # 256 MiB
- GroupPdhs.group_pdhs_for_multiple_transactions(ordered_pdh_query,
- distinct_pdh_count,
- batch_size_max,
- "AddFileInfoToCollection") do |pdhs|
- do_batch(pdhs)
- end
+ puts "Collections now have two new columns, file_count and file_size_total."
+ puts "They were initialized with a zero value. If you are upgrading an Arvados"
+ puts "installation, please run the populate-file-info-columns-in-collections.rb"
+ puts "script to populate the columns. If this is a new installation, that is not"
+ puts "necessary."
end
def down
--- /dev/null
+#!/usr/bin/env ruby
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+# Arvados version 1.4.0 introduces two new columns on the collections table named
+# file_count
+# file_size_total
+#
+# The database migration that adds these columns does not populate them with data,
+# it initializes them set to zero.
+#
+# This script will populate the columns, if file_count is zero. It will ignore
+# collections that have invalid manifests, but it will spit out details for those
+# collections.
+#
+# Run the script as
+#
+# cd scripts
+# RAILS_ENV=production bundle exec populate-file-info-columns-in-collections.rb
+#
+
+ENV["RAILS_ENV"] = ARGV[0] || ENV["RAILS_ENV"] || "development"
+require File.dirname(__FILE__) + '/../config/boot'
+require File.dirname(__FILE__) + '/../config/environment'
+
+require "arvados/keep"
+require "group_pdhs"
+
+ def do_batch(pdhs)
+ pdhs_str = ''
+ pdhs.each do |pdh|
+ pdhs_str << "'" << pdh << "'" << ","
+ end
+
+ collections = ActiveRecord::Base.connection.exec_query(
+ "SELECT DISTINCT portable_data_hash, manifest_text FROM collections "\
+ "WHERE portable_data_hash IN (#{pdhs_str[0..-2]}) "
+ )
+ collections.rows.each do |row|
+ begin
+ manifest = Keep::Manifest.new(row[1])
+ ActiveRecord::Base.connection.exec_query("BEGIN")
+ ActiveRecord::Base.connection.exec_query("UPDATE collections SET file_count=#{manifest.files_count}, "\
+ "file_size_total=#{manifest.files_size} "\
+ "WHERE portable_data_hash='#{row[0]}'")
+ ActiveRecord::Base.connection.exec_query("COMMIT")
+ rescue ArgumentError => detail
+ require 'pp'
+ puts
+ puts "*************** Row detail ***************"
+ puts
+ pp row
+ puts
+ puts "************ Collection detail ***********"
+ puts
+ pp Collection.find_by_portable_data_hash(row[0])
+ puts
+ puts "************** Error detail **************"
+ puts
+ pp detail
+ puts
+ puts "Skipping this collection, continuing!"
+ next
+ end
+ end
+ end
+
+
+def main
+
+ distinct_pdh_count = ActiveRecord::Base.connection.exec_query(
+ "SELECT DISTINCT portable_data_hash FROM collections"
+ ).rows.count
+
+ # Generator that queries for all the distinct pdhs greater than last_pdh
+ ordered_pdh_query = lambda { |last_pdh, &block|
+ pdhs = ActiveRecord::Base.connection.exec_query(
+ "SELECT DISTINCT portable_data_hash FROM collections "\
+ "WHERE file_count=0 and portable_data_hash > '#{last_pdh}' "\
+ "ORDER BY portable_data_hash LIMIT 1000"
+ )
+ pdhs.rows.each do |row|
+ block.call(row[0])
+ end
+ }
+
+ batch_size_max = 1 << 28 # 256 MiB
+ GroupPdhs.group_pdhs_for_multiple_transactions(ordered_pdh_query,
+ distinct_pdh_count,
+ batch_size_max,
+ "AddFileInfoToCollection") do |pdhs|
+ do_batch(pdhs)
+ end
+end
+
+main