Move the population of the new columns on the collections table to a standalone
authorWard Vandewege <wvandewege@veritasgenetics.com>
Fri, 31 May 2019 18:51:50 +0000 (14:51 -0400)
committerWard Vandewege <wvandewege@veritasgenetics.com>
Fri, 31 May 2019 18:51:50 +0000 (14:51 -0400)
script that should be run separate from the migration. Add a note to the
upgrade documentation along those lines. Make the script not blow up on
collections with invalid manifests, but rather just skip them.

refs #15093
refs #14484

Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <wvandewege@veritasgenetics.com>

doc/admin/upgrading.html.textile.liquid
services/api/db/migrate/20190322174136_add_file_info_to_collection.rb [changed mode: 0755->0644]
services/api/script/populate-file-info-columns-in-collections.rb [new file with mode: 0755]

index 09bef2a62acd18c5f2d0b02ef022248e50033956..def8bed79ccafad72c894a5cd49d83a630f781f9 100644 (file)
@@ -32,6 +32,10 @@ TODO: extract this information based on git commit messages and generate changel
 
 h3. current master branch
 
+h4. Populating the new file_count and file_size_total columns on the collections table
+
+As part of story "#14484":https://dev.arvados.org/issues/14484, two new columns were added to the collections table in a database migration. These columns are initialized with a zero value. In order to populate them, it is necessary to run a script called <code class="userinput">populate-file-info-columns-in-collections.rb</code> from the scripts directory of the API server. This can be done out of band, ideally directly after the API server has been upgraded to v1.4.0.
+
 h4. Stricter collection manifest validation on the API server
 
 As a consequence of "#14482":https://dev.arvados.org/issues/14482, the Ruby SDK does a more rigorous collection manifest validation. Collections created after 2015-05 are unlikely to be invalid, however you may check for invalid manifests using the script below.
old mode 100755 (executable)
new mode 100644 (file)
index 61f9b2d..c0cd40d
@@ -2,58 +2,16 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
-require "arvados/keep"
-require "group_pdhs"
-
 class AddFileInfoToCollection < ActiveRecord::Migration[4.2]
-  def do_batch(pdhs)
-    pdhs_str = ''
-    pdhs.each do |pdh|
-      pdhs_str << "'" << pdh << "'" << ","
-    end
-
-    collections = ActiveRecord::Base.connection.exec_query(
-      "SELECT DISTINCT portable_data_hash, manifest_text FROM collections "\
-      "WHERE portable_data_hash IN (#{pdhs_str[0..-2]}) "
-    )
-
-    collections.rows.each do |row|
-      manifest = Keep::Manifest.new(row[1])
-      ActiveRecord::Base.connection.exec_query("BEGIN")
-      ActiveRecord::Base.connection.exec_query("UPDATE collections SET file_count=#{manifest.files_count}, "\
-                                               "file_size_total=#{manifest.files_size} "\
-                                               "WHERE portable_data_hash='#{row[0]}'")
-      ActiveRecord::Base.connection.exec_query("COMMIT")
-    end
-  end
-
   def up
     add_column :collections, :file_count, :integer, default: 0, null: false
     add_column :collections, :file_size_total, :integer, limit: 8, default: 0, null: false
 
-    distinct_pdh_count = ActiveRecord::Base.connection.exec_query(
-      "SELECT DISTINCT portable_data_hash FROM collections"
-    ).rows.count
-
-    # Generator that queries for all the distinct pdhs greater than last_pdh
-    ordered_pdh_query = lambda { |last_pdh, &block|
-      pdhs = ActiveRecord::Base.connection.exec_query(
-        "SELECT DISTINCT portable_data_hash FROM collections "\
-        "WHERE portable_data_hash > '#{last_pdh}' "\
-        "ORDER BY portable_data_hash LIMIT 1000"
-      )
-      pdhs.rows.each do |row|
-        block.call(row[0])
-      end
-    }
-
-    batch_size_max = 1 << 28 # 256 MiB
-    GroupPdhs.group_pdhs_for_multiple_transactions(ordered_pdh_query,
-                                                   distinct_pdh_count,
-                                                   batch_size_max,
-                                                   "AddFileInfoToCollection") do |pdhs|
-      do_batch(pdhs)
-    end
+    puts "Collections now have two new columns, file_count and file_size_total."
+    puts "They were initialized with a zero value. If you are upgrading an Arvados"
+    puts "installation, please run the populate-file-info-columns-in-collections.rb"
+    puts "script to populate the columns. If this is a new installation, that is not"
+    puts "necessary."
   end
 
   def down
diff --git a/services/api/script/populate-file-info-columns-in-collections.rb b/services/api/script/populate-file-info-columns-in-collections.rb
new file mode 100755 (executable)
index 0000000..b0bc5a2
--- /dev/null
@@ -0,0 +1,97 @@
+#!/usr/bin/env ruby
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+# Arvados version 1.4.0 introduces two new columns on the collections table named
+#   file_count
+#   file_size_total
+#
+# The database migration that adds these columns does not populate them with data,
+# it initializes them set to zero.
+#
+# This script will populate the columns, if file_count is zero. It will ignore
+# collections that have invalid manifests, but it will spit out details for those
+# collections.
+#
+# Run the script as
+#
+# cd scripts
+# RAILS_ENV=production bundle exec populate-file-info-columns-in-collections.rb
+#
+
+ENV["RAILS_ENV"] = ARGV[0] || ENV["RAILS_ENV"] || "development"
+require File.dirname(__FILE__) + '/../config/boot'
+require File.dirname(__FILE__) + '/../config/environment'
+
+require "arvados/keep"
+require "group_pdhs"
+
+  def do_batch(pdhs)
+    pdhs_str = ''
+    pdhs.each do |pdh|
+      pdhs_str << "'" << pdh << "'" << ","
+    end
+
+    collections = ActiveRecord::Base.connection.exec_query(
+      "SELECT DISTINCT portable_data_hash, manifest_text FROM collections "\
+      "WHERE portable_data_hash IN (#{pdhs_str[0..-2]}) "
+    )
+    collections.rows.each do |row|
+      begin
+        manifest = Keep::Manifest.new(row[1])
+        ActiveRecord::Base.connection.exec_query("BEGIN")
+        ActiveRecord::Base.connection.exec_query("UPDATE collections SET file_count=#{manifest.files_count}, "\
+                                                 "file_size_total=#{manifest.files_size} "\
+                                                 "WHERE portable_data_hash='#{row[0]}'")
+        ActiveRecord::Base.connection.exec_query("COMMIT")
+      rescue ArgumentError => detail
+        require 'pp'
+        puts
+        puts "*************** Row detail ***************"
+        puts
+        pp row
+        puts
+        puts "************ Collection detail ***********"
+        puts
+        pp Collection.find_by_portable_data_hash(row[0])
+        puts
+        puts "************** Error detail **************"
+        puts
+        pp detail
+        puts
+        puts "Skipping this collection, continuing!"
+        next
+      end
+    end
+  end
+
+
+def main
+
+  distinct_pdh_count = ActiveRecord::Base.connection.exec_query(
+    "SELECT DISTINCT portable_data_hash FROM collections"
+  ).rows.count
+
+  # Generator that queries for all the distinct pdhs greater than last_pdh
+  ordered_pdh_query = lambda { |last_pdh, &block|
+    pdhs = ActiveRecord::Base.connection.exec_query(
+      "SELECT DISTINCT portable_data_hash FROM collections "\
+      "WHERE file_count=0 and portable_data_hash > '#{last_pdh}' "\
+      "ORDER BY portable_data_hash LIMIT 1000"
+    )
+    pdhs.rows.each do |row|
+      block.call(row[0])
+    end
+  }
+
+  batch_size_max = 1 << 28 # 256 MiB
+  GroupPdhs.group_pdhs_for_multiple_transactions(ordered_pdh_query,
+                                                 distinct_pdh_count,
+                                                 batch_size_max,
+                                                 "AddFileInfoToCollection") do |pdhs|
+    do_batch(pdhs)
+  end
+end
+
+main