14873: Adjusts latest API Server changes to be rails5 worthy.
[arvados.git] / services / api / db / migrate / 20190322174136_add_file_info_to_collection.rb
index e12e508be7ea10fd061f17007ec95a72aacdfe5b..61f9b2d8841d0f0e850c1c5720037cc898c70539 100755 (executable)
@@ -2,46 +2,56 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
-class AddFileInfoToCollection < ActiveRecord::Migration
+require "arvados/keep"
+require "group_pdhs"
+
+class AddFileInfoToCollection < ActiveRecord::Migration[4.2]
   def do_batch(pdhs)
     pdhs_str = ''
     pdhs.each do |pdh|
-      pdhs_str << "'" << pdh << "'" << ','
+      pdhs_str << "'" << pdh << "'" << ","
     end
 
     collections = ActiveRecord::Base.connection.exec_query(
-      'SELECT DISTINCT portable_data_hash, manifest_text FROM collections '\
+      "SELECT DISTINCT portable_data_hash, manifest_text FROM collections "\
       "WHERE portable_data_hash IN (#{pdhs_str[0..-2]}) "
     )
 
     collections.rows.each do |row|
-      file_count = 0
-      file_size_total = 0
-      row[1].scan(/\S+/) do |token|
-        is_file = token.match(/^[[:digit:]]+:[[:digit:]]+:/)
-        if is_file
-          _, filesize, filename = token.split(':', 3)
-
-          # Avoid counting empty dir placeholders
-          break if filename == '.' && filesize.zero?
-
-          file_count += 1
-          file_size_total += filesize.to_i
-        end
-      end
-      ActiveRecord::Base.connection.exec_query('BEGIN')
-      ActiveRecord::Base.connection.exec_query("UPDATE collections SET file_count=#{file_count}, "\
-                                               "file_size_total=#{file_size_total} "\
+      manifest = Keep::Manifest.new(row[1])
+      ActiveRecord::Base.connection.exec_query("BEGIN")
+      ActiveRecord::Base.connection.exec_query("UPDATE collections SET file_count=#{manifest.files_count}, "\
+                                               "file_size_total=#{manifest.files_size} "\
                                                "WHERE portable_data_hash='#{row[0]}'")
-      ActiveRecord::Base.connection.exec_query('COMMIT')
+      ActiveRecord::Base.connection.exec_query("COMMIT")
     end
   end
 
   def up
     add_column :collections, :file_count, :integer, default: 0, null: false
-    add_column :collections, :file_size_total, :integer, default: 0, null: false
+    add_column :collections, :file_size_total, :integer, limit: 8, default: 0, null: false
+
+    distinct_pdh_count = ActiveRecord::Base.connection.exec_query(
+      "SELECT DISTINCT portable_data_hash FROM collections"
+    ).rows.count
+
+    # Generator that queries for all the distinct pdhs greater than last_pdh
+    ordered_pdh_query = lambda { |last_pdh, &block|
+      pdhs = ActiveRecord::Base.connection.exec_query(
+        "SELECT DISTINCT portable_data_hash FROM collections "\
+        "WHERE portable_data_hash > '#{last_pdh}' "\
+        "ORDER BY portable_data_hash LIMIT 1000"
+      )
+      pdhs.rows.each do |row|
+        block.call(row[0])
+      end
+    }
 
-    Container.group_pdhs_for_multiple_transactions('AddFileInfoToCollection') do |pdhs|
+    batch_size_max = 1 << 28 # 256 MiB
+    GroupPdhs.group_pdhs_for_multiple_transactions(ordered_pdh_query,
+                                                   distinct_pdh_count,
+                                                   batch_size_max,
+                                                   "AddFileInfoToCollection") do |pdhs|
       do_batch(pdhs)
     end
   end