14484: Moves pdh grouping into a lib module
authorEric Biagiotti <ebiagiotti@veritasgenetics.com>
Fri, 29 Mar 2019 18:46:30 +0000 (14:46 -0400)
committerEric Biagiotti <ebiagiotti@veritasgenetics.com>
Fri, 29 Mar 2019 18:46:30 +0000 (14:46 -0400)
Arvados-DCO-1.1-Signed-off-by: Eric Biagiotti <ebiagiotti@veritasgenetics.com>

services/api/app/models/container.rb
services/api/db/migrate/20190322174136_add_file_info_to_collection.rb
services/api/lib/group_pdhs.rb [new file with mode: 0644]

index 0f48a750119164bb0790e1ff926e39bf6c3b0976..abcfdbd296b3ab71cf7e8466e7c9279076f2c93f 100644 (file)
@@ -405,40 +405,6 @@ class Container < ArvadosModel
     end
   end
 
-  # NOTE: Migration 20190322174136_add_file_info_to_collection.rb relies on this function.
-  #
-  # Change with caution!
-  #
-  # Correctly groups pdhs to use for batch database updates. Helps avoid
-  # updating too many database rows in a single transaction.
-  def self.group_pdhs_for_multiple_transactions(distinct_ordered_pdhs, distinct_pdh_count, batch_size_max, log_prefix)
-    batch_size = 0
-    batch_pdhs = {}
-    last_pdh = '0'
-    done = 0
-    any = true
-
-    while any
-      any = false
-      distinct_ordered_pdhs.call(last_pdh) do |pdh|
-        any = true
-        last_pdh = pdh
-        manifest_size = pdh.split('+')[1].to_i
-        if batch_size > 0 && batch_size + manifest_size > batch_size_max
-          yield batch_pdhs.keys
-          done += batch_pdhs.size
-          Rails.logger.info(log_prefix + ": #{done}/#{distinct_pdh_count}")
-          batch_pdhs = {}
-          batch_size = 0
-        end
-        batch_pdhs[pdh] = true
-        batch_size += manifest_size
-      end
-    end
-    yield batch_pdhs.keys
-    Rails.logger.info(log_prefix + ": finished")
-  end
-
   protected
 
   def fill_field_defaults
index 47f539826515a9b7dce71b800b1663ad036891a0..146e105afaac9331dbd6fc2926b29198718c69d0 100755 (executable)
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: AGPL-3.0
 
 require "arvados/keep"
+require "group_pdhs"
 
 class AddFileInfoToCollection < ActiveRecord::Migration
   def do_batch(pdhs)
@@ -34,7 +35,7 @@ class AddFileInfoToCollection < ActiveRecord::Migration
       "SELECT DISTINCT portable_data_hash FROM collections"
     ).rows.count
 
-    # Generator that queries for all the distince pdhs greater than last_pdh
+    # Generator that queries for all the distinct pdhs greater than last_pdh
     ordered_pdh_query = lambda { |last_pdh, &block|
       pdhs = ActiveRecord::Base.connection.exec_query(
         "SELECT DISTINCT portable_data_hash FROM collections "\
@@ -47,7 +48,7 @@ class AddFileInfoToCollection < ActiveRecord::Migration
     }
 
     batch_size_max = 1 << 28 # 256 MiB
-    Container.group_pdhs_for_multiple_transactions(ordered_pdh_query,
+    GroupPdhs.group_pdhs_for_multiple_transactions(ordered_pdh_query,
                                                    distinct_pdh_count,
                                                    batch_size_max,
                                                    "AddFileInfoToCollection") do |pdhs|
diff --git a/services/api/lib/group_pdhs.rb b/services/api/lib/group_pdhs.rb
new file mode 100644 (file)
index 0000000..0630ef8
--- /dev/null
@@ -0,0 +1,39 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+module GroupPdhs
+  # NOTE: Migration 20190322174136_add_file_info_to_collection.rb relies on this function.
+  #
+  # Change with caution!
+  #
+  # Correctly groups pdhs to use for batch database updates. Helps avoid
+  # updating too many database rows in a single transaction.
+  def self.group_pdhs_for_multiple_transactions(distinct_ordered_pdhs, distinct_pdh_count, batch_size_max, log_prefix)
+    batch_size = 0
+    batch_pdhs = {}
+    last_pdh = '0'
+    done = 0
+    any = true
+
+    while any
+      any = false
+      distinct_ordered_pdhs.call(last_pdh) do |pdh|
+        any = true
+        last_pdh = pdh
+        manifest_size = pdh.split('+')[1].to_i
+        if batch_size > 0 && batch_size + manifest_size > batch_size_max
+          yield batch_pdhs.keys
+          done += batch_pdhs.size
+          Rails.logger.info(log_prefix + ": #{done}/#{distinct_pdh_count}")
+          batch_pdhs = {}
+          batch_size = 0
+        end
+        batch_pdhs[pdh] = true
+        batch_size += manifest_size
+      end
+    end
+    yield batch_pdhs.keys
+    Rails.logger.info(log_prefix + ": finished")
+  end
+end