Merge branch '8784-dir-listings'
[arvados.git] / services / api / db / migrate / 20150303210106_fix_collection_portable_data_hash_with_hinted_manifest.rb
index 7f65450ee245dc3fdb6a762a818555b04f0510c9..31fc683e20085966c122f46e046dd3bd70722f3c 100644 (file)
@@ -1,3 +1,7 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
 require 'has_uuid'
 require 'kind_and_etag'
 
@@ -55,14 +59,29 @@ class FixCollectionPortableDataHashWithHintedManifest < ActiveRecord::Migration
   end
 
   def each_bad_collection
-    Collection.find_each do |coll|
-      next unless (coll.manifest_text =~ /\+[A-Z]/)
-      stripped_manifest = coll.manifest_text.
-        gsub(/( [0-9a-f]{32}(\+\d+)?)(\+\S+)/, '\1')
-      stripped_pdh = sprintf("%s+%i",
-                             Digest::MD5.hexdigest(stripped_manifest),
-                             stripped_manifest.bytesize)
-      yield [coll, stripped_pdh] if (coll.portable_data_hash != stripped_pdh)
+    end_coll = Collection.order("id DESC").first
+    return if end_coll.nil?
+    seen_uuids = []
+    ("A".."Z").each do |hint_char|
+      query = Collection.
+        where("id <= ? AND manifest_text LIKE '%+#{hint_char}%'", end_coll.id)
+      unless seen_uuids.empty?
+        query = query.where("uuid NOT IN (?)", seen_uuids)
+      end
+      # It's important to make sure that this line doesn't swap.  The
+      # worst case scenario is that it finds a batch of collections that
+      # all have maximum size manifests (64MiB).  With a batch size of
+      # 50, that's about 3GiB.  Figure it will end up being 4GiB after
+      # other ActiveRecord overhead.  That's a size we're comfortable with.
+      query.find_each(batch_size: 50) do |coll|
+        seen_uuids << coll.uuid
+        stripped_manifest = coll.manifest_text.
+          gsub(/( [0-9a-f]{32}(\+\d+)?)\+\S+/, '\1')
+        stripped_pdh = sprintf("%s+%i",
+                               Digest::MD5.hexdigest(stripped_manifest),
+                               stripped_manifest.bytesize)
+        yield [coll, stripped_pdh] if (coll.portable_data_hash != stripped_pdh)
+      end
     end
   end