5319: Improve collection PDH fix performance with LIKE searches.
[arvados.git] / services / api / db / migrate / 20150303210106_fix_collection_portable_data_hash_with_hinted_manifest.rb
index d983e7bbcfb7c120fbd1b7b542cadd8c89984c66..9bbe113562d7659800023745e3f62b241c53c383 100644 (file)
@@ -55,19 +55,26 @@ class FixCollectionPortableDataHashWithHintedManifest < ActiveRecord::Migration
   end
 
   def each_bad_collection
-    # It's important to make sure that this line doesn't swap.  The
-    # worst case scenario is that it finds a batch of collections that
-    # all have maximum size manifests (64MiB).  With a batch size of
-    # 50, that's about 3GiB.  Figure it will end up being 4GiB after
-    # other ActiveRecord overhead.  That's a size we're comfortable with.
-    Collection.where("manifest_text ~ '\\+[A-Z]'").
-        find_each(batch_size: 50) do |coll|
-      stripped_manifest = coll.manifest_text.
-        gsub(/( [0-9a-f]{32}(\+\d+)?)(\+\S+)/, '\1')
-      stripped_pdh = sprintf("%s+%i",
-                             Digest::MD5.hexdigest(stripped_manifest),
-                             stripped_manifest.bytesize)
-      yield [coll, stripped_pdh] if (coll.portable_data_hash != stripped_pdh)
+    seen_uuids = []
+    ("A".."Z").each do |hint_char|
+      query = Collection.where("manifest_text LIKE '%+#{hint_char}%'")
+      unless seen_uuids.empty?
+        query = query.where("uuid NOT IN (?)", seen_uuids)
+      end
+      # It's important to make sure that this line doesn't swap.  The
+      # worst case scenario is that it finds a batch of collections that
+      # all have maximum size manifests (64MiB).  With a batch size of
+      # 50, that's about 3GiB.  Figure it will end up being 4GiB after
+      # other ActiveRecord overhead.  That's a size we're comfortable with.
+      query.find_each(batch_size: 50) do |coll|
+        seen_uuids << coll.uuid
+        stripped_manifest = coll.manifest_text.
+          gsub(/( [0-9a-f]{32}(\+\d+)?)\+\S+/, '\1')
+        stripped_pdh = sprintf("%s+%i",
+                               Digest::MD5.hexdigest(stripped_manifest),
+                               stripped_manifest.bytesize)
+        yield [coll, stripped_pdh] if (coll.portable_data_hash != stripped_pdh)
+      end
     end
   end