X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/4925686f6aa7214568ebd60be3acaa49dbf9dd1a..1d0aef8fb7bdad6d28976255919f58b719a4342b:/services/api/db/migrate/20150303210106_fix_collection_portable_data_hash_with_hinted_manifest.rb diff --git a/services/api/db/migrate/20150303210106_fix_collection_portable_data_hash_with_hinted_manifest.rb b/services/api/db/migrate/20150303210106_fix_collection_portable_data_hash_with_hinted_manifest.rb index d983e7bbcf..9bbe113562 100644 --- a/services/api/db/migrate/20150303210106_fix_collection_portable_data_hash_with_hinted_manifest.rb +++ b/services/api/db/migrate/20150303210106_fix_collection_portable_data_hash_with_hinted_manifest.rb @@ -55,19 +55,26 @@ class FixCollectionPortableDataHashWithHintedManifest < ActiveRecord::Migration end def each_bad_collection - # It's important to make sure that this line doesn't swap. The - # worst case scenario is that it finds a batch of collections that - # all have maximum size manifests (64MiB). With a batch size of - # 50, that's about 3GiB. Figure it will end up being 4GiB after - # other ActiveRecord overhead. That's a size we're comfortable with. - Collection.where("manifest_text ~ '\\+[A-Z]'"). - find_each(batch_size: 50) do |coll| - stripped_manifest = coll.manifest_text. - gsub(/( [0-9a-f]{32}(\+\d+)?)(\+\S+)/, '\1') - stripped_pdh = sprintf("%s+%i", - Digest::MD5.hexdigest(stripped_manifest), - stripped_manifest.bytesize) - yield [coll, stripped_pdh] if (coll.portable_data_hash != stripped_pdh) + seen_uuids = [] + ("A".."Z").each do |hint_char| + query = Collection.where("manifest_text LIKE '%+#{hint_char}%'") + unless seen_uuids.empty? + query = query.where("uuid NOT IN (?)", seen_uuids) + end + # It's important to make sure that this line doesn't swap. The + # worst case scenario is that it finds a batch of collections that + # all have maximum size manifests (64MiB). With a batch size of + # 50, that's about 3GiB. Figure it will end up being 4GiB after + # other ActiveRecord overhead. That's a size we're comfortable with. + query.find_each(batch_size: 50) do |coll| + seen_uuids << coll.uuid + stripped_manifest = coll.manifest_text. + gsub(/( [0-9a-f]{32}(\+\d+)?)\+\S+/, '\1') + stripped_pdh = sprintf("%s+%i", + Digest::MD5.hexdigest(stripped_manifest), + stripped_manifest.bytesize) + yield [coll, stripped_pdh] if (coll.portable_data_hash != stripped_pdh) + end end end