X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/aabde5c5f0cba198c90558a3f3bb100ea9a09b6d..442a871e7f3476938d0ebb3adbe3b9a7742f03ad:/services/api/db/migrate/20150303210106_fix_collection_portable_data_hash_with_hinted_manifest.rb diff --git a/services/api/db/migrate/20150303210106_fix_collection_portable_data_hash_with_hinted_manifest.rb b/services/api/db/migrate/20150303210106_fix_collection_portable_data_hash_with_hinted_manifest.rb index d983e7bbcf..8814fc87d3 100644 --- a/services/api/db/migrate/20150303210106_fix_collection_portable_data_hash_with_hinted_manifest.rb +++ b/services/api/db/migrate/20150303210106_fix_collection_portable_data_hash_with_hinted_manifest.rb @@ -1,7 +1,11 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0 + require 'has_uuid' require 'kind_and_etag' -class FixCollectionPortableDataHashWithHintedManifest < ActiveRecord::Migration +class FixCollectionPortableDataHashWithHintedManifest < ActiveRecord::Migration[4.2] include CurrentApiClient class ArvadosModel < ActiveRecord::Base @@ -55,19 +59,29 @@ class FixCollectionPortableDataHashWithHintedManifest < ActiveRecord::Migration end def each_bad_collection - # It's important to make sure that this line doesn't swap. The - # worst case scenario is that it finds a batch of collections that - # all have maximum size manifests (64MiB). With a batch size of - # 50, that's about 3GiB. Figure it will end up being 4GiB after - # other ActiveRecord overhead. That's a size we're comfortable with. - Collection.where("manifest_text ~ '\\+[A-Z]'"). - find_each(batch_size: 50) do |coll| - stripped_manifest = coll.manifest_text. - gsub(/( [0-9a-f]{32}(\+\d+)?)(\+\S+)/, '\1') - stripped_pdh = sprintf("%s+%i", - Digest::MD5.hexdigest(stripped_manifest), - stripped_manifest.bytesize) - yield [coll, stripped_pdh] if (coll.portable_data_hash != stripped_pdh) + end_coll = Collection.order("id DESC").first + return if end_coll.nil? + seen_uuids = [] + ("A".."Z").each do |hint_char| + query = Collection. + where("id <= ? AND manifest_text LIKE '%+#{hint_char}%'", end_coll.id) + unless seen_uuids.empty? + query = query.where("uuid NOT IN (?)", seen_uuids) + end + # It's important to make sure that this line doesn't swap. The + # worst case scenario is that it finds a batch of collections that + # all have maximum size manifests (64MiB). With a batch size of + # 50, that's about 3GiB. Figure it will end up being 4GiB after + # other ActiveRecord overhead. That's a size we're comfortable with. + query.find_each(batch_size: 50) do |coll| + seen_uuids << coll.uuid + stripped_manifest = coll.manifest_text. + gsub(/( [0-9a-f]{32}(\+\d+)?)\+\S+/, '\1') + stripped_pdh = sprintf("%s+%i", + Digest::MD5.hexdigest(stripped_manifest), + stripped_manifest.bytesize) + yield [coll, stripped_pdh] if (coll.portable_data_hash != stripped_pdh) + end end end