X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/2a610a9d5d7e9446935436ff37267b8ada25fdbd..a7631a1ccb6e2a6925d00a06562e171c4ce4ea2f:/services/api/db/migrate/20150303210106_fix_collection_portable_data_hash_with_hinted_manifest.rb diff --git a/services/api/db/migrate/20150303210106_fix_collection_portable_data_hash_with_hinted_manifest.rb b/services/api/db/migrate/20150303210106_fix_collection_portable_data_hash_with_hinted_manifest.rb index 89d54a06b0..8814fc87d3 100644 --- a/services/api/db/migrate/20150303210106_fix_collection_portable_data_hash_with_hinted_manifest.rb +++ b/services/api/db/migrate/20150303210106_fix_collection_portable_data_hash_with_hinted_manifest.rb @@ -1,12 +1,17 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0 + require 'has_uuid' require 'kind_and_etag' -class FixCollectionPortableDataHashWithHintedManifest < ActiveRecord::Migration +class FixCollectionPortableDataHashWithHintedManifest < ActiveRecord::Migration[4.2] include CurrentApiClient class ArvadosModel < ActiveRecord::Base self.abstract_class = true extend HasUuid::ClassMethods + include CurrentApiClient include KindAndEtag before_create do |record| record.uuid ||= record.class.generate_uuid @@ -54,14 +59,29 @@ class FixCollectionPortableDataHashWithHintedManifest < ActiveRecord::Migration end def each_bad_collection - Collection.find_each do |coll| - next unless (coll.manifest_text =~ /\+[A-Z]/) - stripped_manifest = coll.manifest_text. - gsub(/( [0-9a-f]{32}(\+\d+)?)(\+\S+)/, '\1') - stripped_pdh = sprintf("%s+%i", - Digest::MD5.hexdigest(stripped_manifest), - stripped_manifest.bytesize) - yield [coll, stripped_pdh] if (coll.portable_data_hash != stripped_pdh) + end_coll = Collection.order("id DESC").first + return if end_coll.nil? + seen_uuids = [] + ("A".."Z").each do |hint_char| + query = Collection. + where("id <= ? AND manifest_text LIKE '%+#{hint_char}%'", end_coll.id) + unless seen_uuids.empty? + query = query.where("uuid NOT IN (?)", seen_uuids) + end + # It's important to make sure that this line doesn't swap. The + # worst case scenario is that it finds a batch of collections that + # all have maximum size manifests (64MiB). With a batch size of + # 50, that's about 3GiB. Figure it will end up being 4GiB after + # other ActiveRecord overhead. That's a size we're comfortable with. + query.find_each(batch_size: 50) do |coll| + seen_uuids << coll.uuid + stripped_manifest = coll.manifest_text. + gsub(/( [0-9a-f]{32}(\+\d+)?)\+\S+/, '\1') + stripped_pdh = sprintf("%s+%i", + Digest::MD5.hexdigest(stripped_manifest), + stripped_manifest.bytesize) + yield [coll, stripped_pdh] if (coll.portable_data_hash != stripped_pdh) + end end end