X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/65f76d01b775c944d2ef6c1673633adb6229d9d8..efeca068c2c25ee075ee8907a1937a5d4119775c:/services/api/lib/salvage_collection.rb diff --git a/services/api/lib/salvage_collection.rb b/services/api/lib/salvage_collection.rb index 80b96b87c7..3813f41864 100755 --- a/services/api/lib/salvage_collection.rb +++ b/services/api/lib/salvage_collection.rb @@ -1,3 +1,7 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0 + module SalvageCollection # Take two input parameters: a collection uuid and reason # Get "src_collection" with the given uuid @@ -10,102 +14,85 @@ module SalvageCollection # Set portable_data_hash to "d41d8cd98f00b204e9800998ecf8427e+0" require File.dirname(__FILE__) + '/../config/environment' - require 'arvados/keep' include ApplicationHelper require 'tempfile' require 'shellwords' - def self.salvage_collection uuid, reason='salvaged - see #6277, #6859' + def salvage_collection_arv_put cmd + new_manifest = %x(#{cmd}) + if $?.success? + new_manifest + else + raise "Error during arv-put: #{$?} (cmd was #{cmd.inspect})" + end + end + + # Get all the locators (and perhaps other strings that look a lot + # like a locators) from the original manifest, even if they don't + # appear in the correct positions with the correct space delimiters. + def salvage_collection_locator_data manifest + locators = [] + size = 0 + manifest.scan(/(^|[^[:xdigit:]])([[:xdigit:]]{32})((\+\d+)(\+|\b))?/) do |_, hash, _, sizehint, _| + if sizehint + locators << hash.downcase + sizehint + size += sizehint.to_i + else + locators << hash.downcase + end + end + locators << 'd41d8cd98f00b204e9800998ecf8427e+0' if !locators.any? + return [locators, size] + end + + def salvage_collection uuid, reason='salvaged - see #6277, #6859' act_as_system_user do if !ENV['ARVADOS_API_TOKEN'].present? or !ENV['ARVADOS_API_HOST'].present? - $stderr.puts "Please set your admin user credentials as ARVADOS environment variables." - # exit with a code outside the range of special exit codes; http://tldp.org/LDP/abs/html/exitcodes.html - exit 200 + raise "ARVADOS environment variables missing. Please set your admin user credentials as ARVADOS environment variables." end if !uuid.present? - $stderr.puts "Required uuid argument is missing." - return false + raise "Collection UUID is required." end src_collection = Collection.find_by_uuid uuid if !src_collection - $stderr.puts "No collection found for #{uuid}. Returning." - return false + raise "No collection found for #{uuid}." end - begin - src_manifest = src_collection.manifest_text || '' - - # Get all the locators from the original manifest - locators = [] - src_manifest.each_line do |line| - line.split(' ').each do |word| - if match = Keep::Locator::LOCATOR_REGEXP.match(word) - word = word.split('+')[0..1].join('+') # get rid of any hints - locators << word if !word.start_with?('00000000000000000000000000000000') - end - end - end - locators << 'd41d8cd98f00b204e9800998ecf8427e+0' if !locators.any? - - # create new collection using 'arv-put' with original manifest_text as the data - temp_file = Tempfile.new('temp') - temp_file.write(src_manifest) - temp_file.close - new_manifest = %x(arv-put --as-stream --use-filename invalid_manifest_text.txt #{Shellwords::shellescape(temp_file.path)}) - - temp_file.unlink - - if !new_manifest.present? - $stderr.puts "arv-put --as-stream failed for #{uuid}" - return false - end - - words = [] - new_manifest.split(' ').each do |word| - if match = Keep::Locator::LOCATOR_REGEXP.match(word) - word = word.split('+')[0..1].join('+') # get rid of any hints - words << word - else - words << word - end - end - - new_manifest = words.join(' ') + "\n" - new_collection = Collection.new - - total_size = 0 - locators.each do |locator| - total_size += locator.split('+')[1].to_i - end - new_manifest += (". #{locators.join(' ')} 0:#{total_size}:salvaged_data\n") - - new_collection.name = "salvaged from #{src_collection.uuid}, #{src_collection.portable_data_hash}" - new_collection.manifest_text = new_manifest - new_collection.portable_data_hash = Digest::MD5.hexdigest(new_collection.manifest_text) - - created = new_collection.save! - raise "New collection creation failed." if !created - - $stderr.puts "Salvaged manifest and data for #{uuid} are in #{new_collection.uuid}." - puts "Created new collection #{new_collection.uuid}" - rescue => error - $stderr.puts "Error creating collection for #{uuid}: #{error}" - return false - end + src_manifest = src_collection.manifest_text || '' - begin - # update src_collection collection name, pdh, and manifest_text - src_collection.name = (src_collection.name || '') + ' (' + (reason || '') + '; salvaged data at ' + new_collection.uuid + ')' - src_collection.manifest_text = '' - src_collection.portable_data_hash = 'd41d8cd98f00b204e9800998ecf8427e+0' - src_collection.save! - $stderr.puts "Collection #{uuid} emptied and renamed to #{src_collection.name.inspect}." - rescue => error - $stderr.puts "Error salvaging collection #{new_collection.uuid}: #{error}" - return false - end + # create new collection using 'arv-put' with original manifest_text as the data + temp_file = Tempfile.new('temp') + temp_file.write(src_manifest) + + temp_file.close + + new_manifest = salvage_collection_arv_put "arv-put --as-stream --use-filename invalid_manifest_text.txt #{Shellwords::shellescape(temp_file.path)}" + + temp_file.unlink + + # Get the locator data in the format [[locators], size] from the original manifest + locator_data = salvage_collection_locator_data src_manifest + + new_manifest += (". #{locator_data[0].join(' ')} 0:#{locator_data[1]}:salvaged_data\n") + + new_collection = Collection.new + new_collection.name = "salvaged from #{src_collection.uuid}, #{src_collection.portable_data_hash}" + new_collection.manifest_text = new_manifest + + created = new_collection.save! + raise "New collection creation failed." if !created + + $stderr.puts "Salvaged manifest and data for #{uuid} are in #{new_collection.uuid}." + puts "Created new collection #{new_collection.uuid}" + + # update src_collection collection name, pdh, and manifest_text + src_collection.name = (src_collection.name || '') + ' (' + (reason || '') + '; salvaged data at ' + new_collection.uuid + ')' + src_collection.manifest_text = '' + src_collection.portable_data_hash = 'd41d8cd98f00b204e9800998ecf8427e+0' + src_collection.save! + $stderr.puts "Collection #{uuid} emptied and renamed to #{src_collection.name.inspect}." end end end