From a50fab63068c1e8d67ce1d477c6f2c2429464b5c Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Fri, 11 Oct 2019 01:11:45 -0400 Subject: [PATCH] 15699: Fix handling of streams with multiple refs to a block ID. The manifest normalization code in the Ruby SDK (and therefore Workbench) was based on an incorrect assumption that each block locator could only appear once in a given stream. If a manifest referenced the same block more than once, copying a file from that manifest into a new one would produce a new file with the correct size, but wrong data. The new code uses a different strategy that deduplicates block references in common cases, although not in all possible cases. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- sdk/ruby/lib/arvados/collection.rb | 80 +++++++++++++++--------------- sdk/ruby/test/test_collection.rb | 28 ++++++++++- 2 files changed, 65 insertions(+), 43 deletions(-) diff --git a/sdk/ruby/lib/arvados/collection.rb b/sdk/ruby/lib/arvados/collection.rb index f236ce83a3..796d1785ae 100644 --- a/sdk/ruby/lib/arvados/collection.rb +++ b/sdk/ruby/lib/arvados/collection.rb @@ -207,7 +207,7 @@ module Arv loop do ii = (lo + hi) / 2 range = @ranges[ii] - if range.include?(target) + if range.include?(target) && (target < range.end || ii == hi) return ii elsif ii == lo raise RangeError.new("%i not in segment" % target) @@ -481,14 +481,13 @@ module Arv def initialize(name) @name = name - @loc_ranges = {} + @loc_ranges = [] @loc_range_start = 0 @file_specs = [] end def add_file(coll_file) coll_file.each_segment do |segment| - extend_locator_ranges(segment.locators) extend_file_specs(coll_file.name, segment) end end @@ -498,48 +497,53 @@ module Arv "" else "%s %s %s\n" % [escape_name(@name), - @loc_ranges.keys.join(" "), + @loc_ranges.collect(&:locator).join(" "), @file_specs.join(" ")] end end private - def extend_locator_ranges(locators) - locators. - select { |loc_s| not @loc_ranges.include?(loc_s) }. - each do |loc_s| - @loc_ranges[loc_s] = LocatorRange.new(loc_s, @loc_range_start) - @loc_range_start = @loc_ranges[loc_s].end + def extend_file_specs(filename, segment) + found_overlap = false + # Find the longest prefix of segment.locators that's a suffix + # of the existing @loc_ranges. If we find one, drop those + # locators (they'll be added back below, when we're handling + # the normal/no-overlap case). + (1..segment.locators.length).each do |overlap| + if @loc_ranges.length >= overlap && @loc_ranges[-overlap..-1].collect(&:locator) == segment.locators[0..overlap-1] + (1..overlap).each do + discarded = @loc_ranges.pop + @loc_range_start -= (discarded.end - discarded.begin) + end + found_overlap = true + break + end end - end - def extend_file_specs(filename, segment) - # Given a filename and a LocatorSegment, add the smallest - # possible array of file spec strings to @file_specs that - # builds the file from available locators. - filename = escape_name(filename) - start_pos = segment.start_pos - length = segment.length - start_loc = segment.locators.first - prev_loc = start_loc - # Build a list of file specs by iterating through the segment's - # locators and preparing a file spec for each contiguous range. - segment.locators[1..-1].each do |loc_s| - range = @loc_ranges[loc_s] - if range.begin != @loc_ranges[prev_loc].end - range_start, range_length = - start_and_length_at(start_loc, prev_loc, start_pos, length) - @file_specs << "#{range_start}:#{range_length}:#{filename}" - start_pos = 0 - length -= range_length - start_loc = loc_s + # If there was no overlap at the end of our existing + # @loc_ranges, check whether the full set of segment.locators + # appears earlier in @loc_ranges. If so, use those instead of + # appending the same locators again. + if !found_overlap && segment.locators.length < @loc_ranges.length + segment_start = 0 + (0..@loc_ranges.length-1).each do |ri| + if @loc_ranges[ri..ri+segment.locators.length-1].collect(&:locator) == segment.locators + @file_specs << "#{segment.start_pos + @loc_ranges[ri].begin}:#{segment.length}:#{escape_name(filename)}" + return + end end - prev_loc = loc_s end - range_start, range_length = - start_and_length_at(start_loc, prev_loc, start_pos, length) - @file_specs << "#{range_start}:#{range_length}:#{filename}" + + segment_start = @loc_range_start + segment_end = segment_start + segment.locators.each do |loc_s| + r = LocatorRange.new(loc_s, @loc_range_start) + @loc_ranges << r + @loc_range_start = r.end + segment_end += (r.end - r.begin) + end + @file_specs << "#{segment.start_pos + segment_start}:#{segment.length}:#{escape_name(filename)}" end def escape_name(name) @@ -547,12 +551,6 @@ module Arv s.each_byte.map { |c| "\\%03o" % c }.join("") end end - - def start_and_length_at(start_key, end_key, start_pos, length) - range_begin = @loc_ranges[start_key].begin + start_pos - range_length = [@loc_ranges[end_key].end - range_begin, length].min - [range_begin, range_length] - end end end end diff --git a/sdk/ruby/test/test_collection.rb b/sdk/ruby/test/test_collection.rb index 288fd263fa..29ec6d418c 100644 --- a/sdk/ruby/test/test_collection.rb +++ b/sdk/ruby/test/test_collection.rb @@ -15,6 +15,10 @@ class CollectionTest < Minitest::Test "./s1 #{TWO_BY_TWO_BLOCKS.last} 0:5:f1 5:4:f3\n"] TWO_BY_TWO_MANIFEST_S = TWO_BY_TWO_MANIFEST_A.join("") + def abcde_blocks + ["aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa+9", "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb+9", "cccccccccccccccccccccccccccccccc+9", "dddddddddddddddddddddddddddddddd+9", "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee+9"] + end + ### .new def test_empty_construction @@ -145,12 +149,12 @@ class CollectionTest < Minitest::Test test_normalization_file_spans_two_whole_blocks("2:3:f1 2:3:f1", 1) end - def test_normalization_dedups_locators + def test_normalization_handles_duplicate_locator blocks = random_blocks(2, 5) coll = Arv::Collection.new(". %s %s 1:8:f1 11:8:f1\n" % [blocks.join(" "), blocks.reverse.join(" ")]) coll.normalize - assert_equal(". #{blocks.join(' ')} 1:8:f1 6:4:f1 0:4:f1\n", + assert_equal(". #{blocks.join(' ')} #{blocks[0]} 1:8:f1 6:8:f1\n", coll.manifest_text) end @@ -395,6 +399,26 @@ class CollectionTest < Minitest::Test dst_coll.manifest_text) end + def test_copy_with_repeated_blocks + blocks = abcde_blocks + src_coll = Arv::Collection.new(". #{blocks[0]} #{blocks[1]} #{blocks[2]} #{blocks[0]} #{blocks[1]} #{blocks[2]} #{blocks[3]} #{blocks[4]} 27:27:f1\n") + dst_coll = Arv::Collection.new() + dst_coll.cp_r("f1", "./", src_coll) + toks = dst_coll.manifest_text.split(" ") + assert_equal(". #{blocks[0]} #{blocks[1]} #{blocks[2]} 0:27:f1\n", dst_coll.manifest_text, "mangled by cp_r") + end + + def test_copy_with_repeated_split_blocks + blocks = abcde_blocks + src_coll = Arv::Collection.new(". #{blocks[0]} #{blocks[1]} #{blocks[2]} #{blocks[0]} #{blocks[1]} #{blocks[2]} #{blocks[3]} #{blocks[4]} 20:27:f1\n") + dst_coll = Arv::Collection.new() + src_coll.normalize + assert_equal(". #{blocks[2]} #{blocks[0]} #{blocks[1]} #{blocks[2]} 2:27:f1\n", src_coll.manifest_text, "mangled by normalize()") + dst_coll.cp_r("f1", "./", src_coll) + toks = dst_coll.manifest_text.split(" ") + assert_equal(". #{blocks[2]} #{blocks[0]} #{blocks[1]} #{blocks[2]} 2:27:f1\n", dst_coll.manifest_text, "mangled by cp_r") + end + def test_copy_empty_source_path_raises_ArgumentError(src="", dst="./s1") coll = Arv::Collection.new(SIMPLEST_MANIFEST) assert_raises(ArgumentError) do -- 2.30.2