X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/1bcfe8651af341c6e7cd01a19443c7c288efa932..09cbdc3074b3f1e69c9c537875146f6da0a6ed8f:/sdk/ruby/lib/arvados/keep.rb diff --git a/sdk/ruby/lib/arvados/keep.rb b/sdk/ruby/lib/arvados/keep.rb index ede40c3fc2..e391b7a6ca 100644 --- a/sdk/ruby/lib/arvados/keep.rb +++ b/sdk/ruby/lib/arvados/keep.rb @@ -1,3 +1,7 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + module Keep class Locator # A Locator is used to parse and manipulate Keep locator strings. @@ -18,7 +22,7 @@ module Keep # sign-timestamp ::= <8 lowercase hex digits> attr_reader :hash, :hints, :size - LOCATOR_REGEXP = /^([[:xdigit:]]{32})(\+([[:digit:]]+))?(\+([[:upper:]][[:alnum:]+@_-]*))?$/ + LOCATOR_REGEXP = /^([[:xdigit:]]{32})(\+([[:digit:]]+))?((\+([[:upper:]][[:alnum:]@_-]*))+)?\z/ def initialize(hasharg, sizearg, hintarg) @hash = hasharg @@ -35,7 +39,7 @@ module Keep def self.parse(tok) begin Locator.parse!(tok) - rescue ArgumentError => e + rescue ArgumentError nil end end @@ -47,19 +51,19 @@ module Keep raise ArgumentError.new "locator is nil or empty" end - m = LOCATOR_REGEXP.match(tok.strip) + m = LOCATOR_REGEXP.match(tok) unless m raise ArgumentError.new "not a valid locator #{tok}" end - tokhash, _, toksize, _, trailer = m[1..5] + tokhash, _, toksize, _, _, trailer = m[1..6] tokhints = [] if trailer trailer.split('+').each do |hint| - if hint =~ /^[[:upper:]][[:alnum:]@_-]+$/ + if hint =~ /^[[:upper:]][[:alnum:]@_-]*$/ tokhints.push(hint) else - raise ArgumentError.new "unknown hint #{hint}" + raise ArgumentError.new "invalid hint #{hint}" end end end @@ -97,6 +101,15 @@ module Keep end class Manifest + STREAM_TOKEN_REGEXP = /^([^\000-\040\\]|\\[0-3][0-7][0-7])+$/ + STREAM_NAME_REGEXP = /^(\.)(\/[^\/]+)*$/ + + EMPTY_DIR_TOKEN_REGEXP = /^0:0:\.$/ # The exception when a file can have '.' as a name + FILE_TOKEN_REGEXP = /^[[:digit:]]+:[[:digit:]]+:([^\000-\040\\]|\\[0-3][0-7][0-7])+$/ + FILE_NAME_REGEXP = /^[[:digit:]]+:[[:digit:]]+:([^\/]+(\/[^\/]+)*)$/ + + NON_8BIT_ENCODED_CHAR = /[^\\]\\[4-7][0-7][0-7]/ + # Class to parse a manifest text and provide common views of that data. def initialize(manifest_text) @text = manifest_text @@ -109,7 +122,7 @@ module Keep stream_name = nil block_tokens = [] file_tokens = [] - line.scan /\S+/ do |token| + line.scan(/\S+/) do |token| if stream_name.nil? stream_name = unescape token elsif file_tokens.empty? and Locator.valid? token @@ -124,7 +137,9 @@ module Keep end end - def unescape(s) + def self.unescape(s) + return nil if s.nil? + # Parse backslash escapes in a Keep manifest stream or file name. s.gsub(/\\(\\|[0-7]{3})/) do |_| case $1 @@ -136,6 +151,10 @@ module Keep end end + def unescape(s) + self.class.unescape(s) + end + def split_file_token token start_pos, filesize, filename = token.split(':', 3) if filename.nil? @@ -149,12 +168,21 @@ module Keep @text.each_line do |line| stream_name = nil in_file_tokens = false - line.scan /\S+/ do |token| + line.scan(/\S+/) do |token| if stream_name.nil? stream_name = unescape token elsif in_file_tokens or not Locator.valid? token in_file_tokens = true - yield [stream_name] + split_file_token(token) + + start_pos, file_size, file_name = split_file_token(token) + stream_name_adjuster = '' + if file_name.include?('/') # '/' in filename + dirname, sep, basename = file_name.rpartition('/') + stream_name_adjuster = sep + dirname # /dir_parts + file_name = basename + end + + yield [stream_name + stream_name_adjuster, start_pos, file_size, file_name] end end end @@ -181,16 +209,24 @@ module Keep # files. This can help you avoid parsing the entire manifest if you # just want to check if a small number of files are specified. if stop_after.nil? or not @files.nil? - return files.size + # Avoid counting empty dir placeholders + return files.reject{|_, name, size| name == '.' and size == 0}.size end seen_files = {} - each_file_spec do |streamname, _, _, filename| + each_file_spec do |streamname, _, filesize, filename| + # Avoid counting empty dir placeholders + next if filename == "." and filesize == 0 seen_files[[streamname, filename]] = true return stop_after if (seen_files.size >= stop_after) end seen_files.size end + def files_size + # Return the total size of all files in this manifest. + files.reduce(0) { |total, (_, _, size)| total + size } + end + def exact_file_count?(want_count) files_count(want_count + 1) == want_count end @@ -210,5 +246,67 @@ module Keep end false end + + # Verify that a given manifest is valid according to + # https://dev.arvados.org/projects/arvados/wiki/Keep_manifest_format + def self.validate! manifest + raise ArgumentError.new "No manifest found" if !manifest + + return true if manifest.empty? + + raise ArgumentError.new "Invalid manifest: does not end with newline" if !manifest.end_with?("\n") + line_count = 0 + manifest.each_line do |line| + line_count += 1 + + words = line[0..-2].split(/ /) + raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing stream name" if words.empty? + + count = 0 + + word = words.shift + raise ArgumentError.new "Manifest invalid for stream #{line_count}: >8-bit encoded chars not allowed on stream token #{word.inspect}" if word =~ NON_8BIT_ENCODED_CHAR + unescaped_word = unescape(word) + count += 1 if word =~ STREAM_TOKEN_REGEXP and unescaped_word =~ STREAM_NAME_REGEXP and unescaped_word !~ /\/\.\.?(\/|$)/ + raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing or invalid stream name #{word.inspect if word}" if count != 1 + + count = 0 + word = words.shift + while word =~ Locator::LOCATOR_REGEXP + word = words.shift + count += 1 + end + raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing or invalid locator #{word.inspect if word}" if count == 0 + + count = 0 + raise ArgumentError.new "Manifest invalid for stream #{line_count}: >8-bit encoded chars not allowed on file token #{word.inspect}" if word =~ NON_8BIT_ENCODED_CHAR + while unescape(word) =~ EMPTY_DIR_TOKEN_REGEXP or + (word =~ FILE_TOKEN_REGEXP and unescape(word) =~ FILE_NAME_REGEXP and ($~[1].split('/') & ['..', '.']).empty?) + word = words.shift + count += 1 + end + + if word + raise ArgumentError.new "Manifest invalid for stream #{line_count}: invalid file token #{word.inspect}" + elsif count == 0 + raise ArgumentError.new "Manifest invalid for stream #{line_count}: no file tokens" + end + + # Ruby's split() method silently drops trailing empty tokens + # (which are not allowed by the manifest format) so we have to + # check trailing spaces manually. + raise ArgumentError.new "Manifest invalid for stream #{line_count}: trailing space" if line.end_with? " \n" + end + true + end + + def self.valid? manifest + begin + validate! manifest + true + rescue ArgumentError + false + end + end end end