end
class Manifest
- STRICT_STREAM_TOKEN_REGEXP = /^(\.)(\/[^\/\s]+)*$/
- STRICT_FILE_TOKEN_REGEXP = /^[[:digit:]]+:[[:digit:]]+:([^\s\/]+(\/[^\s\/]+)*)$/
+ STREAM_TOKEN_REGEXP = /^([^\000-\040\\]|\\[0-3][0-7][0-7])+$/
+ STREAM_NAME_REGEXP = /^(\.)(\/[^\/]+)*$/
+
+ EMPTY_DIR_TOKEN_REGEXP = /^0:0:\.$/ # The exception when a file can have '.' as a name
+ FILE_TOKEN_REGEXP = /^[[:digit:]]+:[[:digit:]]+:([^\000-\040\\]|\\[0-3][0-7][0-7])+$/
+ FILE_NAME_REGEXP = /^[[:digit:]]+:[[:digit:]]+:([^\/]+(\/[^\/]+)*)$/
+
+ NON_8BIT_ENCODED_CHAR = /[^\\]\\[4-7][0-7][0-7]/
# Class to parse a manifest text and provide common views of that data.
def initialize(manifest_text)
end
end
- def unescape(s)
+ def self.unescape(s)
+ return nil if s.nil?
+
# Parse backslash escapes in a Keep manifest stream or file name.
s.gsub(/\\(\\|[0-7]{3})/) do |_|
case $1
end
end
+ def unescape(s)
+ self.class.unescape(s)
+ end
+
def split_file_token token
start_pos, filesize, filename = token.split(':', 3)
if filename.nil?
# files. This can help you avoid parsing the entire manifest if you
# just want to check if a small number of files are specified.
if stop_after.nil? or not @files.nil?
- return files.size
+ # Avoid counting empty dir placeholders
+ return files.reject{|_, name, size| name == '.' and size == 0}.size
end
seen_files = {}
- each_file_spec do |streamname, _, _, filename|
+ each_file_spec do |streamname, _, filesize, filename|
+ # Avoid counting empty dir placeholders
+ next if filename == "." and filesize == 0
seen_files[[streamname, filename]] = true
return stop_after if (seen_files.size >= stop_after)
end
end
# Verify that a given manifest is valid according to
- # https://arvados.org/projects/arvados/wiki/Keep_manifest_format
+ # https://dev.arvados.org/projects/arvados/wiki/Keep_manifest_format
def self.validate! manifest
raise ArgumentError.new "No manifest found" if !manifest
count = 0
word = words.shift
- count += 1 if word =~ STRICT_STREAM_TOKEN_REGEXP and word !~ /\/\.\.?(\/|$)/
+ raise ArgumentError.new "Manifest invalid for stream #{line_count}: >8-bit encoded chars not allowed on stream token #{word.inspect}" if word =~ NON_8BIT_ENCODED_CHAR
+ unescaped_word = unescape(word)
+ count += 1 if word =~ STREAM_TOKEN_REGEXP and unescaped_word =~ STREAM_NAME_REGEXP and unescaped_word !~ /\/\.\.?(\/|$)/
raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing or invalid stream name #{word.inspect if word}" if count != 1
count = 0
raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing or invalid locator #{word.inspect if word}" if count == 0
count = 0
- while word =~ STRICT_FILE_TOKEN_REGEXP and ($~[1].split('/') & ['..','.']).empty?
+ raise ArgumentError.new "Manifest invalid for stream #{line_count}: >8-bit encoded chars not allowed on file token #{word.inspect}" if word =~ NON_8BIT_ENCODED_CHAR
+ while unescape(word) =~ EMPTY_DIR_TOKEN_REGEXP or
+ (word =~ FILE_TOKEN_REGEXP and unescape(word) =~ FILE_NAME_REGEXP and ($~[1].split('/') & ['..', '.']).empty?)
word = words.shift
count += 1
end