1 # Copyright (C) The Arvados Authors. All rights reserved.
3 # SPDX-License-Identifier: Apache-2.0
7 # A Locator is used to parse and manipulate Keep locator strings.
9 # Locators obey the following syntax:
11 # locator ::= address hint*
12 # address ::= digest size-hint
13 # digest ::= <32 hexadecimal digits>
14 # size-hint ::= "+" [0-9]+
15 # hint ::= "+" hint-type hint-content
17 # hint-content ::= [A-Za-z0-9@_-]+
19 # Individual hints may have their own required format:
21 # sign-hint ::= "+A" <40 lowercase hex digits> "@" sign-timestamp
22 # sign-timestamp ::= <8 lowercase hex digits>
23 attr_reader :hash, :hints, :size
25 LOCATOR_REGEXP = /^([[:xdigit:]]{32})(\+([[:digit:]]+))?((\+([[:upper:]][[:alnum:]@_-]*))+)?\z/
27 def initialize(hasharg, sizearg, hintarg)
34 !!(LOCATOR_REGEXP.match tok)
37 # Locator.parse returns a Locator object parsed from the string tok.
38 # Returns nil if tok could not be parsed as a valid locator.
47 # Locator.parse! returns a Locator object parsed from the string tok,
48 # raising an ArgumentError if tok cannot be parsed.
50 if tok.nil? or tok.empty?
51 raise ArgumentError.new "locator is nil or empty"
54 m = LOCATOR_REGEXP.match(tok)
56 raise ArgumentError.new "not a valid locator #{tok}"
59 tokhash, _, toksize, _, _, trailer = m[1..6]
62 trailer.split('+').each do |hint|
63 if hint =~ /^[[:upper:]][[:alnum:]@_-]*$/
66 raise ArgumentError.new "invalid hint #{hint}"
71 Locator.new(tokhash, toksize, tokhints)
74 # Returns the signature hint supplied with this locator,
75 # or nil if the locator was not signed.
77 @hints.grep(/^A/).first
80 # Returns an unsigned Locator.
82 Locator.new(@hash, @size, @hints.reject { |o| o.start_with?("A") })
86 Locator.new(@hash, @size, [])
96 [ @hash, @size, *@hints ].join('+')
98 [ @hash, *@hints ].join('+')
104 STREAM_TOKEN_REGEXP = /^([^\000-\040\\]|\\[0-3][0-7][0-7])+$/
105 STREAM_NAME_REGEXP = /^(\.)(\/[^\/]+)*$/
107 EMPTY_DIR_TOKEN_REGEXP = /^0:0:\.$/ # The exception when a file can have '.' as a name
108 FILE_TOKEN_REGEXP = /^[[:digit:]]+:[[:digit:]]+:([^\000-\040\\]|\\[0-3][0-7][0-7])+$/
109 FILE_NAME_REGEXP = /^[[:digit:]]+:[[:digit:]]+:([^\/]+(\/[^\/]+)*)$/
111 NON_8BIT_ENCODED_CHAR = /[^\\]\\[4-7][0-7][0-7]/
113 # Class to parse a manifest text and provide common views of that data.
114 def initialize(manifest_text)
115 @text = manifest_text
120 return to_enum(__method__) unless block_given?
121 @text.each_line do |line|
125 line.scan(/\S+/) do |token|
127 stream_name = unescape token
128 elsif file_tokens.empty? and Locator.valid? token
129 block_tokens << token
131 file_tokens << unescape(token)
135 next if stream_name.nil?
136 yield [stream_name, block_tokens, file_tokens]
143 # Parse backslash escapes in a Keep manifest stream or file name.
144 s.gsub(/\\(\\|[0-7]{3})/) do |_|
155 self.class.unescape(s)
158 def split_file_token token
159 start_pos, filesize, filename = token.split(':', 3)
161 raise ArgumentError.new "Invalid file token '#{token}'"
163 [start_pos.to_i, filesize.to_i, unescape(filename)]
167 return to_enum(__method__) unless block_given?
168 @text.each_line do |line|
170 in_file_tokens = false
171 line.scan(/\S+/) do |token|
173 stream_name = unescape token
174 elsif in_file_tokens or not Locator.valid? token
175 in_file_tokens = true
177 start_pos, file_size, file_name = split_file_token(token)
178 stream_name_adjuster = ''
179 if file_name.include?('/') # '/' in filename
180 dirname, sep, basename = file_name.rpartition('/')
181 stream_name_adjuster = sep + dirname # /dir_parts
185 yield [stream_name + stream_name_adjuster, start_pos, file_size, file_name]
194 file_sizes = Hash.new(0)
195 each_file_spec do |streamname, _, filesize, filename|
196 file_sizes[[streamname, filename]] += filesize
198 @files = file_sizes.each_pair.map do |(streamname, filename), size|
199 [streamname, filename, size]
205 def files_count(stop_after=nil)
206 # Return the number of files represented in this manifest.
207 # If stop_after is provided, files_count will read the manifest
208 # incrementally, and return immediately when it counts that number of
209 # files. This can help you avoid parsing the entire manifest if you
210 # just want to check if a small number of files are specified.
211 if stop_after.nil? or not @files.nil?
212 # Avoid counting empty dir placeholders
213 return files.reject{|_, name, size| name == '.' and size == 0}.size
216 each_file_spec do |streamname, _, filesize, filename|
217 # Avoid counting empty dir placeholders
218 next if filename == "." and filesize == 0
219 seen_files[[streamname, filename]] = true
220 return stop_after if (seen_files.size >= stop_after)
226 # Return the total size of all files in this manifest.
227 files.reduce(0) { |total, (_, _, size)| total + size }
230 def exact_file_count?(want_count)
231 files_count(want_count + 1) == want_count
234 def minimum_file_count?(want_count)
235 files_count(want_count) >= want_count
238 def has_file?(want_stream, want_file=nil)
240 want_stream, want_file = File.split(want_stream)
242 each_file_spec do |streamname, _, _, name|
243 if streamname == want_stream and name == want_file
250 # Verify that a given manifest is valid according to
251 # https://arvados.org/projects/arvados/wiki/Keep_manifest_format
252 def self.validate! manifest
253 raise ArgumentError.new "No manifest found" if !manifest
255 return true if manifest.empty?
257 raise ArgumentError.new "Invalid manifest: does not end with newline" if !manifest.end_with?("\n")
259 manifest.each_line do |line|
262 words = line[0..-2].split(/ /)
263 raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing stream name" if words.empty?
268 raise ArgumentError.new "Manifest invalid for stream #{line_count}: >8-bit encoded chars not allowed on stream token #{word.inspect}" if word =~ NON_8BIT_ENCODED_CHAR
269 unescaped_word = unescape(word)
270 count += 1 if word =~ STREAM_TOKEN_REGEXP and unescaped_word =~ STREAM_NAME_REGEXP and unescaped_word !~ /\/\.\.?(\/|$)/
271 raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing or invalid stream name #{word.inspect if word}" if count != 1
275 while word =~ Locator::LOCATOR_REGEXP
279 raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing or invalid locator #{word.inspect if word}" if count == 0
282 raise ArgumentError.new "Manifest invalid for stream #{line_count}: >8-bit encoded chars not allowed on file token #{word.inspect}" if word =~ NON_8BIT_ENCODED_CHAR
283 while unescape(word) =~ EMPTY_DIR_TOKEN_REGEXP or
284 (word =~ FILE_TOKEN_REGEXP and unescape(word) =~ FILE_NAME_REGEXP and ($~[1].split('/') & ['..', '.']).empty?)
290 raise ArgumentError.new "Manifest invalid for stream #{line_count}: invalid file token #{word.inspect}"
292 raise ArgumentError.new "Manifest invalid for stream #{line_count}: no file tokens"
295 # Ruby's split() method silently drops trailing empty tokens
296 # (which are not allowed by the manifest format) so we have to
297 # check trailing spaces manually.
298 raise ArgumentError.new "Manifest invalid for stream #{line_count}: trailing space" if line.end_with? " \n"
303 def self.valid? manifest