sdk/ruby/lib/arvados/keep.rb

   1 # Copyright (C) The Arvados Authors. All rights reserved.
   2 #
   3 # SPDX-License-Identifier: Apache-2.0
   4
   5 module Keep
   6   class Locator
   7     # A Locator is used to parse and manipulate Keep locator strings.
   8     #
   9     # Locators obey the following syntax:
  10     #
  11     #   locator      ::= address hint*
  12     #   address      ::= digest size-hint
  13     #   digest       ::= <32 hexadecimal digits>
  14     #   size-hint    ::= "+" [0-9]+
  15     #   hint         ::= "+" hint-type hint-content
  16     #   hint-type    ::= [A-Z]
  17     #   hint-content ::= [A-Za-z0-9@_-]+
  18     #
  19     # Individual hints may have their own required format:
  20     #
  21     #   sign-hint      ::= "+A" <40 lowercase hex digits> "@" sign-timestamp
  22     #   sign-timestamp ::= <8 lowercase hex digits>
  23     attr_reader :hash, :hints, :size
  24
  25     LOCATOR_REGEXP = /^([[:xdigit:]]{32})(\+([[:digit:]]+))?((\+([[:upper:]][[:alnum:]@_-]*))+)?\z/
  26
  27     def initialize(hasharg, sizearg, hintarg)
  28       @hash = hasharg
  29       @size = sizearg
  30       @hints = hintarg
  31     end
  32
  33     def self.valid? tok
  34       !!(LOCATOR_REGEXP.match tok)
  35     end
  36
  37     # Locator.parse returns a Locator object parsed from the string tok.
  38     # Returns nil if tok could not be parsed as a valid locator.
  39     def self.parse(tok)
  40       begin
  41         Locator.parse!(tok)
  42       rescue ArgumentError
  43         nil
  44       end
  45     end
  46
  47     # Locator.parse! returns a Locator object parsed from the string tok,
  48     # raising an ArgumentError if tok cannot be parsed.
  49     def self.parse!(tok)
  50       if tok.nil? or tok.empty?
  51         raise ArgumentError.new "locator is nil or empty"
  52       end
  53
  54       m = LOCATOR_REGEXP.match(tok)
  55       unless m
  56         raise ArgumentError.new "not a valid locator #{tok}"
  57       end
  58
  59       tokhash, _, toksize, _, _, trailer = m[1..6]
  60       tokhints = []
  61       if trailer
  62         trailer.split('+').each do |hint|
  63           if hint =~ /^[[:upper:]][[:alnum:]@_-]*$/
  64             tokhints.push(hint)
  65           else
  66             raise ArgumentError.new "invalid hint #{hint}"
  67           end
  68         end
  69       end
  70
  71       Locator.new(tokhash, toksize, tokhints)
  72     end
  73
  74     # Returns the signature hint supplied with this locator,
  75     # or nil if the locator was not signed.
  76     def signature
  77       @hints.grep(/^A/).first
  78     end
  79
  80     # Returns an unsigned Locator.
  81     def without_signature
  82       Locator.new(@hash, @size, @hints.reject { |o| o.start_with?("A") })
  83     end
  84
  85     def strip_hints
  86       Locator.new(@hash, @size, [])
  87     end
  88
  89     def strip_hints!
  90       @hints = []
  91       self
  92     end
  93
  94     def to_s
  95       if @size
  96         [ @hash, @size, *@hints ].join('+')
  97       else
  98         [ @hash, *@hints ].join('+')
  99       end
 100     end
 101   end
 102
 103   class Manifest
 104     STREAM_TOKEN_REGEXP = /^([^\000-\040\\]|\\[0-3][0-7][0-7])+$/
 105     STREAM_NAME_REGEXP = /^(\.)(\/[^\/]+)*$/
 106
 107     EMPTY_DIR_TOKEN_REGEXP = /^0:0:\.$/ # The exception when a file can have '.' as a name
 108     FILE_TOKEN_REGEXP = /^[[:digit:]]+:[[:digit:]]+:([^\000-\040\\]|\\[0-3][0-7][0-7])+$/
 109     FILE_NAME_REGEXP = /^[[:digit:]]+:[[:digit:]]+:([^\/]+(\/[^\/]+)*)$/
 110
 111     NON_8BIT_ENCODED_CHAR = /[^\\]\\[4-7][0-7][0-7]/
 112
 113     # Class to parse a manifest text and provide common views of that data.
 114     def initialize(manifest_text)
 115       @text = manifest_text
 116       @files = nil
 117     end
 118
 119     def each_line
 120       return to_enum(__method__) unless block_given?
 121       @text.each_line do |line|
 122         stream_name = nil
 123         block_tokens = []
 124         file_tokens = []
 125         line.scan(/\S+/) do |token|
 126           if stream_name.nil?
 127             stream_name = unescape token
 128           elsif file_tokens.empty? and Locator.valid? token
 129             block_tokens << token
 130           else
 131             file_tokens << unescape(token)
 132           end
 133         end
 134         # Ignore blank lines
 135         next if stream_name.nil?
 136         yield [stream_name, block_tokens, file_tokens]
 137       end
 138     end
 139
 140     def self.unescape(s)
 141       return nil if s.nil?
 142
 143       # Parse backslash escapes in a Keep manifest stream or file name.
 144       s.gsub(/\\(\\|[0-7]{3})/) do |_|
 145         case $1
 146         when '\\'
 147           '\\'
 148         else
 149           $1.to_i(8).chr
 150         end
 151       end
 152     end
 153
 154     def unescape(s)
 155       self.class.unescape(s)
 156     end
 157
 158     def split_file_token token
 159       start_pos, filesize, filename = token.split(':', 3)
 160       if filename.nil?
 161         raise ArgumentError.new "Invalid file token '#{token}'"
 162       end
 163       [start_pos.to_i, filesize.to_i, unescape(filename)]
 164     end
 165
 166     def each_file_spec
 167       return to_enum(__method__) unless block_given?
 168       @text.each_line do |line|
 169         stream_name = nil
 170         in_file_tokens = false
 171         line.scan(/\S+/) do |token|
 172           if stream_name.nil?
 173             stream_name = unescape token
 174           elsif in_file_tokens or not Locator.valid? token
 175             in_file_tokens = true
 176
 177             start_pos, file_size, file_name = split_file_token(token)
 178             stream_name_adjuster = ''
 179             if file_name.include?('/')                # '/' in filename
 180               dirname, sep, basename = file_name.rpartition('/')
 181               stream_name_adjuster = sep + dirname   # /dir_parts
 182               file_name = basename
 183             end
 184
 185             yield [stream_name + stream_name_adjuster, start_pos, file_size, file_name]
 186           end
 187         end
 188       end
 189       true
 190     end
 191
 192     def files
 193       if @files.nil?
 194         file_sizes = Hash.new(0)
 195         each_file_spec do |streamname, _, filesize, filename|
 196           file_sizes[[streamname, filename]] += filesize
 197         end
 198         @files = file_sizes.each_pair.map do |(streamname, filename), size|
 199           [streamname, filename, size]
 200         end
 201       end
 202       @files
 203     end
 204
 205     def files_count(stop_after=nil)
 206       # Return the number of files represented in this manifest.
 207       # If stop_after is provided, files_count will read the manifest
 208       # incrementally, and return immediately when it counts that number of
 209       # files.  This can help you avoid parsing the entire manifest if you
 210       # just want to check if a small number of files are specified.
 211       if stop_after.nil? or not @files.nil?
 212         # Avoid counting empty dir placeholders
 213         return files.reject{|_, name, size| name == '.' and size == 0}.size
 214       end
 215       seen_files = {}
 216       each_file_spec do |streamname, _, filesize, filename|
 217         # Avoid counting empty dir placeholders
 218         next if filename == "." and filesize == 0
 219         seen_files[[streamname, filename]] = true
 220         return stop_after if (seen_files.size >= stop_after)
 221       end
 222       seen_files.size
 223     end
 224
 225     def files_size
 226       # Return the total size of all files in this manifest.
 227       files.reduce(0) { |total, (_, _, size)| total + size }
 228     end
 229
 230     def exact_file_count?(want_count)
 231       files_count(want_count + 1) == want_count
 232     end
 233
 234     def minimum_file_count?(want_count)
 235       files_count(want_count) >= want_count
 236     end
 237
 238     def has_file?(want_stream, want_file=nil)
 239       if want_file.nil?
 240         want_stream, want_file = File.split(want_stream)
 241       end
 242       each_file_spec do |streamname, _, _, name|
 243         if streamname == want_stream and name == want_file
 244           return true
 245         end
 246       end
 247       false
 248     end
 249
 250     # Verify that a given manifest is valid according to
 251     # https://arvados.org/projects/arvados/wiki/Keep_manifest_format
 252     def self.validate! manifest
 253       raise ArgumentError.new "No manifest found" if !manifest
 254
 255       return true if manifest.empty?
 256
 257       raise ArgumentError.new "Invalid manifest: does not end with newline" if !manifest.end_with?("\n")
 258       line_count = 0
 259       manifest.each_line do |line|
 260         line_count += 1
 261
 262         words = line[0..-2].split(/ /)
 263         raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing stream name" if words.empty?
 264
 265         count = 0
 266
 267         word = words.shift
 268         raise ArgumentError.new "Manifest invalid for stream #{line_count}: >8-bit encoded chars not allowed on stream token #{word.inspect}" if word =~ NON_8BIT_ENCODED_CHAR
 269         unescaped_word = unescape(word)
 270         count += 1 if word =~ STREAM_TOKEN_REGEXP and unescaped_word =~ STREAM_NAME_REGEXP and unescaped_word !~ /\/\.\.?(\/|$)/
 271         raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing or invalid stream name #{word.inspect if word}" if count != 1
 272
 273         count = 0
 274         word = words.shift
 275         while word =~ Locator::LOCATOR_REGEXP
 276           word = words.shift
 277           count += 1
 278         end
 279         raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing or invalid locator #{word.inspect if word}" if count == 0
 280
 281         count = 0
 282         raise ArgumentError.new "Manifest invalid for stream #{line_count}: >8-bit encoded chars not allowed on file token #{word.inspect}" if word =~ NON_8BIT_ENCODED_CHAR
 283         while unescape(word) =~ EMPTY_DIR_TOKEN_REGEXP or
 284           (word =~ FILE_TOKEN_REGEXP and unescape(word) =~ FILE_NAME_REGEXP and ($~[1].split('/') & ['..', '.']).empty?)
 285           word = words.shift
 286           count += 1
 287         end
 288
 289         if word
 290           raise ArgumentError.new "Manifest invalid for stream #{line_count}: invalid file token #{word.inspect}"
 291         elsif count == 0
 292           raise ArgumentError.new "Manifest invalid for stream #{line_count}: no file tokens"
 293         end
 294
 295         # Ruby's split() method silently drops trailing empty tokens
 296         # (which are not allowed by the manifest format) so we have to
 297         # check trailing spaces manually.
 298         raise ArgumentError.new "Manifest invalid for stream #{line_count}: trailing space" if line.end_with? " \n"
 299       end
 300       true
 301     end
 302
 303     def self.valid? manifest
 304       begin
 305         validate! manifest
 306         true
 307       rescue ArgumentError
 308         false
 309       end
 310     end
 311   end
 312 end