6277: Add more restrictions to manifest format such as cannot start with '/' and...
[arvados.git] / sdk / ruby / lib / arvados / keep.rb
index 8afe13f84976f6dea1e7f319d0b27bcc3e15f990..bcab5fc69909623c33eebdbb6ca9ae333ed724e0 100644 (file)
@@ -18,12 +18,18 @@ module Keep
     #   sign-timestamp ::= <8 lowercase hex digits>
     attr_reader :hash, :hints, :size
 
+    LOCATOR_REGEXP = /^([[:xdigit:]]{32})(\+([[:digit:]]+))?((\+([[:upper:]][[:alnum:]@_-]*))+)?\z/
+
     def initialize(hasharg, sizearg, hintarg)
       @hash = hasharg
       @size = sizearg
       @hints = hintarg
     end
 
+    def self.valid? tok
+      !!(LOCATOR_REGEXP.match tok)
+    end
+
     # Locator.parse returns a Locator object parsed from the string tok.
     # Returns nil if tok could not be parsed as a valid locator.
     def self.parse(tok)
@@ -41,7 +47,7 @@ module Keep
         raise ArgumentError.new "locator is nil or empty"
       end
 
-      m = /^([[:xdigit:]]{32})(\+([[:digit:]]+))?(\+([[:upper:]][[:alnum:]+@_-]*))?$/.match(tok.strip)
+      m = LOCATOR_REGEXP.match(tok.strip)
       unless m
         raise ArgumentError.new "not a valid locator #{tok}"
       end
@@ -91,6 +97,9 @@ module Keep
   end
 
   class Manifest
+    STREAM_REGEXP = /(\.)((\/+.*[^\/])*)$/
+    FILE_REGEXP = /^[[:digit:]]+:[[:digit:]]+:(?!\/).*[^\/]$/
+
     # Class to parse a manifest text and provide common views of that data.
     def initialize(manifest_text)
       @text = manifest_text
@@ -100,14 +109,21 @@ module Keep
     def each_line
       return to_enum(__method__) unless block_given?
       @text.each_line do |line|
-        tokens = line.split
-        stream_name = unescape(tokens.shift)
-        blocks = []
-        while loc = Locator.parse(tokens.first)
-          blocks << loc
-          tokens.shift
+        stream_name = nil
+        block_tokens = []
+        file_tokens = []
+        line.scan /\S+/ do |token|
+          if stream_name.nil?
+            stream_name = unescape token
+          elsif file_tokens.empty? and Locator.valid? token
+            block_tokens << token
+          else
+            file_tokens << unescape(token)
+          end
         end
-        yield [stream_name, blocks, tokens.map { |s| unescape(s) }]
+        # Ignore blank lines
+        next if stream_name.nil?
+        yield [stream_name, block_tokens, file_tokens]
       end
     end
 
@@ -123,21 +139,45 @@ module Keep
       end
     end
 
-    def each_file_spec(speclist)
-      return to_enum(__method__, speclist) unless block_given?
-      speclist.each do |filespec|
-        start_pos, filesize, filename = filespec.split(':', 3)
-        yield [start_pos.to_i, filesize.to_i, filename]
+    def split_file_token token
+      start_pos, filesize, filename = token.split(':', 3)
+      if filename.nil?
+        raise ArgumentError.new "Invalid file token '#{token}'"
+      end
+      [start_pos.to_i, filesize.to_i, unescape(filename)]
+    end
+
+    def each_file_spec
+      return to_enum(__method__) unless block_given?
+      @text.each_line do |line|
+        stream_name = nil
+        in_file_tokens = false
+        line.scan /\S+/ do |token|
+          if stream_name.nil?
+            stream_name = unescape token
+          elsif in_file_tokens or not Locator.valid? token
+            in_file_tokens = true
+
+            file_tokens = split_file_token(token)
+            stream_name_adjuster = ''
+            if file_tokens[2].include?('/')                # '/' in filename
+              parts = file_tokens[2].rpartition('/')
+              stream_name_adjuster = parts[1] + parts[0]   # /dir_parts
+              file_tokens[2] = parts[2]
+            end
+
+            yield [stream_name + stream_name_adjuster] + file_tokens
+          end
+        end
       end
+      true
     end
 
     def files
       if @files.nil?
         file_sizes = Hash.new(0)
-        each_line do |streamname, blocklist, filelist|
-          each_file_spec(filelist) do |_, filesize, filename|
-            file_sizes[[streamname, filename]] += filesize
-          end
+        each_file_spec do |streamname, _, filesize, filename|
+          file_sizes[[streamname, filename]] += filesize
         end
         @files = file_sizes.each_pair.map do |(streamname, filename), size|
           [streamname, filename, size]
@@ -156,15 +196,18 @@ module Keep
         return files.size
       end
       seen_files = {}
-      each_line do |streamname, blocklist, filelist|
-        each_file_spec(filelist) do |_, _, filename|
-          seen_files[[streamname, filename]] = true
-          return stop_after if (seen_files.size >= stop_after)
-        end
+      each_file_spec do |streamname, _, _, filename|
+        seen_files[[streamname, filename]] = true
+        return stop_after if (seen_files.size >= stop_after)
       end
       seen_files.size
     end
 
+    def files_size
+      # Return the total size of all files in this manifest.
+      files.reduce(0) { |total, (_, _, size)| total + size }
+    end
+
     def exact_file_count?(want_count)
       files_count(want_count + 1) == want_count
     end
@@ -177,13 +220,52 @@ module Keep
       if want_file.nil?
         want_stream, want_file = File.split(want_stream)
       end
-      each_line do |stream_name, _, filelist|
-        if (stream_name == want_stream) and
-            each_file_spec(filelist).any? { |_, _, name| name == want_file }
+      each_file_spec do |streamname, _, _, name|
+        if streamname == want_stream and name == want_file
           return true
         end
       end
       false
     end
+
+    # Verify that a given manifest is valid as per the manifest format definition.
+    # Valid format: stream name + one or more locators + one or more files for each stream in manifest.
+    # https://arvados.org/projects/arvados/wiki/Keep_manifest_format
+    def self.valid?(manifest)
+      raise ArgumentError.new "Invalid manifest: does not end with new line" if !manifest.end_with?("\n")
+      line_count = 0
+      manifest.each_line do |line|
+        line_count += 1
+
+        words = line.split(/[[:space:]]/)
+        raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing stream name" if words.empty?
+
+        count = 0
+
+        word = words.shift
+        count += 1 if word =~ STREAM_REGEXP and !word.include? '//'
+        raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing or invalid stream name #{word.inspect if word}" if count != 1
+
+        count = 0
+        word = words.shift
+        while word =~ Locator::LOCATOR_REGEXP
+          word = words.shift
+          count += 1
+        end
+        raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing or invalid locator #{word.inspect if word}" if count == 0
+
+        count = 0
+        while(word =~ FILE_REGEXP and !word.include? '//')
+          word = words.shift
+          count += 1
+        end
+
+        if word
+          raise ArgumentError.new "Manifest invalid for stream #{line_count}: invalid file token #{word.inspect}"
+        elsif count == 0
+          raise ArgumentError.new "Manifest invalid for stream #{line_count}: no file tokens"
+        end
+      end
+    end
   end
 end