14482: Allow unescaped " " on stream and file token regexes (WIP)

[arvados.git] / sdk / ruby / lib / arvados / keep.rb
diff --git a/sdk/ruby/lib/arvados/keep.rb b/sdk/ruby/lib/arvados/keep.rb

index 00f4f36d2b25201c64e2cff8c7e68c765c6745d9..b8abf0f7cceb939c775cef64dd09ba59318bd81f 100644 (file)
--- a/sdk/ruby/lib/arvados/keep.rb
+++ b/sdk/ruby/lib/arvados/keep.rb
@@ -1,3 +1,7 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
  module Keep
    class Locator
      # A Locator is used to parse and manipulate Keep locator strings.
  module Keep
    class Locator
      # A Locator is used to parse and manipulate Keep locator strings.
@@ -97,8 +101,9 @@ module Keep
    end
  
    class Manifest
    end
  
    class Manifest
-    STRICT_STREAM_TOKEN_REGEXP = /^(\.)(\/[^\/\s]+)*$/
-    STRICT_FILE_TOKEN_REGEXP = /^[[:digit:]]+:[[:digit:]]+:([^\s\/]+(\/[^\s\/]+)*)$/
+    STRICT_STREAM_TOKEN_REGEXP = /^(\.)(\/[^\/\t\v\n\r]+)*$/
+    STRICT_FILE_TOKEN_REGEXP = /^[[:digit:]]+:[[:digit:]]+:([^\t\v\n\r\/]+(\/[^\t\v\n\r\/]+)*)$/
+    EMPTY_DOT_FILE_TOKEN_REGEXP = /^0:0:\.$/
  
      # Class to parse a manifest text and provide common views of that data.
      def initialize(manifest_text)
  
      # Class to parse a manifest text and provide common views of that data.
      def initialize(manifest_text)
@@ -127,7 +132,9 @@ module Keep
        end
      end
  
        end
      end
  
-    def unescape(s)
+    def self.unescape(s)
+      return nil if s.nil?
+
        # Parse backslash escapes in a Keep manifest stream or file name.
        s.gsub(/\\(\\|[0-7]{3})/) do |_|
          case $1
        # Parse backslash escapes in a Keep manifest stream or file name.
        s.gsub(/\\(\\|[0-7]{3})/) do |_|
          case $1
@@ -139,6 +146,10 @@ module Keep
        end
      end
  
        end
      end
  
+    def unescape(s)
+      self.class.unescape(s)
+    end
+
      def split_file_token token
        start_pos, filesize, filename = token.split(':', 3)
        if filename.nil?
      def split_file_token token
        start_pos, filesize, filename = token.split(':', 3)
        if filename.nil?
@@ -158,15 +169,15 @@ module Keep
            elsif in_file_tokens or not Locator.valid? token
              in_file_tokens = true
  
            elsif in_file_tokens or not Locator.valid? token
              in_file_tokens = true
  
-            file_tokens = split_file_token(token)
+            start_pos, file_size, file_name = split_file_token(token)
              stream_name_adjuster = ''
              stream_name_adjuster = ''
-            if file_tokens[2].include?('/')                # '/' in filename
-              parts = file_tokens[2].rpartition('/')
-              stream_name_adjuster = parts[1] + parts[0]   # /dir_parts
-              file_tokens[2] = parts[2]
+            if file_name.include?('/')                # '/' in filename
+              dirname, sep, basename = file_name.rpartition('/')
+              stream_name_adjuster = sep + dirname   # /dir_parts
+              file_name = basename
              end
  
              end
  
-            yield [stream_name + stream_name_adjuster] + file_tokens
+            yield [stream_name + stream_name_adjuster, start_pos, file_size, file_name]
            end
          end
        end
            end
          end
        end
@@ -193,10 +204,13 @@ module Keep
        # files.  This can help you avoid parsing the entire manifest if you
        # just want to check if a small number of files are specified.
        if stop_after.nil? or not @files.nil?
        # files.  This can help you avoid parsing the entire manifest if you
        # just want to check if a small number of files are specified.
        if stop_after.nil? or not @files.nil?
-        return files.size
+        # Avoid counting empty dir placeholders
+        return files.reject{|_, name, size| name == '.' and size == 0}.size
        end
        seen_files = {}
        end
        seen_files = {}
-      each_file_spec do |streamname, _, _, filename|
+      each_file_spec do |streamname, _, filesize, filename|
+        # Avoid counting empty dir placeholders
+        next if filename == "." and filesize == 0
          seen_files[[streamname, filename]] = true
          return stop_after if (seen_files.size >= stop_after)
        end
          seen_files[[streamname, filename]] = true
          return stop_after if (seen_files.size >= stop_after)
        end
@@ -246,7 +260,8 @@ module Keep
          count = 0
  
          word = words.shift
          count = 0
  
          word = words.shift
-        count += 1 if word =~ STRICT_STREAM_TOKEN_REGEXP and word !~ /\/\.\.?(\/|$)/
+        unescaped_word = unescape(word)
+        count += 1 if unescaped_word =~ STRICT_STREAM_TOKEN_REGEXP and unescaped_word !~ /\/\.\.?(\/|$)/
          raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing or invalid stream name #{word.inspect if word}" if count != 1
  
          count = 0
          raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing or invalid stream name #{word.inspect if word}" if count != 1
  
          count = 0
@@ -258,7 +273,8 @@ module Keep
          raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing or invalid locator #{word.inspect if word}" if count == 0
  
          count = 0
          raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing or invalid locator #{word.inspect if word}" if count == 0
  
          count = 0
-        while word =~ STRICT_FILE_TOKEN_REGEXP and ($~[1].split('/') & ['..','.']).empty?
+        while unescape(word) =~ EMPTY_DOT_FILE_TOKEN_REGEXP or
+          (unescape(word) =~ STRICT_FILE_TOKEN_REGEXP and ($~[1].split('/') & ['..', '.']).empty?)
            word = words.shift
            count += 1
          end
            word = words.shift
            count += 1
          end