projects
/
arvados.git
/ blobdiff
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
14482: Allow unescaped " " on stream and file token regexes (WIP)
[arvados.git]
/
sdk
/
ruby
/
lib
/
arvados
/
keep.rb
diff --git
a/sdk/ruby/lib/arvados/keep.rb
b/sdk/ruby/lib/arvados/keep.rb
index 00f4f36d2b25201c64e2cff8c7e68c765c6745d9..b8abf0f7cceb939c775cef64dd09ba59318bd81f 100644
(file)
--- a/
sdk/ruby/lib/arvados/keep.rb
+++ b/
sdk/ruby/lib/arvados/keep.rb
@@
-1,3
+1,7
@@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
module Keep
class Locator
# A Locator is used to parse and manipulate Keep locator strings.
module Keep
class Locator
# A Locator is used to parse and manipulate Keep locator strings.
@@
-97,8
+101,9
@@
module Keep
end
class Manifest
end
class Manifest
- STRICT_STREAM_TOKEN_REGEXP = /^(\.)(\/[^\/\s]+)*$/
- STRICT_FILE_TOKEN_REGEXP = /^[[:digit:]]+:[[:digit:]]+:([^\s\/]+(\/[^\s\/]+)*)$/
+ STRICT_STREAM_TOKEN_REGEXP = /^(\.)(\/[^\/\t\v\n\r]+)*$/
+ STRICT_FILE_TOKEN_REGEXP = /^[[:digit:]]+:[[:digit:]]+:([^\t\v\n\r\/]+(\/[^\t\v\n\r\/]+)*)$/
+ EMPTY_DOT_FILE_TOKEN_REGEXP = /^0:0:\.$/
# Class to parse a manifest text and provide common views of that data.
def initialize(manifest_text)
# Class to parse a manifest text and provide common views of that data.
def initialize(manifest_text)
@@
-127,7
+132,9
@@
module Keep
end
end
end
end
- def unescape(s)
+ def self.unescape(s)
+ return nil if s.nil?
+
# Parse backslash escapes in a Keep manifest stream or file name.
s.gsub(/\\(\\|[0-7]{3})/) do |_|
case $1
# Parse backslash escapes in a Keep manifest stream or file name.
s.gsub(/\\(\\|[0-7]{3})/) do |_|
case $1
@@
-139,6
+146,10
@@
module Keep
end
end
end
end
+ def unescape(s)
+ self.class.unescape(s)
+ end
+
def split_file_token token
start_pos, filesize, filename = token.split(':', 3)
if filename.nil?
def split_file_token token
start_pos, filesize, filename = token.split(':', 3)
if filename.nil?
@@
-158,15
+169,15
@@
module Keep
elsif in_file_tokens or not Locator.valid? token
in_file_tokens = true
elsif in_file_tokens or not Locator.valid? token
in_file_tokens = true
-
file_tokens
= split_file_token(token)
+
start_pos, file_size, file_name
= split_file_token(token)
stream_name_adjuster = ''
stream_name_adjuster = ''
- if file_
tokens[2]
.include?('/') # '/' in filename
-
parts = file_tokens[2]
.rpartition('/')
- stream_name_adjuster =
parts[1] + parts[0]
# /dir_parts
- file_
tokens[2] = parts[2]
+ if file_
name
.include?('/') # '/' in filename
+
dirname, sep, basename = file_name
.rpartition('/')
+ stream_name_adjuster =
sep + dirname
# /dir_parts
+ file_
name = basename
end
end
- yield [stream_name + stream_name_adjuster
] + file_tokens
+ yield [stream_name + stream_name_adjuster
, start_pos, file_size, file_name]
end
end
end
end
end
end
@@
-193,10
+204,13
@@
module Keep
# files. This can help you avoid parsing the entire manifest if you
# just want to check if a small number of files are specified.
if stop_after.nil? or not @files.nil?
# files. This can help you avoid parsing the entire manifest if you
# just want to check if a small number of files are specified.
if stop_after.nil? or not @files.nil?
- return files.size
+ # Avoid counting empty dir placeholders
+ return files.reject{|_, name, size| name == '.' and size == 0}.size
end
seen_files = {}
end
seen_files = {}
- each_file_spec do |streamname, _, _, filename|
+ each_file_spec do |streamname, _, filesize, filename|
+ # Avoid counting empty dir placeholders
+ next if filename == "." and filesize == 0
seen_files[[streamname, filename]] = true
return stop_after if (seen_files.size >= stop_after)
end
seen_files[[streamname, filename]] = true
return stop_after if (seen_files.size >= stop_after)
end
@@
-246,7
+260,8
@@
module Keep
count = 0
word = words.shift
count = 0
word = words.shift
- count += 1 if word =~ STRICT_STREAM_TOKEN_REGEXP and word !~ /\/\.\.?(\/|$)/
+ unescaped_word = unescape(word)
+ count += 1 if unescaped_word =~ STRICT_STREAM_TOKEN_REGEXP and unescaped_word !~ /\/\.\.?(\/|$)/
raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing or invalid stream name #{word.inspect if word}" if count != 1
count = 0
raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing or invalid stream name #{word.inspect if word}" if count != 1
count = 0
@@
-258,7
+273,8
@@
module Keep
raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing or invalid locator #{word.inspect if word}" if count == 0
count = 0
raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing or invalid locator #{word.inspect if word}" if count == 0
count = 0
- while word =~ STRICT_FILE_TOKEN_REGEXP and ($~[1].split('/') & ['..','.']).empty?
+ while unescape(word) =~ EMPTY_DOT_FILE_TOKEN_REGEXP or
+ (unescape(word) =~ STRICT_FILE_TOKEN_REGEXP and ($~[1].split('/') & ['..', '.']).empty?)
word = words.shift
count += 1
end
word = words.shift
count += 1
end