X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/0c9dd4b566696e5de7bbe82d000997acc978dcfa..93380e6aec7e11607019cdce88419b6f708327d7:/sdk/python/arvados/_normalize_stream.py diff --git a/sdk/python/arvados/_normalize_stream.py b/sdk/python/arvados/_normalize_stream.py index 400a38e46b..485c757e7f 100644 --- a/sdk/python/arvados/_normalize_stream.py +++ b/sdk/python/arvados/_normalize_stream.py @@ -1,3 +1,17 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import absolute_import +from . import config + +import re + +def escape(path): + path = re.sub('\\\\', lambda m: '\\134', path) + path = re.sub('[:\000-\040]', lambda m: "\\%03o" % ord(m.group(0)), path) + return path + def normalize_stream(stream_name, stream): """Take manifest stream and return a list of tokens in normalized format. @@ -9,13 +23,13 @@ def normalize_stream(stream_name, stream): """ - stream_name = stream_name.replace(' ', '\\040') + stream_name = escape(stream_name) stream_tokens = [stream_name] sortedfiles = list(stream.keys()) sortedfiles.sort() blocks = {} - streamoffset = 0L + streamoffset = 0 # Go through each file and add each referenced block exactly once. for streamfile in sortedfiles: for segment in stream[streamfile]: @@ -31,7 +45,7 @@ def normalize_stream(stream_name, stream): for streamfile in sortedfiles: # Add in file segments current_span = None - fout = streamfile.replace(' ', '\\040') + fout = escape(streamfile) for segment in stream[streamfile]: # Collapse adjacent segments streamoffset = blocks[segment.locator] + segment.segment_offset @@ -41,13 +55,13 @@ def normalize_stream(stream_name, stream): if streamoffset == current_span[1]: current_span[1] += segment.segment_size else: - stream_tokens.append("{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout)) + stream_tokens.append(u"{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout)) current_span = [streamoffset, streamoffset + segment.segment_size] if current_span is not None: - stream_tokens.append("{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout)) + stream_tokens.append(u"{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout)) if not stream[streamfile]: - stream_tokens.append("0:0:{0}".format(fout)) + stream_tokens.append(u"0:0:{0}".format(fout)) return stream_tokens