if segmentoffset == current_span[1]:
current_span[1] += segment[arvados.SEGMENTSIZE]
else:
- stream_tokens.append("{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout))
+ stream_tokens.append(u"{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout))
current_span = [segmentoffset, segmentoffset + segment[arvados.SEGMENTSIZE]]
if current_span is not None:
- stream_tokens.append("{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout))
+ stream_tokens.append(u"{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout))
if not stream[f]:
- stream_tokens.append("0:0:{0}".format(fout))
+ stream_tokens.append(u"0:0:{0}".format(fout))
return stream_tokens
self._manifest_locator,
error_via_api,
error_via_keep))
- self._streams = [sline.split()
- for sline in self._manifest_text.split("\n")
- if sline]
+ if type(self._manifest_text) == unicode:
+ unicode_manifest = self._manifest_text
+ else:
+ unicode_manifest = self._manifest_text.decode('utf-8')
+ self._streams = [
+ sline.split()
+ for sline in unicode_manifest.split("\n")
+ if sline]
def normalize(self):
self._populate()
# Regenerate the manifest text based on the normalized streams
self._manifest_text = ''.join(
[StreamReader(stream, keep=self._my_keep()).manifest_text()
- for stream in self._streams])
+ for stream in self._streams]
+ ).encode('utf-8')
def open(self, streampath, filename=None):
"""open(streampath[, filename]) -> file-like object
# Older SDK provided a name() method.
# This class provides both, for maximum compatibility.
def __call__(self):
- return self
+ return self.decode('utf-8')
def __init__(self, stream, segments, name):
- super(StreamFileReader, self).__init__(self._NameAttribute(name), 'rb')
+ super(StreamFileReader, self).__init__(
+ self._NameAttribute(name.encode('utf-8')), 'rb')
self._stream = stream
self.segments = segments
self._filepos = 0L
manifest_text.append(m.group(0))
else:
manifest_text.extend([d[LOCATOR] for d in self._data_locators])
- manifest_text.extend([' '.join(["{}:{}:{}".format(seg[LOCATOR], seg[BLOCKSIZE], f.name().replace(' ', '\\040'))
+ manifest_text.extend([' '.join([u"{}:{}:{}".format(seg[LOCATOR], seg[BLOCKSIZE], f.name().replace(' ', '\\040'))
for seg in f.segments])
for f in self._files.values()])
return ' '.join(manifest_text) + '\n'
"""
self.assertEqual(arvados.CollectionReader(m8, self.api_client).manifest_text(normalize=True), m8)
+ m_utf8 = """./\xe2\x9b\xb5 59ca0efa9f5633cb0371bbc0355478d8+13 0:13:\xf0\x9f\x98\xb1
+"""
+ self.assertEqual(arvados.CollectionReader(m_utf8, self.api_client).manifest_text(normalize=True), m_utf8)
+
def test_locators_and_ranges(self):
blocks2 = [['a', 10, 0],
['b', 10, 10],
def set_portable_data_hash
if (self.portable_data_hash.nil? or (self.portable_data_hash == "") or (manifest_text_changed? and !portable_data_hash_changed?))
- self.portable_data_hash = "#{Digest::MD5.hexdigest(manifest_text)}+#{manifest_text.length}"
+ self.portable_data_hash = "#{Digest::MD5.hexdigest(manifest_text)}+#{manifest_text.bytesize}"
elsif portable_data_hash_changed?
begin
loc = Keep::Locator.parse!(self.portable_data_hash)
if loc.size
self.portable_data_hash = loc.to_s
else
- self.portable_data_hash = "#{loc.hash}+#{self.manifest_text.length}"
+ self.portable_data_hash = "#{loc.hash}+#{self.manifest_text.bytesize}"
end
rescue ArgumentError => e
errors.add(:portable_data_hash, "#{e}")
def ensure_hash_matches_manifest_text
if manifest_text_changed? or portable_data_hash_changed?
- computed_hash = "#{Digest::MD5.hexdigest(manifest_text)}+#{manifest_text.length}"
+ computed_hash = "#{Digest::MD5.hexdigest(manifest_text)}+#{manifest_text.bytesize}"
unless computed_hash == portable_data_hash
logger.debug "(computed) '#{computed_hash}' != '#{portable_data_hash}' (provided)"
errors.add(:portable_data_hash, "does not match hash of manifest_text")
cw.start_new_file('x/x')
cw.write('x')
+ self._utf8 = ["\xe2\x9c\x8c", # victory sign
+ "\xe2\x9b\xb5", # sailboat
+ "\xf0\x9f\x98\xb1", # scream
+ ]
+ cw.start_new_stream('edgecases/utf8')
+ for f in self._utf8:
+ cw.start_new_file(f)
+ cw.write(f)
+
self.testcollection = cw.finish()
self.api.collections().create(body={"manifest_text":cw.manifest_text()}).execute()
self.assertDirContents('dir2', ['thing5.txt', 'thing6.txt', 'dir3'])
self.assertDirContents('dir2/dir3', ['thing7.txt', 'thing8.txt'])
self.assertDirContents('edgecases',
- "dirs/:/_/__/.../-/*/\x01\\/ ".split("/"))
+ "dirs/utf8/:/_/__/.../-/*/\x01\\/ ".split("/"))
self.assertDirContents('edgecases/dirs',
":/__/.../-/*/\x01\\/ ".split("/"))
+ self.assertDirContents('edgecases/utf8', self._utf8)
files = {'thing1.txt': 'data 1',
'thing2.txt': 'data 2',