4363: Do not blow up when manifests contain utf-8 characters.
authorTom Clegg <tom@curoverse.com>
Thu, 13 Nov 2014 19:28:57 +0000 (14:28 -0500)
committerTom Clegg <tom@curoverse.com>
Thu, 13 Nov 2014 19:28:57 +0000 (14:28 -0500)
sdk/python/arvados/collection.py
sdk/python/arvados/stream.py
sdk/python/tests/test_collections.py
services/api/app/models/collection.rb
services/fuse/tests/test_mount.py

index 0f49438b6cd0ed8a57c7f7e519eedde8d294909e..0fe7b3ee9ef448f7768af1983d3d3b67aae20e3c 100644 (file)
@@ -42,14 +42,14 @@ def normalize_stream(s, stream):
                 if segmentoffset == current_span[1]:
                     current_span[1] += segment[arvados.SEGMENTSIZE]
                 else:
-                    stream_tokens.append("{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout))
+                    stream_tokens.append(u"{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout))
                     current_span = [segmentoffset, segmentoffset + segment[arvados.SEGMENTSIZE]]
 
         if current_span is not None:
-            stream_tokens.append("{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout))
+            stream_tokens.append(u"{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout))
 
         if not stream[f]:
-            stream_tokens.append("0:0:{0}".format(fout))
+            stream_tokens.append(u"0:0:{0}".format(fout))
 
     return stream_tokens
 
@@ -186,9 +186,14 @@ class CollectionReader(CollectionBase):
                     self._manifest_locator,
                     error_via_api,
                     error_via_keep))
-        self._streams = [sline.split()
-                         for sline in self._manifest_text.split("\n")
-                         if sline]
+        if type(self._manifest_text) == unicode:
+            unicode_manifest = self._manifest_text
+        else:
+            unicode_manifest = self._manifest_text.decode('utf-8')
+        self._streams = [
+            sline.split()
+            for sline in unicode_manifest.split("\n")
+            if sline]
 
     def normalize(self):
         self._populate()
@@ -211,7 +216,8 @@ class CollectionReader(CollectionBase):
         # Regenerate the manifest text based on the normalized streams
         self._manifest_text = ''.join(
             [StreamReader(stream, keep=self._my_keep()).manifest_text()
-             for stream in self._streams])
+             for stream in self._streams]
+            ).encode('utf-8')
 
     def open(self, streampath, filename=None):
         """open(streampath[, filename]) -> file-like object
index c263dd871be9d663d4aceebbfaf18f07aecd71dc..cd8153ea96de98dcdf786ddc86b69363f0c92c80 100644 (file)
@@ -108,11 +108,12 @@ class StreamFileReader(ArvadosFileBase):
         # Older SDK provided a name() method.
         # This class provides both, for maximum compatibility.
         def __call__(self):
-            return self
+            return self.decode('utf-8')
 
 
     def __init__(self, stream, segments, name):
-        super(StreamFileReader, self).__init__(self._NameAttribute(name), 'rb')
+        super(StreamFileReader, self).__init__(
+            self._NameAttribute(name.encode('utf-8')), 'rb')
         self._stream = stream
         self.segments = segments
         self._filepos = 0L
@@ -327,7 +328,7 @@ class StreamReader(object):
                 manifest_text.append(m.group(0))
         else:
             manifest_text.extend([d[LOCATOR] for d in self._data_locators])
-        manifest_text.extend([' '.join(["{}:{}:{}".format(seg[LOCATOR], seg[BLOCKSIZE], f.name().replace(' ', '\\040'))
+        manifest_text.extend([' '.join([u"{}:{}:{}".format(seg[LOCATOR], seg[BLOCKSIZE], f.name().replace(' ', '\\040'))
                                         for seg in f.segments])
                               for f in self._files.values()])
         return ' '.join(manifest_text) + '\n'
index 254a29f313eac8a898a18eea3a3f8b9348cd810f..be62ad45d42e01452373a065332580ae6b5510d8 100644 (file)
@@ -209,6 +209,10 @@ class ArvadosCollectionsTest(run_test_server.TestCaseWithServers,
 """
         self.assertEqual(arvados.CollectionReader(m8, self.api_client).manifest_text(normalize=True), m8)
 
+        m_utf8 = """./\xe2\x9b\xb5 59ca0efa9f5633cb0371bbc0355478d8+13 0:13:\xf0\x9f\x98\xb1
+"""
+        self.assertEqual(arvados.CollectionReader(m_utf8, self.api_client).manifest_text(normalize=True), m_utf8)
+
     def test_locators_and_ranges(self):
         blocks2 = [['a', 10, 0],
                   ['b', 10, 10],
index accd2cc62c7bc049518481efdcbf49db592f325a..28fc70e01b8b8d503461e042918c5b9678fb94a7 100644 (file)
@@ -76,7 +76,7 @@ class Collection < ArvadosModel
 
   def set_portable_data_hash
     if (self.portable_data_hash.nil? or (self.portable_data_hash == "") or (manifest_text_changed? and !portable_data_hash_changed?))
-      self.portable_data_hash = "#{Digest::MD5.hexdigest(manifest_text)}+#{manifest_text.length}"
+      self.portable_data_hash = "#{Digest::MD5.hexdigest(manifest_text)}+#{manifest_text.bytesize}"
     elsif portable_data_hash_changed?
       begin
         loc = Keep::Locator.parse!(self.portable_data_hash)
@@ -84,7 +84,7 @@ class Collection < ArvadosModel
         if loc.size
           self.portable_data_hash = loc.to_s
         else
-          self.portable_data_hash = "#{loc.hash}+#{self.manifest_text.length}"
+          self.portable_data_hash = "#{loc.hash}+#{self.manifest_text.bytesize}"
         end
       rescue ArgumentError => e
         errors.add(:portable_data_hash, "#{e}")
@@ -96,7 +96,7 @@ class Collection < ArvadosModel
 
   def ensure_hash_matches_manifest_text
     if manifest_text_changed? or portable_data_hash_changed?
-      computed_hash = "#{Digest::MD5.hexdigest(manifest_text)}+#{manifest_text.length}"
+      computed_hash = "#{Digest::MD5.hexdigest(manifest_text)}+#{manifest_text.bytesize}"
       unless computed_hash == portable_data_hash
         logger.debug "(computed) '#{computed_hash}' != '#{portable_data_hash}' (provided)"
         errors.add(:portable_data_hash, "does not match hash of manifest_text")
index 3a59084d38e078fb945d7c1c96369b1a0ec24fb1..7e5325db30022984ae7c3a4b5881a2f9b3c8f43b 100644 (file)
@@ -77,6 +77,15 @@ class FuseMountTest(MountTestBase):
             cw.start_new_file('x/x')
             cw.write('x')
 
+        self._utf8 = ["\xe2\x9c\x8c",     # victory sign
+                      "\xe2\x9b\xb5",     # sailboat
+                      "\xf0\x9f\x98\xb1", # scream
+                      ]
+        cw.start_new_stream('edgecases/utf8')
+        for f in self._utf8:
+            cw.start_new_file(f)
+            cw.write(f)
+
         self.testcollection = cw.finish()
         self.api.collections().create(body={"manifest_text":cw.manifest_text()}).execute()
 
@@ -105,9 +114,10 @@ class FuseMountTest(MountTestBase):
         self.assertDirContents('dir2', ['thing5.txt', 'thing6.txt', 'dir3'])
         self.assertDirContents('dir2/dir3', ['thing7.txt', 'thing8.txt'])
         self.assertDirContents('edgecases',
-                               "dirs/:/_/__/.../-/*/\x01\\/ ".split("/"))
+                               "dirs/utf8/:/_/__/.../-/*/\x01\\/ ".split("/"))
         self.assertDirContents('edgecases/dirs',
                                ":/__/.../-/*/\x01\\/ ".split("/"))
+        self.assertDirContents('edgecases/utf8', self._utf8)
 
         files = {'thing1.txt': 'data 1',
                  'thing2.txt': 'data 2',