4823: Working on method documentation and comments for arvfile
[arvados.git] / sdk / python / arvados / stream.py
index ecccefa8c4405095f881a93e39faa53b951dcb57..a7e3be38b727687d9dc0a7178e70c8605102e3af 100644 (file)
@@ -13,10 +13,15 @@ from keep import *
 import config
 import errors
 
+def locator_block_size(loc):
+    s = re.match(r'[0-9a-f]{32}\+(\d+)(\+\S+)*', loc)
+    return long(s.group(1))
+
 def normalize_stream(s, stream):
     '''
     s is the stream name
-    stream is a StreamReader object
+    stream is a dict mapping each filename to a list in the form [block locator, block size, segment offset (from beginning of block), segment size]
+    returns the stream as a list of tokens
     '''
     stream_tokens = [s]
     sortedfiles = list(stream.keys())
@@ -24,29 +29,33 @@ def normalize_stream(s, stream):
 
     blocks = {}
     streamoffset = 0L
+    # Go through each file and add each referenced block exactly once.
     for f in sortedfiles:
         for b in stream[f]:
-            if b[arvados.LOCATOR] not in blocks:
-                stream_tokens.append(b[arvados.LOCATOR])
-                blocks[b[arvados.LOCATOR]] = streamoffset
-                streamoffset += b[arvados.BLOCKSIZE]
+            if b.locator not in blocks:
+                stream_tokens.append(b.locator)
+                blocks[b.locator] = streamoffset
+                streamoffset += locator_block_size(b.locator)
 
+    # Add the empty block if the stream is otherwise empty.
     if len(stream_tokens) == 1:
         stream_tokens.append(config.EMPTY_BLOCK_LOCATOR)
 
     for f in sortedfiles:
+        # Add in file segments
         current_span = None
         fout = f.replace(' ', '\\040')
         for segment in stream[f]:
-            segmentoffset = blocks[segment[arvados.LOCATOR]] + segment[arvados.OFFSET]
+            # Collapse adjacent segments
+            streamoffset = blocks[segment.locator] + segment.segment_offset
             if current_span is None:
-                current_span = [segmentoffset, segmentoffset + segment[arvados.SEGMENTSIZE]]
+                current_span = [streamoffset, streamoffset + segment.segment_size]
             else:
-                if segmentoffset == current_span[1]:
-                    current_span[1] += segment[arvados.SEGMENTSIZE]
+                if streamoffset == current_span[1]:
+                    current_span[1] += segment.segment_size
                 else:
                     stream_tokens.append("{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout))
-                    current_span = [segmentoffset, segmentoffset + segment[arvados.SEGMENTSIZE]]
+                    current_span = [streamoffset, streamoffset + segment.segment_size]
 
         if current_span is not None:
             stream_tokens.append("{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout))
@@ -78,7 +87,7 @@ class StreamReader(object):
             s = re.match(r'^[0-9a-f]{32}\+(\d+)(\+\S+)*$', tok)
             if s:
                 blocksize = long(s.group(1))
-                self._data_locators.append([tok, blocksize, streamoffset])
+                self._data_locators.append(Range(tok, streamoffset, blocksize))
                 streamoffset += blocksize
                 continue
 
@@ -88,10 +97,10 @@ class StreamReader(object):
                 size = long(s.group(2))
                 name = s.group(3).replace('\\040', ' ')
                 if name not in self._files:
-                    self._files[name] = StreamFileReader(self, [[pos, size, 0]], name)
+                    self._files[name] = StreamFileReader(self, [Range(pos, 0, size)], name)
                 else:
                     filereader = self._files[name]
-                    filereader.segments.append([pos, size, filereader.size()])
+                    filereader.segments.append(Range(pos, filereader.size(), size))
                 continue
 
             raise errors.SyntaxError("Invalid manifest format")
@@ -107,7 +116,7 @@ class StreamReader(object):
 
     def _size(self):
         n = self._data_locators[-1]
-        return n[OFFSET] + n[BLOCKSIZE]
+        return n.range_start + n.range_size
 
     def size(self):
         return self._size()
@@ -131,41 +140,24 @@ class StreamReader(object):
         if self._keep is None:
             self._keep = KeepClient(num_retries=self.num_retries)
         data = []
-        for locator, blocksize, segmentoffset, segmentsize in locators_and_ranges(self._data_locators, start, size):
-            data.append(self._keepget(locator, num_retries=num_retries)[segmentoffset:segmentoffset+segmentsize])
+        for lr in locators_and_ranges(self._data_locators, start, size):
+            data.append(self._keepget(lr.locator, num_retries=num_retries)[lr.segment_offset:lr.segment_offset+lr.segment_size])
         return ''.join(data)
 
     def manifest_text(self, strip=False):
         manifest_text = [self.name().replace(' ', '\\040')]
         if strip:
             for d in self._data_locators:
-                m = re.match(r'^[0-9a-f]{32}\+\d+', d[LOCATOR])
+                m = re.match(r'^[0-9a-f]{32}\+\d+', d.locator)
                 manifest_text.append(m.group(0))
         else:
-            manifest_text.extend([d[LOCATOR] for d in self._data_locators])
-        manifest_text.extend([' '.join(["{}:{}:{}".format(seg[LOCATOR], seg[BLOCKSIZE], f.name.replace(' ', '\\040'))
+            manifest_text.extend([d.locator for d in self._data_locators])
+        manifest_text.extend([' '.join(["{}:{}:{}".format(seg.locator, seg.range_size, f.name.replace(' ', '\\040'))
                                         for seg in f.segments])
                               for f in self._files.values()])
         return ' '.join(manifest_text) + '\n'
 
 
-class BufferBlock(object):
-    def __init__(self, locator, streamoffset, starting_size=2**16):
-        self.locator = locator
-        self.buffer_block = bytearray(starting_size)
-        self.buffer_view = memoryview(self.buffer_block)
-        self.write_pointer = 0
-        self.locator_list_entry = [locator, 0, streamoffset]
-
-    def append(self, data):
-        while (self.write_pointer+len(data)) > len(self.buffer_block):
-            new_buffer_block = bytearray(len(self.buffer_block) * 2)
-            new_buffer_block[0:self.write_pointer] = self.buffer_block[0:self.write_pointer]
-            self.buffer_block = new_buffer_block
-            self.buffer_view = memoryview(self.buffer_block)
-        self.buffer_view[self.write_pointer:self.write_pointer+len(data)] = data
-        self.write_pointer += len(data)
-        self.locator_list_entry[1] = self.write_pointer
 
 
 # class StreamWriter(StreamReader):
@@ -204,8 +196,8 @@ class BufferBlock(object):
 
 #     def _init_bufferblock(self):
 #         last = self._data_locators[-1]
-#         streamoffset = last[OFFSET] + last[BLOCKSIZE]
-#         if last[BLOCKSIZE] == 0:
+#         streamoffset = last.range_start + last.range_size
+#         if last.range_size == 0:
 #             del self._data_locators[-1]
 #         self.current_bblock = BufferBlock("bufferblock%i" % len(self.bufferblocks), streamoffset)
 #         self.bufferblocks[self.current_bblock.locator] = self.current_bblock
@@ -225,34 +217,34 @@ class BufferBlock(object):
 #         while i < len(tmp_segs):
 #             # Go through each segment and identify segments that include the buffer block
 #             s = tmp_segs[i]
-#             if s[LOCATOR] < self.current_bblock.locator_list_entry[OFFSET] and (s[LOCATOR] + s[BLOCKSIZE]) > self.current_bblock.locator_list_entry[OFFSET]:
+#             if s[LOCATOR] < self.current_bblock.locator_list_entry.range_start and (s[LOCATOR] + s.range_size) > self.current_bblock.locator_list_entry.range_start:
 #                 # The segment straddles the previous block and the current buffer block.  Split the segment.
-#                 b1 = self.current_bblock.locator_list_entry[OFFSET] - s[LOCATOR]
-#                 b2 = (s[LOCATOR] + s[BLOCKSIZE]) - self.current_bblock.locator_list_entry[OFFSET]
-#                 bb_seg = [self.current_bblock.locator_list_entry[OFFSET], b2, s[OFFSET]+b1]
-#                 tmp_segs[i] = [s[LOCATOR], b1, s[OFFSET]]
+#                 b1 = self.current_bblock.locator_list_entry.range_start - s[LOCATOR]
+#                 b2 = (s[LOCATOR] + s.range_size) - self.current_bblock.locator_list_entry.range_start
+#                 bb_seg = [self.current_bblock.locator_list_entry.range_start, b2, s.range_start+b1]
+#                 tmp_segs[i] = [s[LOCATOR], b1, s.range_start]
 #                 tmp_segs.insert(i+1, bb_seg)
 #                 bufferblock_segs.append(bb_seg)
 #                 i += 1
-#             elif s[LOCATOR] >= self.current_bblock.locator_list_entry[OFFSET]:
+#             elif s[LOCATOR] >= self.current_bblock.locator_list_entry.range_start:
 #                 # The segment's data is in the buffer block.
 #                 bufferblock_segs.append(s)
 #             i += 1
 
 #         # Now sum up the segments to get the total bytes
 #         # of the file referencing into the buffer block.
-#         write_total = sum([s[BLOCKSIZE] for s in bufferblock_segs])
+#         write_total = sum([s.range_size for s in bufferblock_segs])
 
-#         if write_total < self.current_bblock.locator_list_entry[BLOCKSIZE]:
+#         if write_total < self.current_bblock.locator_list_entry.range_size:
 #             # There is more data in the buffer block than is actually accounted for by segments, so
 #             # re-pack into a new buffer by copying over to a new buffer block.
 #             new_bb = BufferBlock(self.current_bblock.locator,
-#                                  self.current_bblock.locator_list_entry[OFFSET],
+#                                  self.current_bblock.locator_list_entry.range_start,
 #                                  starting_size=write_total)
 #             for t in bufferblock_segs:
-#                 t_start = t[LOCATOR] - self.current_bblock.locator_list_entry[OFFSET]
-#                 t_end = t_start + t[BLOCKSIZE]
-#                 t[0] = self.current_bblock.locator_list_entry[OFFSET] + new_bb.write_pointer
+#                 t_start = t[LOCATOR] - self.current_bblock.locator_list_entry.range_start
+#                 t_end = t_start + t.range_size
+#                 t[0] = self.current_bblock.locator_list_entry.range_start + new_bb.write_pointer
 #                 new_bb.append(self.current_bblock.buffer_block[t_start:t_end])
 
 #             self.current_bblock = new_bb