import config
import errors
+def locator_block_size(loc):
+ s = re.match(r'[0-9a-f]{32}\+(\d+)(\+\S+)*', loc)
+ return long(s.group(1))
+
def normalize_stream(s, stream):
'''
s is the stream name
- stream is a StreamReader object
+ stream is a dict mapping each filename to a list in the form [block locator, block size, segment offset (from beginning of block), segment size]
+ returns the stream as a list of tokens
'''
stream_tokens = [s]
sortedfiles = list(stream.keys())
blocks = {}
streamoffset = 0L
+ # Go through each file and add each referenced block exactly once.
for f in sortedfiles:
for b in stream[f]:
- if b[arvados.LOCATOR] not in blocks:
- stream_tokens.append(b[arvados.LOCATOR])
- blocks[b[arvados.LOCATOR]] = streamoffset
- streamoffset += b[arvados.BLOCKSIZE]
+ if b.locator not in blocks:
+ stream_tokens.append(b.locator)
+ blocks[b.locator] = streamoffset
+ streamoffset += locator_block_size(b.locator)
+ # Add the empty block if the stream is otherwise empty.
if len(stream_tokens) == 1:
stream_tokens.append(config.EMPTY_BLOCK_LOCATOR)
for f in sortedfiles:
+ # Add in file segments
current_span = None
fout = f.replace(' ', '\\040')
for segment in stream[f]:
- segmentoffset = blocks[segment[arvados.LOCATOR]] + segment[arvados.OFFSET]
+ # Collapse adjacent segments
+ streamoffset = blocks[segment.locator] + segment.segment_offset
if current_span is None:
- current_span = [segmentoffset, segmentoffset + segment[arvados.SEGMENTSIZE]]
+ current_span = [streamoffset, streamoffset + segment.segment_size]
else:
- if segmentoffset == current_span[1]:
- current_span[1] += segment[arvados.SEGMENTSIZE]
+ if streamoffset == current_span[1]:
+ current_span[1] += segment.segment_size
else:
stream_tokens.append("{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout))
- current_span = [segmentoffset, segmentoffset + segment[arvados.SEGMENTSIZE]]
+ current_span = [streamoffset, streamoffset + segment.segment_size]
if current_span is not None:
stream_tokens.append("{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout))
s = re.match(r'^[0-9a-f]{32}\+(\d+)(\+\S+)*$', tok)
if s:
blocksize = long(s.group(1))
- self._data_locators.append([tok, blocksize, streamoffset])
+ self._data_locators.append(Range(tok, streamoffset, blocksize))
streamoffset += blocksize
continue
size = long(s.group(2))
name = s.group(3).replace('\\040', ' ')
if name not in self._files:
- self._files[name] = StreamFileReader(self, [[pos, size, 0]], name)
+ self._files[name] = StreamFileReader(self, [Range(pos, 0, size)], name)
else:
filereader = self._files[name]
- filereader.segments.append([pos, size, filereader.size()])
+ filereader.segments.append(Range(pos, filereader.size(), size))
continue
raise errors.SyntaxError("Invalid manifest format")
def _size(self):
n = self._data_locators[-1]
- return n[OFFSET] + n[BLOCKSIZE]
+ return n.range_start + n.range_size
def size(self):
return self._size()
if self._keep is None:
self._keep = KeepClient(num_retries=self.num_retries)
data = []
- for locator, blocksize, segmentoffset, segmentsize in locators_and_ranges(self._data_locators, start, size):
- data.append(self._keepget(locator, num_retries=num_retries)[segmentoffset:segmentoffset+segmentsize])
+ for lr in locators_and_ranges(self._data_locators, start, size):
+ data.append(self._keepget(lr.locator, num_retries=num_retries)[lr.segment_offset:lr.segment_offset+lr.segment_size])
return ''.join(data)
def manifest_text(self, strip=False):
manifest_text = [self.name().replace(' ', '\\040')]
if strip:
for d in self._data_locators:
- m = re.match(r'^[0-9a-f]{32}\+\d+', d[LOCATOR])
+ m = re.match(r'^[0-9a-f]{32}\+\d+', d.locator)
manifest_text.append(m.group(0))
else:
- manifest_text.extend([d[LOCATOR] for d in self._data_locators])
- manifest_text.extend([' '.join(["{}:{}:{}".format(seg[LOCATOR], seg[BLOCKSIZE], f.name.replace(' ', '\\040'))
+ manifest_text.extend([d.locator for d in self._data_locators])
+ manifest_text.extend([' '.join(["{}:{}:{}".format(seg.locator, seg.range_size, f.name.replace(' ', '\\040'))
for seg in f.segments])
for f in self._files.values()])
return ' '.join(manifest_text) + '\n'
-class BufferBlock(object):
- def __init__(self, locator, streamoffset, starting_size=2**16):
- self.locator = locator
- self.buffer_block = bytearray(starting_size)
- self.buffer_view = memoryview(self.buffer_block)
- self.write_pointer = 0
- self.locator_list_entry = [locator, 0, streamoffset]
-
- def append(self, data):
- while (self.write_pointer+len(data)) > len(self.buffer_block):
- new_buffer_block = bytearray(len(self.buffer_block) * 2)
- new_buffer_block[0:self.write_pointer] = self.buffer_block[0:self.write_pointer]
- self.buffer_block = new_buffer_block
- self.buffer_view = memoryview(self.buffer_block)
- self.buffer_view[self.write_pointer:self.write_pointer+len(data)] = data
- self.write_pointer += len(data)
- self.locator_list_entry[1] = self.write_pointer
# class StreamWriter(StreamReader):
# def _init_bufferblock(self):
# last = self._data_locators[-1]
-# streamoffset = last[OFFSET] + last[BLOCKSIZE]
-# if last[BLOCKSIZE] == 0:
+# streamoffset = last.range_start + last.range_size
+# if last.range_size == 0:
# del self._data_locators[-1]
# self.current_bblock = BufferBlock("bufferblock%i" % len(self.bufferblocks), streamoffset)
# self.bufferblocks[self.current_bblock.locator] = self.current_bblock
# while i < len(tmp_segs):
# # Go through each segment and identify segments that include the buffer block
# s = tmp_segs[i]
-# if s[LOCATOR] < self.current_bblock.locator_list_entry[OFFSET] and (s[LOCATOR] + s[BLOCKSIZE]) > self.current_bblock.locator_list_entry[OFFSET]:
+# if s[LOCATOR] < self.current_bblock.locator_list_entry.range_start and (s[LOCATOR] + s.range_size) > self.current_bblock.locator_list_entry.range_start:
# # The segment straddles the previous block and the current buffer block. Split the segment.
-# b1 = self.current_bblock.locator_list_entry[OFFSET] - s[LOCATOR]
-# b2 = (s[LOCATOR] + s[BLOCKSIZE]) - self.current_bblock.locator_list_entry[OFFSET]
-# bb_seg = [self.current_bblock.locator_list_entry[OFFSET], b2, s[OFFSET]+b1]
-# tmp_segs[i] = [s[LOCATOR], b1, s[OFFSET]]
+# b1 = self.current_bblock.locator_list_entry.range_start - s[LOCATOR]
+# b2 = (s[LOCATOR] + s.range_size) - self.current_bblock.locator_list_entry.range_start
+# bb_seg = [self.current_bblock.locator_list_entry.range_start, b2, s.range_start+b1]
+# tmp_segs[i] = [s[LOCATOR], b1, s.range_start]
# tmp_segs.insert(i+1, bb_seg)
# bufferblock_segs.append(bb_seg)
# i += 1
-# elif s[LOCATOR] >= self.current_bblock.locator_list_entry[OFFSET]:
+# elif s[LOCATOR] >= self.current_bblock.locator_list_entry.range_start:
# # The segment's data is in the buffer block.
# bufferblock_segs.append(s)
# i += 1
# # Now sum up the segments to get the total bytes
# # of the file referencing into the buffer block.
-# write_total = sum([s[BLOCKSIZE] for s in bufferblock_segs])
+# write_total = sum([s.range_size for s in bufferblock_segs])
-# if write_total < self.current_bblock.locator_list_entry[BLOCKSIZE]:
+# if write_total < self.current_bblock.locator_list_entry.range_size:
# # There is more data in the buffer block than is actually accounted for by segments, so
# # re-pack into a new buffer by copying over to a new buffer block.
# new_bb = BufferBlock(self.current_bblock.locator,
-# self.current_bblock.locator_list_entry[OFFSET],
+# self.current_bblock.locator_list_entry.range_start,
# starting_size=write_total)
# for t in bufferblock_segs:
-# t_start = t[LOCATOR] - self.current_bblock.locator_list_entry[OFFSET]
-# t_end = t_start + t[BLOCKSIZE]
-# t[0] = self.current_bblock.locator_list_entry[OFFSET] + new_bb.write_pointer
+# t_start = t[LOCATOR] - self.current_bblock.locator_list_entry.range_start
+# t_end = t_start + t.range_size
+# t[0] = self.current_bblock.locator_list_entry.range_start + new_bb.write_pointer
# new_bb.append(self.current_bblock.buffer_block[t_start:t_end])
# self.current_bblock = new_bb