-import functools
-import os
-import zlib
+from __future__ import absolute_import
+from __future__ import division
+from future import standard_library
+from future.utils import listitems, listvalues
+standard_library.install_aliases()
+from builtins import range
+from builtins import object
import bz2
-import config
-import hashlib
-import threading
-import Queue
+import collections
import copy
import errno
-import re
+import functools
+import hashlib
import logging
-import collections
+import os
+import queue
+import re
+import sys
+import threading
import uuid
+import zlib
+from . import config
from .errors import KeepWriteError, AssertionError, ArgumentError
from .keep import KeepLocator
from ._normalize_stream import normalize_stream
class ArvadosFileReaderBase(_FileLikeObjectBase):
def __init__(self, name, mode, num_retries=None):
super(ArvadosFileReaderBase, self).__init__(name, mode)
- self._filepos = 0L
+ self._binary = 'b' in mode
+ if sys.version_info >= (3, 0) and not self._binary:
+ raise NotImplementedError("text mode {!r} is not implemented".format(mode))
+ self._filepos = 0
self.num_retries = num_retries
self._readline_cache = (None, None)
pos += self._filepos
elif whence == os.SEEK_END:
pos += self.size()
- if pos < 0L:
+ if pos < 0:
raise IOError(errno.EINVAL, "Tried to seek to negative file offset.")
self._filepos = pos
return self._filepos
def readall(self, size=2**20, num_retries=None):
while True:
data = self.read(size, num_retries=num_retries)
- if data == '':
+ if len(data) == 0:
break
yield data
data = [cache_data]
self._filepos += len(cache_data)
else:
- data = ['']
+ data = [b'']
data_size = len(data[-1])
- while (data_size < size) and ('\n' not in data[-1]):
+ while (data_size < size) and (b'\n' not in data[-1]):
next_read = self.read(2 ** 20, num_retries=num_retries)
if not next_read:
break
data.append(next_read)
data_size += len(next_read)
- data = ''.join(data)
+ data = b''.join(data)
try:
- nextline_index = data.index('\n') + 1
+ nextline_index = data.index(b'\n') + 1
except ValueError:
nextline_index = len(data)
nextline_index = min(nextline_index, size)
self._filepos -= len(data) - nextline_index
self._readline_cache = (self.tell(), data[nextline_index:])
- return data[:nextline_index]
+ return data[:nextline_index].decode()
@_FileLikeObjectBase._before_close
@retry_method
data_size += len(s)
if data_size >= sizehint:
break
- return ''.join(data).splitlines(True)
+ return b''.join(data).decode().splitlines(True)
def size(self):
raise IOError(errno.ENOSYS, "Not implemented")
def read(self, size, num_retries=None):
"""Read up to 'size' bytes from the stream, starting at the current file position"""
if size == 0:
- return ''
+ return b''
- data = ''
+ data = b''
available_chunks = locators_and_ranges(self.segments, self._filepos, size)
if available_chunks:
lr = available_chunks[0]
data = self._stream.readfrom(lr.locator+lr.segment_offset,
- lr.segment_size,
- num_retries=num_retries)
+ lr.segment_size,
+ num_retries=num_retries)
self._filepos += len(data)
return data
def readfrom(self, start, size, num_retries=None):
"""Read up to 'size' bytes from the stream, starting at 'start'"""
if size == 0:
- return ''
+ return b''
data = []
for lr in locators_and_ranges(self.segments, start, size):
data.append(self._stream.readfrom(lr.locator+lr.segment_offset, lr.segment_size,
num_retries=num_retries))
- return ''.join(data)
+ return b''.join(data)
def as_manifest(self):
segs = []
"""
if self._state == _BufferBlock.WRITABLE:
+ if not isinstance(data, bytes) and not isinstance(data, memoryview):
+ data = data.encode()
while (self.write_pointer+len(data)) > len(self.buffer_block):
new_buffer_block = bytearray(len(self.buffer_block) * 2)
new_buffer_block[0:self.write_pointer] = self.buffer_block[0:self.write_pointer]
self.buffer_block = None
self.buffer_view = None
+ @synchronized
+ def repack_writes(self):
+ """Optimize buffer block by repacking segments in file sequence.
+
+ When the client makes random writes, they appear in the buffer block in
+ the sequence they were written rather than the sequence they appear in
+ the file. This makes for inefficient, fragmented manifests. Attempt
+ to optimize by repacking writes in file sequence.
+
+ """
+ if self._state != _BufferBlock.WRITABLE:
+ raise AssertionError("Cannot repack non-writable block")
+
+ segs = self.owner.segments()
+
+ # Collect the segments that reference the buffer block.
+ bufferblock_segs = [s for s in segs if s.locator == self.blockid]
+
+ # Collect total data referenced by segments (could be smaller than
+ # bufferblock size if a portion of the file was written and
+ # then overwritten).
+ write_total = sum([s.range_size for s in bufferblock_segs])
+
+ if write_total < self.size() or len(bufferblock_segs) > 1:
+ # If there's more than one segment referencing this block, it is
+ # due to out-of-order writes and will produce a fragmented
+ # manifest, so try to optimize by re-packing into a new buffer.
+ contents = self.buffer_view[0:self.write_pointer].tobytes()
+ new_bb = _BufferBlock(None, write_total, None)
+ for t in bufferblock_segs:
+ new_bb.append(contents[t.segment_offset:t.segment_offset+t.range_size])
+ t.segment_offset = new_bb.size() - t.range_size
+
+ self.buffer_block = new_bb.buffer_block
+ self.buffer_view = new_bb.buffer_view
+ self.write_pointer = new_bb.write_pointer
+ self._locator = None
+ new_bb.clear()
+ self.owner.set_segments(segs)
+
def __repr__(self):
return "<BufferBlock %s>" % (self.blockid)
else:
loc = self._keep.put(bufferblock.buffer_view[0:bufferblock.write_pointer].tobytes(), copies=self.copies)
bufferblock.set_state(_BufferBlock.COMMITTED, loc)
-
except Exception as e:
bufferblock.set_state(_BufferBlock.ERROR, e)
finally:
# blocks pending. If they are full 64 MiB blocks, that means up to
# 256 MiB of internal buffering, which is the same size as the
# default download block cache in KeepClient.
- self._put_queue = Queue.Queue(maxsize=2)
+ self._put_queue = queue.Queue(maxsize=2)
self._put_threads = []
- for i in xrange(0, self.num_put_threads):
+ for i in range(0, self.num_put_threads):
thread = threading.Thread(target=self._commit_bufferblock_worker)
self._put_threads.append(thread)
thread.daemon = True
@synchronized
def start_get_threads(self):
if self._prefetch_threads is None:
- self._prefetch_queue = Queue.Queue()
+ self._prefetch_queue = queue.Queue()
self._prefetch_threads = []
- for i in xrange(0, self.num_get_threads):
+ for i in range(0, self.num_get_threads):
thread = threading.Thread(target=self._block_prefetch_worker)
self._prefetch_threads.append(thread)
thread.daemon = True
def __exit__(self, exc_type, exc_value, traceback):
self.stop_threads()
+ @synchronized
def repack_small_blocks(self, force=False, sync=False, closed_file_size=0):
"""Packs small blocks together before uploading"""
+
self._pending_write_size += closed_file_size
# Check if there are enough small blocks for filling up one in full
if not (force or (self._pending_write_size >= config.KEEP_BLOCK_SIZE)):
return
- # Search blocks ready for getting packed together before being committed to Keep.
+ # Search blocks ready for getting packed together before being
+ # committed to Keep.
# A WRITABLE block always has an owner.
- # A WRITABLE block with its owner.closed() implies that it's
+ # A WRITABLE block with its owner.closed() implies that its
# size is <= KEEP_BLOCK_SIZE/2.
- with self.lock:
- bufferblocks = self._bufferblocks.values()
-
try:
- for b in bufferblocks:
- if b.state() == _BufferBlock.WRITABLE and b.owner.closed():
- b.owner._repack_writes(0)
+ small_blocks = [b for b in listvalues(self._bufferblocks)
+ if b.state() == _BufferBlock.WRITABLE and b.owner.closed()]
except AttributeError:
# Writable blocks without owner shouldn't exist.
raise UnownedBlockError()
- with self.lock:
- small_blocks = [b for b in self._bufferblocks.values()
- if b.state() == _BufferBlock.WRITABLE and b.owner.closed()]
+ if len(small_blocks) <= 1:
+ # Not enough small blocks for repacking
+ return
- if len(small_blocks) <= 1:
- # Not enough small blocks for repacking
- return
+ for bb in small_blocks:
+ bb.repack_writes()
- # Update the pending write size count with its true value, just in case
- # some small file was opened, written and closed several times.
- self._pending_write_size = sum([b.size() for b in small_blocks])
- if self._pending_write_size < config.KEEP_BLOCK_SIZE and not force:
- return
+ # Update the pending write size count with its true value, just in case
+ # some small file was opened, written and closed several times.
+ self._pending_write_size = sum([b.size() for b in small_blocks])
- new_bb = self._alloc_bufferblock()
- files = []
- while len(small_blocks) > 0 and (new_bb.write_pointer + small_blocks[0].size()) <= config.KEEP_BLOCK_SIZE:
- bb = small_blocks.pop(0)
- self._pending_write_size -= bb.size()
- new_bb.append(bb.buffer_view[0:bb.write_pointer].tobytes())
- files.append((bb, new_bb.write_pointer - bb.size()))
-
- self.commit_bufferblock(new_bb, sync=sync)
-
- for bb, new_bb_segment_offset in files:
- newsegs = []
- for s in bb.owner.segments():
- if s.locator == bb.blockid:
- newsegs.append(Range(new_bb.locator(), s.range_start, s.range_size, new_bb_segment_offset+s.segment_offset))
- else:
- newsegs.append(s)
- bb.owner.set_segments(newsegs)
- self._delete_bufferblock(bb.blockid)
+ if self._pending_write_size < config.KEEP_BLOCK_SIZE and not force:
+ return
+
+ new_bb = self._alloc_bufferblock()
+ new_bb.owner = []
+ files = []
+ while len(small_blocks) > 0 and (new_bb.write_pointer + small_blocks[0].size()) <= config.KEEP_BLOCK_SIZE:
+ bb = small_blocks.pop(0)
+ new_bb.owner.append(bb.owner)
+ self._pending_write_size -= bb.size()
+ new_bb.append(bb.buffer_view[0:bb.write_pointer].tobytes())
+ files.append((bb, new_bb.write_pointer - bb.size()))
+
+ self.commit_bufferblock(new_bb, sync=sync)
+
+ for bb, new_bb_segment_offset in files:
+ newsegs = bb.owner.segments()
+ for s in newsegs:
+ if s.locator == bb.blockid:
+ s.locator = new_bb.blockid
+ s.segment_offset = new_bb_segment_offset+s.segment_offset
+ bb.owner.set_segments(newsegs)
+ self._delete_bufferblock(bb.blockid)
def commit_bufferblock(self, block, sync):
"""Initiate a background upload of a bufferblock.
self.repack_small_blocks(force=True, sync=True)
with self.lock:
- items = self._bufferblocks.items()
+ items = listitems(self._bufferblocks)
for k,v in items:
if v.state() != _BufferBlock.COMMITTED and v.owner:
- v.owner.flush(sync=False)
+ # Ignore blocks with a list of owners, as if they're not in COMMITTED
+ # state, they're already being committed asynchronously.
+ if isinstance(v.owner, ArvadosFile):
+ v.owner.flush(sync=False)
with self.lock:
if self._put_queue is not None:
# flush again with sync=True to remove committed bufferblocks from
# the segments.
if v.owner:
- v.owner.flush(sync=True)
+ if isinstance(v.owner, ArvadosFile):
+ v.owner.flush(sync=True)
+ elif isinstance(v.owner, list) and len(v.owner) > 0:
+ # This bufferblock is referenced by many files as a result
+ # of repacking small blocks, so don't delete it when flushing
+ # its owners, just do it after flushing them all.
+ for owner in v.owner:
+ owner.flush(sync=True)
+ self.delete_bufferblock(k)
def block_prefetch(self, locator):
"""Initiate a background download of a block.
with self.lock:
if len(self._segments) != len(othersegs):
return False
- for i in xrange(0, len(othersegs)):
+ for i in range(0, len(othersegs)):
seg1 = self._segments[i]
seg2 = othersegs[i]
loc1 = seg1.locator
"""
self._writers.remove(writer)
- if flush or self.size() > config.KEEP_BLOCK_SIZE / 2:
+ if flush or self.size() > config.KEEP_BLOCK_SIZE // 2:
# File writer closed, not small enough for repacking
self.flush()
elif self.closed():
with self.lock:
if size == 0 or offset >= self.size():
- return ''
+ return b''
readsegs = locators_and_ranges(self._segments, offset, size)
prefetch = locators_and_ranges(self._segments, offset + size, config.KEEP_BLOCK_SIZE, limit=32)
self.parent._my_block_manager().block_prefetch(lr.locator)
locs.add(lr.locator)
- return ''.join(data)
-
- def _repack_writes(self, num_retries):
- """Optimize buffer block by repacking segments in file sequence.
-
- When the client makes random writes, they appear in the buffer block in
- the sequence they were written rather than the sequence they appear in
- the file. This makes for inefficient, fragmented manifests. Attempt
- to optimize by repacking writes in file sequence.
-
- """
- segs = self._segments
-
- # Collect the segments that reference the buffer block.
- bufferblock_segs = [s for s in segs if s.locator == self._current_bblock.blockid]
-
- # Collect total data referenced by segments (could be smaller than
- # bufferblock size if a portion of the file was written and
- # then overwritten).
- write_total = sum([s.range_size for s in bufferblock_segs])
-
- if write_total < self._current_bblock.size() or len(bufferblock_segs) > 1:
- # If there's more than one segment referencing this block, it is
- # due to out-of-order writes and will produce a fragmented
- # manifest, so try to optimize by re-packing into a new buffer.
- contents = self.parent._my_block_manager().get_block_contents(self._current_bblock.blockid, num_retries)
- new_bb = self.parent._my_block_manager().alloc_bufferblock(self._current_bblock.blockid, starting_capacity=write_total, owner=self)
- for t in bufferblock_segs:
- new_bb.append(contents[t.segment_offset:t.segment_offset+t.range_size])
- t.segment_offset = new_bb.size() - t.range_size
- self._current_bblock.clear()
- self._current_bblock = new_bb
+ return b''.join(data)
@must_be_writable
@synchronized
necessary.
"""
+ if not isinstance(data, bytes) and not isinstance(data, memoryview):
+ data = data.encode()
if len(data) == 0:
return
self._current_bblock = self.parent._my_block_manager().alloc_bufferblock(owner=self)
if (self._current_bblock.size() + len(data)) > config.KEEP_BLOCK_SIZE:
- self._repack_writes(num_retries)
+ self._current_bblock.repack_writes()
if (self._current_bblock.size() + len(data)) > config.KEEP_BLOCK_SIZE:
self.parent._my_block_manager().commit_bufferblock(self._current_bblock, sync=False)
self._current_bblock = self.parent._my_block_manager().alloc_bufferblock(owner=self)
if self._current_bblock and self._current_bblock.state() != _BufferBlock.COMMITTED:
if self._current_bblock.state() == _BufferBlock.WRITABLE:
- self._repack_writes(num_retries)
+ self._current_bblock.repack_writes()
if self._current_bblock.state() != _BufferBlock.DELETED:
self.parent._my_block_manager().commit_bufferblock(self._current_bblock, sync=sync)
to_delete.add(s.locator)
s.locator = bb.locator()
for s in to_delete:
- self.parent._my_block_manager().delete_bufferblock(s)
+ # Don't delete the bufferblock if it's owned by many files. It'll be
+ # deleted after all of its owners are flush()ed.
+ if self.parent._my_block_manager().get_bufferblock(s).owner is self:
+ self.parent._my_block_manager().delete_bufferblock(s)
self.parent.notify(MOD, self.parent, self.name, (self, self))
normalize=False, only_committed=False):
buf = ""
filestream = []
- for segment in self.segments():
+ for segment in self._segments:
loc = segment.locator
if self.parent._my_block_manager().is_bufferblock(loc):
if only_committed:
"""
- def __init__(self, arvadosfile, num_retries=None):
- super(ArvadosFileReader, self).__init__(arvadosfile.name, "r", num_retries=num_retries)
+ def __init__(self, arvadosfile, mode="r", num_retries=None):
+ super(ArvadosFileReader, self).__init__(arvadosfile.name, mode=mode, num_retries=num_retries)
self.arvadosfile = arvadosfile
def size(self):
data.append(rd)
self._filepos += len(rd)
rd = self.arvadosfile.readfrom(self._filepos, config.KEEP_BLOCK_SIZE, num_retries)
- return ''.join(data)
+ return b''.join(data)
else:
data = self.arvadosfile.readfrom(self._filepos, size, num_retries, exact=True)
self._filepos += len(data)
"""
def __init__(self, arvadosfile, mode, num_retries=None):
- super(ArvadosFileWriter, self).__init__(arvadosfile, num_retries=num_retries)
- self.mode = mode
+ super(ArvadosFileWriter, self).__init__(arvadosfile, mode=mode, num_retries=num_retries)
self.arvadosfile.add_writer(self)
def writable(self):
@retry_method
def write(self, data, num_retries=None):
if self.mode[0] == "a":
- self.arvadosfile.writeto(self.size(), data, num_retries)
- else:
- self.arvadosfile.writeto(self._filepos, data, num_retries)
- self._filepos += len(data)
+ self._filepos = self.size()
+ self.arvadosfile.writeto(self._filepos, data, num_retries)
+ self._filepos += len(data)
return len(data)
@_FileLikeObjectBase._before_close