11308: Merge branch 'master' into 11308-python3
[arvados.git] / sdk / python / arvados / arvfile.py
index c52c7727aef048cfb1d3af21e18589f590a9ec26..b9faa11c20874eb47fea29c39c57e887c8120586 100644 (file)
@@ -107,11 +107,23 @@ class ArvadosFileReaderBase(_FileLikeObjectBase):
             pos += self._filepos
         elif whence == os.SEEK_END:
             pos += self.size()
-        self._filepos = min(max(pos, 0), self.size())
+        if pos < 0:
+            raise IOError(errno.EINVAL, "Tried to seek to negative file offset.")
+        self._filepos = pos
+        return self._filepos
 
     def tell(self):
         return self._filepos
 
+    def readable(self):
+        return True
+
+    def writable(self):
+        return False
+
+    def seekable(self):
+        return True
+
     @_FileLikeObjectBase._before_close
     @retry_method
     def readall(self, size=2**20, num_retries=None):
@@ -183,13 +195,13 @@ class ArvadosFileReaderBase(_FileLikeObjectBase):
         return b''.join(data).decode().splitlines(True)
 
     def size(self):
-        raise NotImplementedError()
+        raise IOError(errno.ENOSYS, "Not implemented")
 
     def read(self, size, num_retries=None):
-        raise NotImplementedError()
+        raise IOError(errno.ENOSYS, "Not implemented")
 
     def readfrom(self, start, size, num_retries=None):
-        raise NotImplementedError()
+        raise IOError(errno.ENOSYS, "Not implemented")
 
 
 class StreamFileReader(ArvadosFileReaderBase):
@@ -441,6 +453,7 @@ class _BlockManager(object):
         self.copies = copies
         self._pending_write_size = 0
         self.threads_lock = threading.Lock()
+        self.padding_block = None
 
     @synchronized
     def alloc_bufferblock(self, blockid=None, starting_capacity=2**14, owner=None):
@@ -666,6 +679,22 @@ class _BlockManager(object):
     def get_bufferblock(self, locator):
         return self._bufferblocks.get(locator)
 
+    @synchronized
+    def get_padding_block(self):
+        """Get a bufferblock 64 MB in size consisting of all zeros, used as padding
+        when using truncate() to extend the size of a file.
+
+        For reference (and possible future optimization), the md5sum of the
+        padding block is: 7f614da9329cd3aebf59b91aadc30bf0+67108864
+
+        """
+
+        if self.padding_block is None:
+            self.padding_block = self._alloc_bufferblock(starting_capacity=config.KEEP_BLOCK_SIZE)
+            self.padding_block.write_pointer = config.KEEP_BLOCK_SIZE
+            self.commit_bufferblock(self.padding_block, False)
+        return self.padding_block
+
     @synchronized
     def delete_bufferblock(self, locator):
         self._delete_bufferblock(locator)
@@ -913,11 +942,11 @@ class ArvadosFile(object):
     @must_be_writable
     @synchronized
     def truncate(self, size):
-        """Shrink the size of the file.
+        """Shrink or expand the size of the file.
 
         If `size` is less than the size of the file, the file contents after
         `size` will be discarded.  If `size` is greater than the current size
-        of the file, an IOError will be raised.
+        of the file, it will be filled with zero bytes.
 
         """
         if size < self.size():
@@ -938,7 +967,17 @@ class ArvadosFile(object):
             self._segments = new_segs
             self.set_committed(False)
         elif size > self.size():
-            raise IOError(errno.EINVAL, "truncate() does not support extending the file size")
+            padding = self.parent._my_block_manager().get_padding_block()
+            diff = size - self.size()
+            while diff > config.KEEP_BLOCK_SIZE:
+                self._segments.append(Range(padding.blockid, self.size(), config.KEEP_BLOCK_SIZE, 0))
+                diff -= config.KEEP_BLOCK_SIZE
+            if diff > 0:
+                self._segments.append(Range(padding.blockid, self.size(), diff, 0))
+            self.set_committed(False)
+        else:
+            # size == self.size()
+            pass
 
     def readfrom(self, offset, size, num_retries, exact=False):
         """Read up to `size` bytes from the file starting at `offset`.
@@ -974,23 +1013,28 @@ class ArvadosFile(object):
         return b''.join(data)
 
     def _repack_writes(self, num_retries):
-        """Test if the buffer block has more data than actual segments.
+        """Optimize buffer block by repacking segments in file sequence.
 
-        This happens when a buffered write over-writes a file range written in
-        a previous buffered write.  Re-pack the buffer block for efficiency
-        and to avoid leaking information.
+        When the client makes random writes, they appear in the buffer block in
+        the sequence they were written rather than the sequence they appear in
+        the file.  This makes for inefficient, fragmented manifests.  Attempt
+        to optimize by repacking writes in file sequence.
 
         """
         segs = self._segments
 
-        # Sum up the segments to get the total bytes of the file referencing
-        # into the buffer block.
+        # Collect the segments that reference the buffer block.
         bufferblock_segs = [s for s in segs if s.locator == self._current_bblock.blockid]
+
+        # Collect total data referenced by segments (could be smaller than
+        # bufferblock size if a portion of the file was written and
+        # then overwritten).
         write_total = sum([s.range_size for s in bufferblock_segs])
 
-        if write_total < self._current_bblock.size():
-            # There is more data in the buffer block than is actually accounted for by segments, so
-            # re-pack into a new buffer by copying over to a new buffer block.
+        if write_total < self._current_bblock.size() or len(bufferblock_segs) > 1:
+            # If there's more than one segment referencing this block, it is
+            # due to out-of-order writes and will produce a fragmented
+            # manifest, so try to optimize by re-packing into a new buffer.
             contents = self.parent._my_block_manager().get_block_contents(self._current_bblock.blockid, num_retries)
             new_bb = self.parent._my_block_manager().alloc_bufferblock(self._current_bblock.blockid, starting_capacity=write_total, owner=self)
             for t in bufferblock_segs:
@@ -1014,7 +1058,7 @@ class ArvadosFile(object):
             return
 
         if offset > self.size():
-            raise ArgumentError("Offset is past the end of the file")
+            self.truncate(offset)
 
         if len(data) > config.KEEP_BLOCK_SIZE:
             # Chunk it up into smaller writes
@@ -1197,6 +1241,9 @@ class ArvadosFileWriter(ArvadosFileReader):
         super(ArvadosFileWriter, self).__init__(arvadosfile, mode=mode, num_retries=num_retries)
         self.arvadosfile.add_writer(self)
 
+    def writable(self):
+        return True
+
     @_FileLikeObjectBase._before_close
     @retry_method
     def write(self, data, num_retries=None):
@@ -1218,8 +1265,6 @@ class ArvadosFileWriter(ArvadosFileReader):
         if size is None:
             size = self._filepos
         self.arvadosfile.truncate(size)
-        if self._filepos > self.size():
-            self._filepos = self.size()
 
     @_FileLikeObjectBase._before_close
     def flush(self):