21639: Reenable prefetch, but not on every read()
authorPeter Amstutz <peter.amstutz@curii.com>
Fri, 5 Apr 2024 22:53:37 +0000 (18:53 -0400)
committerPeter Amstutz <peter.amstutz@curii.com>
Fri, 5 Apr 2024 23:06:30 +0000 (19:06 -0400)
Only do prefetch every 128 invocations of read().

This should dramatically reduce the overhead of computing prefetch
while still getting some or moste of the benefits of prefetching.

Indeed, benchmarking suggests that this prefetching strategy, by
advising the kernel to map blocks into RAM, may actually improve
throughput on the high end.

Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>

sdk/python/arvados/arvfile.py
sdk/python/arvados/keep.py
services/fuse/arvados_fuse/command.py

index 0cc7d25a331871c88860357853d1a21898eae965..666efb078d58e3a798e9721613919f0d0bf82469 100644 (file)
@@ -825,7 +825,7 @@ class ArvadosFile(object):
     """
 
     __slots__ = ('parent', 'name', '_writers', '_committed',
-                 '_segments', 'lock', '_current_bblock', 'fuse_entry')
+                 '_segments', 'lock', '_current_bblock', 'fuse_entry', '_read_counter')
 
     def __init__(self, parent, name, stream=[], segments=[]):
         """
@@ -846,6 +846,7 @@ class ArvadosFile(object):
         for s in segments:
             self._add_segment(stream, s.locator, s.range_size)
         self._current_bblock = None
+        self._read_counter = 0
 
     def writable(self):
         return self.parent.writable()
@@ -1060,8 +1061,11 @@ class ArvadosFile(object):
             if size == 0 or offset >= self.size():
                 return b''
             readsegs = locators_and_ranges(self._segments, offset, size)
-            if self.parent._my_block_manager()._keep.num_prefetch_threads > 0:
-                prefetch = locators_and_ranges(self._segments, offset + size, config.KEEP_BLOCK_SIZE * self.parent._my_block_manager()._keep.num_prefetch_threads, limit=32)
+            prefetch = None
+            if self.parent._my_block_manager()._keep.num_prefetch_threads > 0 and (self._read_counter % 128) == 0:
+                prefetch = locators_and_ranges(self._segments, offset + size, config.KEEP_BLOCK_SIZE * self.parent._my_block_manager()._keep.num_prefetch_threads,
+                                               limit=(1+self.parent._my_block_manager()._keep.num_prefetch_threads))
+            self._read_counter += 1
 
         locs = set()
         data = []
@@ -1074,7 +1078,7 @@ class ArvadosFile(object):
             else:
                 break
 
-        if self.parent._my_block_manager()._keep.num_prefetch_threads > 0:
+        if prefetch:
             for lr in prefetch:
                 if lr.locator not in locs:
                     self.parent._my_block_manager().block_prefetch(lr.locator)
index a8246210793e6964ca89ea6f5f0f0d7a5ac4d497..d1be6b931e7b0ea1ae8009076a0c684aedaa3a2b 100644 (file)
@@ -1181,6 +1181,8 @@ class KeepClient(object):
                         # result, so if it is already in flight return
                         # immediately.  Clear 'slot' to prevent
                         # finally block from calling slot.set()
+                        if slot.ready.is_set():
+                            slot.get()
                         slot = None
                         return None
 
index 1398b92e8797c6bcdf540df3423df9ed62154e3c..f52121d862b60ed1e16ba94dc594a5f1a32feffc 100644 (file)
@@ -490,13 +490,6 @@ class Mount(object):
                                                       disk_cache=self.args.disk_cache,
                                                       disk_cache_dir=self.args.disk_cache_dir)
 
-            # Profiling indicates that prefetching has more of a
-            # negative impact on the read() fast path (by requiring it
-            # to do more work and take additional locks) than benefit.
-            # Also, the kernel does some readahead itself, which has a
-            # similar effect.
-            prefetch_threads = 0
-
             self.api = arvados.safeapi.ThreadSafeApiCache(
                 apiconfig=arvados.config.settings(),
                 api_params={
@@ -504,7 +497,6 @@ class Mount(object):
                 },
                 keep_params={
                     'block_cache': block_cache,
-                    'num_prefetch_threads': prefetch_threads,
                     'num_retries': self.args.retries,
                 },
                 version='v1',