20422: Do GET when cache slot is empty instead of raising an error 20422-cache-slot
authorPeter Amstutz <peter.amstutz@curii.com>
Mon, 24 Apr 2023 20:59:27 +0000 (16:59 -0400)
committerPeter Amstutz <peter.amstutz@curii.com>
Mon, 24 Apr 2023 20:59:27 +0000 (16:59 -0400)
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>

sdk/python/arvados/keep.py

index 6804f355a8cdc12217e59f7d259fe9852d91aac4..8658774cbb6544950209d588d73867d20a423869 100644 (file)
@@ -1174,21 +1174,37 @@ class KeepClient(object):
         try:
             locator = KeepLocator(loc_s)
             if method == "GET":
-                slot, first = self.block_cache.reserve_cache(locator.md5sum)
-                if not first:
+                while slot is None:
+                    slot, first = self.block_cache.reserve_cache(locator.md5sum)
+                    if first:
+                        # Fresh and empty "first time it is used" slot
+                        break
                     if prefetch:
-                        # this is request for a prefetch, if it is
-                        # already in flight, return immediately.
-                        # clear 'slot' to prevent finally block from
-                        # calling slot.set()
+                        # this is request for a prefetch to fill in
+                        # the cache, don't need to wait for the
+                        # result, so if it is already in flight return
+                        # immediately.  Clear 'slot' to prevent
+                        # finally block from calling slot.set()
                         slot = None
                         return None
-                    self.hits_counter.add(1)
+
                     blob = slot.get()
-                    if blob is None:
-                        raise arvados.errors.KeepReadError(
-                            "failed to read {}".format(loc_s))
-                    return blob
+                    if blob is not None:
+                        self.hits_counter.add(1)
+                        return blob
+
+                    # If blob is None, this means either
+                    #
+                    # (a) another thread was fetching this block and
+                    # failed with an error or
+                    #
+                    # (b) cache thrashing caused the slot to be
+                    # evicted (content set to None) by another thread
+                    # between the call to reserve_cache() and get().
+                    #
+                    # We'll handle these cases by reserving a new slot
+                    # and then doing a full GET request.
+                    slot = None
 
             self.misses_counter.add(1)