18842: Clean up keep cache set logic a little more

[arvados.git] / sdk / python / arvados / keep.py
diff --git a/sdk/python/arvados/keep.py b/sdk/python/arvados/keep.py

index 0018687ff35a585c33ce07378acb7f05e0b98522..8dbad2abb18aeb99a39ab896490751651c45c6b3 100644 (file)
--- a/sdk/python/arvados/keep.py
+++ b/sdk/python/arvados/keep.py
@@ -15,6 +15,7 @@ from builtins import object
  import collections
  import datetime
  import hashlib
  import collections
  import datetime
  import hashlib
+import errno
  import io
  import logging
  import math
  import io
  import logging
  import math
@@ -26,8 +27,10 @@ import socket
  import ssl
  import sys
  import threading
  import ssl
  import sys
  import threading
+import resource
  from . import timer
  import urllib.parse
  from . import timer
  import urllib.parse
+import traceback
  
  if sys.version_info >= (3, 0):
      from io import BytesIO
  
  if sys.version_info >= (3, 0):
      from io import BytesIO
@@ -39,6 +42,7 @@ import arvados.config as config
  import arvados.errors
  import arvados.retry as retry
  import arvados.util
  import arvados.errors
  import arvados.retry as retry
  import arvados.util
+import arvados.diskcache
  
  _logger = logging.getLogger('arvados.keep')
  global_client_object = None
  
  _logger = logging.getLogger('arvados.keep')
  global_client_object = None
@@ -159,7 +163,6 @@ class Keep(object):
                 config.get('ARVADOS_API_TOKEN'),
                 config.flag_is_true('ARVADOS_API_HOST_INSECURE'),
                 config.get('ARVADOS_KEEP_PROXY'),
                 config.get('ARVADOS_API_TOKEN'),
                 config.flag_is_true('ARVADOS_API_HOST_INSECURE'),
                 config.get('ARVADOS_KEEP_PROXY'),
-               config.get('ARVADOS_EXTERNAL_CLIENT') == 'true',
                 os.environ.get('KEEP_LOCAL_STORE'))
          if (global_client_object is None) or (cls._last_key != key):
              global_client_object = KeepClient()
                 os.environ.get('KEEP_LOCAL_STORE'))
          if (global_client_object is None) or (cls._last_key != key):
              global_client_object = KeepClient()
@@ -175,11 +178,50 @@ class Keep(object):
          return Keep.global_client_object().put(data, **kwargs)
  
  class KeepBlockCache(object):
          return Keep.global_client_object().put(data, **kwargs)
  
  class KeepBlockCache(object):
-    # Default RAM cache is 256MiB
-    def __init__(self, cache_max=(256 * 1024 * 1024)):
+    def __init__(self, cache_max=0, max_slots=0, disk_cache=False, disk_cache_dir=None):
          self.cache_max = cache_max
          self._cache = []
          self._cache_lock = threading.Lock()
          self.cache_max = cache_max
          self._cache = []
          self._cache_lock = threading.Lock()
+        self._max_slots = max_slots
+        self._disk_cache = disk_cache
+        self._disk_cache_dir = disk_cache_dir
+
+        if self._disk_cache and self._disk_cache_dir is None:
+            self._disk_cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "arvados", "keep")
+            os.makedirs(self._disk_cache_dir, mode=0o700, exist_ok=True)
+
+        if self._max_slots == 0:
+            if self._disk_cache:
+                # default max slots to half of maximum file handles
+                # NOFILE typically defaults to 1024 on Linux so this
+                # will be 512 slots.
+                self._max_slots = resource.getrlimit(resource.RLIMIT_NOFILE)[0] / 2
+            else:
+                # RAM cache slots
+                self._max_slots = 512
+
+        if self.cache_max == 0:
+            if self._disk_cache:
+                fs = os.statvfs(self._disk_cache_dir)
+                # Calculation of available space incorporates existing cache usage
+                existing_usage = arvados.diskcache.DiskCacheSlot.cache_usage(self._disk_cache_dir)
+                avail = (fs.f_bavail * fs.f_bsize + existing_usage) / 4
+                maxdisk = int((fs.f_blocks * fs.f_bsize) * 0.10)
+                # pick smallest of:
+                # 10% of total disk size
+                # 25% of available space
+                # max_slots * 64 MiB
+                self.cache_max = min(min(maxdisk, avail), (self._max_slots * 64 * 1024 * 1024))
+            else:
+                # 256 MiB in RAM
+                self.cache_max = (256 * 1024 * 1024)
+
+        self.cache_max = max(self.cache_max, 64 * 1024 * 1024)
+
+        if self._disk_cache:
+            self._cache = arvados.diskcache.DiskCacheSlot.init_cache(self._disk_cache_dir, self._max_slots)
+            self.cap_cache()
+
  
      class CacheSlot(object):
          __slots__ = ("locator", "ready", "content")
  
      class CacheSlot(object):
          __slots__ = ("locator", "ready", "content")
@@ -203,6 +245,9 @@ class KeepBlockCache(object):
              else:
                  return len(self.content)
  
              else:
                  return len(self.content)
  
+        def evict(self):
+            return True
+
      def cap_cache(self):
          '''Cap the cache size to self.cache_max'''
          with self._cache_lock:
      def cap_cache(self):
          '''Cap the cache size to self.cache_max'''
          with self._cache_lock:
@@ -210,12 +255,27 @@ class KeepBlockCache(object):
              # None (that means there was an error reading the block).
              self._cache = [c for c in self._cache if not (c.ready.is_set() and c.content is None)]
              sm = sum([slot.size() for slot in self._cache])
              # None (that means there was an error reading the block).
              self._cache = [c for c in self._cache if not (c.ready.is_set() and c.content is None)]
              sm = sum([slot.size() for slot in self._cache])
-            while len(self._cache) > 0 and sm > self.cache_max:
+            while len(self._cache) > 0 and (sm > self.cache_max or len(self._cache) > self._max_slots):
                  for i in range(len(self._cache)-1, -1, -1):
                  for i in range(len(self._cache)-1, -1, -1):
+                    # start from the back, find a slot that is a candidate to evict
                      if self._cache[i].ready.is_set():
                      if self._cache[i].ready.is_set():
+                        sz = self._cache[i].size()
+
+                        # If evict returns false it means the
+                        # underlying disk cache couldn't lock the file
+                        # for deletion because another process was using
+                        # it. Don't count it as reducing the amount
+                        # of data in the cache, find something else to
+                        # throw out.
+                        if self._cache[i].evict():
+                            sm -= sz
+
+                        # either way we forget about it.  either the
+                        # other process will delete it, or if we need
+                        # it again and it is still there, we'll find
+                        # it on disk.
                          del self._cache[i]
                          break
                          del self._cache[i]
                          break
-                sm = sum([slot.size() for slot in self._cache])
  
      def _get(self, locator):
          # Test if the locator is already in the cache
  
      def _get(self, locator):
          # Test if the locator is already in the cache
@@ -227,6 +287,12 @@ class KeepBlockCache(object):
                      del self._cache[i]
                      self._cache.insert(0, n)
                  return n
                      del self._cache[i]
                      self._cache.insert(0, n)
                  return n
+        if self._disk_cache:
+            # see if it exists on disk
+            n = arvados.diskcache.DiskCacheSlot.get_from_disk(locator, self._disk_cache_dir)
+            if n is not None:
+                self._cache.insert(0, n)
+                return n
          return None
  
      def get(self, locator):
          return None
  
      def get(self, locator):
@@ -242,10 +308,54 @@ class KeepBlockCache(object):
                  return n, False
              else:
                  # Add a new cache slot for the locator
                  return n, False
              else:
                  # Add a new cache slot for the locator
-                n = KeepBlockCache.CacheSlot(locator)
+                if self._disk_cache:
+                    n = arvados.diskcache.DiskCacheSlot(locator, self._disk_cache_dir)
+                else:
+                    n = KeepBlockCache.CacheSlot(locator)
                  self._cache.insert(0, n)
                  return n, True
  
                  self._cache.insert(0, n)
                  return n, True
  
+    def set(self, slot, blob):
+        try:
+            slot.set(blob)
+            return
+        except OSError as e:
+            if e.errno == errno.ENOMEM:
+                # Reduce max slots to current - 4, cap cache and retry
+                with self._cache_lock:
+                    self._max_slots = max(4, len(self._cache) - 4)
+            elif e.errno == errno.ENOSPC:
+                # Reduce disk max space to current - 256 MiB, cap cache and retry
+                with self._cache_lock:
+                    sm = sum([st.size() for st in self._cache])
+                    self.cache_max = max((256 * 1024 * 1024), sm - (256 * 1024 * 1024))
+            elif e.errno == errno.ENODEV:
+                _logger.error("Unable to use disk cache: The underlying filesystem does not support memory mapping.")
+        except Exception as e:
+            pass
+        finally:
+            # Check if we should evict things from the cache.  Either
+            # because we added a new thing or there was an error and
+            # we possibly adjusted the limits down, so we might need
+            # to push something out.
+            self.cap_cache()
+
+        try:
+            # Only gets here if there was an error the first time. The
+            # exception handler adjusts limits downward in some cases
+            # to free up resources, which would make the operation
+            # succeed.
+            slot.set(blob)
+            self.cap_cache()
+        except Exception as e:
+            # It failed again.  Give up.
+            raise arvados.errors.KeepCacheError("Unable to save block %s to disk cache: %s" % (slot.locator, e))
+        finally:
+            # Set the notice that that we are done with the cache
+            # slot one way or another.
+            slot.ready.set()
+
+
  class Counter(object):
      def __init__(self, v=0):
          self._lk = threading.Lock()
  class Counter(object):
      def __init__(self, v=0):
          self._lk = threading.Lock()
@@ -376,6 +486,7 @@ class KeepClient(object):
                      curl.setopt(pycurl.HEADERFUNCTION, self._headerfunction)
                      if self.insecure:
                          curl.setopt(pycurl.SSL_VERIFYPEER, 0)
                      curl.setopt(pycurl.HEADERFUNCTION, self._headerfunction)
                      if self.insecure:
                          curl.setopt(pycurl.SSL_VERIFYPEER, 0)
+                        curl.setopt(pycurl.SSL_VERIFYHOST, 0)
                      else:
                          curl.setopt(pycurl.CAINFO, arvados.util.ca_certs_path())
                      if method == "HEAD":
                      else:
                          curl.setopt(pycurl.CAINFO, arvados.util.ca_certs_path())
                      if method == "HEAD":
@@ -478,6 +589,7 @@ class KeepClient(object):
                      curl.setopt(pycurl.HEADERFUNCTION, self._headerfunction)
                      if self.insecure:
                          curl.setopt(pycurl.SSL_VERIFYPEER, 0)
                      curl.setopt(pycurl.HEADERFUNCTION, self._headerfunction)
                      if self.insecure:
                          curl.setopt(pycurl.SSL_VERIFYPEER, 0)
+                        curl.setopt(pycurl.SSL_VERIFYHOST, 0)
                      else:
                          curl.setopt(pycurl.CAINFO, arvados.util.ca_certs_path())
                      self._setcurltimeouts(curl, timeout)
                      else:
                          curl.setopt(pycurl.CAINFO, arvados.util.ca_certs_path())
                      self._setcurltimeouts(curl, timeout)
@@ -1034,9 +1146,10 @@ class KeepClient(object):
          else:
              return None
  
          else:
              return None
  
-    def get_from_cache(self, loc):
+    def get_from_cache(self, loc_s):
          """Fetch a block only if is in the cache, otherwise return None."""
          """Fetch a block only if is in the cache, otherwise return None."""
-        slot = self.block_cache.get(loc)
+        locator = KeepLocator(loc_s)
+        slot = self.block_cache.get(locator.md5sum)
          if slot is not None and slot.ready.is_set():
              return slot.get()
          else:
          if slot is not None and slot.ready.is_set():
              return slot.get()
          else:
@@ -1055,7 +1168,7 @@ class KeepClient(object):
      def get(self, loc_s, **kwargs):
          return self._get_or_head(loc_s, method="GET", **kwargs)
  
      def get(self, loc_s, **kwargs):
          return self._get_or_head(loc_s, method="GET", **kwargs)
  
-    def _get_or_head(self, loc_s, method="GET", num_retries=None, request_id=None, headers=None):
+    def _get_or_head(self, loc_s, method="GET", num_retries=None, request_id=None, headers=None, prefetch=False):
          """Get data from Keep.
  
          This method fetches one or more blocks of data from Keep.  It
          """Get data from Keep.
  
          This method fetches one or more blocks of data from Keep.  It
@@ -1094,6 +1207,13 @@ class KeepClient(object):
              if method == "GET":
                  slot, first = self.block_cache.reserve_cache(locator.md5sum)
                  if not first:
              if method == "GET":
                  slot, first = self.block_cache.reserve_cache(locator.md5sum)
                  if not first:
+                    if prefetch:
+                        # this is request for a prefetch, if it is
+                        # already in flight, return immediately.
+                        # clear 'slot' to prevent finally block from
+                        # calling slot.set()
+                        slot = None
+                        return None
                      self.hits_counter.add(1)
                      blob = slot.get()
                      if blob is None:
                      self.hits_counter.add(1)
                      blob = slot.get()
                      if blob is None:
@@ -1161,8 +1281,7 @@ class KeepClient(object):
                  return blob
          finally:
              if slot is not None:
                  return blob
          finally:
              if slot is not None:
-                slot.set(blob)
-                self.block_cache.cap_cache()
+                self.block_cache.set(slot, blob)
  
          # Q: Including 403 is necessary for the Keep tests to continue
          # passing, but maybe they should expect KeepReadError instead?
  
          # Q: Including 403 is necessary for the Keep tests to continue
          # passing, but maybe they should expect KeepReadError instead?
@@ -1330,6 +1449,3 @@ class KeepClient(object):
              return True
          if os.path.exists(os.path.join(self.local_store, locator.md5sum)):
              return True
              return True
          if os.path.exists(os.path.join(self.local_store, locator.md5sum)):
              return True
-
-    def is_cached(self, locator):
-        return self.block_cache.reserve_cache(expect_hash)