18842: Added tests for specific disk cache behavior

[arvados.git] / sdk / python / arvados / keep.py
diff --git a/sdk/python/arvados/keep.py b/sdk/python/arvados/keep.py

index bc07851835e2471ee9f1055b689fe6a789ea4d62..dd99e8b928054ed03e42196608f0a9697f65870b 100644 (file)
--- a/sdk/python/arvados/keep.py
+++ b/sdk/python/arvados/keep.py
@@ -26,6 +26,7 @@ import socket
  import ssl
  import sys
  import threading
+import resource
  from . import timer
  import urllib.parse
  
@@ -39,6 +40,7 @@ import arvados.config as config
  import arvados.errors
  import arvados.retry as retry
  import arvados.util
+import arvados.diskcache
  
  _logger = logging.getLogger('arvados.keep')
  global_client_object = None
@@ -159,7 +161,6 @@ class Keep(object):
                 config.get('ARVADOS_API_TOKEN'),
                 config.flag_is_true('ARVADOS_API_HOST_INSECURE'),
                 config.get('ARVADOS_KEEP_PROXY'),
-               config.get('ARVADOS_EXTERNAL_CLIENT') == 'true',
                 os.environ.get('KEEP_LOCAL_STORE'))
          if (global_client_object is None) or (cls._last_key != key):
              global_client_object = KeepClient()
@@ -175,11 +176,48 @@ class Keep(object):
          return Keep.global_client_object().put(data, **kwargs)
  
  class KeepBlockCache(object):
-    # Default RAM cache is 256MiB
-    def __init__(self, cache_max=(256 * 1024 * 1024)):
+    def __init__(self, cache_max=0, max_slots=0, disk_cache=False, disk_cache_dir=None):
          self.cache_max = cache_max
          self._cache = []
          self._cache_lock = threading.Lock()
+        self._max_slots = max_slots
+        self._disk_cache = disk_cache
+        self._disk_cache_dir = disk_cache_dir
+
+        if self._disk_cache and self._disk_cache_dir is None:
+            self._disk_cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "arvados", "keep")
+            os.makedirs(self._disk_cache_dir, mode=0o700, exist_ok=True)
+
+        if self._max_slots == 0:
+            if self._disk_cache:
+                # default max slots to half of maximum file handles
+                # NOFILE typically defaults to 1024 on Linux so this
+                # will be 512 slots.
+                self._max_slots = resource.getrlimit(resource.RLIMIT_NOFILE)[0] / 2
+            else:
+                # RAM cache slots
+                self._max_slots = 512
+
+        if self.cache_max == 0:
+            if self._disk_cache:
+                fs = os.statvfs(self._disk_cache_dir)
+                avail = (fs.f_bavail * fs.f_bsize) / 4
+                maxdisk = int((fs.f_blocks * fs.f_bsize) * 0.10)
+                # pick smallest of:
+                # 10% of total disk size
+                # 25% of available space
+                # max_slots * 64 MiB
+                self.cache_max = min(min(maxdisk, avail), (self._max_slots * 64 * 1024 * 1024))
+            else:
+                # 256 GiB in RAM
+                self.cache_max = (256 * 1024 * 1024)
+
+        self.cache_max = max(self.cache_max, 64 * 1024 * 1024)
+
+        if self._disk_cache:
+            self._cache = arvados.diskcache.DiskCacheSlot.init_cache(self._disk_cache_dir, self._max_slots)
+            self.cap_cache()
+
  
      class CacheSlot(object):
          __slots__ = ("locator", "ready", "content")
@@ -203,6 +241,9 @@ class KeepBlockCache(object):
              else:
                  return len(self.content)
  
+        def evict(self):
+            return True
+
      def cap_cache(self):
          '''Cap the cache size to self.cache_max'''
          with self._cache_lock:
@@ -210,12 +251,27 @@ class KeepBlockCache(object):
              # None (that means there was an error reading the block).
              self._cache = [c for c in self._cache if not (c.ready.is_set() and c.content is None)]
              sm = sum([slot.size() for slot in self._cache])
-            while len(self._cache) > 0 and sm > self.cache_max:
+            while len(self._cache) > 0 and (sm > self.cache_max or len(self._cache) > self._max_slots):
                  for i in range(len(self._cache)-1, -1, -1):
+                    # start from the back, find a slot that is a candidate to evict
                      if self._cache[i].ready.is_set():
+                        sz = self._cache[i].size()
+
+                        # If evict returns false it means the
+                        # underlying disk cache couldn't lock the file
+                        # for deletion because another process was using
+                        # it. Don't count it as reducing the amount
+                        # of data in the cache, find something else to
+                        # throw out.
+                        if self._cache[i].evict():
+                            sm -= sz
+
+                        # either way we forget about it.  either the
+                        # other process will delete it, or if we need
+                        # it again and it is still there, we'll find
+                        # it on disk.
                          del self._cache[i]
                          break
-                sm = sum([slot.size() for slot in self._cache])
  
      def _get(self, locator):
          # Test if the locator is already in the cache
@@ -227,6 +283,12 @@ class KeepBlockCache(object):
                      del self._cache[i]
                      self._cache.insert(0, n)
                  return n
+        if self._disk_cache:
+            # see if it exists on disk
+            n = arvados.diskcache.DiskCacheSlot.get_from_disk(locator, self._disk_cache_dir)
+            if n is not None:
+                self._cache.insert(0, n)
+                return n
          return None
  
      def get(self, locator):
@@ -242,7 +304,10 @@ class KeepBlockCache(object):
                  return n, False
              else:
                  # Add a new cache slot for the locator
-                n = KeepBlockCache.CacheSlot(locator)
+                if self._disk_cache:
+                    n = arvados.diskcache.DiskCacheSlot(locator, self._disk_cache_dir)
+                else:
+                    n = KeepBlockCache.CacheSlot(locator)
                  self._cache.insert(0, n)
                  return n, True
  
@@ -376,6 +441,7 @@ class KeepClient(object):
                      curl.setopt(pycurl.HEADERFUNCTION, self._headerfunction)
                      if self.insecure:
                          curl.setopt(pycurl.SSL_VERIFYPEER, 0)
+                        curl.setopt(pycurl.SSL_VERIFYHOST, 0)
                      else:
                          curl.setopt(pycurl.CAINFO, arvados.util.ca_certs_path())
                      if method == "HEAD":
@@ -478,6 +544,7 @@ class KeepClient(object):
                      curl.setopt(pycurl.HEADERFUNCTION, self._headerfunction)
                      if self.insecure:
                          curl.setopt(pycurl.SSL_VERIFYPEER, 0)
+                        curl.setopt(pycurl.SSL_VERIFYHOST, 0)
                      else:
                          curl.setopt(pycurl.CAINFO, arvados.util.ca_certs_path())
                      self._setcurltimeouts(curl, timeout)
@@ -1034,9 +1101,10 @@ class KeepClient(object):
          else:
              return None
  
-    def get_from_cache(self, loc):
+    def get_from_cache(self, loc_s):
          """Fetch a block only if is in the cache, otherwise return None."""
-        slot = self.block_cache.get(loc)
+        locator = KeepLocator(loc_s)
+        slot = self.block_cache.get(locator.md5sum)
          if slot is not None and slot.ready.is_set():
              return slot.get()
          else:
@@ -1055,7 +1123,7 @@ class KeepClient(object):
      def get(self, loc_s, **kwargs):
          return self._get_or_head(loc_s, method="GET", **kwargs)
  
-    def _get_or_head(self, loc_s, method="GET", num_retries=None, request_id=None, headers=None):
+    def _get_or_head(self, loc_s, method="GET", num_retries=None, request_id=None, headers=None, prefetch=False):
          """Get data from Keep.
  
          This method fetches one or more blocks of data from Keep.  It
@@ -1080,6 +1148,13 @@ class KeepClient(object):
  
          self.get_counter.add(1)
  
+        request_id = (request_id or
+                      (hasattr(self, 'api_client') and self.api_client.request_id) or
+                      arvados.util.new_request_id())
+        if headers is None:
+            headers = {}
+        headers['X-Request-Id'] = request_id
+
          slot = None
          blob = None
          try:
@@ -1087,6 +1162,13 @@ class KeepClient(object):
              if method == "GET":
                  slot, first = self.block_cache.reserve_cache(locator.md5sum)
                  if not first:
+                    if prefetch:
+                        # this is request for a prefetch, if it is
+                        # already in flight, return immediately.
+                        # clear 'slot' to prevent finally block from
+                        # calling slot.set()
+                        slot = None
+                        return None
                      self.hits_counter.add(1)
                      blob = slot.get()
                      if blob is None:
@@ -1096,12 +1178,6 @@ class KeepClient(object):
  
              self.misses_counter.add(1)
  
-            if headers is None:
-                headers = {}
-            headers['X-Request-Id'] = (request_id or
-                                        (hasattr(self, 'api_client') and self.api_client.request_id) or
-                                        arvados.util.new_request_id())
-
              # If the locator has hints specifying a prefix (indicating a
              # remote keepproxy) or the UUID of a local gateway service,
              # read data from the indicated service(s) instead of the usual
@@ -1171,14 +1247,14 @@ class KeepClient(object):
                            for key in sorted_roots)
          if not roots_map:
              raise arvados.errors.KeepReadError(
-                "failed to read {}: no Keep services available ({})".format(
-                    loc_s, loop.last_result()))
+                "[{}] failed to read {}: no Keep services available ({})".format(
+                    request_id, loc_s, loop.last_result()))
          elif not_founds == len(sorted_roots):
              raise arvados.errors.NotFoundError(
-                "{} not found".format(loc_s), service_errors)
+                "[{}] {} not found".format(request_id, loc_s), service_errors)
          else:
              raise arvados.errors.KeepReadError(
-                "failed to read {} after {}".format(loc_s, loop.attempts_str()), service_errors, label="service")
+                "[{}] failed to read {} after {}".format(request_id, loc_s, loop.attempts_str()), service_errors, label="service")
  
      @retry.retry_method
      def put(self, data, copies=2, num_retries=None, request_id=None, classes=None):
@@ -1215,10 +1291,11 @@ class KeepClient(object):
              return loc_s
          locator = KeepLocator(loc_s)
  
+        request_id = (request_id or
+                      (hasattr(self, 'api_client') and self.api_client.request_id) or
+                      arvados.util.new_request_id())
          headers = {
-            'X-Request-Id': (request_id or
-                             (hasattr(self, 'api_client') and self.api_client.request_id) or
-                             arvados.util.new_request_id()),
+            'X-Request-Id': request_id,
              'X-Keep-Desired-Replicas': str(copies),
          }
          roots_map = {}
@@ -1275,15 +1352,15 @@ class KeepClient(object):
              return writer_pool.response()
          if not roots_map:
              raise arvados.errors.KeepWriteError(
-                "failed to write {}: no Keep services available ({})".format(
-                    data_hash, loop.last_result()))
+                "[{}] failed to write {}: no Keep services available ({})".format(
+                    request_id, data_hash, loop.last_result()))
          else:
              service_errors = ((key, roots_map[key].last_result()['error'])
                                for key in sorted_roots
                                if roots_map[key].last_result()['error'])
              raise arvados.errors.KeepWriteError(
-                "failed to write {} after {} (wanted {} copies but wrote {})".format(
-                    data_hash, loop.attempts_str(), (copies, classes), writer_pool.done()), service_errors, label="service")
+                "[{}] failed to write {} after {} (wanted {} copies but wrote {})".format(
+                    request_id, data_hash, loop.attempts_str(), (copies, classes), writer_pool.done()), service_errors, label="service")
  
      def local_store_put(self, data, copies=1, num_retries=None, classes=[]):
          """A stub for put().
@@ -1328,6 +1405,3 @@ class KeepClient(object):
              return True
          if os.path.exists(os.path.join(self.local_store, locator.md5sum)):
              return True
-
-    def is_cached(self, locator):
-        return self.block_cache.reserve_cache(expect_hash)