21452: Add --filters option to arv-mount

[arvados.git] / services / fuse / arvados_fuse / command.py
diff --git a/services/fuse/arvados_fuse/command.py b/services/fuse/arvados_fuse/command.py

index 7bef8a269fd5a2aec7dcd93f272e5a0a5bd99d19..610da477cad48e4dcf63d3f019c597256174a759 100644 (file)
--- a/services/fuse/arvados_fuse/command.py
+++ b/services/fuse/arvados_fuse/command.py
@@ -16,6 +16,7 @@ import signal
  import subprocess
  import sys
  import time
+import resource
  
  import arvados.commands._util as arv_cmd
  from arvados_fuse import crunchstat
@@ -83,18 +84,26 @@ class ArgumentParser(argparse.ArgumentParser):
                              type=str, metavar='PATH', action='append', default=[],
                              help="Create a new collection, mount it in read/write mode at mountpoint/PATH, and delete it when unmounting.")
  
+
          self.add_argument('--debug', action='store_true', help="""Debug mode""")
          self.add_argument('--logfile', help="""Write debug logs and errors to the specified file (default stderr).""")
          self.add_argument('--foreground', action='store_true', help="""Run in foreground (default is to daemonize unless --exec specified)""", default=False)
          self.add_argument('--encoding', type=str, help="Character encoding to use for filesystem, default is utf-8 (see Python codec registry for list of available encodings)", default="utf-8")
  
-        self.add_argument('--file-cache', type=int, help="File data cache size, in bytes (default 256MiB)", default=256*1024*1024)
-        self.add_argument('--directory-cache', type=int, help="Directory data cache size, in bytes (default 128MiB)", default=128*1024*1024)
+        self.add_argument('--file-cache', type=int, help="File data cache size, in bytes (default 8 GiB for disk-based cache or 256 MiB with RAM-only cache)", default=0)
+        self.add_argument('--directory-cache', type=int, help="Directory data cache size, in bytes (default 128 MiB)", default=128*1024*1024)
+
+        cachetype = self.add_mutually_exclusive_group()
+        cachetype.add_argument('--ram-cache', action='store_false', dest='disk_cache', help="Use in-memory caching only", default=True)
+        cachetype.add_argument('--disk-cache', action='store_true', dest='disk_cache', help="Use disk based caching (default)", default=True)
+
+        self.add_argument('--disk-cache-dir', type=str, help="Disk cache location (default ~/.cache/arvados/keep)", default=None)
  
          self.add_argument('--disable-event-listening', action='store_true', help="Don't subscribe to events on the API server", dest="disable_event_listening", default=False)
  
          self.add_argument('--read-only', action='store_false', help="Mount will be read only (default)", dest="enable_write", default=False)
          self.add_argument('--read-write', action='store_true', help="Mount will be read-write", dest="enable_write", default=False)
+        self.add_argument('--storage-classes', type=str, metavar='CLASSES', help="Specify comma separated list of storage classes to be used when saving data of new collections", default=None)
  
          self.add_argument('--crunchstat-interval', type=float, help="Write stats to stderr every N seconds (default disabled)", default=0)
  
@@ -108,7 +117,13 @@ class ArgumentParser(argparse.ArgumentParser):
          self.add_argument('--unmount-timeout',
                            type=float, default=2.0,
                            help="Time to wait for graceful shutdown after --exec program exits and filesystem is unmounted")
-
+        self.add_argument(
+            '--filters',
+            type=arv_cmd.JSONArgument(arv_cmd.validate_filters),
+            help="""Filters to apply to all project, shared, and tag directory
+contents. Pass filters as either a JSON string or a path to a JSON file.
+The JSON object should be a list of filters in Arvados API list filter syntax.
+""")
          self.add_argument('--exec', type=str, nargs=argparse.REMAINDER,
                              dest="exec_args", metavar=('command', 'args', '...', '--'),
                              help="""Mount, run a command, then unmount and exit""")
@@ -127,10 +142,38 @@ class Mount(object):
  
          try:
              self._setup_logging()
+        except Exception as e:
+            self.logger.exception("exception during setup: %s", e)
+            exit(1)
+
+        try:
+            nofile_limit = resource.getrlimit(resource.RLIMIT_NOFILE)
+
+            minlimit = 10240
+            if self.args.file_cache:
+                # Adjust the file handle limit so it can meet
+                # the desired cache size. Multiply by 8 because the
+                # number of 64 MiB cache slots that keepclient
+                # allocates is RLIMIT_NOFILE / 8
+                minlimit = int((self.args.file_cache/(64*1024*1024)) * 8)
+
+            if nofile_limit[0] < minlimit:
+                resource.setrlimit(resource.RLIMIT_NOFILE, (min(minlimit, nofile_limit[1]), nofile_limit[1]))
+
+            if minlimit > nofile_limit[1]:
+                self.logger.warning("file handles required to meet --file-cache (%s) exceeds hard file handle limit (%s), cache size will be smaller than requested", minlimit, nofile_limit[1])
+
+        except Exception as e:
+            self.logger.warning("unable to adjust file handle limit: %s", e)
+
+        nofile_limit = resource.getrlimit(resource.RLIMIT_NOFILE)
+        self.logger.info("file cache capped at %s bytes or less based on available disk (RLIMIT_NOFILE is %s)", ((nofile_limit[0]//8)*64*1024*1024), nofile_limit)
+
+        try:
              self._setup_api()
              self._setup_mount()
          except Exception as e:
-            self.logger.exception("arv-mount: exception during setup: %s", e)
+            self.logger.exception("exception during setup: %s", e)
              exit(1)
  
      def __enter__(self):
@@ -210,12 +253,32 @@ class Mount(object):
  
      def _setup_api(self):
          try:
+            # default value of file_cache is 0, this tells KeepBlockCache to
+            # choose a default based on whether disk_cache is enabled or not.
+
+            block_cache = arvados.keep.KeepBlockCache(cache_max=self.args.file_cache,
+                                                      disk_cache=self.args.disk_cache,
+                                                      disk_cache_dir=self.args.disk_cache_dir)
+
+            # If there's too many prefetch threads and you
+            # max out the CPU, delivering data to the FUSE
+            # layer actually ends up being slower.
+            # Experimentally, capping 7 threads seems to
+            # be a sweet spot.
+            prefetch_threads = min(max((block_cache.cache_max // (64 * 1024 * 1024)) - 1, 1), 7)
+
              self.api = arvados.safeapi.ThreadSafeApiCache(
                  apiconfig=arvados.config.settings(),
+                api_params={
+                    'num_retries': self.args.retries,
+                },
                  keep_params={
-                    'block_cache': arvados.keep.KeepBlockCache(self.args.file_cache),
+                    'block_cache': block_cache,
+                    'num_prefetch_threads': prefetch_threads,
                      'num_retries': self.args.retries,
-                })
+                },
+                version='v1',
+            )
          except KeyError as e:
              self.logger.error("Missing environment: %s", e)
              exit(1)
@@ -243,9 +306,21 @@ class Mount(object):
          usr = self.api.users().current().execute(num_retries=self.args.retries)
          now = time.time()
          dir_class = None
-        dir_args = [llfuse.ROOT_INODE, self.operations.inodes, self.api, self.args.retries]
+        dir_args = [
+            llfuse.ROOT_INODE,
+            self.operations.inodes,
+            self.api,
+            self.args.retries,
+            self.args.enable_write,
+            self.args.filters,
+        ]
          mount_readme = False
  
+        storage_classes = None
+        if self.args.storage_classes is not None:
+            storage_classes = self.args.storage_classes.replace(' ', '').split(',')
+            self.logger.info("Storage classes requested for new collections: {}".format(', '.join(storage_classes)))
+
          if self.args.collection is not None:
              # Set up the request handler with the collection at the root
              # First check that the collection is readable
@@ -295,27 +370,35 @@ class Mount(object):
              mount_readme = True
  
          if dir_class is not None:
-            ent = dir_class(*dir_args)
+            if dir_class in [TagsDirectory, CollectionDirectory]:
+                ent = dir_class(*dir_args)
+            else:
+                ent = dir_class(*dir_args, storage_classes=storage_classes)
              self.operations.inodes.add_entry(ent)
              self.listen_for_events = ent.want_event_subscribe()
              return
  
          e = self.operations.inodes.add_entry(Directory(
-            llfuse.ROOT_INODE, self.operations.inodes, self.api.config))
+            llfuse.ROOT_INODE,
+            self.operations.inodes,
+            self.api.config,
+            self.args.enable_write,
+            self.args.filters,
+        ))
          dir_args[0] = e.inode
  
          for name in self.args.mount_by_id:
-            self._add_mount(e, name, MagicDirectory(*dir_args, pdh_only=False))
+            self._add_mount(e, name, MagicDirectory(*dir_args, pdh_only=False, storage_classes=storage_classes))
          for name in self.args.mount_by_pdh:
              self._add_mount(e, name, MagicDirectory(*dir_args, pdh_only=True))
          for name in self.args.mount_by_tag:
              self._add_mount(e, name, TagsDirectory(*dir_args))
          for name in self.args.mount_home:
-            self._add_mount(e, name, ProjectDirectory(*dir_args, project_object=usr, poll=True))
+            self._add_mount(e, name, ProjectDirectory(*dir_args, project_object=usr, poll=True, storage_classes=storage_classes))
          for name in self.args.mount_shared:
-            self._add_mount(e, name, SharedDirectory(*dir_args, exclude=usr, poll=True))
+            self._add_mount(e, name, SharedDirectory(*dir_args, exclude=usr, poll=True, storage_classes=storage_classes))
          for name in self.args.mount_tmp:
-            self._add_mount(e, name, TmpCollectionDirectory(*dir_args))
+            self._add_mount(e, name, TmpCollectionDirectory(*dir_args, storage_classes=storage_classes))
  
          if mount_readme:
              text = self._readme_text(