21452: Add --filters option to arv-mount
authorBrett Smith <brett.smith@curii.com>
Mon, 12 Feb 2024 02:02:19 +0000 (21:02 -0500)
committerBrett Smith <brett.smith@curii.com>
Mon, 12 Feb 2024 02:05:02 +0000 (21:05 -0500)
This allows users to specify arbitrary API filters to queries arv-mount
uses to build directory listings. This gives users a way to filter out
unwanted entries from a mount, similar to the filtering in Workbench.

Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith@curii.com>

sdk/python/arvados/commands/_util.py
sdk/python/tests/test_cmd_util.py [new file with mode: 0644]
services/api/test/fixtures/collections.yml
services/api/test/fixtures/groups.yml
services/fuse/arvados_fuse/command.py
services/fuse/arvados_fuse/fusedir.py
services/fuse/tests/mount_test_base.py
services/fuse/tests/test_mount_filters.py [new file with mode: 0644]

index c267932e66176b332a6c6494736c6de966657e20..6c792b2e0d54d7f1e25ffa9850723b4dc9289cc0 100644 (file)
@@ -4,11 +4,21 @@
 
 import argparse
 import errno
+import json
 import logging
 import os
+import re
 import signal
 import sys
 
+FILTER_STR_RE = re.compile(r'''
+^\(
+\ *(\w+)
+\ *(<|<=|=|>=|>)
+\ *(\w+)
+\ *\)$
+''', re.ASCII | re.VERBOSE)
+
 def _pos_int(s):
     num = int(s)
     if num < 0:
@@ -62,3 +72,87 @@ def install_signal_handlers():
 def restore_signal_handlers():
     for sigcode, orig_handler in orig_signal_handlers.items():
         signal.signal(sigcode, orig_handler)
+
+def validate_filters(filters):
+    """Validate user-provided filters
+
+    This function validates that a user-defined object represents valid
+    Arvados filters that can be passed to an API client: that it's a list of
+    3-element lists with the field name and operator given as strings. If any
+    of these conditions are not true, it raises a ValueError with details about
+    the problem.
+
+    It returns validated filters. Currently the provided filters are returned
+    unmodified. Future versions of this function may clean up the filters with
+    "obvious" type conversions, so callers SHOULD use the returned value for
+    Arvados API calls.
+    """
+    if not isinstance(filters, list):
+        raise ValueError(f"filters are not a list: {filters!r}")
+    for index, f in enumerate(filters):
+        if isinstance(f, str):
+            match = FILTER_STR_RE.fullmatch(f)
+            if match is None:
+                raise ValueError(f"filter at index {index} has invalid syntax: {f!r}")
+            s, op, o = match.groups()
+            if s[0].isdigit():
+                raise ValueError(f"filter at index {index} has invalid syntax: bad field name {s!r}")
+            if o[0].isdigit():
+                raise ValueError(f"filter at index {index} has invalid syntax: bad field name {o!r}")
+            continue
+        elif not isinstance(f, list):
+            raise ValueError(f"filter at index {index} is not a string or list: {f!r}")
+        try:
+            s, op, o = f
+        except ValueError:
+            raise ValueError(
+                f"filter at index {index} does not have three items (field name, operator, operand): {f!r}",
+            ) from None
+        if not isinstance(s, str):
+            raise ValueError(f"filter at index {index} field name is not a string: {s!r}")
+        if not isinstance(op, str):
+            raise ValueError(f"filter at index {index} operator is not a string: {op!r}")
+    return filters
+
+
+class JSONArgument:
+    """Parse a JSON file from a command line argument string or path
+
+    JSONArgument objects can be called with a string and return an arbitrary
+    object. First it will try to decode the string as JSON. If that fails, it
+    will try to open a file at the path named by the string, and decode it as
+    JSON. If that fails, it raises ValueError with more detail.
+
+    This is designed to be used as an argparse argument type.
+    Typical usage looks like:
+
+        parser = argparse.ArgumentParser()
+        parser.add_argument('--object', type=JSONArgument(), ...)
+
+    You can construct JSONArgument with an optional validation function. If
+    given, it is called with the object decoded from user input, and its
+    return value replaces it. It should raise ValueError if there is a problem
+    with the input. (argparse turns ValueError into a useful error message.)
+
+        filters_type = JSONArgument(validate_filters)
+        parser.add_argument('--filters', type=filters_type, ...)
+    """
+    def __init__(self, validator=None):
+        self.validator = validator
+
+    def __call__(self, value):
+        try:
+            retval = json.loads(value)
+        except json.JSONDecodeError:
+            try:
+                with open(value, 'rb') as json_file:
+                    retval = json.load(json_file)
+            except json.JSONDecodeError as error:
+                raise ValueError(f"error decoding JSON from file {value!r}: {error}") from None
+            except (FileNotFoundError, ValueError):
+                raise ValueError(f"not a valid JSON string or file path: {value!r}") from None
+            except OSError as error:
+                raise ValueError(f"error reading JSON file path {value!r}: {error.strerror}") from None
+        if self.validator is not None:
+            retval = self.validator(retval)
+        return retval
diff --git a/sdk/python/tests/test_cmd_util.py b/sdk/python/tests/test_cmd_util.py
new file mode 100644 (file)
index 0000000..ffd45aa
--- /dev/null
@@ -0,0 +1,194 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import contextlib
+import copy
+import itertools
+import json
+import os
+import tempfile
+import unittest
+
+from pathlib import Path
+
+from parameterized import parameterized
+
+import arvados.commands._util as cmd_util
+
+FILE_PATH = Path(__file__)
+
+class ValidateFiltersTestCase(unittest.TestCase):
+    NON_FIELD_TYPES = [
+        None,
+        123,
+        ('name', '=', 'tuple'),
+        {'filters': ['name', '=', 'object']},
+    ]
+    NON_FILTER_TYPES = NON_FIELD_TYPES + ['string']
+    VALID_FILTERS = [
+        ['owner_uuid', '=', 'zzzzz-tpzed-12345abcde67890'],
+        ['name', 'in', ['foo', 'bar']],
+        '(replication_desired > replication_cofirmed)',
+        '(replication_confirmed>=replication_desired)',
+    ]
+
+    @parameterized.expand(itertools.combinations(VALID_FILTERS, 2))
+    def test_valid_filters(self, f1, f2):
+        expected = [f1, f2]
+        actual = cmd_util.validate_filters(copy.deepcopy(expected))
+        self.assertEqual(actual, expected)
+
+    @parameterized.expand([(t,) for t in NON_FILTER_TYPES])
+    def test_filters_wrong_type(self, value):
+        with self.assertRaisesRegex(ValueError, r'^filters are not a list\b'):
+            cmd_util.validate_filters(value)
+
+    @parameterized.expand([(t,) for t in NON_FIELD_TYPES])
+    def test_single_filter_wrong_type(self, value):
+        with self.assertRaisesRegex(ValueError, r'^filter at index 0 is not a string or list\b'):
+            cmd_util.validate_filters([value])
+
+    @parameterized.expand([
+        ([],),
+        (['owner_uuid'],),
+        (['owner_uuid', 'zzzzz-tpzed-12345abcde67890'],),
+        (['name', 'not in', 'foo', 'bar'],),
+        (['name', 'in', 'foo', 'bar', 'baz'],),
+    ])
+    def test_filters_wrong_arity(self, value):
+        with self.assertRaisesRegex(ValueError, r'^filter at index 0 does not have three items\b'):
+            cmd_util.validate_filters([value])
+
+    @parameterized.expand(itertools.product(
+        [0, 1],
+        NON_FIELD_TYPES,
+    ))
+    def test_filter_definition_wrong_type(self, index, bad_value):
+        value = ['owner_uuid', '=', 'zzzzz-tpzed-12345abcde67890']
+        value[index] = bad_value
+        name = ('field name', 'operator')[index]
+        with self.assertRaisesRegex(ValueError, rf'^filter at index 0 {name} is not a string\b'):
+            cmd_util.validate_filters([value])
+
+    @parameterized.expand([
+        # Not enclosed in parentheses
+        'foo = bar',
+        '(foo) < bar',
+        'foo > (bar)',
+        # Not exactly one operator
+        '(a >= b >= c)',
+        '(foo)',
+        '(file_count version)',
+        # Invalid field identifiers
+        '(version = 1)',
+        '(2 = file_count)',
+        '(replication.desired <= replication.confirmed)',
+        # Invalid whitespace
+        '(file_count\t=\tversion)',
+        '(file_count >= version\n)',
+    ])
+    def test_invalid_string_filter(self, value):
+        with self.assertRaisesRegex(ValueError, r'^filter at index 0 has invalid syntax\b'):
+            cmd_util.validate_filters([value])
+
+
+class JSONArgumentTestCase(unittest.TestCase):
+    JSON_OBJECTS = [
+        None,
+        123,
+        456.789,
+        'string',
+        ['list', 1],
+        {'object': True, 'yaml': False},
+    ]
+
+    @classmethod
+    def setUpClass(cls):
+        cls.json_file = tempfile.NamedTemporaryFile(
+            'w+',
+            encoding='utf-8',
+            prefix='argtest',
+            suffix='.json',
+        )
+        cls.parser = cmd_util.JSONArgument()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.json_file.close()
+
+    def setUp(self):
+        self.json_file.seek(0)
+        self.json_file.truncate()
+
+    @parameterized.expand((obj,) for obj in JSON_OBJECTS)
+    def test_valid_argument_string(self, obj):
+        actual = self.parser(json.dumps(obj))
+        self.assertEqual(actual, obj)
+
+    @parameterized.expand((obj,) for obj in JSON_OBJECTS)
+    def test_valid_argument_path(self, obj):
+        json.dump(obj, self.json_file)
+        self.json_file.flush()
+        actual = self.parser(self.json_file.name)
+        self.assertEqual(actual, obj)
+
+    @parameterized.expand([
+        '',
+        '\0',
+        None,
+    ])
+    def test_argument_not_json_or_path(self, value):
+        if value is None:
+            with tempfile.NamedTemporaryFile() as gone_file:
+                value = gone_file.name
+        with self.assertRaisesRegex(ValueError, r'\bnot a valid JSON string or file path\b'):
+            self.parser(value)
+
+    @parameterized.expand([
+        FILE_PATH.parent,
+        FILE_PATH / 'nonexistent.json',
+        None,
+    ])
+    def test_argument_path_unreadable(self, path):
+        if path is None:
+            bad_file = tempfile.NamedTemporaryFile()
+            os.chmod(bad_file.fileno(), 0o000)
+            path = bad_file.name
+            @contextlib.contextmanager
+            def ctx():
+                try:
+                    yield
+                finally:
+                    os.chmod(bad_file.fileno(), 0o600)
+        else:
+            ctx = contextlib.nullcontext
+        with self.assertRaisesRegex(ValueError, rf'^error reading JSON file path {str(path)!r}: '), ctx():
+            self.parser(str(path))
+
+    @parameterized.expand([
+        FILE_PATH,
+        None,
+    ])
+    def test_argument_path_not_json(self, path):
+        if path is None:
+            path = self.json_file.name
+        with self.assertRaisesRegex(ValueError, rf'^error decoding JSON from file {str(path)!r}'):
+            self.parser(str(path))
+
+
+class JSONArgumentValidationTestCase(unittest.TestCase):
+    @parameterized.expand((obj,) for obj in JSONArgumentTestCase.JSON_OBJECTS)
+    def test_object_returned_from_validator(self, value):
+        parser = cmd_util.JSONArgument(lambda _: copy.deepcopy(value))
+        self.assertEqual(parser('{}'), value)
+
+    @parameterized.expand((obj,) for obj in JSONArgumentTestCase.JSON_OBJECTS)
+    def test_exception_raised_from_validator(self, value):
+        json_value = json.dumps(value)
+        def raise_func(_):
+            raise ValueError(json_value)
+        parser = cmd_util.JSONArgument(raise_func)
+        with self.assertRaises(ValueError) as exc_check:
+            parser(json_value)
+        self.assertEqual(exc_check.exception.args, (json_value,))
index 5a3242e4ffce028bb95df3e9beaaa36f9bf5059f..0c5e9e987aa69d72baf8d008c629841e501a7958 100644 (file)
@@ -220,6 +220,51 @@ foo_collection_in_aproject:
   manifest_text: ". acbd18db4cc2f85cedef654fccc4a4d8+3 0:3:foo\n"
   name: "zzzzz-4zz18-fy296fx3hot09f7 added sometime"
 
+fuse_filters_test_foo:
+  uuid: zzzzz-4zz18-4e2kjqv891jl3p3
+  current_version_uuid: zzzzz-4zz18-4e2kjqv891jl3p3
+  portable_data_hash: 1f4b0bc7583c2a7f9102c395f4ffc5e3+45
+  modified_by_client_uuid: zzzzz-ozdt8-brczlopd8u8d0jr
+  modified_by_user_uuid: zzzzz-tpzed-000000000000000
+  owner_uuid: zzzzz-tpzed-fusefiltertest1
+  created_at: 2024-02-09T12:01:00Z
+  modified_at: 2024-02-09T12:01:01Z
+  updated_at: 2024-02-09T12:01:01Z
+  manifest_text: ". acbd18db4cc2f85cedef654fccc4a4d8+3 0:3:foo\n"
+  name: foo
+  properties:
+    MainFile: foo
+
+fuse_filters_test_bar:
+  uuid: zzzzz-4zz18-qpxqtq2wbjnu630
+  current_version_uuid: zzzzz-4zz18-qpxqtq2wbjnu630
+  portable_data_hash: fa7aeb5140e2848d39b416daeef4ffc5+45
+  owner_uuid: zzzzz-tpzed-000000000000000
+  modified_by_client_uuid: zzzzz-ozdt8-brczlopd8u8d0jr
+  modified_by_user_uuid: zzzzz-tpzed-d9tiejq69daie8f
+  created_at: 2024-02-09T12:02:00Z
+  modified_at: 2024-02-09T12:02:01Z
+  updated_at: 2024-02-09T12:02:01Z
+  manifest_text: ". 37b51d194a7513e45b56f6524f2d51f2+3 0:3:bar\n"
+  name: bar
+  properties:
+    MainFile: bar
+
+fuse_filters_test_baz:
+  uuid: zzzzz-4zz18-ls97ezovrkkpfxz
+  current_version_uuid: zzzzz-4zz18-ls97ezovrkkpfxz
+  portable_data_hash: ea10d51bcf88862dbcc36eb292017dfd+45
+  owner_uuid: zzzzz-tpzed-000000000000000
+  modified_by_client_uuid: zzzzz-ozdt8-brczlopd8u8d0jr
+  modified_by_user_uuid: zzzzz-tpzed-d9tiejq69daie8f
+  created_at: 2024-02-09T12:03:00Z
+  modified_at: 2024-02-09T12:03:01Z
+  updated_at: 2024-02-09T12:03:01Z
+  manifest_text: ". 73feffa4b7f6bb68e44cf984c85f6e88+3 0:3:baz\n"
+  name: baz
+  properties:
+    MainFile: baz
+
 user_agreement_in_anonymously_accessible_project:
   uuid: zzzzz-4zz18-uukreo9rbgwsujr
   current_version_uuid: zzzzz-4zz18-uukreo9rbgwsujr
index 9a2dc169b63aec6ff8d624bf4128c69483e0ce3b..9034ac6ee7d2dd72928388b51b4461bff2814af8 100644 (file)
@@ -172,6 +172,17 @@ afiltergroup5:
   properties:
     filters: [["collections.properties.listprop","contains","elem1"],["uuid", "is_a", "arvados#collection"]]
 
+fuse_filters_test_project:
+  uuid: zzzzz-j7d0g-fusefiltertest1
+  owner_uuid: zzzzz-tpzed-xurymjxw79nv3jz
+  modified_by_client_uuid: zzzzz-ozdt8-brczlopd8u8d0jr
+  modified_by_user_uuid: zzzzz-tpzed-xurymjxw79nv3jz
+  created_at: 2024-02-09T12:00:00Z
+  modified_at: 2024-02-09T12:00:01Z
+  updated_at: 2024-02-09T12:00:01Z
+  name: FUSE Filters Test Project 1
+  group_class: project
+
 future_project_viewing_group:
   uuid: zzzzz-j7d0g-futrprojviewgrp
   owner_uuid: zzzzz-tpzed-000000000000000
index 9c607c7f0c3dbf97ffdfa0f98a76e4489731c71f..610da477cad48e4dcf63d3f019c597256174a759 100644 (file)
@@ -117,7 +117,13 @@ class ArgumentParser(argparse.ArgumentParser):
         self.add_argument('--unmount-timeout',
                           type=float, default=2.0,
                           help="Time to wait for graceful shutdown after --exec program exits and filesystem is unmounted")
-
+        self.add_argument(
+            '--filters',
+            type=arv_cmd.JSONArgument(arv_cmd.validate_filters),
+            help="""Filters to apply to all project, shared, and tag directory
+contents. Pass filters as either a JSON string or a path to a JSON file.
+The JSON object should be a list of filters in Arvados API list filter syntax.
+""")
         self.add_argument('--exec', type=str, nargs=argparse.REMAINDER,
                             dest="exec_args", metavar=('command', 'args', '...', '--'),
                             help="""Mount, run a command, then unmount and exit""")
@@ -300,7 +306,14 @@ class Mount(object):
         usr = self.api.users().current().execute(num_retries=self.args.retries)
         now = time.time()
         dir_class = None
-        dir_args = [llfuse.ROOT_INODE, self.operations.inodes, self.api, self.args.retries, self.args.enable_write]
+        dir_args = [
+            llfuse.ROOT_INODE,
+            self.operations.inodes,
+            self.api,
+            self.args.retries,
+            self.args.enable_write,
+            self.args.filters,
+        ]
         mount_readme = False
 
         storage_classes = None
@@ -366,7 +379,12 @@ class Mount(object):
             return
 
         e = self.operations.inodes.add_entry(Directory(
-            llfuse.ROOT_INODE, self.operations.inodes, self.api.config, self.args.enable_write))
+            llfuse.ROOT_INODE,
+            self.operations.inodes,
+            self.api.config,
+            self.args.enable_write,
+            self.args.filters,
+        ))
         dir_args[0] = e.inode
 
         for name in self.args.mount_by_id:
index 8faf01cb6c4a4ddc58c31ddbe224360870d7026b..e3b8dd4c2cca29616626dab55f6d440c22b58f51 100644 (file)
@@ -36,7 +36,7 @@ class Directory(FreshBase):
     and the value referencing a File or Directory object.
     """
 
-    def __init__(self, parent_inode, inodes, apiconfig, enable_write):
+    def __init__(self, parent_inode, inodes, apiconfig, enable_write, filters):
         """parent_inode is the integer inode number"""
 
         super(Directory, self).__init__()
@@ -50,6 +50,19 @@ class Directory(FreshBase):
         self._entries = {}
         self._mtime = time.time()
         self._enable_write = enable_write
+        self._filters = filters or []
+
+    def _filters_for(self, subtype, *, qualified):
+        for f in self._filters:
+            f_type, _, f_name = f[0].partition('.')
+            if not f_name:
+                yield f
+            elif f_type != subtype:
+                pass
+            elif qualified:
+                yield f
+            else:
+                yield [f_name, *f[1:]]
 
     def forward_slash_subst(self):
         if not hasattr(self, '_fsns'):
@@ -270,8 +283,8 @@ class CollectionDirectoryBase(Directory):
 
     """
 
-    def __init__(self, parent_inode, inodes, apiconfig, enable_write, collection, collection_root):
-        super(CollectionDirectoryBase, self).__init__(parent_inode, inodes, apiconfig, enable_write)
+    def __init__(self, parent_inode, inodes, apiconfig, enable_write, filters, collection, collection_root):
+        super(CollectionDirectoryBase, self).__init__(parent_inode, inodes, apiconfig, enable_write, filters)
         self.apiconfig = apiconfig
         self.collection = collection
         self.collection_root = collection_root
@@ -287,7 +300,15 @@ class CollectionDirectoryBase(Directory):
             item.fuse_entry.dead = False
             self._entries[name] = item.fuse_entry
         elif isinstance(item, arvados.collection.RichCollectionBase):
-            self._entries[name] = self.inodes.add_entry(CollectionDirectoryBase(self.inode, self.inodes, self.apiconfig, self._enable_write, item, self.collection_root))
+            self._entries[name] = self.inodes.add_entry(CollectionDirectoryBase(
+                self.inode,
+                self.inodes,
+                self.apiconfig,
+                self._enable_write,
+                self._filters,
+                item,
+                self.collection_root,
+            ))
             self._entries[name].populate(mtime)
         else:
             self._entries[name] = self.inodes.add_entry(FuseArvadosFile(self.inode, item, mtime, self._enable_write))
@@ -434,8 +455,8 @@ class CollectionDirectoryBase(Directory):
 class CollectionDirectory(CollectionDirectoryBase):
     """Represents the root of a directory tree representing a collection."""
 
-    def __init__(self, parent_inode, inodes, api, num_retries, enable_write, collection_record=None, explicit_collection=None):
-        super(CollectionDirectory, self).__init__(parent_inode, inodes, api.config, enable_write, None, self)
+    def __init__(self, parent_inode, inodes, api, num_retries, enable_write, filters=None, collection_record=None, explicit_collection=None):
+        super(CollectionDirectory, self).__init__(parent_inode, inodes, api.config, enable_write, filters, None, self)
         self.api = api
         self.num_retries = num_retries
         self._poll = True
@@ -637,7 +658,7 @@ class TmpCollectionDirectory(CollectionDirectoryBase):
         def save_new(self):
             pass
 
-    def __init__(self, parent_inode, inodes, api_client, num_retries, enable_write, storage_classes=None):
+    def __init__(self, parent_inode, inodes, api_client, num_retries, enable_write, filters=None, storage_classes=None):
         collection = self.UnsaveableCollection(
             api_client=api_client,
             keep_client=api_client.keep,
@@ -646,7 +667,7 @@ class TmpCollectionDirectory(CollectionDirectoryBase):
         # This is always enable_write=True because it never tries to
         # save to the backend
         super(TmpCollectionDirectory, self).__init__(
-            parent_inode, inodes, api_client.config, True, collection, self)
+            parent_inode, inodes, api_client.config, True, filters, collection, self)
         self.populate(self.mtime())
 
     def on_event(self, *args, **kwargs):
@@ -742,8 +763,8 @@ and the directory will appear if it exists.
 
 """.lstrip()
 
-    def __init__(self, parent_inode, inodes, api, num_retries, enable_write, pdh_only=False, storage_classes=None):
-        super(MagicDirectory, self).__init__(parent_inode, inodes, api.config, enable_write)
+    def __init__(self, parent_inode, inodes, api, num_retries, enable_write, filters, pdh_only=False, storage_classes=None):
+        super(MagicDirectory, self).__init__(parent_inode, inodes, api.config, enable_write, filters)
         self.api = api
         self.num_retries = num_retries
         self.pdh_only = pdh_only
@@ -759,8 +780,14 @@ and the directory will appear if it exists.
             # If we're the root directory, add an identical by_id subdirectory.
             if self.inode == llfuse.ROOT_INODE:
                 self._entries['by_id'] = self.inodes.add_entry(MagicDirectory(
-                    self.inode, self.inodes, self.api, self.num_retries, self._enable_write,
-                    self.pdh_only))
+                    self.inode,
+                    self.inodes,
+                    self.api,
+                    self.num_retries,
+                    self._enable_write,
+                    self._filters,
+                    self.pdh_only,
+                ))
 
     def __contains__(self, k):
         if k in self._entries:
@@ -774,15 +801,34 @@ and the directory will appear if it exists.
 
             if group_uuid_pattern.match(k):
                 project = self.api.groups().list(
-                    filters=[['group_class', 'in', ['project','filter']], ["uuid", "=", k]]).execute(num_retries=self.num_retries)
+                    filters=[
+                        ['group_class', 'in', ['project','filter']],
+                        ["uuid", "=", k],
+                        *self._filters_for('groups', qualified=False),
+                    ],
+                ).execute(num_retries=self.num_retries)
                 if project[u'items_available'] == 0:
                     return False
                 e = self.inodes.add_entry(ProjectDirectory(
-                    self.inode, self.inodes, self.api, self.num_retries, self._enable_write,
-                    project[u'items'][0], storage_classes=self.storage_classes))
+                    self.inode,
+                    self.inodes,
+                    self.api,
+                    self.num_retries,
+                    self._enable_write,
+                    self._filters,
+                    project[u'items'][0],
+                    storage_classes=self.storage_classes,
+                ))
             else:
                 e = self.inodes.add_entry(CollectionDirectory(
-                        self.inode, self.inodes, self.api, self.num_retries, self._enable_write, k))
+                    self.inode,
+                    self.inodes,
+                    self.api,
+                    self.num_retries,
+                    self._enable_write,
+                    self._filters,
+                    k,
+                ))
 
             if e.update():
                 if k not in self._entries:
@@ -816,8 +862,8 @@ and the directory will appear if it exists.
 class TagsDirectory(Directory):
     """A special directory that contains as subdirectories all tags visible to the user."""
 
-    def __init__(self, parent_inode, inodes, api, num_retries, enable_write, poll_time=60):
-        super(TagsDirectory, self).__init__(parent_inode, inodes, api.config, enable_write)
+    def __init__(self, parent_inode, inodes, api, num_retries, enable_write, filters, poll_time=60):
+        super(TagsDirectory, self).__init__(parent_inode, inodes, api.config, enable_write, filters)
         self.api = api
         self.num_retries = num_retries
         self._poll = True
@@ -831,15 +877,32 @@ class TagsDirectory(Directory):
     def update(self):
         with llfuse.lock_released:
             tags = self.api.links().list(
-                filters=[['link_class', '=', 'tag'], ["name", "!=", ""]],
-                select=['name'], distinct=True, limit=1000
-                ).execute(num_retries=self.num_retries)
+                filters=[
+                    ['link_class', '=', 'tag'],
+                    ['name', '!=', ''],
+                    *self._filters_for('links', qualified=False),
+                ],
+                select=['name'],
+                distinct=True,
+                limit=1000,
+            ).execute(num_retries=self.num_retries)
         if "items" in tags:
-            self.merge(tags['items']+[{"name": n} for n in self._extra],
-                       lambda i: i['name'],
-                       lambda a, i: a.tag == i['name'],
-                       lambda i: TagDirectory(self.inode, self.inodes, self.api, self.num_retries, self._enable_write,
-                                              i['name'], poll=self._poll, poll_time=self._poll_time))
+            self.merge(
+                tags['items']+[{"name": n} for n in self._extra],
+                lambda i: i['name'],
+                lambda a, i: a.tag == i['name'],
+                lambda i: TagDirectory(
+                    self.inode,
+                    self.inodes,
+                    self.api,
+                    self.num_retries,
+                    self._enable_write,
+                    self._filters,
+                    i['name'],
+                    poll=self._poll,
+                    poll_time=self._poll_time,
+                ),
+            )
 
     @use_counter
     @check_update
@@ -848,7 +911,12 @@ class TagsDirectory(Directory):
             return super(TagsDirectory, self).__getitem__(item)
         with llfuse.lock_released:
             tags = self.api.links().list(
-                filters=[['link_class', '=', 'tag'], ['name', '=', item]], limit=1
+                filters=[
+                    ['link_class', '=', 'tag'],
+                    ['name', '=', item],
+                    *self._filters_for('links', qualified=False),
+                ],
+                limit=1,
             ).execute(num_retries=self.num_retries)
         if tags["items"]:
             self._extra.add(item)
@@ -873,9 +941,9 @@ class TagDirectory(Directory):
     to the user that are tagged with a particular tag.
     """
 
-    def __init__(self, parent_inode, inodes, api, num_retries, enable_write, tag,
+    def __init__(self, parent_inode, inodes, api, num_retries, enable_write, filters, tag,
                  poll=False, poll_time=60):
-        super(TagDirectory, self).__init__(parent_inode, inodes, api.config, enable_write)
+        super(TagDirectory, self).__init__(parent_inode, inodes, api.config, enable_write, filters)
         self.api = api
         self.num_retries = num_retries
         self.tag = tag
@@ -889,23 +957,36 @@ class TagDirectory(Directory):
     def update(self):
         with llfuse.lock_released:
             taggedcollections = self.api.links().list(
-                filters=[['link_class', '=', 'tag'],
-                         ['name', '=', self.tag],
-                         ['head_uuid', 'is_a', 'arvados#collection']],
-                select=['head_uuid']
-                ).execute(num_retries=self.num_retries)
-        self.merge(taggedcollections['items'],
-                   lambda i: i['head_uuid'],
-                   lambda a, i: a.collection_locator == i['head_uuid'],
-                   lambda i: CollectionDirectory(self.inode, self.inodes, self.api, self.num_retries, self._enable_write, i['head_uuid']))
+                filters=[
+                    ['link_class', '=', 'tag'],
+                    ['name', '=', self.tag],
+                    ['head_uuid', 'is_a', 'arvados#collection'],
+                    *self._filters_for('links', qualified=False),
+                ],
+                select=['head_uuid'],
+            ).execute(num_retries=self.num_retries)
+        self.merge(
+            taggedcollections['items'],
+            lambda i: i['head_uuid'],
+            lambda a, i: a.collection_locator == i['head_uuid'],
+            lambda i: CollectionDirectory(
+                self.inode,
+                self.inodes,
+                self.api,
+                self.num_retries,
+                self._enable_write,
+                self._filters,
+                i['head_uuid'],
+            ),
+        )
 
 
 class ProjectDirectory(Directory):
     """A special directory that contains the contents of a project."""
 
-    def __init__(self, parent_inode, inodes, api, num_retries, enable_write, project_object,
-                 poll=True, poll_time=3, storage_classes=None):
-        super(ProjectDirectory, self).__init__(parent_inode, inodes, api.config, enable_write)
+    def __init__(self, parent_inode, inodes, api, num_retries, enable_write, filters,
+                 project_object, poll=True, poll_time=3, storage_classes=None):
+        super(ProjectDirectory, self).__init__(parent_inode, inodes, api.config, enable_write, filters)
         self.api = api
         self.num_retries = num_retries
         self.project_object = project_object
@@ -922,14 +1003,14 @@ class ProjectDirectory(Directory):
         return True
 
     def createDirectory(self, i):
+        common_args = (self.inode, self.inodes, self.api, self.num_retries, self._enable_write, self._filters)
         if collection_uuid_pattern.match(i['uuid']):
-            return CollectionDirectory(self.inode, self.inodes, self.api, self.num_retries, self._enable_write, i)
+            return CollectionDirectory(*common_args, i)
         elif group_uuid_pattern.match(i['uuid']):
-            return ProjectDirectory(self.inode, self.inodes, self.api, self.num_retries, self._enable_write,
-                                    i, self._poll, self._poll_time, self.storage_classes)
+            return ProjectDirectory(*common_args, i, self._poll, self._poll_time, self.storage_classes)
         elif link_uuid_pattern.match(i['uuid']):
             if i['head_kind'] == 'arvados#collection' or portable_data_hash_pattern.match(i['head_uuid']):
-                return CollectionDirectory(self.inode, self.inodes, self.api, self.num_retries, self._enable_write, i['head_uuid'])
+                return CollectionDirectory(*common_args, i['head_uuid'])
             else:
                 return None
         elif uuid_pattern.match(i['uuid']):
@@ -990,19 +1071,27 @@ class ProjectDirectory(Directory):
                     self.project_object = self.api.users().get(
                         uuid=self.project_uuid).execute(num_retries=self.num_retries)
                 # do this in 2 steps until #17424 is fixed
-                contents = list(arvados.util.keyset_list_all(self.api.groups().contents,
-                                                        order_key="uuid",
-                                                        num_retries=self.num_retries,
-                                                        uuid=self.project_uuid,
-                                                        filters=[["uuid", "is_a", "arvados#group"],
-                                                                 ["groups.group_class", "in", ["project","filter"]]]))
-                contents.extend(filter(lambda i: i["current_version_uuid"] == i["uuid"],
-                                       arvados.util.keyset_list_all(self.api.groups().contents,
-                                                             order_key="uuid",
-                                                             num_retries=self.num_retries,
-                                                             uuid=self.project_uuid,
-                                                             filters=[["uuid", "is_a", "arvados#collection"]])))
-
+                contents = list(arvados.util.keyset_list_all(
+                    self.api.groups().contents,
+                    order_key='uuid',
+                    num_retries=self.num_retries,
+                    uuid=self.project_uuid,
+                    filters=[
+                        ['uuid', 'is_a', 'arvados#group'],
+                        ['groups.group_class', 'in', ['project', 'filter']],
+                        *self._filters_for('groups', qualified=True),
+                    ],
+                ))
+                contents.extend(obj for obj in arvados.util.keyset_list_all(
+                    self.api.groups().contents,
+                    order_key='uuid',
+                    num_retries=self.num_retries,
+                    uuid=self.project_uuid,
+                    filters=[
+                        ['uuid', 'is_a', 'arvados#collection'],
+                        *self._filters_for('collections', qualified=True),
+                    ],
+                ) if obj['current_version_uuid'] == obj['uuid'])
 
             # end with llfuse.lock_released, re-acquire lock
 
@@ -1032,14 +1121,24 @@ class ProjectDirectory(Directory):
                 namefilter = ["name", "=", k]
             else:
                 namefilter = ["name", "in", [k, k2]]
-            contents = self.api.groups().list(filters=[["owner_uuid", "=", self.project_uuid],
-                                                       ["group_class", "in", ["project","filter"]],
-                                                       namefilter],
-                                              limit=2).execute(num_retries=self.num_retries)["items"]
+            contents = self.api.groups().list(
+                filters=[
+                    ["owner_uuid", "=", self.project_uuid],
+                    ["group_class", "in", ["project","filter"]],
+                    namefilter,
+                    *self._filters_for('groups', qualified=False),
+                ],
+                limit=2,
+            ).execute(num_retries=self.num_retries)["items"]
             if not contents:
-                contents = self.api.collections().list(filters=[["owner_uuid", "=", self.project_uuid],
-                                                                namefilter],
-                                                       limit=2).execute(num_retries=self.num_retries)["items"]
+                contents = self.api.collections().list(
+                    filters=[
+                        ["owner_uuid", "=", self.project_uuid],
+                        namefilter,
+                        *self._filters_for('collections', qualified=False),
+                    ],
+                    limit=2,
+                ).execute(num_retries=self.num_retries)["items"]
         if contents:
             if len(contents) > 1 and contents[1]['name'] == k:
                 # If "foo/bar" and "foo[SUBST]bar" both exist, use
@@ -1193,9 +1292,9 @@ class ProjectDirectory(Directory):
 class SharedDirectory(Directory):
     """A special directory that represents users or groups who have shared projects with me."""
 
-    def __init__(self, parent_inode, inodes, api, num_retries, enable_write, exclude,
-                 poll=False, poll_time=60, storage_classes=None):
-        super(SharedDirectory, self).__init__(parent_inode, inodes, api.config, enable_write)
+    def __init__(self, parent_inode, inodes, api, num_retries, enable_write, filters,
+                 exclude, poll=False, poll_time=60, storage_classes=None):
+        super(SharedDirectory, self).__init__(parent_inode, inodes, api.config, enable_write, filters)
         self.api = api
         self.num_retries = num_retries
         self.current_user = api.users().current().execute(num_retries=num_retries)
@@ -1221,11 +1320,17 @@ class SharedDirectory(Directory):
                 if 'httpMethod' in methods.get('shared', {}):
                     page = []
                     while True:
-                        resp = self.api.groups().shared(filters=[['group_class', 'in', ['project','filter']]]+page,
-                                                        order="uuid",
-                                                        limit=10000,
-                                                        count="none",
-                                                        include="owner_uuid").execute()
+                        resp = self.api.groups().shared(
+                            filters=[
+                                ['group_class', 'in', ['project','filter']],
+                                *page,
+                                *self._filters_for('groups', qualified=False),
+                            ],
+                            order="uuid",
+                            limit=10000,
+                            count="none",
+                            include="owner_uuid",
+                        ).execute()
                         if not resp["items"]:
                             break
                         page = [["uuid", ">", resp["items"][len(resp["items"])-1]["uuid"]]]
@@ -1240,8 +1345,12 @@ class SharedDirectory(Directory):
                         self.api.groups().list,
                         order_key="uuid",
                         num_retries=self.num_retries,
-                        filters=[['group_class','in',['project','filter']]],
-                        select=["uuid", "owner_uuid"]))
+                        filters=[
+                            ['group_class', 'in', ['project','filter']],
+                            *self._filters_for('groups', qualified=False),
+                        ],
+                        select=["uuid", "owner_uuid"],
+                    ))
                     for ob in all_projects:
                         objects[ob['uuid']] = ob
 
@@ -1255,13 +1364,20 @@ class SharedDirectory(Directory):
                         self.api.users().list,
                         order_key="uuid",
                         num_retries=self.num_retries,
-                        filters=[['uuid','in', list(root_owners)]])
+                        filters=[
+                            ['uuid', 'in', list(root_owners)],
+                            *self._filters_for('users', qualified=False),
+                        ],
+                    )
                     lgroups = arvados.util.keyset_list_all(
                         self.api.groups().list,
                         order_key="uuid",
                         num_retries=self.num_retries,
-                        filters=[['uuid','in', list(root_owners)+roots]])
-
+                        filters=[
+                            ['uuid', 'in', list(root_owners)+roots],
+                            *self._filters_for('groups', qualified=False),
+                        ],
+                    )
                     for l in lusers:
                         objects[l["uuid"]] = l
                     for l in lgroups:
@@ -1283,11 +1399,23 @@ class SharedDirectory(Directory):
 
             # end with llfuse.lock_released, re-acquire lock
 
-            self.merge(contents.items(),
-                       lambda i: i[0],
-                       lambda a, i: a.uuid() == i[1]['uuid'],
-                       lambda i: ProjectDirectory(self.inode, self.inodes, self.api, self.num_retries, self._enable_write,
-                                                  i[1], poll=self._poll, poll_time=self._poll_time, storage_classes=self.storage_classes))
+            self.merge(
+                contents.items(),
+                lambda i: i[0],
+                lambda a, i: a.uuid() == i[1]['uuid'],
+                lambda i: ProjectDirectory(
+                    self.inode,
+                    self.inodes,
+                    self.api,
+                    self.num_retries,
+                    self._enable_write,
+                    self._filters,
+                    i[1],
+                    poll=self._poll,
+                    poll_time=self._poll_time,
+                    storage_classes=self.storage_classes,
+                ),
+            )
         except Exception:
             _logger.exception("arv-mount shared dir error")
         finally:
index c316010f6c48b17b5d7aa35b4fe96d1021bfb49d..8a3522e0cb0df7e11aec61279ab530d3d2395e44 100644 (file)
@@ -72,15 +72,22 @@ class MountTestBase(unittest.TestCase):
         llfuse.close()
 
     def make_mount(self, root_class, **root_kwargs):
-        enable_write = True
-        if 'enable_write' in root_kwargs:
-            enable_write = root_kwargs.pop('enable_write')
+        enable_write = root_kwargs.pop('enable_write', True)
         self.operations = fuse.Operations(
-            os.getuid(), os.getgid(),
+            os.getuid(),
+            os.getgid(),
             api_client=self.api,
-            enable_write=enable_write)
+            enable_write=enable_write,
+        )
         self.operations.inodes.add_entry(root_class(
-            llfuse.ROOT_INODE, self.operations.inodes, self.api, 0, enable_write, **root_kwargs))
+            llfuse.ROOT_INODE,
+            self.operations.inodes,
+            self.api,
+            0,
+            enable_write,
+            root_kwargs.pop('filters', None),
+            **root_kwargs,
+        ))
         llfuse.init(self.operations, self.mounttmp, [])
         self.llfuse_thread = threading.Thread(None, lambda: self._llfuse_main())
         self.llfuse_thread.daemon = True
diff --git a/services/fuse/tests/test_mount_filters.py b/services/fuse/tests/test_mount_filters.py
new file mode 100644 (file)
index 0000000..5f32453
--- /dev/null
@@ -0,0 +1,223 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+import collections
+import itertools
+import json
+import re
+import unittest
+
+from pathlib import Path
+
+from parameterized import parameterized
+
+from arvados_fuse import fusedir
+
+from .integration_test import IntegrationTest
+from .mount_test_base import MountTestBase
+from .run_test_server import fixture
+
+_COLLECTIONS = fixture('collections')
+_GROUPS = fixture('groups')
+_LINKS = fixture('links')
+_USERS = fixture('users')
+
+class DirectoryFiltersTestCase(MountTestBase):
+    DEFAULT_ROOT_KWARGS = {
+        'enable_write': False,
+        'filters': [
+            ['collections.name', 'like', 'zzzzz-4zz18-%'],
+            # This matches both "A Project" (which we use as the test root)
+            # and "A Subproject" (which we assert is found under it).
+            ['groups.name', 'like', 'A %roject'],
+        ],
+    }
+    EXPECTED_PATHS = frozenset([
+        _COLLECTIONS['foo_collection_in_aproject']['name'],
+        _GROUPS['asubproject']['name'],
+    ])
+    CHECKED_PATHS = EXPECTED_PATHS.union([
+        _COLLECTIONS['collection_to_move_around_in_aproject']['name'],
+        _GROUPS['subproject_in_active_user_home_project_to_test_unique_key_violation']['name'],
+    ])
+
+    @parameterized.expand([
+        (fusedir.MagicDirectory, {}, _GROUPS['aproject']['uuid']),
+        (fusedir.ProjectDirectory, {'project_object': _GROUPS['aproject']}, '.'),
+        (fusedir.SharedDirectory, {'exclude': None}, Path(
+            '{first_name} {last_name}'.format_map(_USERS['active']),
+            _GROUPS['aproject']['name'],
+        )),
+    ])
+    def test_filtered_path_exists(self, root_class, root_kwargs, subdir):
+        root_kwargs = collections.ChainMap(root_kwargs, self.DEFAULT_ROOT_KWARGS)
+        self.make_mount(root_class, **root_kwargs)
+        dir_path = Path(self.mounttmp, subdir)
+        actual = frozenset(
+            basename
+            for basename in self.CHECKED_PATHS
+            if (dir_path / basename).exists()
+        )
+        self.assertEqual(
+            actual,
+            self.EXPECTED_PATHS,
+            "mount existence checks did not match expected results",
+        )
+
+    @parameterized.expand([
+        (fusedir.MagicDirectory, {}, _GROUPS['aproject']['uuid']),
+        (fusedir.ProjectDirectory, {'project_object': _GROUPS['aproject']}, '.'),
+        (fusedir.SharedDirectory, {'exclude': None}, Path(
+            '{first_name} {last_name}'.format_map(_USERS['active']),
+            _GROUPS['aproject']['name'],
+        )),
+    ])
+    def test_filtered_path_listing(self, root_class, root_kwargs, subdir):
+        root_kwargs = collections.ChainMap(root_kwargs, self.DEFAULT_ROOT_KWARGS)
+        self.make_mount(root_class, **root_kwargs)
+        actual = frozenset(path.name for path in Path(self.mounttmp, subdir).iterdir())
+        self.assertEqual(
+            actual & self.EXPECTED_PATHS,
+            self.EXPECTED_PATHS,
+            "mount listing did not include minimum matches",
+        )
+        extra = frozenset(
+            name
+            for name in actual
+            if not (name.startswith('zzzzz-4zz18-') or name.endswith('roject'))
+        )
+        self.assertFalse(
+            extra,
+            "mount listing included results outside filters",
+        )
+
+
+class TagFiltersTestCase(MountTestBase):
+    COLL_UUID = _COLLECTIONS['foo_collection_in_aproject']['uuid']
+    TAG_NAME = _LINKS['foo_collection_tag']['name']
+
+    @parameterized.expand([
+        '=',
+        '!=',
+    ])
+    def test_tag_directory_filters(self, op):
+        self.make_mount(
+            fusedir.TagDirectory,
+            enable_write=False,
+            filters=[
+                ['links.head_uuid', op, self.COLL_UUID],
+            ],
+            tag=self.TAG_NAME,
+        )
+        checked_path = Path(self.mounttmp, self.COLL_UUID)
+        self.assertEqual(checked_path.exists(), op == '=')
+
+    @parameterized.expand(itertools.product(
+        ['in', 'not in'],
+        ['=', '!='],
+    ))
+    def test_tags_directory_filters(self, coll_op, link_op):
+        self.make_mount(
+            fusedir.TagsDirectory,
+            enable_write=False,
+            filters=[
+                ['links.head_uuid', coll_op, [self.COLL_UUID]],
+                ['links.name', link_op, self.TAG_NAME],
+            ],
+        )
+        if link_op == '!=':
+            filtered_path = Path(self.mounttmp, self.TAG_NAME)
+        elif coll_op == 'not in':
+            # As of 2024-02-09, foo tag only applies to the single collection.
+            # If you filter it out via head_uuid, then it disappears completely
+            # from the TagsDirectory. Hence we set that tag directory as
+            # filtered_path. If any of this changes in the future,
+            # it would be fine to append self.COLL_UUID to filtered_path here.
+            filtered_path = Path(self.mounttmp, self.TAG_NAME)
+        else:
+            filtered_path = Path(self.mounttmp, self.TAG_NAME, self.COLL_UUID, 'foo', 'nonexistent')
+        expect_path = filtered_path.parent
+        self.assertTrue(
+            expect_path.exists(),
+            f"path not found but should exist: {expect_path}",
+        )
+        self.assertFalse(
+            filtered_path.exists(),
+            f"path was found but should be filtered out: {filtered_path}",
+        )
+
+
+class FiltersIntegrationTest(IntegrationTest):
+    COLLECTIONS_BY_PROP = {
+        coll['properties']['MainFile']: coll
+        for coll in _COLLECTIONS.values()
+        if coll['owner_uuid'] == _GROUPS['fuse_filters_test_project']['uuid']
+    }
+    PROP_VALUES = list(COLLECTIONS_BY_PROP)
+
+    for test_n, query in enumerate(['foo', 'ba?']):
+        @IntegrationTest.mount([
+            '--filters', json.dumps([
+                ['collections.properties.MainFile', 'like', query],
+            ]),
+            '--mount-by-pdh', 'by_pdh',
+            '--mount-by-id', 'by_id',
+            '--mount-home', 'home',
+        ])
+        def _test_func(self, query=query):
+            pdh_path = Path(self.mnt, 'by_pdh')
+            id_path = Path(self.mnt, 'by_id')
+            home_path = Path(self.mnt, 'home')
+            query_re = re.compile(query.replace('?', '.'))
+            for prop_val, coll in self.COLLECTIONS_BY_PROP.items():
+                should_exist = query_re.fullmatch(prop_val) is not None
+                for path in [
+                        pdh_path / coll['portable_data_hash'],
+                        id_path / coll['portable_data_hash'],
+                        id_path / coll['uuid'],
+                        home_path / coll['name'],
+                ]:
+                    self.assertEqual(
+                        path.exists(),
+                        should_exist,
+                        f"{path} from MainFile={prop_val} exists!={should_exist}",
+                    )
+        exec(f"test_collection_properties_filters_{test_n} = _test_func")
+
+    for test_n, mount_opts in enumerate([
+            ['--home'],
+            ['--project', _GROUPS['aproject']['uuid']],
+    ]):
+        @IntegrationTest.mount([
+            '--filters', json.dumps([
+                ['collections.name', 'like', 'zzzzz-4zz18-%'],
+                ['groups.name', 'like', 'A %roject'],
+            ]),
+            *mount_opts,
+        ])
+        def _test_func(self, mount_opts=mount_opts):
+            root_path = Path(self.mnt)
+            root_depth = len(root_path.parts)
+            max_depth = 0
+            name_re = re.compile(r'(zzzzz-4zz18-.*|A .*roject)')
+            dir_queue = [root_path]
+            while dir_queue:
+                root_path = dir_queue.pop()
+                max_depth = max(max_depth, len(root_path.parts))
+                for child in root_path.iterdir():
+                    if not child.is_dir():
+                        continue
+                    match = name_re.fullmatch(child.name)
+                    self.assertIsNotNone(
+                        match,
+                        "found directory with name that should've been filtered",
+                    )
+                    if not match.group(1).startswith('zzzzz-4zz18-'):
+                        dir_queue.append(child)
+            self.assertGreaterEqual(
+                max_depth,
+                root_depth + (2 if mount_opts[0] == '--home' else 1),
+                "test descended fewer subdirectories than expected",
+            )
+        exec(f"test_multiple_name_filters_{test_n} = _test_func")