12430: Avoid loading collections excluded by globs.
authorTom Clegg <tom@curii.com>
Tue, 30 Apr 2024 15:18:31 +0000 (11:18 -0400)
committerTom Clegg <tom@curii.com>
Tue, 30 Apr 2024 15:18:31 +0000 (11:18 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

go.mod
go.sum
lib/crunchrun/copier.go
lib/crunchrun/copier_test.go

diff --git a/go.mod b/go.mod
index 62a31c1c58501ada1bbc3b8d6b81f3b134253161..e26a8699a4753502548ffa86a546f268d8f2c59e 100644 (file)
--- a/go.mod
+++ b/go.mod
@@ -11,6 +11,7 @@ require (
        github.com/arvados/cgofuse v1.2.0-arvados1
        github.com/aws/aws-sdk-go v1.44.174
        github.com/aws/aws-sdk-go-v2 v0.23.0
+       github.com/bmatcuk/doublestar/v4 v4.6.1
        github.com/bradleypeabody/godap v0.0.0-20170216002349-c249933bc092
        github.com/coreos/go-oidc/v3 v3.5.0
        github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e
@@ -65,7 +66,6 @@ require (
        github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239 // indirect
        github.com/beorn7/perks v1.0.1 // indirect
        github.com/bgentry/speakeasy v0.1.0 // indirect
-       github.com/bmatcuk/doublestar v1.3.4 // indirect
        github.com/cespare/xxhash/v2 v2.2.0 // indirect
        github.com/davecgh/go-spew v1.1.1 // indirect
        github.com/dimchansky/utfbom v1.1.1 // indirect
diff --git a/go.sum b/go.sum
index fe6febb04364fd0250105dc6161a5dff11349ba2..9d445185fb35dbe1a1c6e0ef2f810ef6411d6cd4 100644 (file)
--- a/go.sum
+++ b/go.sum
@@ -59,8 +59,8 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
 github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
 github.com/bgentry/speakeasy v0.1.0 h1:ByYyxL9InA1OWqxJqqp2A5pYHUrCiAL6K3J+LKSsQkY=
 github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs=
-github.com/bmatcuk/doublestar v1.3.4 h1:gPypJ5xD31uhX6Tf54sDPUOBXTqKH4c9aPY66CyQrS0=
-github.com/bmatcuk/doublestar v1.3.4/go.mod h1:wiQtGV+rzVYxB7WIlirSN++5HPtPlXEo9MEoZQC/PmE=
+github.com/bmatcuk/doublestar/v4 v4.6.1 h1:FH9SifrbvJhnlQpztAx++wlkk70QBf0iBWDwNy7PA4I=
+github.com/bmatcuk/doublestar/v4 v4.6.1/go.mod h1:xBQ8jztBU6kakFMg+8WGxn0c6z1fTSPVIjEY1Wr7jzc=
 github.com/boltdb/bolt v1.3.1/go.mod h1:clJnj/oiGkjum5o1McbSZDSLxVThjynRyGBgiAx27Ps=
 github.com/bradleypeabody/godap v0.0.0-20170216002349-c249933bc092 h1:0Di2onNnlN5PAyWPbqlPyN45eOQ+QW/J9eqLynt4IV4=
 github.com/bradleypeabody/godap v0.0.0-20170216002349-c249933bc092/go.mod h1:8IzBjZCRSnsvM6MJMG8HNNtnzMl48H22rbJL2kRUJ0Y=
index 8d986d7066e703fcebb2de6c2afc675ed51ab727..507029b36edfcf49b7037807e365cbf79be54e2f 100644 (file)
@@ -18,7 +18,7 @@ import (
        "git.arvados.org/arvados.git/sdk/go/arvados"
        "git.arvados.org/arvados.git/sdk/go/keepclient"
        "git.arvados.org/arvados.git/sdk/go/manifest"
-       "github.com/bmatcuk/doublestar"
+       "github.com/bmatcuk/doublestar/v4"
 )
 
 type printfer interface {
@@ -115,14 +115,44 @@ func (cp *copier) Copy() (string, error) {
        return collfs.MarshalManifest(".")
 }
 
-func (cp *copier) matchGlobs(path string) bool {
+func (cp *copier) matchGlobs(path string, isDir bool) bool {
        // An entry in the top level of the output directory looks
        // like "/foo", but globs look like "foo", so we strip the
        // leading "/" before matching.
        path = strings.TrimLeft(path, "/")
        for _, glob := range cp.globs {
-               if match, _ := doublestar.Match(glob, path); match {
+               if !isDir && strings.HasSuffix(glob, "/**") {
+                       // doublestar.Match("f*/**", "ff") and
+                       // doublestar.Match("f*/**", "ff/gg") both
+                       // return true, but (to be compatible with
+                       // bash shopt) "ff" should match only if it is
+                       // a directory.
+                       //
+                       // To avoid errant matches, we add the file's
+                       // basename to the end of the pattern:
+                       //
+                       // Match("f*/**/ff", "ff") => false
+                       // Match("f*/**/gg", "ff/gg") => true
+                       //
+                       // Of course, we need to escape basename in
+                       // case it contains *, ?, \, etc.
+                       _, name := filepath.Split(path)
+                       escapedName := strings.TrimSuffix(strings.Replace(name, "", "\\", -1), "\\")
+                       if match, _ := doublestar.Match(glob+"/"+escapedName, path); match {
+                               return true
+                       }
+               } else if match, _ := doublestar.Match(glob, path); match {
                        return true
+               } else if isDir {
+                       // Workaround doublestar bug (v4.6.1).
+                       // "foo*/**" should match "foo", but does not,
+                       // because isZeroLengthPattern does not accept
+                       // "*/**" as a zero length pattern.
+                       if trunc := strings.TrimSuffix(glob, "*/**"); trunc != glob {
+                               if match, _ := doublestar.Match(trunc, path); match {
+                                       return true
+                               }
+                       }
                }
        }
        return false
@@ -141,7 +171,7 @@ func (cp *copier) applyGlobsToFilesAndDirs() {
        }
        keepdirs := make(map[string]bool)
        for _, path := range cp.dirs {
-               if cp.matchGlobs(path) {
+               if cp.matchGlobs(path, true) {
                        keepdirs[path] = true
                }
        }
@@ -154,7 +184,7 @@ func (cp *copier) applyGlobsToFilesAndDirs() {
        }
        var keepfiles []filetodo
        for _, file := range cp.files {
-               if cp.matchGlobs(file.dst) {
+               if cp.matchGlobs(file.dst, false) {
                        keepfiles = append(keepfiles, file)
                }
        }
@@ -182,7 +212,7 @@ func (cp *copier) applyGlobsToCollectionFS(collfs arvados.CollectionFileSystem)
        }
        include := make(map[string]bool)
        err := fs.WalkDir(arvados.FS(collfs), "", func(path string, ent fs.DirEntry, err error) error {
-               if cp.matchGlobs(path) {
+               if cp.matchGlobs(path, ent.IsDir()) {
                        for i, c := range path {
                                if i > 0 && c == '/' {
                                        include[path[:i]] = true
@@ -213,6 +243,42 @@ func (cp *copier) applyGlobsToCollectionFS(collfs arvados.CollectionFileSystem)
        return err
 }
 
+// Return true if it's possible for any descendant of the given path
+// to match anything in cp.globs.  Used by walkMount to avoid loading
+// collections that are mounted underneath ctrOutputPath but excluded
+// by globs.
+func (cp *copier) subtreeCouldMatch(path string) bool {
+       if len(cp.globs) == 0 {
+               return true
+       }
+       pathdepth := 1 + strings.Count(path, "/")
+       for _, glob := range cp.globs {
+               globdepth := 0
+               lastsep := 0
+               for i, c := range glob {
+                       if c != '/' || !doublestar.ValidatePattern(glob[:i]) {
+                               // Escaped "/", or "/" in a character
+                               // class, is not a path separator.
+                               continue
+                       }
+                       if glob[lastsep:i] == "**" {
+                               return true
+                       }
+                       lastsep = i + 1
+                       if globdepth++; globdepth == pathdepth {
+                               if match, _ := doublestar.Match(glob[:i]+"/*", path+"/z"); match {
+                                       return true
+                               }
+                               break
+                       }
+               }
+               if globdepth < pathdepth && glob[lastsep:] == "**" {
+                       return true
+               }
+       }
+       return false
+}
+
 func (cp *copier) copyFile(fs arvados.CollectionFileSystem, f filetodo) (int64, error) {
        cp.logger.Printf("copying %q (%d bytes)", strings.TrimLeft(f.dst, "/"), f.size)
        dst, err := fs.OpenFile(f.dst, os.O_CREATE|os.O_WRONLY, 0666)
@@ -267,9 +333,8 @@ func (cp *copier) walkMount(dest, src string, maxSymlinks int, walkMountsBelow b
        // copy, relative to its mount point -- ".", "./foo.txt", ...
        srcRelPath := filepath.Join(".", srcMount.Path, src[len(srcRoot):])
 
-       // outputRelPath is the path relative in the output directory
-       // that corresponds to the path in the output collection where
-       // the file will go, for logging
+       // outputRelPath is the destination path relative to the
+       // output directory. Used for logging and glob matching.
        var outputRelPath = ""
        if strings.HasPrefix(src, cp.ctrOutputDir) {
                outputRelPath = strings.TrimPrefix(src[len(cp.ctrOutputDir):], "/")
@@ -283,6 +348,9 @@ func (cp *copier) walkMount(dest, src string, maxSymlinks int, walkMountsBelow b
 
        switch {
        case srcMount.ExcludeFromOutput:
+       case outputRelPath != "*" && !cp.subtreeCouldMatch(outputRelPath):
+               cp.logger.Printf("not copying %q because contents cannot match output globs", outputRelPath)
+               return nil
        case srcMount.Kind == "tmp":
                // Handle by walking the host filesystem.
                return cp.walkHostFS(dest, src, maxSymlinks, walkMountsBelow)
@@ -434,6 +502,8 @@ func (cp *copier) walkHostFS(dest, src string, maxSymlinks int, includeMounts bo
                                // (...except mount types that are
                                // handled as regular files.)
                                continue
+                       } else if isMount && !cp.subtreeCouldMatch(src[len(cp.ctrOutputDir)+1:]) {
+                               continue
                        }
                        err = cp.walkHostFS(dest, src, maxSymlinks, false)
                        if err != nil {
index a1fc81c716dd7466be75065cdc597ff0cc50da36..9deba783b2da475f6186f8a15f7a563140721eb8 100644 (file)
@@ -214,6 +214,68 @@ func (s *copierSuite) TestWritableMountBelow(c *check.C) {
        })
 }
 
+// Check some glob-matching edge cases. In particular, check that
+// patterns like "foo/**" do not match regular files named "foo"
+// (unless of course they are inside a directory named "foo").
+func (s *copierSuite) TestMatchGlobs(c *check.C) {
+       s.cp.globs = []string{"foo*/**"}
+       c.Check(s.cp.matchGlobs("foo", true), check.Equals, true)
+       c.Check(s.cp.matchGlobs("food", true), check.Equals, true)
+       c.Check(s.cp.matchGlobs("foo", false), check.Equals, false)
+       c.Check(s.cp.matchGlobs("food", false), check.Equals, false)
+
+       s.cp.globs = []string{"ba[!/]/foo*/**"}
+       c.Check(s.cp.matchGlobs("bar/foo", true), check.Equals, true)
+       c.Check(s.cp.matchGlobs("bar/food", true), check.Equals, true)
+       c.Check(s.cp.matchGlobs("bar/foo", false), check.Equals, false)
+       c.Check(s.cp.matchGlobs("bar/food", false), check.Equals, false)
+       c.Check(s.cp.matchGlobs("bar/foo/z\\[", true), check.Equals, true)
+       c.Check(s.cp.matchGlobs("bar/food/z\\[", true), check.Equals, true)
+       c.Check(s.cp.matchGlobs("bar/foo/z\\[", false), check.Equals, true)
+       c.Check(s.cp.matchGlobs("bar/food/z\\[", false), check.Equals, true)
+
+       s.cp.globs = []string{"waz/**/foo*/**"}
+       c.Check(s.cp.matchGlobs("waz/quux/foo", true), check.Equals, true)
+       c.Check(s.cp.matchGlobs("waz/quux/food", true), check.Equals, true)
+       c.Check(s.cp.matchGlobs("waz/quux/foo", false), check.Equals, false)
+       c.Check(s.cp.matchGlobs("waz/quux/food", false), check.Equals, false)
+       c.Check(s.cp.matchGlobs("waz/quux/foo/foo", true), check.Equals, true)
+       c.Check(s.cp.matchGlobs("waz/quux/food/foo", true), check.Equals, true)
+       c.Check(s.cp.matchGlobs("waz/quux/foo/foo", false), check.Equals, true)
+       c.Check(s.cp.matchGlobs("waz/quux/food/foo", false), check.Equals, true)
+}
+
+func (s *copierSuite) TestSubtreeCouldMatch(c *check.C) {
+       for _, trial := range []struct {
+               mount string // relative to output dir
+               glob  string
+               could bool
+       }{
+               {mount: "abc", glob: "*"},
+               {mount: "abc", glob: "abc/*", could: true},
+               {mount: "abc", glob: "a*/**", could: true},
+               {mount: "abc", glob: "**", could: true},
+               {mount: "abc", glob: "*/*", could: true},
+               {mount: "abc", glob: "**/*.txt", could: true},
+               {mount: "abc/def", glob: "*"},
+               {mount: "abc/def", glob: "*/*"},
+               {mount: "abc/def", glob: "*/*.txt"},
+               {mount: "abc/def", glob: "*/*/*", could: true},
+               {mount: "abc/def", glob: "**", could: true},
+               {mount: "abc/def", glob: "**/bar", could: true},
+               {mount: "abc/def", glob: "abc/**", could: true},
+               {mount: "abc/def/ghi", glob: "*c/**/bar", could: true},
+               {mount: "abc/def/ghi", glob: "*c/*f/bar"},
+               {mount: "abc/def/ghi", glob: "abc/d[^/]f/ghi/*", could: true},
+       } {
+               c.Logf("=== %+v", trial)
+               got := (&copier{
+                       globs: []string{trial.glob},
+               }).subtreeCouldMatch(trial.mount)
+               c.Check(got, check.Equals, trial.could)
+       }
+}
+
 func (s *copierSuite) TestMountBelowExcludedByGlob(c *check.C) {
        bindtmp := c.MkDir()
        s.cp.mounts["/ctr/outdir/include/includer"] = arvados.Mount{
@@ -234,7 +296,7 @@ func (s *copierSuite) TestMountBelowExcludedByGlob(c *check.C) {
                PortableDataHash: arvadostest.FooCollectionPDH,
                Writable:         true,
        }
-       s.cp.mounts["/ctr/outdir/nonexistent-collection"] = arvados.Mount{
+       s.cp.mounts["/ctr/outdir/nonexistent/collection"] = arvados.Mount{
                // As extra assurance, plant a collection that will
                // fail if copier attempts to load its manifest.  (For
                // performance reasons it's important that copier
@@ -244,8 +306,8 @@ func (s *copierSuite) TestMountBelowExcludedByGlob(c *check.C) {
                PortableDataHash: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa+1234",
        }
        s.cp.globs = []string{
-               "?ncl*/*r",
-               "*/?ncl*",
+               "?ncl*/*r/*",
+               "*/?ncl*/**",
        }
        c.Assert(os.MkdirAll(s.cp.hostOutputDir+"/include/includer", 0755), check.IsNil)
        c.Assert(os.MkdirAll(s.cp.hostOutputDir+"/include/includew", 0755), check.IsNil)
@@ -264,14 +326,18 @@ func (s *copierSuite) TestMountBelowExcludedByGlob(c *check.C) {
        c.Check(err, check.IsNil)
        c.Log(s.log.String())
 
-       c.Check(s.cp.dirs, check.DeepEquals, []string{"/include", "/include/includew"})
+       // Note it's OK that "/exclude" is not excluded by walkMount:
+       // it is just a local filesystem directory, not a mount point
+       // that's expensive to walk.  In real-life usage, it will be
+       // removed from cp.dirs before any copying happens.
+       c.Check(s.cp.dirs, check.DeepEquals, []string{"/exclude", "/include", "/include/includew"})
        c.Check(s.cp.files, check.DeepEquals, []filetodo{
                {src: s.cp.hostOutputDir + "/include/includew/foo", dst: "/include/includew/foo", size: 3},
        })
        c.Check(s.cp.manifest, check.Matches, `(?ms).*\./include/includer .*`)
        c.Check(s.cp.manifest, check.Not(check.Matches), `(?ms).*exclude.*`)
        c.Check(s.log.String(), check.Matches, `(?ms).*not copying \\"exclude/excluder\\".*`)
-       c.Check(s.log.String(), check.Matches, `(?ms).*not copying \\"exclude/excludew\\".*`)
+       c.Check(s.log.String(), check.Matches, `(?ms).*not copying \\"nonexistent/collection\\".*`)
 }
 
 func (s *copierSuite) writeFileInOutputDir(c *check.C, path, data string) {