From ec0b74da392f739d4340b7a9120f2f90f5871f47 Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Tue, 30 Apr 2024 11:18:31 -0400 Subject: [PATCH] 12430: Avoid loading collections excluded by globs. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- go.mod | 2 +- go.sum | 4 +- lib/crunchrun/copier.go | 88 ++++++++++++++++++++++++++++++++---- lib/crunchrun/copier_test.go | 76 +++++++++++++++++++++++++++++-- 4 files changed, 153 insertions(+), 17 deletions(-) diff --git a/go.mod b/go.mod index 62a31c1c58..e26a8699a4 100644 --- a/go.mod +++ b/go.mod @@ -11,6 +11,7 @@ require ( github.com/arvados/cgofuse v1.2.0-arvados1 github.com/aws/aws-sdk-go v1.44.174 github.com/aws/aws-sdk-go-v2 v0.23.0 + github.com/bmatcuk/doublestar/v4 v4.6.1 github.com/bradleypeabody/godap v0.0.0-20170216002349-c249933bc092 github.com/coreos/go-oidc/v3 v3.5.0 github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e @@ -65,7 +66,6 @@ require ( github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/bgentry/speakeasy v0.1.0 // indirect - github.com/bmatcuk/doublestar v1.3.4 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/dimchansky/utfbom v1.1.1 // indirect diff --git a/go.sum b/go.sum index fe6febb043..9d445185fb 100644 --- a/go.sum +++ b/go.sum @@ -59,8 +59,8 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/bgentry/speakeasy v0.1.0 h1:ByYyxL9InA1OWqxJqqp2A5pYHUrCiAL6K3J+LKSsQkY= github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs= -github.com/bmatcuk/doublestar v1.3.4 h1:gPypJ5xD31uhX6Tf54sDPUOBXTqKH4c9aPY66CyQrS0= -github.com/bmatcuk/doublestar v1.3.4/go.mod h1:wiQtGV+rzVYxB7WIlirSN++5HPtPlXEo9MEoZQC/PmE= +github.com/bmatcuk/doublestar/v4 v4.6.1 h1:FH9SifrbvJhnlQpztAx++wlkk70QBf0iBWDwNy7PA4I= +github.com/bmatcuk/doublestar/v4 v4.6.1/go.mod h1:xBQ8jztBU6kakFMg+8WGxn0c6z1fTSPVIjEY1Wr7jzc= github.com/boltdb/bolt v1.3.1/go.mod h1:clJnj/oiGkjum5o1McbSZDSLxVThjynRyGBgiAx27Ps= github.com/bradleypeabody/godap v0.0.0-20170216002349-c249933bc092 h1:0Di2onNnlN5PAyWPbqlPyN45eOQ+QW/J9eqLynt4IV4= github.com/bradleypeabody/godap v0.0.0-20170216002349-c249933bc092/go.mod h1:8IzBjZCRSnsvM6MJMG8HNNtnzMl48H22rbJL2kRUJ0Y= diff --git a/lib/crunchrun/copier.go b/lib/crunchrun/copier.go index 8d986d7066..507029b36e 100644 --- a/lib/crunchrun/copier.go +++ b/lib/crunchrun/copier.go @@ -18,7 +18,7 @@ import ( "git.arvados.org/arvados.git/sdk/go/arvados" "git.arvados.org/arvados.git/sdk/go/keepclient" "git.arvados.org/arvados.git/sdk/go/manifest" - "github.com/bmatcuk/doublestar" + "github.com/bmatcuk/doublestar/v4" ) type printfer interface { @@ -115,14 +115,44 @@ func (cp *copier) Copy() (string, error) { return collfs.MarshalManifest(".") } -func (cp *copier) matchGlobs(path string) bool { +func (cp *copier) matchGlobs(path string, isDir bool) bool { // An entry in the top level of the output directory looks // like "/foo", but globs look like "foo", so we strip the // leading "/" before matching. path = strings.TrimLeft(path, "/") for _, glob := range cp.globs { - if match, _ := doublestar.Match(glob, path); match { + if !isDir && strings.HasSuffix(glob, "/**") { + // doublestar.Match("f*/**", "ff") and + // doublestar.Match("f*/**", "ff/gg") both + // return true, but (to be compatible with + // bash shopt) "ff" should match only if it is + // a directory. + // + // To avoid errant matches, we add the file's + // basename to the end of the pattern: + // + // Match("f*/**/ff", "ff") => false + // Match("f*/**/gg", "ff/gg") => true + // + // Of course, we need to escape basename in + // case it contains *, ?, \, etc. + _, name := filepath.Split(path) + escapedName := strings.TrimSuffix(strings.Replace(name, "", "\\", -1), "\\") + if match, _ := doublestar.Match(glob+"/"+escapedName, path); match { + return true + } + } else if match, _ := doublestar.Match(glob, path); match { return true + } else if isDir { + // Workaround doublestar bug (v4.6.1). + // "foo*/**" should match "foo", but does not, + // because isZeroLengthPattern does not accept + // "*/**" as a zero length pattern. + if trunc := strings.TrimSuffix(glob, "*/**"); trunc != glob { + if match, _ := doublestar.Match(trunc, path); match { + return true + } + } } } return false @@ -141,7 +171,7 @@ func (cp *copier) applyGlobsToFilesAndDirs() { } keepdirs := make(map[string]bool) for _, path := range cp.dirs { - if cp.matchGlobs(path) { + if cp.matchGlobs(path, true) { keepdirs[path] = true } } @@ -154,7 +184,7 @@ func (cp *copier) applyGlobsToFilesAndDirs() { } var keepfiles []filetodo for _, file := range cp.files { - if cp.matchGlobs(file.dst) { + if cp.matchGlobs(file.dst, false) { keepfiles = append(keepfiles, file) } } @@ -182,7 +212,7 @@ func (cp *copier) applyGlobsToCollectionFS(collfs arvados.CollectionFileSystem) } include := make(map[string]bool) err := fs.WalkDir(arvados.FS(collfs), "", func(path string, ent fs.DirEntry, err error) error { - if cp.matchGlobs(path) { + if cp.matchGlobs(path, ent.IsDir()) { for i, c := range path { if i > 0 && c == '/' { include[path[:i]] = true @@ -213,6 +243,42 @@ func (cp *copier) applyGlobsToCollectionFS(collfs arvados.CollectionFileSystem) return err } +// Return true if it's possible for any descendant of the given path +// to match anything in cp.globs. Used by walkMount to avoid loading +// collections that are mounted underneath ctrOutputPath but excluded +// by globs. +func (cp *copier) subtreeCouldMatch(path string) bool { + if len(cp.globs) == 0 { + return true + } + pathdepth := 1 + strings.Count(path, "/") + for _, glob := range cp.globs { + globdepth := 0 + lastsep := 0 + for i, c := range glob { + if c != '/' || !doublestar.ValidatePattern(glob[:i]) { + // Escaped "/", or "/" in a character + // class, is not a path separator. + continue + } + if glob[lastsep:i] == "**" { + return true + } + lastsep = i + 1 + if globdepth++; globdepth == pathdepth { + if match, _ := doublestar.Match(glob[:i]+"/*", path+"/z"); match { + return true + } + break + } + } + if globdepth < pathdepth && glob[lastsep:] == "**" { + return true + } + } + return false +} + func (cp *copier) copyFile(fs arvados.CollectionFileSystem, f filetodo) (int64, error) { cp.logger.Printf("copying %q (%d bytes)", strings.TrimLeft(f.dst, "/"), f.size) dst, err := fs.OpenFile(f.dst, os.O_CREATE|os.O_WRONLY, 0666) @@ -267,9 +333,8 @@ func (cp *copier) walkMount(dest, src string, maxSymlinks int, walkMountsBelow b // copy, relative to its mount point -- ".", "./foo.txt", ... srcRelPath := filepath.Join(".", srcMount.Path, src[len(srcRoot):]) - // outputRelPath is the path relative in the output directory - // that corresponds to the path in the output collection where - // the file will go, for logging + // outputRelPath is the destination path relative to the + // output directory. Used for logging and glob matching. var outputRelPath = "" if strings.HasPrefix(src, cp.ctrOutputDir) { outputRelPath = strings.TrimPrefix(src[len(cp.ctrOutputDir):], "/") @@ -283,6 +348,9 @@ func (cp *copier) walkMount(dest, src string, maxSymlinks int, walkMountsBelow b switch { case srcMount.ExcludeFromOutput: + case outputRelPath != "*" && !cp.subtreeCouldMatch(outputRelPath): + cp.logger.Printf("not copying %q because contents cannot match output globs", outputRelPath) + return nil case srcMount.Kind == "tmp": // Handle by walking the host filesystem. return cp.walkHostFS(dest, src, maxSymlinks, walkMountsBelow) @@ -434,6 +502,8 @@ func (cp *copier) walkHostFS(dest, src string, maxSymlinks int, includeMounts bo // (...except mount types that are // handled as regular files.) continue + } else if isMount && !cp.subtreeCouldMatch(src[len(cp.ctrOutputDir)+1:]) { + continue } err = cp.walkHostFS(dest, src, maxSymlinks, false) if err != nil { diff --git a/lib/crunchrun/copier_test.go b/lib/crunchrun/copier_test.go index a1fc81c716..9deba783b2 100644 --- a/lib/crunchrun/copier_test.go +++ b/lib/crunchrun/copier_test.go @@ -214,6 +214,68 @@ func (s *copierSuite) TestWritableMountBelow(c *check.C) { }) } +// Check some glob-matching edge cases. In particular, check that +// patterns like "foo/**" do not match regular files named "foo" +// (unless of course they are inside a directory named "foo"). +func (s *copierSuite) TestMatchGlobs(c *check.C) { + s.cp.globs = []string{"foo*/**"} + c.Check(s.cp.matchGlobs("foo", true), check.Equals, true) + c.Check(s.cp.matchGlobs("food", true), check.Equals, true) + c.Check(s.cp.matchGlobs("foo", false), check.Equals, false) + c.Check(s.cp.matchGlobs("food", false), check.Equals, false) + + s.cp.globs = []string{"ba[!/]/foo*/**"} + c.Check(s.cp.matchGlobs("bar/foo", true), check.Equals, true) + c.Check(s.cp.matchGlobs("bar/food", true), check.Equals, true) + c.Check(s.cp.matchGlobs("bar/foo", false), check.Equals, false) + c.Check(s.cp.matchGlobs("bar/food", false), check.Equals, false) + c.Check(s.cp.matchGlobs("bar/foo/z\\[", true), check.Equals, true) + c.Check(s.cp.matchGlobs("bar/food/z\\[", true), check.Equals, true) + c.Check(s.cp.matchGlobs("bar/foo/z\\[", false), check.Equals, true) + c.Check(s.cp.matchGlobs("bar/food/z\\[", false), check.Equals, true) + + s.cp.globs = []string{"waz/**/foo*/**"} + c.Check(s.cp.matchGlobs("waz/quux/foo", true), check.Equals, true) + c.Check(s.cp.matchGlobs("waz/quux/food", true), check.Equals, true) + c.Check(s.cp.matchGlobs("waz/quux/foo", false), check.Equals, false) + c.Check(s.cp.matchGlobs("waz/quux/food", false), check.Equals, false) + c.Check(s.cp.matchGlobs("waz/quux/foo/foo", true), check.Equals, true) + c.Check(s.cp.matchGlobs("waz/quux/food/foo", true), check.Equals, true) + c.Check(s.cp.matchGlobs("waz/quux/foo/foo", false), check.Equals, true) + c.Check(s.cp.matchGlobs("waz/quux/food/foo", false), check.Equals, true) +} + +func (s *copierSuite) TestSubtreeCouldMatch(c *check.C) { + for _, trial := range []struct { + mount string // relative to output dir + glob string + could bool + }{ + {mount: "abc", glob: "*"}, + {mount: "abc", glob: "abc/*", could: true}, + {mount: "abc", glob: "a*/**", could: true}, + {mount: "abc", glob: "**", could: true}, + {mount: "abc", glob: "*/*", could: true}, + {mount: "abc", glob: "**/*.txt", could: true}, + {mount: "abc/def", glob: "*"}, + {mount: "abc/def", glob: "*/*"}, + {mount: "abc/def", glob: "*/*.txt"}, + {mount: "abc/def", glob: "*/*/*", could: true}, + {mount: "abc/def", glob: "**", could: true}, + {mount: "abc/def", glob: "**/bar", could: true}, + {mount: "abc/def", glob: "abc/**", could: true}, + {mount: "abc/def/ghi", glob: "*c/**/bar", could: true}, + {mount: "abc/def/ghi", glob: "*c/*f/bar"}, + {mount: "abc/def/ghi", glob: "abc/d[^/]f/ghi/*", could: true}, + } { + c.Logf("=== %+v", trial) + got := (&copier{ + globs: []string{trial.glob}, + }).subtreeCouldMatch(trial.mount) + c.Check(got, check.Equals, trial.could) + } +} + func (s *copierSuite) TestMountBelowExcludedByGlob(c *check.C) { bindtmp := c.MkDir() s.cp.mounts["/ctr/outdir/include/includer"] = arvados.Mount{ @@ -234,7 +296,7 @@ func (s *copierSuite) TestMountBelowExcludedByGlob(c *check.C) { PortableDataHash: arvadostest.FooCollectionPDH, Writable: true, } - s.cp.mounts["/ctr/outdir/nonexistent-collection"] = arvados.Mount{ + s.cp.mounts["/ctr/outdir/nonexistent/collection"] = arvados.Mount{ // As extra assurance, plant a collection that will // fail if copier attempts to load its manifest. (For // performance reasons it's important that copier @@ -244,8 +306,8 @@ func (s *copierSuite) TestMountBelowExcludedByGlob(c *check.C) { PortableDataHash: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa+1234", } s.cp.globs = []string{ - "?ncl*/*r", - "*/?ncl*", + "?ncl*/*r/*", + "*/?ncl*/**", } c.Assert(os.MkdirAll(s.cp.hostOutputDir+"/include/includer", 0755), check.IsNil) c.Assert(os.MkdirAll(s.cp.hostOutputDir+"/include/includew", 0755), check.IsNil) @@ -264,14 +326,18 @@ func (s *copierSuite) TestMountBelowExcludedByGlob(c *check.C) { c.Check(err, check.IsNil) c.Log(s.log.String()) - c.Check(s.cp.dirs, check.DeepEquals, []string{"/include", "/include/includew"}) + // Note it's OK that "/exclude" is not excluded by walkMount: + // it is just a local filesystem directory, not a mount point + // that's expensive to walk. In real-life usage, it will be + // removed from cp.dirs before any copying happens. + c.Check(s.cp.dirs, check.DeepEquals, []string{"/exclude", "/include", "/include/includew"}) c.Check(s.cp.files, check.DeepEquals, []filetodo{ {src: s.cp.hostOutputDir + "/include/includew/foo", dst: "/include/includew/foo", size: 3}, }) c.Check(s.cp.manifest, check.Matches, `(?ms).*\./include/includer .*`) c.Check(s.cp.manifest, check.Not(check.Matches), `(?ms).*exclude.*`) c.Check(s.log.String(), check.Matches, `(?ms).*not copying \\"exclude/excluder\\".*`) - c.Check(s.log.String(), check.Matches, `(?ms).*not copying \\"exclude/excludew\\".*`) + c.Check(s.log.String(), check.Matches, `(?ms).*not copying \\"nonexistent/collection\\".*`) } func (s *copierSuite) writeFileInOutputDir(c *check.C, path, data string) { -- 2.30.2