21891: Skip extra serialization step.
[arvados.git] / lib / crunchrun / copier_test.go
index 30e13f65033eef8dc2e93d5a0ba3acbd6356b155..3348a879a7fd98867aef3ba7dd46999089c9c750 100644 (file)
@@ -6,15 +6,18 @@ package crunchrun
 
 import (
        "bytes"
+       "encoding/json"
+       "fmt"
        "io"
        "io/fs"
-       "io/ioutil"
        "os"
        "sort"
        "syscall"
 
        "git.arvados.org/arvados.git/sdk/go/arvados"
+       "git.arvados.org/arvados.git/sdk/go/arvadosclient"
        "git.arvados.org/arvados.git/sdk/go/arvadostest"
+       "git.arvados.org/arvados.git/sdk/go/keepclient"
        "github.com/sirupsen/logrus"
        check "gopkg.in/check.v1"
 )
@@ -29,8 +32,17 @@ type copierSuite struct {
 func (s *copierSuite) SetUpTest(c *check.C) {
        tmpdir := c.MkDir()
        s.log = bytes.Buffer{}
+
+       cl, err := arvadosclient.MakeArvadosClient()
+       c.Assert(err, check.IsNil)
+       kc, err := keepclient.MakeKeepClient(cl)
+       c.Assert(err, check.IsNil)
+       collfs, err := (&arvados.Collection{}).FileSystem(arvados.NewClientFromEnv(), kc)
+       c.Assert(err, check.IsNil)
+
        s.cp = copier{
                client:        arvados.NewClientFromEnv(),
+               keepClient:    kc,
                hostOutputDir: tmpdir,
                ctrOutputDir:  "/ctr/outdir",
                mounts: map[string]arvados.Mount{
@@ -40,6 +52,7 @@ func (s *copierSuite) SetUpTest(c *check.C) {
                        "/secret_text": {Kind: "text", Content: "xyzzy"},
                },
                logger: &logrus.Logger{Out: &s.log, Formatter: &logrus.TextFormatter{}, Level: logrus.InfoLevel},
+               staged: collfs,
        }
 }
 
@@ -117,9 +130,7 @@ func (s *copierSuite) TestSymlinkToMountedCollection(c *check.C) {
        }
 
        // simulate mounted writable collection
-       bindtmp, err := ioutil.TempDir("", "crunch-run.test.")
-       c.Assert(err, check.IsNil)
-       defer os.RemoveAll(bindtmp)
+       bindtmp := c.MkDir()
        f, err := os.OpenFile(bindtmp+"/.arvados#collection", os.O_CREATE|os.O_WRONLY, 0644)
        c.Assert(err, check.IsNil)
        _, err = io.WriteString(f, `{"manifest_text":". 37b51d194a7513e45b56f6524f2d51f2+3 0:3:bar\n"}`)
@@ -140,7 +151,16 @@ func (s *copierSuite) TestSymlinkToMountedCollection(c *check.C) {
 
        err = s.cp.walkMount("", s.cp.ctrOutputDir, 10, true)
        c.Check(err, check.IsNil)
-       c.Check(s.cp.manifest, check.Matches, `(?ms)\./l_dir acbd\S+ 0:3:foo\n\. acbd\S+ 0:3:l_file\n\. 37b5\S+ 0:3:l_file_w\n`)
+       s.checkStagedFile(c, "l_dir/foo", 3)
+       s.checkStagedFile(c, "l_file", 3)
+       s.checkStagedFile(c, "l_file_w", 3)
+}
+
+func (s *copierSuite) checkStagedFile(c *check.C, path string, size int64) {
+       fi, err := s.cp.staged.Stat(path)
+       if c.Check(err, check.IsNil) {
+               c.Check(fi.Size(), check.Equals, size)
+       }
 }
 
 func (s *copierSuite) TestSymlink(c *check.C) {
@@ -217,6 +237,192 @@ func (s *copierSuite) TestWritableMountBelow(c *check.C) {
        })
 }
 
+// Check some glob-matching edge cases. In particular, check that
+// patterns like "foo/**" do not match regular files named "foo"
+// (unless of course they are inside a directory named "foo").
+func (s *copierSuite) TestMatchGlobs(c *check.C) {
+       s.cp.globs = []string{"foo*/**"}
+       c.Check(s.cp.matchGlobs("foo", true), check.Equals, true)
+       c.Check(s.cp.matchGlobs("food", true), check.Equals, true)
+       c.Check(s.cp.matchGlobs("foo", false), check.Equals, false)
+       c.Check(s.cp.matchGlobs("food", false), check.Equals, false)
+       c.Check(s.cp.matchGlobs("foo/bar", false), check.Equals, true)
+       c.Check(s.cp.matchGlobs("food/bar", false), check.Equals, true)
+       c.Check(s.cp.matchGlobs("foo/bar", true), check.Equals, true)
+       c.Check(s.cp.matchGlobs("food/bar", true), check.Equals, true)
+
+       s.cp.globs = []string{"ba[!/]/foo*/**"}
+       c.Check(s.cp.matchGlobs("bar/foo", true), check.Equals, true)
+       c.Check(s.cp.matchGlobs("bar/food", true), check.Equals, true)
+       c.Check(s.cp.matchGlobs("bar/foo", false), check.Equals, false)
+       c.Check(s.cp.matchGlobs("bar/food", false), check.Equals, false)
+       c.Check(s.cp.matchGlobs("bar/foo/z\\[", true), check.Equals, true)
+       c.Check(s.cp.matchGlobs("bar/food/z\\[", true), check.Equals, true)
+       c.Check(s.cp.matchGlobs("bar/foo/z\\[", false), check.Equals, true)
+       c.Check(s.cp.matchGlobs("bar/food/z\\[", false), check.Equals, true)
+
+       s.cp.globs = []string{"waz/**/foo*/**"}
+       c.Check(s.cp.matchGlobs("waz/quux/foo", true), check.Equals, true)
+       c.Check(s.cp.matchGlobs("waz/quux/food", true), check.Equals, true)
+       c.Check(s.cp.matchGlobs("waz/quux/foo", false), check.Equals, false)
+       c.Check(s.cp.matchGlobs("waz/quux/food", false), check.Equals, false)
+       c.Check(s.cp.matchGlobs("waz/quux/foo/foo", true), check.Equals, true)
+       c.Check(s.cp.matchGlobs("waz/quux/food/foo", true), check.Equals, true)
+       c.Check(s.cp.matchGlobs("waz/quux/foo/foo", false), check.Equals, true)
+       c.Check(s.cp.matchGlobs("waz/quux/food/foo", false), check.Equals, true)
+
+       s.cp.globs = []string{"foo/**/*"}
+       c.Check(s.cp.matchGlobs("foo", false), check.Equals, false)
+       c.Check(s.cp.matchGlobs("foo/bar", false), check.Equals, true)
+       c.Check(s.cp.matchGlobs("foo/bar/baz", false), check.Equals, true)
+       c.Check(s.cp.matchGlobs("foo/bar/baz/waz", false), check.Equals, true)
+}
+
+func (s *copierSuite) TestSubtreeCouldMatch(c *check.C) {
+       for _, trial := range []struct {
+               mount string // relative to output dir
+               glob  string
+               could bool
+       }{
+               {mount: "abc", glob: "*"},
+               {mount: "abc", glob: "abc/*", could: true},
+               {mount: "abc", glob: "a*/**", could: true},
+               {mount: "abc", glob: "**", could: true},
+               {mount: "abc", glob: "*/*", could: true},
+               {mount: "abc", glob: "**/*.txt", could: true},
+               {mount: "abc/def", glob: "*"},
+               {mount: "abc/def", glob: "*/*"},
+               {mount: "abc/def", glob: "*/*.txt"},
+               {mount: "abc/def", glob: "*/*/*", could: true},
+               {mount: "abc/def", glob: "**", could: true},
+               {mount: "abc/def", glob: "**/bar", could: true},
+               {mount: "abc/def", glob: "abc/**", could: true},
+               {mount: "abc/def/ghi", glob: "*c/**/bar", could: true},
+               {mount: "abc/def/ghi", glob: "*c/*f/bar"},
+               {mount: "abc/def/ghi", glob: "abc/d[^/]f/ghi/*", could: true},
+       } {
+               c.Logf("=== %+v", trial)
+               got := (&copier{
+                       globs: []string{trial.glob},
+               }).subtreeCouldMatch(trial.mount)
+               c.Check(got, check.Equals, trial.could)
+       }
+}
+
+func (s *copierSuite) TestCopyFromLargeCollection_Readonly(c *check.C) {
+       s.testCopyFromLargeCollection(c, false)
+}
+
+func (s *copierSuite) TestCopyFromLargeCollection_Writable(c *check.C) {
+       s.testCopyFromLargeCollection(c, true)
+}
+
+func (s *copierSuite) testCopyFromLargeCollection(c *check.C, writable bool) {
+       bindtmp := c.MkDir()
+       mtxt := arvadostest.FakeManifest(100, 100, 2, 4<<20)
+       pdh := arvados.PortableDataHash(mtxt)
+       json, err := json.Marshal(arvados.Collection{ManifestText: mtxt, PortableDataHash: pdh})
+       c.Assert(err, check.IsNil)
+       err = os.WriteFile(bindtmp+"/.arvados#collection", json, 0644)
+       // This symlink tricks walkHostFS into calling walkMount on
+       // the fakecollection dir. If we did the obvious thing instead
+       // (i.e., mount a collection under the output dir) walkMount
+       // would see that our fakecollection dir is actually a regular
+       // directory, conclude that the mount has been deleted and
+       // replaced by a regular directory tree, and process the tree
+       // as regular files, bypassing the manifest-copying code path
+       // we're trying to test.
+       err = os.Symlink("/fakecollection", s.cp.hostOutputDir+"/fakecollection")
+       c.Assert(err, check.IsNil)
+       s.cp.mounts["/fakecollection"] = arvados.Mount{
+               Kind:             "collection",
+               PortableDataHash: pdh,
+               Writable:         writable,
+       }
+       s.cp.bindmounts = map[string]bindmount{
+               "/fakecollection": bindmount{HostPath: bindtmp, ReadOnly: !writable},
+       }
+       s.cp.manifestCache = map[string]string{pdh: mtxt}
+       err = s.cp.walkMount("", s.cp.ctrOutputDir, 10, true)
+       c.Check(err, check.IsNil)
+       c.Log(s.log.String())
+
+       // Check some files to ensure they were copied properly.
+       for i := 0; i < 100; i += 13 {
+               for j := 0; j < 100; j += 17 {
+                       fnm := fmt.Sprintf("/fakecollection/dir%d/dir%d/file%d", i, j, j)
+                       _, err := s.cp.staged.Stat(fnm)
+                       c.Assert(err, check.IsNil, check.Commentf("%s", fnm))
+               }
+       }
+}
+
+func (s *copierSuite) TestMountBelowExcludedByGlob(c *check.C) {
+       bindtmp := c.MkDir()
+       s.cp.mounts["/ctr/outdir/include/includer"] = arvados.Mount{
+               Kind:             "collection",
+               PortableDataHash: arvadostest.FooCollectionPDH,
+       }
+       s.cp.mounts["/ctr/outdir/include/includew"] = arvados.Mount{
+               Kind:             "collection",
+               PortableDataHash: arvadostest.FooCollectionPDH,
+               Writable:         true,
+       }
+       s.cp.mounts["/ctr/outdir/exclude/excluder"] = arvados.Mount{
+               Kind:             "collection",
+               PortableDataHash: arvadostest.FooCollectionPDH,
+       }
+       s.cp.mounts["/ctr/outdir/exclude/excludew"] = arvados.Mount{
+               Kind:             "collection",
+               PortableDataHash: arvadostest.FooCollectionPDH,
+               Writable:         true,
+       }
+       s.cp.mounts["/ctr/outdir/nonexistent/collection"] = arvados.Mount{
+               // As extra assurance, plant a collection that will
+               // fail if copier attempts to load its manifest.  (For
+               // performance reasons it's important that copier
+               // doesn't try to load the manifest before deciding
+               // not to copy the contents.)
+               Kind:             "collection",
+               PortableDataHash: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa+1234",
+       }
+       s.cp.globs = []string{
+               "?ncl*/*r/*",
+               "*/?ncl*/**",
+       }
+       c.Assert(os.MkdirAll(s.cp.hostOutputDir+"/include/includer", 0755), check.IsNil)
+       c.Assert(os.MkdirAll(s.cp.hostOutputDir+"/include/includew", 0755), check.IsNil)
+       c.Assert(os.MkdirAll(s.cp.hostOutputDir+"/exclude/excluder", 0755), check.IsNil)
+       c.Assert(os.MkdirAll(s.cp.hostOutputDir+"/exclude/excludew", 0755), check.IsNil)
+       s.writeFileInOutputDir(c, "include/includew/foo", "foo")
+       s.writeFileInOutputDir(c, "exclude/excludew/foo", "foo")
+       s.cp.bindmounts = map[string]bindmount{
+               "/ctr/outdir/include/includew": bindmount{HostPath: bindtmp, ReadOnly: false},
+       }
+       s.cp.bindmounts = map[string]bindmount{
+               "/ctr/outdir/include/excludew": bindmount{HostPath: bindtmp, ReadOnly: false},
+       }
+
+       err := s.cp.walkMount("", s.cp.ctrOutputDir, 10, true)
+       c.Check(err, check.IsNil)
+       c.Log(s.log.String())
+
+       // Note it's OK that "/exclude" is not excluded by walkMount:
+       // it is just a local filesystem directory, not a mount point
+       // that's expensive to walk.  In real-life usage, it will be
+       // removed from cp.dirs before any copying happens.
+       c.Check(s.cp.dirs, check.DeepEquals, []string{"/exclude", "/include", "/include/includew"})
+       c.Check(s.cp.files, check.DeepEquals, []filetodo{
+               {src: s.cp.hostOutputDir + "/include/includew/foo", dst: "/include/includew/foo", size: 3},
+       })
+       manifest, err := s.cp.staged.MarshalManifest(".")
+       c.Assert(err, check.IsNil)
+       c.Check(manifest, check.Matches, `(?ms).*\./include/includer .*`)
+       c.Check(manifest, check.Not(check.Matches), `(?ms).*exclude.*`)
+       c.Check(s.log.String(), check.Matches, `(?ms).*not copying \\"exclude/excluder\\".*`)
+       c.Check(s.log.String(), check.Matches, `(?ms).*not copying \\"nonexistent/collection\\".*`)
+}
+
 func (s *copierSuite) writeFileInOutputDir(c *check.C, path, data string) {
        f, err := os.OpenFile(s.cp.hostOutputDir+"/"+path, os.O_CREATE|os.O_WRONLY, 0644)
        c.Assert(err, check.IsNil)
@@ -226,7 +432,7 @@ func (s *copierSuite) writeFileInOutputDir(c *check.C, path, data string) {
 }
 
 // applyGlobsToFilesAndDirs uses the same glob-matching code as
-// applyGlobsToCollectionFS, so we don't need to test all of the same
+// applyGlobsToStaged, so we don't need to test all of the same
 // glob-matching behavior covered in TestApplyGlobsToCollectionFS.  We
 // do need to check that (a) the glob is actually being used to filter
 // out files, and (b) non-matching dirs still included if and only if
@@ -388,8 +594,8 @@ func (s *copierSuite) TestApplyGlobsToCollectionFS(c *check.C) {
                c.Logf("=== globs: %q", trial.globs)
                collfs, err := (&arvados.Collection{ManifestText: ". d41d8cd98f00b204e9800998ecf8427e+0 0:0:foo 0:0:bar 0:0:baz/quux 0:0:baz/parent1/item1\n"}).FileSystem(nil, nil)
                c.Assert(err, check.IsNil)
-               cp := copier{globs: trial.globs}
-               err = cp.applyGlobsToCollectionFS(collfs)
+               cp := copier{globs: trial.globs, staged: collfs}
+               err = cp.applyGlobsToStaged()
                if !c.Check(err, check.IsNil) {
                        continue
                }