From dcdf385b2852acf95f41e2340d07cd68cb34e371 Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Thu, 4 Apr 2024 13:51:22 -0400 Subject: [PATCH] 12430: Drop non-matching files from output if output_glob specified. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- .../container_requests.html.textile.liquid | 3 + .../methods/containers.html.textile.liquid | 3 +- go.mod | 1 + go.sum | 2 + lib/crunchrun/copier.go | 116 +++++++++++- lib/crunchrun/copier_test.go | 172 ++++++++++++++++++ lib/crunchrun/crunchrun.go | 1 + lib/crunchrun/integration_test.go | 71 +++++--- sdk/go/arvados/container.go | 2 + sdk/python/arvados-v1-discovery.json | 6 + services/api/app/models/container.rb | 18 +- services/api/app/models/container_request.rb | 12 +- ...329173437_add_output_glob_to_containers.rb | 10 + ...733_add_output_glob_index_to_containers.rb | 17 ++ services/api/db/structure.sql | 19 +- .../api/test/unit/container_request_test.rb | 10 +- services/api/test/unit/container_test.rb | 2 + 17 files changed, 418 insertions(+), 47 deletions(-) create mode 100644 services/api/db/migrate/20240329173437_add_output_glob_to_containers.rb create mode 100644 services/api/db/migrate/20240402162733_add_output_glob_index_to_containers.rb diff --git a/doc/api/methods/container_requests.html.textile.liquid b/doc/api/methods/container_requests.html.textile.liquid index 1c269fb3e6..66368848c6 100644 --- a/doc/api/methods/container_requests.html.textile.liquid +++ b/doc/api/methods/container_requests.html.textile.liquid @@ -49,6 +49,9 @@ table(table table-bordered table-condensed). |cwd|string|Initial working directory, given as an absolute path (in the container) or a path relative to the WORKDIR given in the image's Dockerfile.|Required.| |command|array of strings|Command to execute in the container.|Required. e.g., @["echo","hello"]@| |output_path|string|Path to a directory or file inside the container that should be preserved as container's output when it finishes. This path must be one of the mount targets. For best performance, point output_path to a writable collection mount. See "Pre-populate output using Mount points":#pre-populate-output for details regarding optional output pre-population using mount points and "Symlinks in output":#symlinks-in-output for additional details.|Required.| +|output_glob|array of strings|Glob patterns determining which files (of those present in the output directory when the container finishes) will be included in the output collection. If multiple patterns are given, files that match any pattern are included. If null or empty, all files will be included.|"Doublestar" patterns are accepted: +@foo/**@ matches the entire tree rooted at @foo@ in the top level of the output directory. +@**/fo[og]@ matches all files named @foo@ or @fog@ anywhere in the tree.| |output_name|string|Desired name for the output collection. If null or empty, a name will be assigned automatically.|| |output_ttl|integer|Desired lifetime for the output collection, in seconds. If zero, the output collection will not be deleted automatically.|| |priority|integer|Range 0-1000. Indicate scheduling order preference.|Clients are expected to submit container requests with zero priority in order to preview the container that will be used to satisfy it. Priority can be null if and only if state!="Committed". See "below for more details":#priority .| diff --git a/doc/api/methods/containers.html.textile.liquid b/doc/api/methods/containers.html.textile.liquid index 1d2fed768c..f6f59291da 100644 --- a/doc/api/methods/containers.html.textile.liquid +++ b/doc/api/methods/containers.html.textile.liquid @@ -30,9 +30,10 @@ table(table table-bordered table-condensed). |finished_at|datetime|When this container finished.|Null if container has not yet finished.| |log|string|Portable data hash of a collection containing the log messages produced when executing the container.|Null if container has not yet started. The Crunch system will periodically update this field for a running container.| |environment|hash|Environment variables and values that should be set in the container environment (@docker run --env@). This augments and (when conflicts exist) overrides environment variables given in the image's Dockerfile.|Must be equal to a ContainerRequest's environment in order to satisfy the ContainerRequest.| -|cwd|string|Initial working directory.|Must be equal to a ContainerRequest's cwd in order to satisfy the ContainerRequest| +|cwd|string|Initial working directory.|Must be equal to a ContainerRequest's cwd in order to satisfy the ContainerRequest.| |command|array of strings|Command to execute.| Must be equal to a ContainerRequest's command in order to satisfy the ContainerRequest.| |output_path|string|Path to a directory or file inside the container that should be preserved as this container's output when it finishes.|Must be equal to a ContainerRequest's output_path in order to satisfy the ContainerRequest.| +|output_glob|array of strings|Glob patterns determining which files will be included in the output collection. See corresponding attribute in the "container_requests resource":container_requests.html.|Must be equal to a ContainerRequest's output_glob in order to satisfy the ContainerRequest.| |mounts|hash|Must contain the same keys as the ContainerRequest being satisfied. Each value must be within the range of values described in the ContainerRequest at the time the Container is assigned to the ContainerRequest.|See "Mount types":#mount_types for more details.| |secret_mounts|hash|Must contain the same keys as the ContainerRequest being satisfied. Each value must be within the range of values described in the ContainerRequest at the time the Container is assigned to the ContainerRequest.|Not returned in API responses. Reset to empty when state is "Complete" or "Cancelled".| |runtime_constraints|hash|Compute resources, and access to the outside world, that are / were available to the container. diff --git a/go.mod b/go.mod index 0011d7970f..62a31c1c58 100644 --- a/go.mod +++ b/go.mod @@ -65,6 +65,7 @@ require ( github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/bgentry/speakeasy v0.1.0 // indirect + github.com/bmatcuk/doublestar v1.3.4 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/dimchansky/utfbom v1.1.1 // indirect diff --git a/go.sum b/go.sum index fb2fe5e3f0..fe6febb043 100644 --- a/go.sum +++ b/go.sum @@ -59,6 +59,8 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/bgentry/speakeasy v0.1.0 h1:ByYyxL9InA1OWqxJqqp2A5pYHUrCiAL6K3J+LKSsQkY= github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs= +github.com/bmatcuk/doublestar v1.3.4 h1:gPypJ5xD31uhX6Tf54sDPUOBXTqKH4c9aPY66CyQrS0= +github.com/bmatcuk/doublestar v1.3.4/go.mod h1:wiQtGV+rzVYxB7WIlirSN++5HPtPlXEo9MEoZQC/PmE= github.com/boltdb/bolt v1.3.1/go.mod h1:clJnj/oiGkjum5o1McbSZDSLxVThjynRyGBgiAx27Ps= github.com/bradleypeabody/godap v0.0.0-20170216002349-c249933bc092 h1:0Di2onNnlN5PAyWPbqlPyN45eOQ+QW/J9eqLynt4IV4= github.com/bradleypeabody/godap v0.0.0-20170216002349-c249933bc092/go.mod h1:8IzBjZCRSnsvM6MJMG8HNNtnzMl48H22rbJL2kRUJ0Y= diff --git a/lib/crunchrun/copier.go b/lib/crunchrun/copier.go index a081c5d325..8d986d7066 100644 --- a/lib/crunchrun/copier.go +++ b/lib/crunchrun/copier.go @@ -9,6 +9,7 @@ import ( "errors" "fmt" "io" + "io/fs" "os" "path/filepath" "sort" @@ -17,6 +18,7 @@ import ( "git.arvados.org/arvados.git/sdk/go/arvados" "git.arvados.org/arvados.git/sdk/go/keepclient" "git.arvados.org/arvados.git/sdk/go/manifest" + "github.com/bmatcuk/doublestar" ) type printfer interface { @@ -54,6 +56,7 @@ type copier struct { keepClient IKeepClient hostOutputDir string ctrOutputDir string + globs []string bindmounts map[string]bindmount mounts map[string]arvados.Mount secretMounts map[string]arvados.Mount @@ -72,12 +75,17 @@ func (cp *copier) Copy() (string, error) { if err != nil { return "", fmt.Errorf("error scanning files to copy to output: %v", err) } - fs, err := (&arvados.Collection{ManifestText: cp.manifest}).FileSystem(cp.client, cp.keepClient) + collfs, err := (&arvados.Collection{ManifestText: cp.manifest}).FileSystem(cp.client, cp.keepClient) if err != nil { return "", fmt.Errorf("error creating Collection.FileSystem: %v", err) } + err = cp.applyGlobsToCollectionFS(collfs) + if err != nil { + return "", fmt.Errorf("error while removing non-matching files from output collection: %w", err) + } + cp.applyGlobsToFilesAndDirs() for _, d := range cp.dirs { - err = fs.Mkdir(d, 0777) + err = collfs.Mkdir(d, 0777) if err != nil && err != os.ErrExist { return "", fmt.Errorf("error making directory %q in output collection: %v", d, err) } @@ -91,20 +99,118 @@ func (cp *copier) Copy() (string, error) { // open so f's data can be packed with it). dir, _ := filepath.Split(f.dst) if dir != lastparentdir || unflushed > keepclient.BLOCKSIZE { - if err := fs.Flush("/"+lastparentdir, dir != lastparentdir); err != nil { + if err := collfs.Flush("/"+lastparentdir, dir != lastparentdir); err != nil { return "", fmt.Errorf("error flushing output collection file data: %v", err) } unflushed = 0 } lastparentdir = dir - n, err := cp.copyFile(fs, f) + n, err := cp.copyFile(collfs, f) if err != nil { return "", fmt.Errorf("error copying file %q into output collection: %v", f, err) } unflushed += n } - return fs.MarshalManifest(".") + return collfs.MarshalManifest(".") +} + +func (cp *copier) matchGlobs(path string) bool { + // An entry in the top level of the output directory looks + // like "/foo", but globs look like "foo", so we strip the + // leading "/" before matching. + path = strings.TrimLeft(path, "/") + for _, glob := range cp.globs { + if match, _ := doublestar.Match(glob, path); match { + return true + } + } + return false +} + +// Delete entries from cp.files that do not match cp.globs. +// +// Delete entries from cp.dirs that do not match cp.globs. +// +// Ensure parent/ancestor directories of remaining cp.files and +// cp.dirs entries are still present in cp.dirs, even if they do not +// match cp.globs themselves. +func (cp *copier) applyGlobsToFilesAndDirs() { + if len(cp.globs) == 0 { + return + } + keepdirs := make(map[string]bool) + for _, path := range cp.dirs { + if cp.matchGlobs(path) { + keepdirs[path] = true + } + } + for path := range keepdirs { + for i, c := range path { + if i > 0 && c == '/' { + keepdirs[path[:i]] = true + } + } + } + var keepfiles []filetodo + for _, file := range cp.files { + if cp.matchGlobs(file.dst) { + keepfiles = append(keepfiles, file) + } + } + for _, file := range keepfiles { + for i, c := range file.dst { + if i > 0 && c == '/' { + keepdirs[file.dst[:i]] = true + } + } + } + cp.dirs = nil + for path := range keepdirs { + cp.dirs = append(cp.dirs, path) + } + sort.Strings(cp.dirs) + cp.files = keepfiles +} + +// Delete files in collfs that do not match cp.globs. Also delete +// directories that are empty (after deleting non-matching files) and +// do not match cp.globs themselves. +func (cp *copier) applyGlobsToCollectionFS(collfs arvados.CollectionFileSystem) error { + if len(cp.globs) == 0 { + return nil + } + include := make(map[string]bool) + err := fs.WalkDir(arvados.FS(collfs), "", func(path string, ent fs.DirEntry, err error) error { + if cp.matchGlobs(path) { + for i, c := range path { + if i > 0 && c == '/' { + include[path[:i]] = true + } + } + include[path] = true + } + return nil + }) + if err != nil { + return err + } + err = fs.WalkDir(arvados.FS(collfs), "", func(path string, ent fs.DirEntry, err error) error { + if err != nil || path == "" { + return err + } + if !include[path] { + err := collfs.RemoveAll(path) + if err != nil { + return err + } + if ent.IsDir() { + return fs.SkipDir + } + } + return nil + }) + return err } func (cp *copier) copyFile(fs arvados.CollectionFileSystem, f filetodo) (int64, error) { diff --git a/lib/crunchrun/copier_test.go b/lib/crunchrun/copier_test.go index c8936d1a9f..bf7e789e45 100644 --- a/lib/crunchrun/copier_test.go +++ b/lib/crunchrun/copier_test.go @@ -7,8 +7,10 @@ package crunchrun import ( "bytes" "io" + "io/fs" "io/ioutil" "os" + "sort" "syscall" "git.arvados.org/arvados.git/sdk/go/arvados" @@ -222,3 +224,173 @@ func (s *copierSuite) writeFileInOutputDir(c *check.C, path, data string) { c.Assert(err, check.IsNil) c.Assert(f.Close(), check.IsNil) } + +func (s *copierSuite) TestApplyGlobsToFilesAndDirs(c *check.C) { + dirs := []string{"dir1", "dir1/dir11", "dir1/dir12", "dir2"} + files := []string{"dir1/file11", "dir1/dir11/file111", "dir2/file2"} + for _, trial := range []struct { + globs []string + dirs []string + files []string + }{ + { + globs: []string{}, + dirs: append([]string{}, dirs...), + files: append([]string{}, files...), + }, + { + globs: []string{"**"}, + dirs: append([]string{}, dirs...), + files: append([]string{}, files...), + }, + { + globs: []string{"**/file111"}, + dirs: []string{"dir1", "dir1/dir11"}, + files: []string{"dir1/dir11/file111"}, + }, + { + globs: []string{"nothing"}, + dirs: nil, + files: nil, + }, + { + globs: []string{"**/dir12"}, + dirs: []string{"dir1", "dir1/dir12"}, + files: nil, + }, + { + globs: []string{"**/file*"}, + dirs: []string{"dir1", "dir1/dir11", "dir2"}, + files: append([]string{}, files...), + }, + { + globs: []string{"**/dir1[12]"}, + dirs: []string{"dir1", "dir1/dir11", "dir1/dir12"}, + files: nil, + }, + { + globs: []string{"**/dir1[^2]"}, + dirs: []string{"dir1", "dir1/dir11"}, + files: nil, + }, + } { + c.Logf("=== globs: %q", trial.globs) + cp := copier{ + globs: trial.globs, + dirs: dirs, + } + for _, path := range files { + cp.files = append(cp.files, filetodo{dst: path}) + } + cp.applyGlobsToFilesAndDirs() + var gotFiles []string + for _, file := range cp.files { + gotFiles = append(gotFiles, file.dst) + } + c.Check(cp.dirs, check.DeepEquals, trial.dirs) + c.Check(gotFiles, check.DeepEquals, trial.files) + } +} + +func (s *copierSuite) TestApplyGlobsToCollectionFS(c *check.C) { + for _, trial := range []struct { + globs []string + expect []string + }{ + { + globs: nil, + expect: []string{"foo", "bar", "baz/quux", "baz/parent1/item1"}, + }, + { + globs: []string{"foo"}, + expect: []string{"foo"}, + }, + { + globs: []string{"baz/parent1/item1"}, + expect: []string{"baz/parent1/item1"}, + }, + { + globs: []string{"**"}, + expect: []string{"foo", "bar", "baz/quux", "baz/parent1/item1"}, + }, + { + globs: []string{"**/*"}, + expect: []string{"foo", "bar", "baz/quux", "baz/parent1/item1"}, + }, + { + globs: []string{"*"}, + expect: []string{"foo", "bar"}, + }, + { + globs: []string{"baz"}, + expect: nil, + }, + { + globs: []string{"b*/**"}, + expect: []string{"baz/quux", "baz/parent1/item1"}, + }, + { + globs: []string{"baz"}, + expect: nil, + }, + { + globs: []string{"baz/**"}, + expect: []string{"baz/quux", "baz/parent1/item1"}, + }, + { + globs: []string{"baz/*"}, + expect: []string{"baz/quux"}, + }, + { + globs: []string{"baz/**/*uu?"}, + expect: []string{"baz/quux"}, + }, + { + globs: []string{"**/*m1"}, + expect: []string{"baz/parent1/item1"}, + }, + { + globs: []string{"*/*/*/**/*1"}, + expect: nil, + }, + { + globs: []string{"f*", "**/q*"}, + expect: []string{"foo", "baz/quux"}, + }, + { + globs: []string{"\\"}, // invalid pattern matches nothing + expect: nil, + }, + { + globs: []string{"\\", "foo"}, + expect: []string{"foo"}, + }, + { + globs: []string{"foo/**"}, + expect: nil, + }, + { + globs: []string{"foo*/**"}, + expect: nil, + }, + } { + c.Logf("=== globs: %q", trial.globs) + collfs, err := (&arvados.Collection{ManifestText: ". d41d8cd98f00b204e9800998ecf8427e+0 0:0:foo 0:0:bar 0:0:baz/quux 0:0:baz/parent1/item1\n"}).FileSystem(nil, nil) + c.Assert(err, check.IsNil) + cp := copier{globs: trial.globs} + err = cp.applyGlobsToCollectionFS(collfs) + if !c.Check(err, check.IsNil) { + continue + } + var got []string + fs.WalkDir(arvados.FS(collfs), "", func(path string, ent fs.DirEntry, err error) error { + if !ent.IsDir() { + got = append(got, path) + } + return nil + }) + sort.Strings(got) + sort.Strings(trial.expect) + c.Check(got, check.DeepEquals, trial.expect) + } +} diff --git a/lib/crunchrun/crunchrun.go b/lib/crunchrun/crunchrun.go index bde13424dd..1f1c5675aa 100644 --- a/lib/crunchrun/crunchrun.go +++ b/lib/crunchrun/crunchrun.go @@ -1365,6 +1365,7 @@ func (runner *ContainerRunner) CaptureOutput(bindmounts map[string]bindmount) er keepClient: runner.ContainerKeepClient, hostOutputDir: runner.HostOutputDir, ctrOutputDir: runner.Container.OutputPath, + globs: runner.Container.OutputGlob, bindmounts: bindmounts, mounts: runner.Container.Mounts, secretMounts: runner.SecretMounts, diff --git a/lib/crunchrun/integration_test.go b/lib/crunchrun/integration_test.go index 4f0100b267..def5f07fc0 100644 --- a/lib/crunchrun/integration_test.go +++ b/lib/crunchrun/integration_test.go @@ -148,6 +148,7 @@ func (s *integrationSuite) setup(c *C) { "state": s.cr.State, "command": s.cr.Command, "output_path": s.cr.OutputPath, + "output_glob": s.cr.OutputGlob, "container_image": s.cr.ContainerImage, "mounts": s.cr.Mounts, "runtime_constraints": s.cr.RuntimeConstraints, @@ -272,6 +273,19 @@ func (s *integrationSuite) TestRunTrivialContainerWithNoLocalKeepstore(c *C) { c.Check(s.logFiles["crunch-run.txt"], Matches, `(?ms).*loaded config file \Q`+os.Getenv("ARVADOS_CONFIG")+`\E\n.*`) } +func (s *integrationSuite) TestRunTrivialContainerWithOutputGlob(c *C) { + s.cr.OutputGlob = []string{"js?n"} + s.testRunTrivialContainer(c) + fs, err := s.outputCollection.FileSystem(s.client, s.kc) + c.Assert(err, IsNil) + _, err = fs.Stat("json") + c.Check(err, IsNil) + _, err = fs.Stat("inputfile") + c.Check(err, Equals, os.ErrNotExist) + _, err = fs.Stat("emptydir") + c.Check(err, Equals, os.ErrNotExist) +} + func (s *integrationSuite) testRunTrivialContainer(c *C) { if err := exec.Command("which", s.engine).Run(); err != nil { c.Skip(fmt.Sprintf("%s: %s", s.engine, err)) @@ -321,34 +335,37 @@ func (s *integrationSuite) testRunTrivialContainer(c *C) { var output arvados.Collection err = s.client.RequestAndDecode(&output, "GET", "arvados/v1/collections/"+s.cr.OutputUUID, nil, nil) c.Assert(err, IsNil) - fs, err = output.FileSystem(s.client, s.kc) - c.Assert(err, IsNil) - if f, err := fs.Open("inputfile"); c.Check(err, IsNil) { - defer f.Close() - buf, err := ioutil.ReadAll(f) - c.Check(err, IsNil) - c.Check(string(buf), Equals, "inputdata") - } - if f, err := fs.Open("json"); c.Check(err, IsNil) { - defer f.Close() - buf, err := ioutil.ReadAll(f) - c.Check(err, IsNil) - c.Check(string(buf), Equals, `["foo",{"foo":"bar"},null]`) - } - if fi, err := fs.Stat("emptydir"); c.Check(err, IsNil) { - c.Check(fi.IsDir(), Equals, true) - } - if d, err := fs.Open("emptydir"); c.Check(err, IsNil) { - defer d.Close() - fis, err := d.Readdir(-1) + s.outputCollection = output + + if len(s.cr.OutputGlob) == 0 { + fs, err = output.FileSystem(s.client, s.kc) c.Assert(err, IsNil) - // crunch-run still saves a ".keep" file to preserve - // empty dirs even though that shouldn't be - // necessary. Ideally we would do: - // c.Check(fis, HasLen, 0) - for _, fi := range fis { - c.Check(fi.Name(), Equals, ".keep") + if f, err := fs.Open("inputfile"); c.Check(err, IsNil) { + defer f.Close() + buf, err := ioutil.ReadAll(f) + c.Check(err, IsNil) + c.Check(string(buf), Equals, "inputdata") + } + if f, err := fs.Open("json"); c.Check(err, IsNil) { + defer f.Close() + buf, err := ioutil.ReadAll(f) + c.Check(err, IsNil) + c.Check(string(buf), Equals, `["foo",{"foo":"bar"},null]`) + } + if fi, err := fs.Stat("emptydir"); c.Check(err, IsNil) { + c.Check(fi.IsDir(), Equals, true) + } + if d, err := fs.Open("emptydir"); c.Check(err, IsNil) { + defer d.Close() + fis, err := d.Readdir(-1) + c.Assert(err, IsNil) + // crunch-run still saves a ".keep" file to preserve + // empty dirs even though that shouldn't be + // necessary. Ideally we would do: + // c.Check(fis, HasLen, 0) + for _, fi := range fis { + c.Check(fi.Name(), Equals, ".keep") + } } } - s.outputCollection = output } diff --git a/sdk/go/arvados/container.go b/sdk/go/arvados/container.go index 91c8fbfe29..aefad9b5e5 100644 --- a/sdk/go/arvados/container.go +++ b/sdk/go/arvados/container.go @@ -23,6 +23,7 @@ type Container struct { Mounts map[string]Mount `json:"mounts"` Output string `json:"output"` OutputPath string `json:"output_path"` + OutputGlob []string `json:"output_glob"` Priority int64 `json:"priority"` RuntimeConstraints RuntimeConstraints `json:"runtime_constraints"` State ContainerState `json:"state"` @@ -68,6 +69,7 @@ type ContainerRequest struct { Cwd string `json:"cwd"` Command []string `json:"command"` OutputPath string `json:"output_path"` + OutputGlob []string `json:"output_glob"` OutputName string `json:"output_name"` OutputTTL int `json:"output_ttl"` Priority int `json:"priority"` diff --git a/sdk/python/arvados-v1-discovery.json b/sdk/python/arvados-v1-discovery.json index 232c88d067..6e3fcdc00c 100644 --- a/sdk/python/arvados-v1-discovery.json +++ b/sdk/python/arvados-v1-discovery.json @@ -9618,6 +9618,9 @@ }, "subrequests_cost": { "type": "float" + }, + "output_glob": { + "type": "Array" } } }, @@ -9764,6 +9767,9 @@ }, "cumulative_cost": { "type": "float" + }, + "output_glob": { + "type": "Array" } } }, diff --git a/services/api/app/models/container.rb b/services/api/app/models/container.rb index ee338b81ff..08dad2314e 100644 --- a/services/api/app/models/container.rb +++ b/services/api/app/models/container.rb @@ -30,6 +30,7 @@ class Container < ArvadosModel serialize :runtime_constraints, Hash serialize :command, Array serialize :scheduling_parameters, Hash + serialize :output_glob, Array after_find :fill_container_defaults_after_find before_validation :fill_field_defaults, :if => :new_record? @@ -73,6 +74,7 @@ class Container < ArvadosModel t.add :mounts t.add :output t.add :output_path + t.add :output_glob t.add :priority t.add :progress t.add :runtime_constraints @@ -164,6 +166,7 @@ class Container < ArvadosModel cwd: req.cwd, environment: req.environment, output_path: req.output_path, + output_glob: req.output_glob, container_image: resolve_container_image(req.container_image), mounts: resolve_mounts(req.mounts), runtime_constraints: resolve_runtime_constraints(req.runtime_constraints), @@ -263,6 +266,9 @@ class Container < ArvadosModel candidates = candidates.where('output_path = ?', attrs[:output_path]) log_reuse_info(candidates) { "after filtering on output_path #{attrs[:output_path].inspect}" } + candidates = candidates.where_serialized(:output_glob, attrs[:output_glob], md5: true) + log_reuse_info(candidates) { "after filtering on output_glob #{attrs[:output_glob].inspect}" } + image = resolve_container_image(attrs[:container_image]) candidates = candidates.where('container_image = ?', image) log_reuse_info(candidates) { "after filtering on container_image #{image.inspect} (resolved from #{attrs[:container_image].inspect})" } @@ -482,6 +488,7 @@ class Container < ArvadosModel self.environment ||= {} self.runtime_constraints ||= {} self.mounts ||= {} + self.output_glob ||= [] self.cwd ||= "." self.priority ||= 0 self.scheduling_parameters ||= {} @@ -531,11 +538,11 @@ class Container < ArvadosModel if self.new_record? permitted.push(:owner_uuid, :command, :container_image, :cwd, - :environment, :mounts, :output_path, :priority, - :runtime_constraints, :scheduling_parameters, - :secret_mounts, :runtime_token, - :runtime_user_uuid, :runtime_auth_scopes, - :output_storage_classes) + :environment, :mounts, :output_path, :output_glob, + :priority, :runtime_constraints, + :scheduling_parameters, :secret_mounts, + :runtime_token, :runtime_user_uuid, + :runtime_auth_scopes, :output_storage_classes) end case self.state @@ -798,6 +805,7 @@ class Container < ArvadosModel cwd: self.cwd, environment: self.environment, output_path: self.output_path, + output_glob: self.output_glob, container_image: self.container_image, mounts: self.mounts, runtime_constraints: self.runtime_constraints, diff --git a/services/api/app/models/container_request.rb b/services/api/app/models/container_request.rb index f5789f31f6..9b3d427594 100644 --- a/services/api/app/models/container_request.rb +++ b/services/api/app/models/container_request.rb @@ -34,6 +34,7 @@ class ContainerRequest < ArvadosModel serialize :runtime_constraints, Hash serialize :command, Array serialize :scheduling_parameters, Hash + serialize :output_glob, Array after_find :fill_container_defaults_after_find after_initialize { @state_was_when_initialized = self.state_was } # see finalize_if_needed @@ -73,6 +74,7 @@ class ContainerRequest < ArvadosModel t.add :name t.add :output_name t.add :output_path + t.add :output_glob t.add :output_uuid t.add :output_ttl t.add :priority @@ -104,7 +106,7 @@ class ContainerRequest < ArvadosModel AttrsPermittedAlways = [:owner_uuid, :state, :name, :description, :properties] AttrsPermittedBeforeCommit = [:command, :container_count_max, :container_image, :cwd, :environment, :filters, :mounts, - :output_path, :priority, :runtime_token, + :output_path, :output_glob, :priority, :runtime_token, :runtime_constraints, :state, :container_uuid, :use_existing, :scheduling_parameters, :secret_mounts, :output_name, :output_ttl, :output_storage_classes, :output_properties] @@ -307,7 +309,7 @@ class ContainerRequest < ArvadosModel end def self.full_text_searchable_columns - super - ["mounts", "secret_mounts", "secret_mounts_md5", "runtime_token", "output_storage_classes"] + super - ["mounts", "secret_mounts", "secret_mounts_md5", "runtime_token", "output_storage_classes", "output_glob"] end def set_priority_zero @@ -326,6 +328,7 @@ class ContainerRequest < ArvadosModel self.container_count_max ||= Rails.configuration.Containers.MaxRetryAttempts self.scheduling_parameters ||= {} self.output_ttl ||= 0 + self.output_glob ||= [] self.priority ||= 0 end @@ -442,6 +445,11 @@ class ContainerRequest < ArvadosModel errors.add(:environment, "must be an map of String to String but has entry #{k.class} to #{v.class}") end end + output_glob.each do |g| + if !g.is_a? String + errors.add(:output_glob, "must be an array of strings but has entry #{g.class}") + end + end [:mounts, :secret_mounts].each do |m| self[m].each do |k, v| if !k.is_a?(String) || !v.is_a?(Hash) diff --git a/services/api/db/migrate/20240329173437_add_output_glob_to_containers.rb b/services/api/db/migrate/20240329173437_add_output_glob_to_containers.rb new file mode 100644 index 0000000000..481cad123f --- /dev/null +++ b/services/api/db/migrate/20240329173437_add_output_glob_to_containers.rb @@ -0,0 +1,10 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0 + +class AddOutputGlobToContainers < ActiveRecord::Migration[7.0] + def change + add_column :containers, :output_glob, :text, default: '[]' + add_column :container_requests, :output_glob, :text, default: '[]' + end +end diff --git a/services/api/db/migrate/20240402162733_add_output_glob_index_to_containers.rb b/services/api/db/migrate/20240402162733_add_output_glob_index_to_containers.rb new file mode 100644 index 0000000000..00050f8a78 --- /dev/null +++ b/services/api/db/migrate/20240402162733_add_output_glob_index_to_containers.rb @@ -0,0 +1,17 @@ +# Copyright (C) The Arvados Authors. All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0 + +require './db/migrate/20161213172944_full_text_search_indexes' + +class AddOutputGlobIndexToContainers < ActiveRecord::Migration[4.2] + def up + ActiveRecord::Base.connection.execute 'DROP INDEX index_containers_on_reuse_columns' + ActiveRecord::Base.connection.execute 'CREATE INDEX index_containers_on_reuse_columns on containers (md5(command), cwd, md5(environment), output_path, md5(output_glob), container_image, md5(mounts), secret_mounts_md5, md5(runtime_constraints))' + FullTextSearchIndexes.new.replace_index('container_requests') + end + def down + ActiveRecord::Base.connection.execute 'DROP INDEX index_containers_on_reuse_columns' + ActiveRecord::Base.connection.execute 'CREATE INDEX index_containers_on_reuse_columns on containers (md5(command), cwd, md5(environment), output_path, container_image, md5(mounts), secret_mounts_md5, md5(runtime_constraints))' + end +end diff --git a/services/api/db/structure.sql b/services/api/db/structure.sql index c0d4263d97..08862c60c2 100644 --- a/services/api/db/structure.sql +++ b/services/api/db/structure.sql @@ -568,7 +568,8 @@ CREATE TABLE public.container_requests ( runtime_token text, output_storage_classes jsonb DEFAULT '["default"]'::jsonb, output_properties jsonb DEFAULT '{}'::jsonb, - cumulative_cost double precision DEFAULT 0.0 NOT NULL + cumulative_cost double precision DEFAULT 0.0 NOT NULL, + output_glob text DEFAULT '[]'::text ); @@ -634,7 +635,8 @@ CREATE TABLE public.containers ( output_storage_classes jsonb DEFAULT '["default"]'::jsonb, output_properties jsonb DEFAULT '{}'::jsonb, cost double precision DEFAULT 0.0 NOT NULL, - subrequests_cost double precision DEFAULT 0.0 NOT NULL + subrequests_cost double precision DEFAULT 0.0 NOT NULL, + output_glob text DEFAULT '[]'::text ); @@ -1855,6 +1857,13 @@ CREATE INDEX collections_search_index ON public.collections USING btree (owner_u CREATE INDEX collections_trgm_text_search_idx ON public.collections USING gin (((((((((((((((((((COALESCE(owner_uuid, ''::character varying))::text || ' '::text) || (COALESCE(modified_by_client_uuid, ''::character varying))::text) || ' '::text) || (COALESCE(modified_by_user_uuid, ''::character varying))::text) || ' '::text) || (COALESCE(portable_data_hash, ''::character varying))::text) || ' '::text) || (COALESCE(uuid, ''::character varying))::text) || ' '::text) || (COALESCE(name, ''::character varying))::text) || ' '::text) || (COALESCE(description, ''::character varying))::text) || ' '::text) || COALESCE((properties)::text, ''::text)) || ' '::text) || COALESCE(file_names, ''::text))) public.gin_trgm_ops); +-- +-- Name: container_requests_full_text_search_idx; Type: INDEX; Schema: public; Owner: - +-- + +CREATE INDEX container_requests_full_text_search_idx ON public.container_requests USING gin (to_tsvector('english'::regconfig, substr((((((((((((((((((((((((((((((((((((((((((((((COALESCE(uuid, ''::character varying))::text || ' '::text) || (COALESCE(owner_uuid, ''::character varying))::text) || ' '::text) || (COALESCE(modified_by_client_uuid, ''::character varying))::text) || ' '::text) || (COALESCE(modified_by_user_uuid, ''::character varying))::text) || ' '::text) || (COALESCE(name, ''::character varying))::text) || ' '::text) || COALESCE(description, ''::text)) || ' '::text) || COALESCE((properties)::text, ''::text)) || ' '::text) || (COALESCE(state, ''::character varying))::text) || ' '::text) || (COALESCE(requesting_container_uuid, ''::character varying))::text) || ' '::text) || (COALESCE(container_uuid, ''::character varying))::text) || ' '::text) || COALESCE(runtime_constraints, ''::text)) || ' '::text) || (COALESCE(container_image, ''::character varying))::text) || ' '::text) || COALESCE(environment, ''::text)) || ' '::text) || (COALESCE(cwd, ''::character varying))::text) || ' '::text) || COALESCE(command, ''::text)) || ' '::text) || (COALESCE(output_path, ''::character varying))::text) || ' '::text) || COALESCE(filters, ''::text)) || ' '::text) || COALESCE(scheduling_parameters, ''::text)) || ' '::text) || (COALESCE(output_uuid, ''::character varying))::text) || ' '::text) || (COALESCE(log_uuid, ''::character varying))::text) || ' '::text) || (COALESCE(output_name, ''::character varying))::text) || ' '::text) || COALESCE((output_properties)::text, ''::text)) || ' '::text) || COALESCE(output_glob, ''::text)), 0, 8000))); + + -- -- Name: container_requests_index_on_properties; Type: INDEX; Schema: public; Owner: - -- @@ -2167,7 +2176,7 @@ CREATE INDEX index_containers_on_queued_state ON public.containers USING btree ( -- Name: index_containers_on_reuse_columns; Type: INDEX; Schema: public; Owner: - -- -CREATE INDEX index_containers_on_reuse_columns ON public.containers USING btree (md5(command), cwd, md5(environment), output_path, container_image, md5(mounts), secret_mounts_md5, md5(runtime_constraints)); +CREATE INDEX index_containers_on_reuse_columns ON public.containers USING btree (md5(command), cwd, md5(environment), output_path, md5(output_glob), container_image, md5(mounts), secret_mounts_md5, md5(runtime_constraints)); -- @@ -3316,4 +3325,6 @@ INSERT INTO "schema_migrations" (version) VALUES ('20230815160000'), ('20230821000000'), ('20230922000000'), -('20231013000000'); +('20231013000000'), +('20240329173437'), +('20240402162733'); diff --git a/services/api/test/unit/container_request_test.rb b/services/api/test/unit/container_request_test.rb index d25c08a579..fa7910d597 100644 --- a/services/api/test/unit/container_request_test.rb +++ b/services/api/test/unit/container_request_test.rb @@ -112,11 +112,15 @@ class ContainerRequestTest < ActiveSupport::TestCase {"mounts" => {"FOO" => {}}}, {"mounts" => {"FOO" => {"kind" => "tmp", "capacity" => 42.222}}}, {"command" => ["echo", 55]}, - {"environment" => {"FOO" => 55}} + {"environment" => {"FOO" => 55}}, + {"output_glob" => [false]}, + {"output_glob" => [["bad"]]}, + {"output_glob" => "bad"}, + {"output_glob" => ["nope", -1]}, ].each do |value| test "Create with invalid #{value}" do set_user_from_auth :active - assert_raises(ActiveRecord::RecordInvalid) do + assert_raises(ActiveRecord::RecordInvalid, Serializer::TypeMismatch) do cr = create_minimal_req!({state: "Committed", priority: 1}.merge(value)) cr.save! @@ -127,7 +131,7 @@ class ContainerRequestTest < ActiveSupport::TestCase set_user_from_auth :active cr = create_minimal_req!(state: "Uncommitted", priority: 1) cr.save! - assert_raises(ActiveRecord::RecordInvalid) do + assert_raises(ActiveRecord::RecordInvalid, Serializer::TypeMismatch) do cr = ContainerRequest.find_by_uuid cr.uuid cr.update!({state: "Committed", priority: 1}.merge(value)) diff --git a/services/api/test/unit/container_test.rb b/services/api/test/unit/container_test.rb index 09b885b391..60dcb48f18 100644 --- a/services/api/test/unit/container_test.rb +++ b/services/api/test/unit/container_test.rb @@ -22,6 +22,7 @@ class ContainerTest < ActiveSupport::TestCase cwd: "test", command: ["echo", "hello"], output_path: "test", + output_glob: [], runtime_constraints: { "API" => false, "keep_cache_disk" => 0, @@ -48,6 +49,7 @@ class ContainerTest < ActiveSupport::TestCase environment: {}, mounts: {}, output_path: "test", + output_glob: [], runtime_auth_scopes: ["all"], runtime_constraints: { "API" => false, -- 2.30.2