From ccefd86d022a00e89af37c6c7f9e71d4d68178ef Mon Sep 17 00:00:00 2001 From: Ward Vandewege Date: Mon, 29 Jun 2020 16:53:38 -0400 Subject: [PATCH] 16573: add a deduplication-report command to arvados-client Arvados-DCO-1.1-Signed-off-by: Ward Vandewege --- .licenseignore | 4 +- cmd/arvados-client/cmd.go | 4 +- go.mod | 1 + go.sum | 2 + lib/deduplicationreport/command.go | 43 +++++ lib/deduplicationreport/report.go | 217 +++++++++++++++++++++++++ lib/deduplicationreport/report_test.go | 124 ++++++++++++++ 7 files changed, 393 insertions(+), 2 deletions(-) create mode 100644 lib/deduplicationreport/command.go create mode 100644 lib/deduplicationreport/report.go create mode 100644 lib/deduplicationreport/report_test.go diff --git a/.licenseignore b/.licenseignore index ad80dc3f4b..81f6b7181d 100644 --- a/.licenseignore +++ b/.licenseignore @@ -79,4 +79,6 @@ lib/dispatchcloud/test/sshkey_* *.asc sdk/java-v2/build.gradle sdk/java-v2/settings.gradle -sdk/cwl/tests/wf/feddemo \ No newline at end of file +sdk/cwl/tests/wf/feddemo +go.mod +go.sum diff --git a/cmd/arvados-client/cmd.go b/cmd/arvados-client/cmd.go index 887bc62bb3..bcc3dda09a 100644 --- a/cmd/arvados-client/cmd.go +++ b/cmd/arvados-client/cmd.go @@ -9,6 +9,7 @@ import ( "git.arvados.org/arvados.git/lib/cli" "git.arvados.org/arvados.git/lib/cmd" + "git.arvados.org/arvados.git/lib/deduplicationreport" "git.arvados.org/arvados.git/lib/mount" ) @@ -52,7 +53,8 @@ var ( "virtual_machine": cli.APICall, "workflow": cli.APICall, - "mount": mount.Command, + "mount": mount.Command, + "deduplication-report": deduplicationreport.Command, }) ) diff --git a/go.mod b/go.mod index cc5457975f..1fde587e64 100644 --- a/go.mod +++ b/go.mod @@ -22,6 +22,7 @@ require ( github.com/docker/docker v1.4.2-0.20180109013817-94b8a116fbf1 github.com/docker/go-connections v0.3.0 // indirect github.com/docker/go-units v0.3.3-0.20171221200356-d59758554a3d // indirect + github.com/dustin/go-humanize v1.0.0 github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568 // indirect github.com/fsnotify/fsnotify v1.4.9 github.com/ghodss/yaml v1.0.0 diff --git a/go.sum b/go.sum index 38153ce3ea..c9b7f74e3b 100644 --- a/go.sum +++ b/go.sum @@ -56,6 +56,8 @@ github.com/docker/go-connections v0.3.0 h1:3lOnM9cSzgGwx8VfK/NGOW5fLQ0GjIlCkaktF github.com/docker/go-connections v0.3.0/go.mod h1:Gbd7IOopHjR8Iph03tsViu4nIes5XhDvyHbTtUxmeec= github.com/docker/go-units v0.3.3-0.20171221200356-d59758554a3d h1:dVaNRYvaGV23AdNdsm+4y1mPN0tj3/1v6taqKMmM6Ko= github.com/docker/go-units v0.3.3-0.20171221200356-d59758554a3d/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= +github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo= +github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568 h1:BHsljHzVlRcyQhjrss6TZTdY2VfCqZPbv5k3iBFa2ZQ= github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568/go.mod h1:xEzjJPgXI435gkrCt3MPfRiAkVrwSbHsst4LCFVfpJc= github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWoS4= diff --git a/lib/deduplicationreport/command.go b/lib/deduplicationreport/command.go new file mode 100644 index 0000000000..1199bc0ae2 --- /dev/null +++ b/lib/deduplicationreport/command.go @@ -0,0 +1,43 @@ +// Copyright (C) The Arvados Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +package deduplicationreport + +import ( + "io" + + "git.arvados.org/arvados.git/lib/config" + "git.arvados.org/arvados.git/sdk/go/ctxlog" + "github.com/sirupsen/logrus" +) + +var Command command + +type command struct{} + +type NoPrefixFormatter struct{} + +func (f *NoPrefixFormatter) Format(entry *logrus.Entry) ([]byte, error) { + return []byte(entry.Message), nil +} + +// RunCommand implements the subcommand "deduplication-report ..." +func (command) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int { + var err error + logger := ctxlog.New(stderr, "text", "info") + defer func() { + if err != nil { + logger.WithError(err).Error("fatal") + } + }() + + logger.SetFormatter(new(NoPrefixFormatter)) + + loader := config.NewLoader(stdin, logger) + loader.SkipLegacy = true + + exitcode := report(prog, args, loader, logger, stdout, stderr) + + return exitcode +} diff --git a/lib/deduplicationreport/report.go b/lib/deduplicationreport/report.go new file mode 100644 index 0000000000..b7699fcb25 --- /dev/null +++ b/lib/deduplicationreport/report.go @@ -0,0 +1,217 @@ +// Copyright (C) The Arvados Authors. All rights reserved. +// +// SPDX-License-Identifier: AGPL-3.0 + +package deduplicationreport + +import ( + "flag" + "fmt" + "io" + "strings" + + "git.arvados.org/arvados.git/lib/config" + "git.arvados.org/arvados.git/sdk/go/arvados" + "git.arvados.org/arvados.git/sdk/go/arvadosclient" + "git.arvados.org/arvados.git/sdk/go/manifest" + + "github.com/dustin/go-humanize" + "github.com/sirupsen/logrus" +) + +func deDuplicate(inputs []string) (trimmed []string) { + seen := make(map[string]bool) + for _, uuid := range inputs { + if _, ok := seen[uuid]; !ok { + seen[uuid] = true + trimmed = append(trimmed, uuid) + } + } + return +} + +func parseFlags(prog string, args []string, loader *config.Loader, logger *logrus.Logger, stderr io.Writer) (exitcode int, inputs []string) { + flags := flag.NewFlagSet("", flag.ContinueOnError) + flags.SetOutput(stderr) + flags.Usage = func() { + fmt.Fprintf(flags.Output(), ` +Usage: + %s [options ...] ... + + %s [options ...] , \ + , ... + + This program analyzes the overlap in blocks used by 2 or more collections. It + prints a deduplication report that shows the nominal space used by the list + of collection, as well as the actual size and the amount of space that is + saved by Keep's deduplication. + + The list of collections may be provided in two ways. A list of collection + uuids is sufficient. Alternatively, the PDH for each collection may also be + provided. This is will greatly speed up operation when the list contains + multiple collections with the same PDH. + + Exit status will be zero if there were no errors generating the report. + +Example: + + Use the 'arv' and 'jq' commands to get the list of the 100 + largest collections and generate the deduplication report: + + arv collection list --order 'file_size_total desc' | \ + jq -r '.items[] | [.portable_data_hash,.uuid] |@csv' | \ + tail -n100 |sed -e 's/"//g'|tr '\n' ' ' | \ + xargs %s + +Options: +`, prog, prog, prog) + flags.PrintDefaults() + } + loader.SetupFlags(flags) + loglevel := flags.String("log-level", "info", "logging level (debug, info, ...)") + err := flags.Parse(args) + if err == flag.ErrHelp { + return 0, inputs + } else if err != nil { + return 2, inputs + } + + inputs = flags.Args() + + inputs = deDuplicate(inputs) + + if len(inputs) < 2 { + logger.Error("Error: at least 2 different collections UUIDs required") + flags.Usage() + return 2, inputs + } + + lvl, err := logrus.ParseLevel(*loglevel) + if err != nil { + return 2, inputs + } + logger.SetLevel(lvl) + return +} + +func blockList(collection arvados.Collection) (blocks map[string]int) { + blocks = make(map[string]int) + m := manifest.Manifest{Text: collection.ManifestText} + blockChannel := m.BlockIterWithDuplicates() + for b := range blockChannel { + blocks[b.Digest.String()] = b.Size + } + return +} + +func report(prog string, args []string, loader *config.Loader, logger *logrus.Logger, stdout, stderr io.Writer) (exitcode int) { + + var inputs []string + exitcode, inputs = parseFlags(prog, args, loader, logger, stderr) + if exitcode != 0 { + return + } + + // Arvados Client setup + arv, err := arvadosclient.MakeArvadosClient() + if err != nil { + logger.Errorf("error creating Arvados object: %s", err) + exitcode = 1 + return + } + + type Col struct { + FileSizeTotal int64 + FileCount int64 + } + + blocks := make(map[string]map[string]int) + pdhs := make(map[string]Col) + var nominalSize int64 + + fmt.Println() + for _, input := range inputs { + var uuid string + var pdh string + if strings.Contains(input, ",") { + // The input is in the format pdh,uuid. This will allow us to save time on duplicate pdh's + tmp := strings.Split(input, ",") + pdh = tmp[0] + uuid = tmp[1] + } else { + // The input must be a plain uuid + uuid = input + } + if !strings.Contains(uuid, "-4zz18-") { + logger.Error("uuid must refer to collection object") + exitcode = 1 + return + } + if _, ok := pdhs[pdh]; ok { + // We've processed a collection with this pdh already. Simply add its + // size to the totals and move on to the next one. + // Note that we simply trust the PDH matches the collection UUID here, + // in other words, we use it over the UUID. If they don't match, the report + // will be wrong. + nominalSize += pdhs[pdh].FileSizeTotal + } else { + var collection arvados.Collection + err = arv.Get("collections", uuid, nil, &collection) + if err != nil { + logger.Errorf("Error: unable to retrieve collection: %s\n", err) + exitcode = 1 + return + } + blocks[uuid] = make(map[string]int) + blocks[uuid] = blockList(collection) + if pdh != "" && collection.PortableDataHash != pdh { + logger.Errorf("Error: the collection with UUID %s has PDH %s, but a different PDH was provided in the arguments: %s\n", uuid, collection.PortableDataHash, pdh) + exitcode = 1 + return + } + if pdh == "" { + pdh = collection.PortableDataHash + } + + col := Col{} + if collection.FileSizeTotal != 0 || collection.FileCount != 0 { + nominalSize += collection.FileSizeTotal + col.FileSizeTotal = collection.FileSizeTotal + col.FileCount = int64(collection.FileCount) + } else { + // Collections created with old Arvados versions do not always have the total file size and count cached in the collections object + var collSize int64 + for _, size := range blocks[uuid] { + collSize += int64(size) + } + nominalSize += collSize + col.FileSizeTotal = collSize + } + pdhs[pdh] = col + } + + if pdhs[pdh].FileCount != 0 { + fmt.Fprintf(stdout, "Collection %s: pdh %s; nominal size %d (%s); file count %d\n", uuid, pdh, pdhs[pdh].FileSizeTotal, humanize.IBytes(uint64(pdhs[pdh].FileSizeTotal)), pdhs[pdh].FileCount) + } else { + fmt.Fprintf(stdout, "Collection %s: pdh %s; nominal size %d (%s)\n", uuid, pdh, pdhs[pdh].FileSizeTotal, humanize.IBytes(uint64(pdhs[pdh].FileSizeTotal))) + } + } + + var totalSize int64 + seen := make(map[string]bool) + for _, v := range blocks { + for pdh, size := range v { + if _, ok := seen[pdh]; !ok { + seen[pdh] = true + totalSize += int64(size) + } + } + } + fmt.Fprintln(stdout) + fmt.Fprintf(stdout, "Collections: %15d\n", len(inputs)) + fmt.Fprintf(stdout, "Nominal size of stored data: %15d bytes (%s)\n", nominalSize, humanize.IBytes(uint64(nominalSize))) + fmt.Fprintf(stdout, "Actual size of stored data: %15d bytes (%s)\n", totalSize, humanize.IBytes(uint64(totalSize))) + fmt.Fprintf(stdout, "Saved by Keep deduplication: %15d bytes (%s)\n", nominalSize-totalSize, humanize.IBytes(uint64(nominalSize-totalSize))) + + return exitcode +} diff --git a/lib/deduplicationreport/report_test.go b/lib/deduplicationreport/report_test.go new file mode 100644 index 0000000000..6e7cd3af52 --- /dev/null +++ b/lib/deduplicationreport/report_test.go @@ -0,0 +1,124 @@ +// Copyright (C) The Arvados Authors. All rights reserved. +// +// SPDX-License-Identifier: AGPL-3.0 + +package deduplicationreport + +import ( + "bytes" + "testing" + + "git.arvados.org/arvados.git/sdk/go/arvados" + "git.arvados.org/arvados.git/sdk/go/arvadostest" + "gopkg.in/check.v1" +) + +func Test(t *testing.T) { + check.TestingT(t) +} + +var _ = check.Suite(&Suite{}) + +type Suite struct{} + +func (s *Suite) TearDownSuite(c *check.C) { + // Undo any changes/additions to the database so they don't affect subsequent tests. + arvadostest.ResetEnv() +} + +func (*Suite) TestUsage(c *check.C) { + var stdout, stderr bytes.Buffer + exitcode := Command.RunCommand("deduplicationreport.test", []string{"-log-level=debug"}, &bytes.Buffer{}, &stdout, &stderr) + c.Check(exitcode, check.Equals, 2) + c.Check(stdout.String(), check.Equals, "") + c.Log(stderr.String()) + c.Check(stderr.String(), check.Matches, `(?ms).*Usage:.*`) +} + +func (*Suite) TestTwoIdenticalUUIDs(c *check.C) { + var stdout, stderr bytes.Buffer + // Run dedupreport with 2 identical uuids + exitcode := Command.RunCommand("deduplicationreport.test", []string{arvadostest.FooCollection, arvadostest.FooCollection}, &bytes.Buffer{}, &stdout, &stderr) + c.Check(exitcode, check.Equals, 2) + c.Check(stdout.String(), check.Equals, "") + c.Log(stderr.String()) + c.Check(stderr.String(), check.Matches, `(?ms).*Error: at least 2 different collections UUIDs required.*`) +} + +func (*Suite) TestTwoUUIDsInvalidPDH(c *check.C) { + var stdout, stderr bytes.Buffer + // Run dedupreport with pdh,uuid where pdh does not match + exitcode := Command.RunCommand("deduplicationreport.test", []string{arvadostest.FooAndBarFilesInDirPDH + "," + arvadostest.FooCollection, arvadostest.FooCollection}, &bytes.Buffer{}, &stdout, &stderr) + c.Check(exitcode, check.Equals, 1) + c.Check(stdout.String(), check.Equals, "") + c.Log(stderr.String()) + c.Check(stderr.String(), check.Matches, `(?ms).*Error: the collection with UUID zzzzz-4zz18-fy296fx3hot09f7 has PDH 1f4b0bc7583c2a7f9102c395f4ffc5e3\+45, but a different PDH was provided in the arguments: 870369fc72738603c2fad16664e50e2d\+58.*`) +} + +func (*Suite) TestNonExistentCollection(c *check.C) { + var stdout, stderr bytes.Buffer + // Run dedupreport with many UUIDs + exitcode := Command.RunCommand("deduplicationreport.test", []string{arvadostest.FooCollection, arvadostest.NonexistentCollection}, &bytes.Buffer{}, &stdout, &stderr) + c.Check(exitcode, check.Equals, 1) + c.Check(stdout.String(), check.Equals, "Collection zzzzz-4zz18-fy296fx3hot09f7: pdh 1f4b0bc7583c2a7f9102c395f4ffc5e3+45; nominal size 3 (3 B)\n") + c.Log(stderr.String()) + c.Check(stderr.String(), check.Matches, `(?ms).*Error: unable to retrieve collection:.*404 Not Found.*`) +} + +func (*Suite) TestManyUUIDsNoOverlap(c *check.C) { + var stdout, stderr bytes.Buffer + // Run dedupreport with 5 UUIDs + exitcode := Command.RunCommand("deduplicationreport.test", []string{arvadostest.FooCollection, arvadostest.HelloWorldCollection, arvadostest.FooBarDirCollection, arvadostest.WazVersion1Collection, arvadostest.UserAgreementCollection}, &bytes.Buffer{}, &stdout, &stderr) + c.Check(exitcode, check.Equals, 0) + c.Check(stdout.String(), check.Matches, "(?ms).*Nominal size of stored data:[[:space:]]+249049 bytes \\(243 KiB\\).*") + c.Check(stdout.String(), check.Matches, "(?ms).*Actual size of stored data:[[:space:]]+249049 bytes \\(243 KiB\\).*") + c.Check(stdout.String(), check.Matches, "(?ms).*Saved by Keep deduplication:[[:space:]]+0 bytes \\(0 B\\).*") + c.Log(stderr.String()) + c.Check(stderr.String(), check.Equals, "") +} + +func (*Suite) TestTwoOverlappingCollections(c *check.C) { + var stdout, stderr bytes.Buffer + // Create two collections + arv := arvados.NewClientFromEnv() + + var c1 arvados.Collection + err := arv.RequestAndDecode(&c1, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". d3b07384d113edec49eaa6238ad5ff00+4+A2705511e0c47c92cc73e9ddc95b9822ef774c406@5f0de808 0:4:foo\n"}}) + c.Assert(err, check.Equals, nil) + + var c2 arvados.Collection + err = arv.RequestAndDecode(&c2, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". c157a79031e1c40f85931829bc5fc552+4+A1544eb0cee937934dc565d2b11836c804384c139@5f0e0bf9 d3b07384d113edec49eaa6238ad5ff00+4+A60746cad7ecc16fe26a0c17c55af90db675369c2@5f0e0bf9 0:4:bar 4:4:foo\n"}}) + c.Assert(err, check.Equals, nil) + + // Run dedupreport with 2 arguments: uuid uuid + exitcode := Command.RunCommand("deduplicationreport.test", []string{c1.UUID, c2.UUID}, &bytes.Buffer{}, &stdout, &stderr) + c.Check(exitcode, check.Equals, 0) + c.Check(stdout.String(), check.Matches, "(?ms).*Nominal size of stored data:[[:space:]]+12 bytes \\(12 B\\).*") + c.Check(stdout.String(), check.Matches, "(?ms).*Actual size of stored data:[[:space:]]+8 bytes \\(8 B\\).*") + c.Check(stdout.String(), check.Matches, "(?ms).*Saved by Keep deduplication:[[:space:]]+4 bytes \\(4 B\\).*") + c.Log(stderr.String()) + c.Check(stderr.String(), check.Equals, "") +} + +func (*Suite) TestTwoOverlappingCollectionsWithPDH(c *check.C) { + var stdout, stderr bytes.Buffer + // Create two collections + arv := arvados.NewClientFromEnv() + + var c1 arvados.Collection + err := arv.RequestAndDecode(&c1, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". d3b07384d113edec49eaa6238ad5ff00+4+A2705511e0c47c92cc73e9ddc95b9822ef774c406@5f0de808 0:4:foo\n"}}) + c.Assert(err, check.Equals, nil) + + var c2 arvados.Collection + err = arv.RequestAndDecode(&c2, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". c157a79031e1c40f85931829bc5fc552+4+A1544eb0cee937934dc565d2b11836c804384c139@5f0e0bf9 d3b07384d113edec49eaa6238ad5ff00+4+A60746cad7ecc16fe26a0c17c55af90db675369c2@5f0e0bf9 0:4:bar 4:4:foo\n"}}) + c.Assert(err, check.Equals, nil) + + // Run dedupreport with 2 arguments: pdh,uuid uuid + exitcode := Command.RunCommand("deduplicationreport.test", []string{c1.PortableDataHash + "," + c1.UUID, c2.UUID}, &bytes.Buffer{}, &stdout, &stderr) + c.Check(exitcode, check.Equals, 0) + c.Check(stdout.String(), check.Matches, "(?ms).*Nominal size of stored data:[[:space:]]+12 bytes \\(12 B\\).*") + c.Check(stdout.String(), check.Matches, "(?ms).*Actual size of stored data:[[:space:]]+8 bytes \\(8 B\\).*") + c.Check(stdout.String(), check.Matches, "(?ms).*Saved by Keep deduplication:[[:space:]]+4 bytes \\(4 B\\).*") + c.Log(stderr.String()) + c.Check(stderr.String(), check.Equals, "") +} -- 2.30.2