From 544d7aae0d58a25e8c761c638167c3564de06af5 Mon Sep 17 00:00:00 2001 From: Ward Vandewege Date: Sun, 12 Jul 2020 21:34:02 -0400 Subject: [PATCH] 16573: address review comments. Arvados-DCO-1.1-Signed-off-by: Ward Vandewege --- lib/deduplicationreport/report.go | 23 +++++---- lib/deduplicationreport/report_test.go | 68 ++++++++++++-------------- 2 files changed, 43 insertions(+), 48 deletions(-) diff --git a/lib/deduplicationreport/report.go b/lib/deduplicationreport/report.go index b7699fcb25..663e734274 100644 --- a/lib/deduplicationreport/report.go +++ b/lib/deduplicationreport/report.go @@ -22,7 +22,7 @@ import ( func deDuplicate(inputs []string) (trimmed []string) { seen := make(map[string]bool) for _, uuid := range inputs { - if _, ok := seen[uuid]; !ok { + if !seen[uuid] { seen[uuid] = true trimmed = append(trimmed, uuid) } @@ -42,9 +42,9 @@ Usage: , ... This program analyzes the overlap in blocks used by 2 or more collections. It - prints a deduplication report that shows the nominal space used by the list - of collection, as well as the actual size and the amount of space that is - saved by Keep's deduplication. + prints a deduplication report that shows the nominal space used by the + collections, as well as the actual size and the amount of space that is saved + by Keep's deduplication. The list of collections may be provided in two ways. A list of collection uuids is sufficient. Alternatively, the PDH for each collection may also be @@ -58,9 +58,9 @@ Example: Use the 'arv' and 'jq' commands to get the list of the 100 largest collections and generate the deduplication report: - arv collection list --order 'file_size_total desc' | \ + arv collection list --order 'file_size_total desc' --limit 100 | \ jq -r '.items[] | [.portable_data_hash,.uuid] |@csv' | \ - tail -n100 |sed -e 's/"//g'|tr '\n' ' ' | \ + tail -n+2 |sed -e 's/"//g'|tr '\n' ' ' | \ xargs %s Options: @@ -80,8 +80,8 @@ Options: inputs = deDuplicate(inputs) - if len(inputs) < 2 { - logger.Error("Error: at least 2 different collections UUIDs required") + if len(inputs) < 1 { + logger.Errorf("Error: no collections provided\n") flags.Usage() return 2, inputs } @@ -115,7 +115,7 @@ func report(prog string, args []string, loader *config.Loader, logger *logrus.Lo // Arvados Client setup arv, err := arvadosclient.MakeArvadosClient() if err != nil { - logger.Errorf("error creating Arvados object: %s", err) + logger.Errorf("Error creating Arvados object: %s\n", err) exitcode = 1 return } @@ -129,7 +129,6 @@ func report(prog string, args []string, loader *config.Loader, logger *logrus.Lo pdhs := make(map[string]Col) var nominalSize int64 - fmt.Println() for _, input := range inputs { var uuid string var pdh string @@ -143,7 +142,7 @@ func report(prog string, args []string, loader *config.Loader, logger *logrus.Lo uuid = input } if !strings.Contains(uuid, "-4zz18-") { - logger.Error("uuid must refer to collection object") + logger.Errorf("Error: uuid must refer to collection object\n") exitcode = 1 return } @@ -201,7 +200,7 @@ func report(prog string, args []string, loader *config.Loader, logger *logrus.Lo seen := make(map[string]bool) for _, v := range blocks { for pdh, size := range v { - if _, ok := seen[pdh]; !ok { + if !seen[pdh] { seen[pdh] = true totalSize += int64(size) } diff --git a/lib/deduplicationreport/report_test.go b/lib/deduplicationreport/report_test.go index 6e7cd3af52..dc760f78ad 100644 --- a/lib/deduplicationreport/report_test.go +++ b/lib/deduplicationreport/report_test.go @@ -39,10 +39,13 @@ func (*Suite) TestTwoIdenticalUUIDs(c *check.C) { var stdout, stderr bytes.Buffer // Run dedupreport with 2 identical uuids exitcode := Command.RunCommand("deduplicationreport.test", []string{arvadostest.FooCollection, arvadostest.FooCollection}, &bytes.Buffer{}, &stdout, &stderr) - c.Check(exitcode, check.Equals, 2) - c.Check(stdout.String(), check.Equals, "") + c.Check(exitcode, check.Equals, 0) + //c.Check(stdout.String(), check.Equals, "") + c.Check(stdout.String(), check.Matches, "(?ms).*Collections:[[:space:]]+1.*") + c.Check(stdout.String(), check.Matches, "(?ms).*Nominal size of stored data:[[:space:]]+3 bytes \\(3 B\\).*") + c.Check(stdout.String(), check.Matches, "(?ms).*Actual size of stored data:[[:space:]]+3 bytes \\(3 B\\).*") + c.Check(stdout.String(), check.Matches, "(?ms).*Saved by Keep deduplication:[[:space:]]+0 bytes \\(0 B\\).*") c.Log(stderr.String()) - c.Check(stderr.String(), check.Matches, `(?ms).*Error: at least 2 different collections UUIDs required.*`) } func (*Suite) TestTwoUUIDsInvalidPDH(c *check.C) { @@ -70,6 +73,7 @@ func (*Suite) TestManyUUIDsNoOverlap(c *check.C) { // Run dedupreport with 5 UUIDs exitcode := Command.RunCommand("deduplicationreport.test", []string{arvadostest.FooCollection, arvadostest.HelloWorldCollection, arvadostest.FooBarDirCollection, arvadostest.WazVersion1Collection, arvadostest.UserAgreementCollection}, &bytes.Buffer{}, &stdout, &stderr) c.Check(exitcode, check.Equals, 0) + c.Check(stdout.String(), check.Matches, "(?ms).*Collections:[[:space:]]+5.*") c.Check(stdout.String(), check.Matches, "(?ms).*Nominal size of stored data:[[:space:]]+249049 bytes \\(243 KiB\\).*") c.Check(stdout.String(), check.Matches, "(?ms).*Actual size of stored data:[[:space:]]+249049 bytes \\(243 KiB\\).*") c.Check(stdout.String(), check.Matches, "(?ms).*Saved by Keep deduplication:[[:space:]]+0 bytes \\(0 B\\).*") @@ -83,42 +87,34 @@ func (*Suite) TestTwoOverlappingCollections(c *check.C) { arv := arvados.NewClientFromEnv() var c1 arvados.Collection - err := arv.RequestAndDecode(&c1, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". d3b07384d113edec49eaa6238ad5ff00+4+A2705511e0c47c92cc73e9ddc95b9822ef774c406@5f0de808 0:4:foo\n"}}) - c.Assert(err, check.Equals, nil) - - var c2 arvados.Collection - err = arv.RequestAndDecode(&c2, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". c157a79031e1c40f85931829bc5fc552+4+A1544eb0cee937934dc565d2b11836c804384c139@5f0e0bf9 d3b07384d113edec49eaa6238ad5ff00+4+A60746cad7ecc16fe26a0c17c55af90db675369c2@5f0e0bf9 0:4:bar 4:4:foo\n"}}) - c.Assert(err, check.Equals, nil) - - // Run dedupreport with 2 arguments: uuid uuid - exitcode := Command.RunCommand("deduplicationreport.test", []string{c1.UUID, c2.UUID}, &bytes.Buffer{}, &stdout, &stderr) - c.Check(exitcode, check.Equals, 0) - c.Check(stdout.String(), check.Matches, "(?ms).*Nominal size of stored data:[[:space:]]+12 bytes \\(12 B\\).*") - c.Check(stdout.String(), check.Matches, "(?ms).*Actual size of stored data:[[:space:]]+8 bytes \\(8 B\\).*") - c.Check(stdout.String(), check.Matches, "(?ms).*Saved by Keep deduplication:[[:space:]]+4 bytes \\(4 B\\).*") - c.Log(stderr.String()) - c.Check(stderr.String(), check.Equals, "") -} - -func (*Suite) TestTwoOverlappingCollectionsWithPDH(c *check.C) { - var stdout, stderr bytes.Buffer - // Create two collections - arv := arvados.NewClientFromEnv() - - var c1 arvados.Collection - err := arv.RequestAndDecode(&c1, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". d3b07384d113edec49eaa6238ad5ff00+4+A2705511e0c47c92cc73e9ddc95b9822ef774c406@5f0de808 0:4:foo\n"}}) + err := arv.RequestAndDecode(&c1, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". d3b07384d113edec49eaa6238ad5ff00+4 0:4:foo\n"}}) c.Assert(err, check.Equals, nil) var c2 arvados.Collection - err = arv.RequestAndDecode(&c2, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". c157a79031e1c40f85931829bc5fc552+4+A1544eb0cee937934dc565d2b11836c804384c139@5f0e0bf9 d3b07384d113edec49eaa6238ad5ff00+4+A60746cad7ecc16fe26a0c17c55af90db675369c2@5f0e0bf9 0:4:bar 4:4:foo\n"}}) + err = arv.RequestAndDecode(&c2, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". c157a79031e1c40f85931829bc5fc552+4 d3b07384d113edec49eaa6238ad5ff00+4 0:4:bar 4:4:foo\n"}}) c.Assert(err, check.Equals, nil) - // Run dedupreport with 2 arguments: pdh,uuid uuid - exitcode := Command.RunCommand("deduplicationreport.test", []string{c1.PortableDataHash + "," + c1.UUID, c2.UUID}, &bytes.Buffer{}, &stdout, &stderr) - c.Check(exitcode, check.Equals, 0) - c.Check(stdout.String(), check.Matches, "(?ms).*Nominal size of stored data:[[:space:]]+12 bytes \\(12 B\\).*") - c.Check(stdout.String(), check.Matches, "(?ms).*Actual size of stored data:[[:space:]]+8 bytes \\(8 B\\).*") - c.Check(stdout.String(), check.Matches, "(?ms).*Saved by Keep deduplication:[[:space:]]+4 bytes \\(4 B\\).*") - c.Log(stderr.String()) - c.Check(stderr.String(), check.Equals, "") + for _, trial := range []struct { + field1 string + field2 string + }{ + { + // Run dedupreport with 2 arguments: uuid uuid + field1: c1.UUID, + field2: c2.UUID, + }, + { + // Run dedupreport with 2 arguments: pdh,uuid uuid + field1: c1.PortableDataHash + "," + c1.UUID, + field2: c2.UUID, + }, + } { + exitcode := Command.RunCommand("deduplicationreport.test", []string{trial.field1, trial.field2}, &bytes.Buffer{}, &stdout, &stderr) + c.Check(exitcode, check.Equals, 0) + c.Check(stdout.String(), check.Matches, "(?ms).*Nominal size of stored data:[[:space:]]+12 bytes \\(12 B\\).*") + c.Check(stdout.String(), check.Matches, "(?ms).*Actual size of stored data:[[:space:]]+8 bytes \\(8 B\\).*") + c.Check(stdout.String(), check.Matches, "(?ms).*Saved by Keep deduplication:[[:space:]]+4 bytes \\(4 B\\).*") + c.Log(stderr.String()) + c.Check(stderr.String(), check.Equals, "") + } } -- 2.30.2