16573: address review comments.
authorWard Vandewege <ward@curii.com>
Mon, 13 Jul 2020 01:34:02 +0000 (21:34 -0400)
committerWard Vandewege <ward@curii.com>
Mon, 13 Jul 2020 01:34:02 +0000 (21:34 -0400)
Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward@curii.com>

lib/deduplicationreport/report.go
lib/deduplicationreport/report_test.go

index b7699fcb2554ffa8955dc0fb0b4628088dc74eb0..663e734274b3cbafe1ebe55be9706983270e5b67 100644 (file)
@@ -22,7 +22,7 @@ import (
 func deDuplicate(inputs []string) (trimmed []string) {
        seen := make(map[string]bool)
        for _, uuid := range inputs {
-               if _, ok := seen[uuid]; !ok {
+               if !seen[uuid] {
                        seen[uuid] = true
                        trimmed = append(trimmed, uuid)
                }
@@ -42,9 +42,9 @@ Usage:
      <collection-pdh>,<collection_uuid> ...
 
   This program analyzes the overlap in blocks used by 2 or more collections. It
-  prints a deduplication report that shows the nominal space used by the list
-  of collection, as well as the actual size and the amount of space that is
-  saved by Keep's deduplication.
+  prints a deduplication report that shows the nominal space used by the
+  collections, as well as the actual size and the amount of space that is saved
+  by Keep's deduplication.
 
   The list of collections may be provided in two ways. A list of collection
   uuids is sufficient. Alternatively, the PDH for each collection may also be
@@ -58,9 +58,9 @@ Example:
   Use the 'arv' and 'jq' commands to get the list of the 100
   largest collections and generate the deduplication report:
 
-  arv collection list --order 'file_size_total desc' | \
+  arv collection list --order 'file_size_total desc' --limit 100 | \
     jq -r '.items[] | [.portable_data_hash,.uuid] |@csv' | \
-    tail -n100 |sed -e 's/"//g'|tr '\n' ' ' | \
+    tail -n+2 |sed -e 's/"//g'|tr '\n' ' ' | \
     xargs %s
 
 Options:
@@ -80,8 +80,8 @@ Options:
 
        inputs = deDuplicate(inputs)
 
-       if len(inputs) < 2 {
-               logger.Error("Error: at least 2 different collections UUIDs required")
+       if len(inputs) < 1 {
+               logger.Errorf("Error: no collections provided\n")
                flags.Usage()
                return 2, inputs
        }
@@ -115,7 +115,7 @@ func report(prog string, args []string, loader *config.Loader, logger *logrus.Lo
        // Arvados Client setup
        arv, err := arvadosclient.MakeArvadosClient()
        if err != nil {
-               logger.Errorf("error creating Arvados object: %s", err)
+               logger.Errorf("Error creating Arvados object: %s\n", err)
                exitcode = 1
                return
        }
@@ -129,7 +129,6 @@ func report(prog string, args []string, loader *config.Loader, logger *logrus.Lo
        pdhs := make(map[string]Col)
        var nominalSize int64
 
-       fmt.Println()
        for _, input := range inputs {
                var uuid string
                var pdh string
@@ -143,7 +142,7 @@ func report(prog string, args []string, loader *config.Loader, logger *logrus.Lo
                        uuid = input
                }
                if !strings.Contains(uuid, "-4zz18-") {
-                       logger.Error("uuid must refer to collection object")
+                       logger.Errorf("Error: uuid must refer to collection object\n")
                        exitcode = 1
                        return
                }
@@ -201,7 +200,7 @@ func report(prog string, args []string, loader *config.Loader, logger *logrus.Lo
        seen := make(map[string]bool)
        for _, v := range blocks {
                for pdh, size := range v {
-                       if _, ok := seen[pdh]; !ok {
+                       if !seen[pdh] {
                                seen[pdh] = true
                                totalSize += int64(size)
                        }
index 6e7cd3af52f57d183e94bfe7f5b41d8e29032569..dc760f78ad77cd1b4856c6c01ccb1bbd0ae7b188 100644 (file)
@@ -39,10 +39,13 @@ func (*Suite) TestTwoIdenticalUUIDs(c *check.C) {
        var stdout, stderr bytes.Buffer
        // Run dedupreport with 2 identical uuids
        exitcode := Command.RunCommand("deduplicationreport.test", []string{arvadostest.FooCollection, arvadostest.FooCollection}, &bytes.Buffer{}, &stdout, &stderr)
-       c.Check(exitcode, check.Equals, 2)
-       c.Check(stdout.String(), check.Equals, "")
+       c.Check(exitcode, check.Equals, 0)
+       //c.Check(stdout.String(), check.Equals, "")
+       c.Check(stdout.String(), check.Matches, "(?ms).*Collections:[[:space:]]+1.*")
+       c.Check(stdout.String(), check.Matches, "(?ms).*Nominal size of stored data:[[:space:]]+3 bytes \\(3 B\\).*")
+       c.Check(stdout.String(), check.Matches, "(?ms).*Actual size of stored data:[[:space:]]+3 bytes \\(3 B\\).*")
+       c.Check(stdout.String(), check.Matches, "(?ms).*Saved by Keep deduplication:[[:space:]]+0 bytes \\(0 B\\).*")
        c.Log(stderr.String())
-       c.Check(stderr.String(), check.Matches, `(?ms).*Error: at least 2 different collections UUIDs required.*`)
 }
 
 func (*Suite) TestTwoUUIDsInvalidPDH(c *check.C) {
@@ -70,6 +73,7 @@ func (*Suite) TestManyUUIDsNoOverlap(c *check.C) {
        // Run dedupreport with 5 UUIDs
        exitcode := Command.RunCommand("deduplicationreport.test", []string{arvadostest.FooCollection, arvadostest.HelloWorldCollection, arvadostest.FooBarDirCollection, arvadostest.WazVersion1Collection, arvadostest.UserAgreementCollection}, &bytes.Buffer{}, &stdout, &stderr)
        c.Check(exitcode, check.Equals, 0)
+       c.Check(stdout.String(), check.Matches, "(?ms).*Collections:[[:space:]]+5.*")
        c.Check(stdout.String(), check.Matches, "(?ms).*Nominal size of stored data:[[:space:]]+249049 bytes \\(243 KiB\\).*")
        c.Check(stdout.String(), check.Matches, "(?ms).*Actual size of stored data:[[:space:]]+249049 bytes \\(243 KiB\\).*")
        c.Check(stdout.String(), check.Matches, "(?ms).*Saved by Keep deduplication:[[:space:]]+0 bytes \\(0 B\\).*")
@@ -83,42 +87,34 @@ func (*Suite) TestTwoOverlappingCollections(c *check.C) {
        arv := arvados.NewClientFromEnv()
 
        var c1 arvados.Collection
-       err := arv.RequestAndDecode(&c1, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". d3b07384d113edec49eaa6238ad5ff00+4+A2705511e0c47c92cc73e9ddc95b9822ef774c406@5f0de808 0:4:foo\n"}})
-       c.Assert(err, check.Equals, nil)
-
-       var c2 arvados.Collection
-       err = arv.RequestAndDecode(&c2, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". c157a79031e1c40f85931829bc5fc552+4+A1544eb0cee937934dc565d2b11836c804384c139@5f0e0bf9 d3b07384d113edec49eaa6238ad5ff00+4+A60746cad7ecc16fe26a0c17c55af90db675369c2@5f0e0bf9 0:4:bar 4:4:foo\n"}})
-       c.Assert(err, check.Equals, nil)
-
-       // Run dedupreport with 2 arguments: uuid uuid
-       exitcode := Command.RunCommand("deduplicationreport.test", []string{c1.UUID, c2.UUID}, &bytes.Buffer{}, &stdout, &stderr)
-       c.Check(exitcode, check.Equals, 0)
-       c.Check(stdout.String(), check.Matches, "(?ms).*Nominal size of stored data:[[:space:]]+12 bytes \\(12 B\\).*")
-       c.Check(stdout.String(), check.Matches, "(?ms).*Actual size of stored data:[[:space:]]+8 bytes \\(8 B\\).*")
-       c.Check(stdout.String(), check.Matches, "(?ms).*Saved by Keep deduplication:[[:space:]]+4 bytes \\(4 B\\).*")
-       c.Log(stderr.String())
-       c.Check(stderr.String(), check.Equals, "")
-}
-
-func (*Suite) TestTwoOverlappingCollectionsWithPDH(c *check.C) {
-       var stdout, stderr bytes.Buffer
-       // Create two collections
-       arv := arvados.NewClientFromEnv()
-
-       var c1 arvados.Collection
-       err := arv.RequestAndDecode(&c1, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". d3b07384d113edec49eaa6238ad5ff00+4+A2705511e0c47c92cc73e9ddc95b9822ef774c406@5f0de808 0:4:foo\n"}})
+       err := arv.RequestAndDecode(&c1, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". d3b07384d113edec49eaa6238ad5ff00+4 0:4:foo\n"}})
        c.Assert(err, check.Equals, nil)
 
        var c2 arvados.Collection
-       err = arv.RequestAndDecode(&c2, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". c157a79031e1c40f85931829bc5fc552+4+A1544eb0cee937934dc565d2b11836c804384c139@5f0e0bf9 d3b07384d113edec49eaa6238ad5ff00+4+A60746cad7ecc16fe26a0c17c55af90db675369c2@5f0e0bf9 0:4:bar 4:4:foo\n"}})
+       err = arv.RequestAndDecode(&c2, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". c157a79031e1c40f85931829bc5fc552+4 d3b07384d113edec49eaa6238ad5ff00+4 0:4:bar 4:4:foo\n"}})
        c.Assert(err, check.Equals, nil)
 
-       // Run dedupreport with 2 arguments: pdh,uuid uuid
-       exitcode := Command.RunCommand("deduplicationreport.test", []string{c1.PortableDataHash + "," + c1.UUID, c2.UUID}, &bytes.Buffer{}, &stdout, &stderr)
-       c.Check(exitcode, check.Equals, 0)
-       c.Check(stdout.String(), check.Matches, "(?ms).*Nominal size of stored data:[[:space:]]+12 bytes \\(12 B\\).*")
-       c.Check(stdout.String(), check.Matches, "(?ms).*Actual size of stored data:[[:space:]]+8 bytes \\(8 B\\).*")
-       c.Check(stdout.String(), check.Matches, "(?ms).*Saved by Keep deduplication:[[:space:]]+4 bytes \\(4 B\\).*")
-       c.Log(stderr.String())
-       c.Check(stderr.String(), check.Equals, "")
+       for _, trial := range []struct {
+               field1 string
+               field2 string
+       }{
+               {
+                       // Run dedupreport with 2 arguments: uuid uuid
+                       field1: c1.UUID,
+                       field2: c2.UUID,
+               },
+               {
+                       // Run dedupreport with 2 arguments: pdh,uuid uuid
+                       field1: c1.PortableDataHash + "," + c1.UUID,
+                       field2: c2.UUID,
+               },
+       } {
+               exitcode := Command.RunCommand("deduplicationreport.test", []string{trial.field1, trial.field2}, &bytes.Buffer{}, &stdout, &stderr)
+               c.Check(exitcode, check.Equals, 0)
+               c.Check(stdout.String(), check.Matches, "(?ms).*Nominal size of stored data:[[:space:]]+12 bytes \\(12 B\\).*")
+               c.Check(stdout.String(), check.Matches, "(?ms).*Actual size of stored data:[[:space:]]+8 bytes \\(8 B\\).*")
+               c.Check(stdout.String(), check.Matches, "(?ms).*Saved by Keep deduplication:[[:space:]]+4 bytes \\(4 B\\).*")
+               c.Log(stderr.String())
+               c.Check(stderr.String(), check.Equals, "")
+       }
 }