16573: add a deduplication-report command to arvados-client
[arvados.git] / lib / deduplicationreport / report.go
diff --git a/lib/deduplicationreport/report.go b/lib/deduplicationreport/report.go
new file mode 100644 (file)
index 0000000..b7699fc
--- /dev/null
@@ -0,0 +1,217 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package deduplicationreport
+
+import (
+       "flag"
+       "fmt"
+       "io"
+       "strings"
+
+       "git.arvados.org/arvados.git/lib/config"
+       "git.arvados.org/arvados.git/sdk/go/arvados"
+       "git.arvados.org/arvados.git/sdk/go/arvadosclient"
+       "git.arvados.org/arvados.git/sdk/go/manifest"
+
+       "github.com/dustin/go-humanize"
+       "github.com/sirupsen/logrus"
+)
+
+func deDuplicate(inputs []string) (trimmed []string) {
+       seen := make(map[string]bool)
+       for _, uuid := range inputs {
+               if _, ok := seen[uuid]; !ok {
+                       seen[uuid] = true
+                       trimmed = append(trimmed, uuid)
+               }
+       }
+       return
+}
+
+func parseFlags(prog string, args []string, loader *config.Loader, logger *logrus.Logger, stderr io.Writer) (exitcode int, inputs []string) {
+       flags := flag.NewFlagSet("", flag.ContinueOnError)
+       flags.SetOutput(stderr)
+       flags.Usage = func() {
+               fmt.Fprintf(flags.Output(), `
+Usage:
+  %s [options ...] <collection-uuid> <collection-uuid> ...
+
+  %s [options ...] <collection-pdh>,<collection_uuid> \
+     <collection-pdh>,<collection_uuid> ...
+
+  This program analyzes the overlap in blocks used by 2 or more collections. It
+  prints a deduplication report that shows the nominal space used by the list
+  of collection, as well as the actual size and the amount of space that is
+  saved by Keep's deduplication.
+
+  The list of collections may be provided in two ways. A list of collection
+  uuids is sufficient. Alternatively, the PDH for each collection may also be
+  provided. This is will greatly speed up operation when the list contains
+  multiple collections with the same PDH.
+
+  Exit status will be zero if there were no errors generating the report.
+
+Example:
+
+  Use the 'arv' and 'jq' commands to get the list of the 100
+  largest collections and generate the deduplication report:
+
+  arv collection list --order 'file_size_total desc' | \
+    jq -r '.items[] | [.portable_data_hash,.uuid] |@csv' | \
+    tail -n100 |sed -e 's/"//g'|tr '\n' ' ' | \
+    xargs %s
+
+Options:
+`, prog, prog, prog)
+               flags.PrintDefaults()
+       }
+       loader.SetupFlags(flags)
+       loglevel := flags.String("log-level", "info", "logging level (debug, info, ...)")
+       err := flags.Parse(args)
+       if err == flag.ErrHelp {
+               return 0, inputs
+       } else if err != nil {
+               return 2, inputs
+       }
+
+       inputs = flags.Args()
+
+       inputs = deDuplicate(inputs)
+
+       if len(inputs) < 2 {
+               logger.Error("Error: at least 2 different collections UUIDs required")
+               flags.Usage()
+               return 2, inputs
+       }
+
+       lvl, err := logrus.ParseLevel(*loglevel)
+       if err != nil {
+               return 2, inputs
+       }
+       logger.SetLevel(lvl)
+       return
+}
+
+func blockList(collection arvados.Collection) (blocks map[string]int) {
+       blocks = make(map[string]int)
+       m := manifest.Manifest{Text: collection.ManifestText}
+       blockChannel := m.BlockIterWithDuplicates()
+       for b := range blockChannel {
+               blocks[b.Digest.String()] = b.Size
+       }
+       return
+}
+
+func report(prog string, args []string, loader *config.Loader, logger *logrus.Logger, stdout, stderr io.Writer) (exitcode int) {
+
+       var inputs []string
+       exitcode, inputs = parseFlags(prog, args, loader, logger, stderr)
+       if exitcode != 0 {
+               return
+       }
+
+       // Arvados Client setup
+       arv, err := arvadosclient.MakeArvadosClient()
+       if err != nil {
+               logger.Errorf("error creating Arvados object: %s", err)
+               exitcode = 1
+               return
+       }
+
+       type Col struct {
+               FileSizeTotal int64
+               FileCount     int64
+       }
+
+       blocks := make(map[string]map[string]int)
+       pdhs := make(map[string]Col)
+       var nominalSize int64
+
+       fmt.Println()
+       for _, input := range inputs {
+               var uuid string
+               var pdh string
+               if strings.Contains(input, ",") {
+                       // The input is in the format pdh,uuid. This will allow us to save time on duplicate pdh's
+                       tmp := strings.Split(input, ",")
+                       pdh = tmp[0]
+                       uuid = tmp[1]
+               } else {
+                       // The input must be a plain uuid
+                       uuid = input
+               }
+               if !strings.Contains(uuid, "-4zz18-") {
+                       logger.Error("uuid must refer to collection object")
+                       exitcode = 1
+                       return
+               }
+               if _, ok := pdhs[pdh]; ok {
+                       // We've processed a collection with this pdh already. Simply add its
+                       // size to the totals and move on to the next one.
+                       // Note that we simply trust the PDH matches the collection UUID here,
+                       // in other words, we use it over the UUID. If they don't match, the report
+                       // will be wrong.
+                       nominalSize += pdhs[pdh].FileSizeTotal
+               } else {
+                       var collection arvados.Collection
+                       err = arv.Get("collections", uuid, nil, &collection)
+                       if err != nil {
+                               logger.Errorf("Error: unable to retrieve collection: %s\n", err)
+                               exitcode = 1
+                               return
+                       }
+                       blocks[uuid] = make(map[string]int)
+                       blocks[uuid] = blockList(collection)
+                       if pdh != "" && collection.PortableDataHash != pdh {
+                               logger.Errorf("Error: the collection with UUID %s has PDH %s, but a different PDH was provided in the arguments: %s\n", uuid, collection.PortableDataHash, pdh)
+                               exitcode = 1
+                               return
+                       }
+                       if pdh == "" {
+                               pdh = collection.PortableDataHash
+                       }
+
+                       col := Col{}
+                       if collection.FileSizeTotal != 0 || collection.FileCount != 0 {
+                               nominalSize += collection.FileSizeTotal
+                               col.FileSizeTotal = collection.FileSizeTotal
+                               col.FileCount = int64(collection.FileCount)
+                       } else {
+                               // Collections created with old Arvados versions do not always have the total file size and count cached in the collections object
+                               var collSize int64
+                               for _, size := range blocks[uuid] {
+                                       collSize += int64(size)
+                               }
+                               nominalSize += collSize
+                               col.FileSizeTotal = collSize
+                       }
+                       pdhs[pdh] = col
+               }
+
+               if pdhs[pdh].FileCount != 0 {
+                       fmt.Fprintf(stdout, "Collection %s: pdh %s; nominal size %d (%s); file count %d\n", uuid, pdh, pdhs[pdh].FileSizeTotal, humanize.IBytes(uint64(pdhs[pdh].FileSizeTotal)), pdhs[pdh].FileCount)
+               } else {
+                       fmt.Fprintf(stdout, "Collection %s: pdh %s; nominal size %d (%s)\n", uuid, pdh, pdhs[pdh].FileSizeTotal, humanize.IBytes(uint64(pdhs[pdh].FileSizeTotal)))
+               }
+       }
+
+       var totalSize int64
+       seen := make(map[string]bool)
+       for _, v := range blocks {
+               for pdh, size := range v {
+                       if _, ok := seen[pdh]; !ok {
+                               seen[pdh] = true
+                               totalSize += int64(size)
+                       }
+               }
+       }
+       fmt.Fprintln(stdout)
+       fmt.Fprintf(stdout, "Collections:                 %15d\n", len(inputs))
+       fmt.Fprintf(stdout, "Nominal size of stored data: %15d bytes (%s)\n", nominalSize, humanize.IBytes(uint64(nominalSize)))
+       fmt.Fprintf(stdout, "Actual size of stored data:  %15d bytes (%s)\n", totalSize, humanize.IBytes(uint64(totalSize)))
+       fmt.Fprintf(stdout, "Saved by Keep deduplication: %15d bytes (%s)\n", nominalSize-totalSize, humanize.IBytes(uint64(nominalSize-totalSize)))
+
+       return exitcode
+}