1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
5 package deduplicationreport
13 "git.arvados.org/arvados.git/sdk/go/arvados"
14 "git.arvados.org/arvados.git/sdk/go/arvadosclient"
15 "git.arvados.org/arvados.git/sdk/go/manifest"
17 "github.com/dustin/go-humanize"
18 "github.com/sirupsen/logrus"
21 func deDuplicate(inputs []string) (trimmed []string) {
22 seen := make(map[string]bool)
23 for _, uuid := range inputs {
26 trimmed = append(trimmed, uuid)
32 func parseFlags(prog string, args []string, logger *logrus.Logger, stderr io.Writer) ([]string, error) {
33 flags := flag.NewFlagSet("", flag.ContinueOnError)
34 flags.SetOutput(stderr)
35 flags.Usage = func() {
36 fmt.Fprintf(flags.Output(), `
38 %s [options ...] <collection-uuid> <collection-uuid> ...
40 %s [options ...] <collection-pdh>,<collection_uuid> \
41 <collection-pdh>,<collection_uuid> ...
43 This program analyzes the overlap in blocks used by 2 or more collections. It
44 prints a deduplication report that shows the nominal space used by the
45 collections, as well as the actual size and the amount of space that is saved
46 by Keep's deduplication.
48 The list of collections may be provided in two ways. A list of collection
49 uuids is sufficient. Alternatively, the PDH for each collection may also be
50 provided. This is will greatly speed up operation when the list contains
51 multiple collections with the same PDH.
53 Exit status will be zero if there were no errors generating the report.
57 Use the 'arv' and 'jq' commands to get the list of the 100
58 largest collections and generate the deduplication report:
60 arv collection list --order 'file_size_total desc' --limit 100 | \
61 jq -r '.items[] | [.portable_data_hash,.uuid] |@csv' | \
62 sed -e 's/"//g'|tr '\n' ' ' | \
69 loglevel := flags.String("log-level", "info", "logging level (debug, info, ...)")
70 err := flags.Parse(args)
71 if err == flag.ErrHelp {
73 } else if err != nil {
77 inputs := flags.Args()
79 inputs = deDuplicate(inputs)
82 err = fmt.Errorf("Error: no collections provided")
86 lvl, err := logrus.ParseLevel(*loglevel)
94 func blockList(collection arvados.Collection) (blocks map[string]int) {
95 blocks = make(map[string]int)
96 m := manifest.Manifest{Text: collection.ManifestText}
97 blockChannel := m.BlockIterWithDuplicates()
98 for b := range blockChannel {
99 blocks[b.Digest.String()] = b.Size
104 func report(prog string, args []string, logger *logrus.Logger, stdout, stderr io.Writer) (exitcode int) {
108 inputs, err = parseFlags(prog, args, logger, stderr)
109 if err == flag.ErrHelp {
111 } else if err != nil {
112 logger.Error(err.Error())
116 // Arvados Client setup
117 arv, err := arvadosclient.MakeArvadosClient()
119 logger.Errorf("Error creating Arvados object: %s", err)
129 blocks := make(map[string]map[string]int)
130 pdhs := make(map[string]Col)
131 var nominalSize int64
133 for _, input := range inputs {
136 if strings.Contains(input, ",") {
137 // The input is in the format pdh,uuid. This will allow us to save time on duplicate pdh's
138 tmp := strings.Split(input, ",")
142 // The input must be a plain uuid
145 if !strings.Contains(uuid, "-4zz18-") {
146 logger.Errorf("Error: uuid must refer to collection object")
150 if _, ok := pdhs[pdh]; ok {
151 // We've processed a collection with this pdh already. Simply add its
152 // size to the totals and move on to the next one.
153 // Note that we simply trust the PDH matches the collection UUID here,
154 // in other words, we use it over the UUID. If they don't match, the report
156 nominalSize += pdhs[pdh].FileSizeTotal
158 var collection arvados.Collection
159 err = arv.Get("collections", uuid, nil, &collection)
161 logger.Errorf("Error: unable to retrieve collection: %s", err)
165 blocks[uuid] = make(map[string]int)
166 blocks[uuid] = blockList(collection)
167 if pdh != "" && collection.PortableDataHash != pdh {
168 logger.Errorf("Error: the collection with UUID %s has PDH %s, but a different PDH was provided in the arguments: %s", uuid, collection.PortableDataHash, pdh)
173 pdh = collection.PortableDataHash
177 if collection.FileSizeTotal != 0 || collection.FileCount != 0 {
178 nominalSize += collection.FileSizeTotal
179 col.FileSizeTotal = collection.FileSizeTotal
180 col.FileCount = int64(collection.FileCount)
182 // Collections created with old Arvados versions do not always have the total file size and count cached in the collections object
184 for _, size := range blocks[uuid] {
185 collSize += int64(size)
187 nominalSize += collSize
188 col.FileSizeTotal = collSize
193 if pdhs[pdh].FileCount != 0 {
194 fmt.Fprintf(stdout, "Collection %s: pdh %s; nominal size %d (%s); file count %d\n", uuid, pdh, pdhs[pdh].FileSizeTotal, humanize.IBytes(uint64(pdhs[pdh].FileSizeTotal)), pdhs[pdh].FileCount)
196 fmt.Fprintf(stdout, "Collection %s: pdh %s; nominal size %d (%s)\n", uuid, pdh, pdhs[pdh].FileSizeTotal, humanize.IBytes(uint64(pdhs[pdh].FileSizeTotal)))
201 seen := make(map[string]bool)
202 for _, v := range blocks {
203 for pdh, size := range v {
206 totalSize += int64(size)
211 fmt.Fprintf(stdout, "Collections: %15d\n", len(inputs))
212 fmt.Fprintf(stdout, "Nominal size of stored data: %15d bytes (%s)\n", nominalSize, humanize.IBytes(uint64(nominalSize)))
213 fmt.Fprintf(stdout, "Actual size of stored data: %15d bytes (%s)\n", totalSize, humanize.IBytes(uint64(totalSize)))
214 fmt.Fprintf(stdout, "Saved by Keep deduplication: %15d bytes (%s)\n", nominalSize-totalSize, humanize.IBytes(uint64(nominalSize-totalSize)))