1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
5 package deduplicationreport
13 "git.arvados.org/arvados.git/lib/config"
14 "git.arvados.org/arvados.git/sdk/go/arvados"
15 "git.arvados.org/arvados.git/sdk/go/arvadosclient"
16 "git.arvados.org/arvados.git/sdk/go/manifest"
18 "github.com/dustin/go-humanize"
19 "github.com/sirupsen/logrus"
22 func deDuplicate(inputs []string) (trimmed []string) {
23 seen := make(map[string]bool)
24 for _, uuid := range inputs {
27 trimmed = append(trimmed, uuid)
33 func parseFlags(prog string, args []string, loader *config.Loader, logger *logrus.Logger, stderr io.Writer) (exitcode int, inputs []string) {
34 flags := flag.NewFlagSet("", flag.ContinueOnError)
35 flags.SetOutput(stderr)
36 flags.Usage = func() {
37 fmt.Fprintf(flags.Output(), `
39 %s [options ...] <collection-uuid> <collection-uuid> ...
41 %s [options ...] <collection-pdh>,<collection_uuid> \
42 <collection-pdh>,<collection_uuid> ...
44 This program analyzes the overlap in blocks used by 2 or more collections. It
45 prints a deduplication report that shows the nominal space used by the
46 collections, as well as the actual size and the amount of space that is saved
47 by Keep's deduplication.
49 The list of collections may be provided in two ways. A list of collection
50 uuids is sufficient. Alternatively, the PDH for each collection may also be
51 provided. This is will greatly speed up operation when the list contains
52 multiple collections with the same PDH.
54 Exit status will be zero if there were no errors generating the report.
58 Use the 'arv' and 'jq' commands to get the list of the 100
59 largest collections and generate the deduplication report:
61 arv collection list --order 'file_size_total desc' --limit 100 | \
62 jq -r '.items[] | [.portable_data_hash,.uuid] |@csv' | \
63 tail -n+2 |sed -e 's/"//g'|tr '\n' ' ' | \
70 loader.SetupFlags(flags)
71 loglevel := flags.String("log-level", "info", "logging level (debug, info, ...)")
72 err := flags.Parse(args)
73 if err == flag.ErrHelp {
75 } else if err != nil {
81 inputs = deDuplicate(inputs)
84 logger.Errorf("Error: no collections provided")
89 lvl, err := logrus.ParseLevel(*loglevel)
97 func blockList(collection arvados.Collection) (blocks map[string]int) {
98 blocks = make(map[string]int)
99 m := manifest.Manifest{Text: collection.ManifestText}
100 blockChannel := m.BlockIterWithDuplicates()
101 for b := range blockChannel {
102 blocks[b.Digest.String()] = b.Size
107 func report(prog string, args []string, loader *config.Loader, logger *logrus.Logger, stdout, stderr io.Writer) (exitcode int) {
110 exitcode, inputs = parseFlags(prog, args, loader, logger, stderr)
115 // Arvados Client setup
116 arv, err := arvadosclient.MakeArvadosClient()
118 logger.Errorf("Error creating Arvados object: %s", err)
128 blocks := make(map[string]map[string]int)
129 pdhs := make(map[string]Col)
130 var nominalSize int64
132 for _, input := range inputs {
135 if strings.Contains(input, ",") {
136 // The input is in the format pdh,uuid. This will allow us to save time on duplicate pdh's
137 tmp := strings.Split(input, ",")
141 // The input must be a plain uuid
144 if !strings.Contains(uuid, "-4zz18-") {
145 logger.Errorf("Error: uuid must refer to collection object")
149 if _, ok := pdhs[pdh]; ok {
150 // We've processed a collection with this pdh already. Simply add its
151 // size to the totals and move on to the next one.
152 // Note that we simply trust the PDH matches the collection UUID here,
153 // in other words, we use it over the UUID. If they don't match, the report
155 nominalSize += pdhs[pdh].FileSizeTotal
157 var collection arvados.Collection
158 err = arv.Get("collections", uuid, nil, &collection)
160 logger.Errorf("Error: unable to retrieve collection: %s", err)
164 blocks[uuid] = make(map[string]int)
165 blocks[uuid] = blockList(collection)
166 if pdh != "" && collection.PortableDataHash != pdh {
167 logger.Errorf("Error: the collection with UUID %s has PDH %s, but a different PDH was provided in the arguments: %s", uuid, collection.PortableDataHash, pdh)
172 pdh = collection.PortableDataHash
176 if collection.FileSizeTotal != 0 || collection.FileCount != 0 {
177 nominalSize += collection.FileSizeTotal
178 col.FileSizeTotal = collection.FileSizeTotal
179 col.FileCount = int64(collection.FileCount)
181 // Collections created with old Arvados versions do not always have the total file size and count cached in the collections object
183 for _, size := range blocks[uuid] {
184 collSize += int64(size)
186 nominalSize += collSize
187 col.FileSizeTotal = collSize
192 if pdhs[pdh].FileCount != 0 {
193 fmt.Fprintf(stdout, "Collection %s: pdh %s; nominal size %d (%s); file count %d\n", uuid, pdh, pdhs[pdh].FileSizeTotal, humanize.IBytes(uint64(pdhs[pdh].FileSizeTotal)), pdhs[pdh].FileCount)
195 fmt.Fprintf(stdout, "Collection %s: pdh %s; nominal size %d (%s)\n", uuid, pdh, pdhs[pdh].FileSizeTotal, humanize.IBytes(uint64(pdhs[pdh].FileSizeTotal)))
200 seen := make(map[string]bool)
201 for _, v := range blocks {
202 for pdh, size := range v {
205 totalSize += int64(size)
210 fmt.Fprintf(stdout, "Collections: %15d\n", len(inputs))
211 fmt.Fprintf(stdout, "Nominal size of stored data: %15d bytes (%s)\n", nominalSize, humanize.IBytes(uint64(nominalSize)))
212 fmt.Fprintf(stdout, "Actual size of stored data: %15d bytes (%s)\n", totalSize, humanize.IBytes(uint64(totalSize)))
213 fmt.Fprintf(stdout, "Saved by Keep deduplication: %15d bytes (%s)\n", nominalSize-totalSize, humanize.IBytes(uint64(nominalSize-totalSize)))