1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
5 package deduplicationreport
13 "git.arvados.org/arvados.git/lib/cmd"
14 "git.arvados.org/arvados.git/sdk/go/arvados"
15 "git.arvados.org/arvados.git/sdk/go/arvadosclient"
16 "git.arvados.org/arvados.git/sdk/go/manifest"
18 "github.com/dustin/go-humanize"
19 "github.com/sirupsen/logrus"
22 func deDuplicate(inputs []string) (trimmed []string) {
23 seen := make(map[string]bool)
24 for _, uuid := range inputs {
27 trimmed = append(trimmed, uuid)
33 // parseFlags returns either some inputs to process, or (if there are
34 // no inputs to process) a nil slice and a suitable exit code.
35 func parseFlags(prog string, args []string, logger *logrus.Logger, stderr io.Writer) (inputs []string, exitcode int) {
36 flags := flag.NewFlagSet(prog, flag.ContinueOnError)
37 flags.Usage = func() {
38 fmt.Fprintf(flags.Output(), `
40 %s [options ...] <collection-uuid> <collection-uuid> ...
42 %s [options ...] <collection-pdh>,<collection-uuid> \
43 <collection-pdh>,<collection-uuid> ...
45 This program analyzes the overlap in blocks used by 2 or more collections. It
46 prints a deduplication report that shows the nominal space used by the
47 collections, as well as the actual size and the amount of space that is saved
48 by Keep's deduplication.
50 The list of collections may be provided in two ways. A list of collection
51 uuids is sufficient. Alternatively, the PDH for each collection may also be
52 provided. This is will greatly speed up operation when the list contains
53 multiple collections with the same PDH.
55 Exit status will be zero if there were no errors generating the report.
59 Use the 'arv' and 'jq' commands to get the list of the 100
60 largest collections and generate the deduplication report:
62 arv collection list --order 'file_size_total desc' --limit 100 | \
63 jq -r '.items[] | [.portable_data_hash,.uuid] |@csv' | \
64 sed -e 's/"//g'|tr '\n' ' ' | \
71 loglevel := flags.String("log-level", "info", "logging level (debug, info, ...)")
72 if ok, code := cmd.ParseFlags(flags, prog, args, "collection-uuid [...]", stderr); !ok {
76 inputs = deDuplicate(flags.Args())
79 fmt.Fprintf(stderr, "Error: no collections provided\n")
83 lvl, err := logrus.ParseLevel(*loglevel)
85 fmt.Fprintf(stderr, "Error: cannot parse log level: %s\n", err)
92 func blockList(collection arvados.Collection) (blocks map[string]int) {
93 blocks = make(map[string]int)
94 m := manifest.Manifest{Text: collection.ManifestText}
95 blockChannel := m.BlockIterWithDuplicates()
96 for b := range blockChannel {
97 blocks[b.Digest.String()] = b.Size
102 func report(prog string, args []string, logger *logrus.Logger, stdout, stderr io.Writer) (exitcode int) {
105 inputs, exitcode = parseFlags(prog, args, logger, stderr)
110 // Arvados Client setup
111 arv, err := arvadosclient.MakeArvadosClient()
113 logger.Errorf("Error creating Arvados object: %s", err)
123 blocks := make(map[string]map[string]int)
124 pdhs := make(map[string]Col)
125 var nominalSize int64
127 for _, input := range inputs {
130 if strings.Contains(input, ",") {
131 // The input is in the format pdh,uuid. This will allow us to save time on duplicate pdh's
132 tmp := strings.Split(input, ",")
136 // The input must be a plain uuid
139 if !strings.Contains(uuid, "-4zz18-") {
140 logger.Errorf("Error: uuid must refer to collection object")
144 if _, ok := pdhs[pdh]; ok {
145 // We've processed a collection with this pdh already. Simply add its
146 // size to the totals and move on to the next one.
147 // Note that we simply trust the PDH matches the collection UUID here,
148 // in other words, we use it over the UUID. If they don't match, the report
150 nominalSize += pdhs[pdh].FileSizeTotal
152 var collection arvados.Collection
153 err = arv.Get("collections", uuid, nil, &collection)
155 logger.Errorf("Error: unable to retrieve collection: %s", err)
159 blocks[uuid] = make(map[string]int)
160 blocks[uuid] = blockList(collection)
161 if pdh != "" && collection.PortableDataHash != pdh {
162 logger.Errorf("Error: the collection with UUID %s has PDH %s, but a different PDH was provided in the arguments: %s", uuid, collection.PortableDataHash, pdh)
167 pdh = collection.PortableDataHash
171 if collection.FileSizeTotal != 0 || collection.FileCount != 0 {
172 nominalSize += collection.FileSizeTotal
173 col.FileSizeTotal = collection.FileSizeTotal
174 col.FileCount = int64(collection.FileCount)
176 // Collections created with old Arvados versions do not always have the total file size and count cached in the collections object
178 for _, size := range blocks[uuid] {
179 collSize += int64(size)
181 nominalSize += collSize
182 col.FileSizeTotal = collSize
187 if pdhs[pdh].FileCount != 0 {
188 fmt.Fprintf(stdout, "Collection %s: pdh %s; nominal size %d (%s); file count %d\n", uuid, pdh, pdhs[pdh].FileSizeTotal, humanize.IBytes(uint64(pdhs[pdh].FileSizeTotal)), pdhs[pdh].FileCount)
190 fmt.Fprintf(stdout, "Collection %s: pdh %s; nominal size %d (%s)\n", uuid, pdh, pdhs[pdh].FileSizeTotal, humanize.IBytes(uint64(pdhs[pdh].FileSizeTotal)))
195 seen := make(map[string]bool)
196 for _, v := range blocks {
197 for pdh, size := range v {
200 totalSize += int64(size)
205 fmt.Fprintf(stdout, "Collections: %15d\n", len(inputs))
206 fmt.Fprintf(stdout, "Nominal size of stored data: %15d bytes (%s)\n", nominalSize, humanize.IBytes(uint64(nominalSize)))
207 fmt.Fprintf(stdout, "Actual size of stored data: %15d bytes (%s)\n", totalSize, humanize.IBytes(uint64(totalSize)))
208 fmt.Fprintf(stdout, "Saved by Keep deduplication: %15d bytes (%s)\n", nominalSize-totalSize, humanize.IBytes(uint64(nominalSize-totalSize)))