1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
5 package deduplicationreport
14 "git.arvados.org/arvados.git/lib/cmd"
15 "git.arvados.org/arvados.git/sdk/go/arvados"
16 "git.arvados.org/arvados.git/sdk/go/arvadosclient"
17 "git.arvados.org/arvados.git/sdk/go/blockdigest"
19 "github.com/dustin/go-humanize"
20 "github.com/sirupsen/logrus"
23 func deDuplicate(inputs []string) (trimmed []string) {
24 seen := make(map[string]bool)
25 for _, uuid := range inputs {
28 trimmed = append(trimmed, uuid)
34 // parseFlags returns either some inputs to process, or (if there are
35 // no inputs to process) a nil slice and a suitable exit code.
36 func parseFlags(prog string, args []string, logger *logrus.Logger, stderr io.Writer) (inputs []string, exitcode int) {
37 flags := flag.NewFlagSet(prog, flag.ContinueOnError)
38 flags.Usage = func() {
39 fmt.Fprintf(flags.Output(), `
41 %s [options ...] <collection-uuid> <collection-uuid> ...
43 %s [options ...] <collection-pdh>,<collection-uuid> \
44 <collection-pdh>,<collection-uuid> ...
46 This program analyzes the overlap in blocks used by 2 or more collections. It
47 prints a deduplication report that shows the nominal space used by the
48 collections, as well as the actual size and the amount of space that is saved
49 by Keep's deduplication.
51 The list of collections may be provided in two ways. A list of collection
52 uuids is sufficient. Alternatively, the PDH for each collection may also be
53 provided. This is will greatly speed up operation when the list contains
54 multiple collections with the same PDH.
56 Exit status will be zero if there were no errors generating the report.
60 Use the 'arv' and 'jq' commands to get the list of the 100
61 largest collections and generate the deduplication report:
63 arv collection list --order 'file_size_total desc' --limit 100 | \
64 jq -r '.items[] | [.portable_data_hash,.uuid] |@csv' | \
65 sed -e 's/"//g'|tr '\n' ' ' | \
72 loglevel := flags.String("log-level", "info", "logging level (debug, info, ...)")
73 if ok, code := cmd.ParseFlags(flags, prog, args, "collection-uuid [...]", stderr); !ok {
77 inputs = deDuplicate(flags.Args())
80 fmt.Fprintf(stderr, "Error: no collections provided\n")
84 lvl, err := logrus.ParseLevel(*loglevel)
86 fmt.Fprintf(stderr, "Error: cannot parse log level: %s\n", err)
93 func blockList(collection arvados.Collection) (blocks map[string]int) {
94 blocks = make(map[string]int)
95 for _, token := range bytes.Split([]byte(collection.ManifestText), []byte{' '}) {
96 if blockdigest.IsBlockLocator(string(token)) {
97 loc, _ := blockdigest.ParseBlockLocator(string(token))
98 blocks[loc.Digest.String()] = loc.Size
104 func report(prog string, args []string, logger *logrus.Logger, stdout, stderr io.Writer) (exitcode int) {
107 inputs, exitcode = parseFlags(prog, args, logger, stderr)
112 // Arvados Client setup
113 arv, err := arvadosclient.MakeArvadosClient()
115 logger.Errorf("Error creating Arvados object: %s", err)
125 blocks := make(map[string]map[string]int)
126 pdhs := make(map[string]Col)
127 var nominalSize int64
129 for _, input := range inputs {
132 if strings.Contains(input, ",") {
133 // The input is in the format pdh,uuid. This will allow us to save time on duplicate pdh's
134 tmp := strings.Split(input, ",")
138 // The input must be a plain uuid
141 if !strings.Contains(uuid, "-4zz18-") {
142 logger.Errorf("Error: uuid must refer to collection object")
146 if _, ok := pdhs[pdh]; ok {
147 // We've processed a collection with this pdh already. Simply add its
148 // size to the totals and move on to the next one.
149 // Note that we simply trust the PDH matches the collection UUID here,
150 // in other words, we use it over the UUID. If they don't match, the report
152 nominalSize += pdhs[pdh].FileSizeTotal
154 var collection arvados.Collection
155 err = arv.Get("collections", uuid, nil, &collection)
157 logger.Errorf("Error: unable to retrieve collection: %s", err)
161 blocks[uuid] = make(map[string]int)
162 blocks[uuid] = blockList(collection)
163 if pdh != "" && collection.PortableDataHash != pdh {
164 logger.Errorf("Error: the collection with UUID %s has PDH %s, but a different PDH was provided in the arguments: %s", uuid, collection.PortableDataHash, pdh)
169 pdh = collection.PortableDataHash
173 if collection.FileSizeTotal != 0 || collection.FileCount != 0 {
174 nominalSize += collection.FileSizeTotal
175 col.FileSizeTotal = collection.FileSizeTotal
176 col.FileCount = int64(collection.FileCount)
178 // Collections created with old Arvados versions do not always have the total file size and count cached in the collections object
180 for _, size := range blocks[uuid] {
181 collSize += int64(size)
183 nominalSize += collSize
184 col.FileSizeTotal = collSize
189 if pdhs[pdh].FileCount != 0 {
190 fmt.Fprintf(stdout, "Collection %s: pdh %s; nominal size %d (%s); file count %d\n", uuid, pdh, pdhs[pdh].FileSizeTotal, humanize.IBytes(uint64(pdhs[pdh].FileSizeTotal)), pdhs[pdh].FileCount)
192 fmt.Fprintf(stdout, "Collection %s: pdh %s; nominal size %d (%s)\n", uuid, pdh, pdhs[pdh].FileSizeTotal, humanize.IBytes(uint64(pdhs[pdh].FileSizeTotal)))
197 seen := make(map[string]bool)
198 for _, v := range blocks {
199 for pdh, size := range v {
202 totalSize += int64(size)
207 fmt.Fprintf(stdout, "Collections: %15d\n", len(inputs))
208 fmt.Fprintf(stdout, "Nominal size of stored data: %15d bytes (%s)\n", nominalSize, humanize.IBytes(uint64(nominalSize)))
209 fmt.Fprintf(stdout, "Actual size of stored data: %15d bytes (%s)\n", totalSize, humanize.IBytes(uint64(totalSize)))
210 fmt.Fprintf(stdout, "Saved by Keep deduplication: %15d bytes (%s)\n", nominalSize-totalSize, humanize.IBytes(uint64(nominalSize-totalSize)))