"io"
"strings"
- "git.arvados.org/arvados.git/lib/config"
+ "git.arvados.org/arvados.git/lib/cmd"
"git.arvados.org/arvados.git/sdk/go/arvados"
"git.arvados.org/arvados.git/sdk/go/arvadosclient"
"git.arvados.org/arvados.git/sdk/go/manifest"
func deDuplicate(inputs []string) (trimmed []string) {
seen := make(map[string]bool)
for _, uuid := range inputs {
- if _, ok := seen[uuid]; !ok {
+ if !seen[uuid] {
seen[uuid] = true
trimmed = append(trimmed, uuid)
}
return
}
-func parseFlags(prog string, args []string, loader *config.Loader, logger *logrus.Logger, stderr io.Writer) (exitcode int, inputs []string) {
- flags := flag.NewFlagSet("", flag.ContinueOnError)
- flags.SetOutput(stderr)
+// parseFlags returns either some inputs to process, or (if there are
+// no inputs to process) a nil slice and a suitable exit code.
+func parseFlags(prog string, args []string, logger *logrus.Logger, stderr io.Writer) (inputs []string, exitcode int) {
+ flags := flag.NewFlagSet(prog, flag.ContinueOnError)
flags.Usage = func() {
fmt.Fprintf(flags.Output(), `
Usage:
%s [options ...] <collection-uuid> <collection-uuid> ...
- %s [options ...] <collection-pdh>,<collection_uuid> \
- <collection-pdh>,<collection_uuid> ...
+ %s [options ...] <collection-pdh>,<collection-uuid> \
+ <collection-pdh>,<collection-uuid> ...
This program analyzes the overlap in blocks used by 2 or more collections. It
- prints a deduplication report that shows the nominal space used by the list
- of collection, as well as the actual size and the amount of space that is
- saved by Keep's deduplication.
+ prints a deduplication report that shows the nominal space used by the
+ collections, as well as the actual size and the amount of space that is saved
+ by Keep's deduplication.
The list of collections may be provided in two ways. A list of collection
uuids is sufficient. Alternatively, the PDH for each collection may also be
Use the 'arv' and 'jq' commands to get the list of the 100
largest collections and generate the deduplication report:
- arv collection list --order 'file_size_total desc' | \
+ arv collection list --order 'file_size_total desc' --limit 100 | \
jq -r '.items[] | [.portable_data_hash,.uuid] |@csv' | \
- tail -n100 |sed -e 's/"//g'|tr '\n' ' ' | \
+ sed -e 's/"//g'|tr '\n' ' ' | \
xargs %s
Options:
`, prog, prog, prog)
flags.PrintDefaults()
}
- loader.SetupFlags(flags)
loglevel := flags.String("log-level", "info", "logging level (debug, info, ...)")
- err := flags.Parse(args)
- if err == flag.ErrHelp {
- return 0, inputs
- } else if err != nil {
- return 2, inputs
+ if ok, code := cmd.ParseFlags(flags, prog, args, "collection-uuid [...]", stderr); !ok {
+ return nil, code
}
- inputs = flags.Args()
+ inputs = deDuplicate(flags.Args())
- inputs = deDuplicate(inputs)
-
- if len(inputs) < 2 {
- logger.Error("Error: at least 2 different collections UUIDs required")
- flags.Usage()
- return 2, inputs
+ if len(inputs) < 1 {
+ fmt.Fprintf(stderr, "Error: no collections provided\n")
+ return nil, 2
}
lvl, err := logrus.ParseLevel(*loglevel)
if err != nil {
- return 2, inputs
+ fmt.Fprintf(stderr, "Error: cannot parse log level: %s\n", err)
+ return nil, 2
}
logger.SetLevel(lvl)
- return
+ return inputs, 0
}
func blockList(collection arvados.Collection) (blocks map[string]int) {
return
}
-func report(prog string, args []string, loader *config.Loader, logger *logrus.Logger, stdout, stderr io.Writer) (exitcode int) {
-
+func report(prog string, args []string, logger *logrus.Logger, stdout, stderr io.Writer) (exitcode int) {
var inputs []string
- exitcode, inputs = parseFlags(prog, args, loader, logger, stderr)
- if exitcode != 0 {
+
+ inputs, exitcode = parseFlags(prog, args, logger, stderr)
+ if inputs == nil {
return
}
// Arvados Client setup
arv, err := arvadosclient.MakeArvadosClient()
if err != nil {
- logger.Errorf("error creating Arvados object: %s", err)
+ logger.Errorf("Error creating Arvados object: %s", err)
exitcode = 1
return
}
pdhs := make(map[string]Col)
var nominalSize int64
- fmt.Println()
for _, input := range inputs {
var uuid string
var pdh string
uuid = input
}
if !strings.Contains(uuid, "-4zz18-") {
- logger.Error("uuid must refer to collection object")
+ logger.Errorf("Error: uuid must refer to collection object")
exitcode = 1
return
}
var collection arvados.Collection
err = arv.Get("collections", uuid, nil, &collection)
if err != nil {
- logger.Errorf("Error: unable to retrieve collection: %s\n", err)
+ logger.Errorf("Error: unable to retrieve collection: %s", err)
exitcode = 1
return
}
blocks[uuid] = make(map[string]int)
blocks[uuid] = blockList(collection)
if pdh != "" && collection.PortableDataHash != pdh {
- logger.Errorf("Error: the collection with UUID %s has PDH %s, but a different PDH was provided in the arguments: %s\n", uuid, collection.PortableDataHash, pdh)
+ logger.Errorf("Error: the collection with UUID %s has PDH %s, but a different PDH was provided in the arguments: %s", uuid, collection.PortableDataHash, pdh)
exitcode = 1
return
}
seen := make(map[string]bool)
for _, v := range blocks {
for pdh, size := range v {
- if _, ok := seen[pdh]; !ok {
+ if !seen[pdh] {
seen[pdh] = true
totalSize += int64(size)
}