*.asc
sdk/java-v2/build.gradle
sdk/java-v2/settings.gradle
-sdk/cwl/tests/wf/feddemo
\ No newline at end of file
+sdk/cwl/tests/wf/feddemo
+go.mod
+go.sum
"git.arvados.org/arvados.git/lib/cli"
"git.arvados.org/arvados.git/lib/cmd"
+ "git.arvados.org/arvados.git/lib/deduplicationreport"
"git.arvados.org/arvados.git/lib/mount"
)
"virtual_machine": cli.APICall,
"workflow": cli.APICall,
- "mount": mount.Command,
+ "mount": mount.Command,
+ "deduplication-report": deduplicationreport.Command,
})
)
github.com/docker/docker v1.4.2-0.20180109013817-94b8a116fbf1
github.com/docker/go-connections v0.3.0 // indirect
github.com/docker/go-units v0.3.3-0.20171221200356-d59758554a3d // indirect
+ github.com/dustin/go-humanize v1.0.0
github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568 // indirect
github.com/fsnotify/fsnotify v1.4.9
github.com/ghodss/yaml v1.0.0
github.com/imdario/mergo v0.3.8-0.20190415133143-5ef87b449ca7
github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 // indirect
github.com/jmcvetta/randutil v0.0.0-20150817122601-2bb1b664bcff
+ github.com/jmoiron/sqlx v1.2.0
github.com/julienschmidt/httprouter v1.2.0
github.com/karalabe/xgo v0.0.0-20191115072854-c5ccff8648a7 // indirect
github.com/kevinburke/ssh_config v0.0.0-20171013211458-802051befeb5 // indirect
github.com/docker/go-connections v0.3.0/go.mod h1:Gbd7IOopHjR8Iph03tsViu4nIes5XhDvyHbTtUxmeec=
github.com/docker/go-units v0.3.3-0.20171221200356-d59758554a3d h1:dVaNRYvaGV23AdNdsm+4y1mPN0tj3/1v6taqKMmM6Ko=
github.com/docker/go-units v0.3.3-0.20171221200356-d59758554a3d/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
+github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo=
+github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568 h1:BHsljHzVlRcyQhjrss6TZTdY2VfCqZPbv5k3iBFa2ZQ=
github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568/go.mod h1:xEzjJPgXI435gkrCt3MPfRiAkVrwSbHsst4LCFVfpJc=
github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWoS4=
github.com/go-ldap/ldap v3.0.3+incompatible/go.mod h1:qfd9rJvER9Q0/D/Sqn1DfHRoBp40uXYvFoEVrNEPqRc=
github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE=
github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk=
+github.com/go-sql-driver/mysql v1.4.0/go.mod h1:zAC/RDZ24gD3HViQzih4MyKcchzm+sOG5ZlKdlhCg5w=
github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
github.com/gogo/protobuf v1.1.1 h1:72R+M5VuhED/KujmZVcIquuo8mBgX4oVda//DQb3PXo=
github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
github.com/jmcvetta/randutil v0.0.0-20150817122601-2bb1b664bcff/go.mod h1:ddfPX8Z28YMjiqoaJhNBzWHapTHXejnB5cDCUWDwriw=
github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af h1:pmfjZENx5imkbgOkpRUYLnmbU7UEFbjtDA2hxJ1ichM=
github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af/go.mod h1:Nht3zPeWKUH0NzdCt2Blrr5ys8VGpn0CEB0cQHVjt7k=
+github.com/jmoiron/sqlx v1.2.0 h1:41Ip0zITnmWNR/vHV+S4m+VoUivnWY5E4OJfLZjCJMA=
+github.com/jmoiron/sqlx v1.2.0/go.mod h1:1FEQNm3xlJgrMD+FBdI9+xvCksHtbpVBBw5dYhBSsks=
github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU=
github.com/json-iterator/go v1.1.7/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU=
github.com/konsorten/go-windows-terminal-sequences v1.0.1 h1:mweAR1A6xJ3oS2pRaGiHgQ4OO8tzTaLawm8vnODuwDk=
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
+github.com/lib/pq v1.0.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
github.com/lib/pq v1.3.0 h1:/qkRGz8zljWiDcFvgpwUpwIAPu3r07TDvs3Rws+o/pU=
github.com/lib/pq v1.3.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
github.com/marstr/guid v1.1.1-0.20170427235115-8bdf7d1a087c h1:ouxemItv3B/Zh008HJkEXDYCN3BIRyNHxtUN7ThJ5Js=
github.com/marstr/guid v1.1.1-0.20170427235115-8bdf7d1a087c/go.mod h1:74gB1z2wpxxInTG6yaqA7KrtM0NZ+RbrcqDvYHefzho=
+github.com/mattn/go-sqlite3 v1.9.0/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc=
github.com/matttproud/golang_protobuf_extensions v1.0.1 h1:4hp9jkHxhMHkqkrB3Ix0jegS5sx/RkqARlsWZ6pIwiU=
github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
github.com/mitchellh/go-homedir v0.0.0-20161203194507-b8bc1bf76747 h1:eQox4Rh4ewJF+mqYPxCkmBAirRnPaHEB26UkNuPyjlk=
--- /dev/null
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+// Package api provides types used by controller/server-component
+// packages.
+package api
+
+import "context"
+
+// A RoutableFunc calls an API method (sometimes via a wrapped
+// RoutableFunc) that has real argument types.
+//
+// (It is used by ctrlctx to manage database transactions, so moving
+// it to the router package would cause a circular dependency
+// router->arvadostest->ctrlctx->router.)
+type RoutableFunc func(ctx context.Context, opts interface{}) (interface{}, error)
import (
"context"
- "database/sql"
"errors"
"fmt"
"net/http"
"time"
"git.arvados.org/arvados.git/lib/controller/federation"
- "git.arvados.org/arvados.git/lib/controller/localdb"
"git.arvados.org/arvados.git/lib/controller/railsproxy"
"git.arvados.org/arvados.git/lib/controller/router"
+ "git.arvados.org/arvados.git/lib/ctrlctx"
"git.arvados.org/arvados.git/sdk/go/arvados"
"git.arvados.org/arvados.git/sdk/go/ctxlog"
"git.arvados.org/arvados.git/sdk/go/health"
"git.arvados.org/arvados.git/sdk/go/httpserver"
+ "github.com/jmoiron/sqlx"
_ "github.com/lib/pq"
)
proxy *proxy
secureClient *http.Client
insecureClient *http.Client
- pgdb *sql.DB
+ pgdb *sqlx.DB
pgdbMtx sync.Mutex
}
Routes: health.Routes{"ping": func() error { _, err := h.db(context.TODO()); return err }},
})
- rtr := router.New(federation.New(h.Cluster), localdb.WrapCallsInTransactions(h.db))
+ rtr := router.New(federation.New(h.Cluster), ctrlctx.WrapCallsInTransactions(h.db))
mux.Handle("/arvados/v1/config", rtr)
mux.Handle("/"+arvados.EndpointUserAuthenticate.Path, rtr)
var errDBConnection = errors.New("database connection error")
-func (h *Handler) db(ctx context.Context) (*sql.DB, error) {
+func (h *Handler) db(ctx context.Context) (*sqlx.DB, error) {
h.pgdbMtx.Lock()
defer h.pgdbMtx.Unlock()
if h.pgdb != nil {
return h.pgdb, nil
}
- db, err := sql.Open("postgres", h.Cluster.PostgreSQL.Connection.String())
+ db, err := sqlx.Open("postgres", h.Cluster.PostgreSQL.Connection.String())
if err != nil {
ctxlog.FromContext(ctx).WithError(err).Error("postgresql connect failed")
return nil, errDBConnection
func NewConn(cluster *arvados.Cluster) *Conn {
railsProxy := railsproxy.NewConn(cluster)
- return &Conn{
+ var conn Conn
+ conn = Conn{
cluster: cluster,
railsProxy: railsProxy,
loginController: chooseLoginController(cluster, railsProxy),
}
+ return &conn
}
func (conn *Conn) Logout(ctx context.Context, opts arvados.LogoutOptions) (arvados.LogoutResponse, error) {
"strings"
"git.arvados.org/arvados.git/lib/controller/rpc"
+ "git.arvados.org/arvados.git/lib/ctrlctx"
"git.arvados.org/arvados.git/sdk/go/arvados"
"git.arvados.org/arvados.git/sdk/go/auth"
"git.arvados.org/arvados.git/sdk/go/httpserver"
return
}
token := target.Query().Get("api_token")
- tx, err := currenttx(ctx)
+ tx, err := ctrlctx.CurrentTx(ctx)
if err != nil {
return
}
}
var exp sql.NullString
var scopes []byte
- err = tx.QueryRowContext(ctx, "select uuid, api_token, expires_at, scopes from api_client_authorizations where api_token=$1", tokensecret).Scan(&resp.UUID, &resp.APIToken, &exp, &scopes)
+ err = tx.QueryRowxContext(ctx, "select uuid, api_token, expires_at, scopes from api_client_authorizations where api_token=$1", tokensecret).Scan(&resp.UUID, &resp.APIToken, &exp, &scopes)
if err != nil {
return
}
import (
"context"
- "database/sql"
"encoding/json"
"net"
"net/http"
"git.arvados.org/arvados.git/lib/config"
"git.arvados.org/arvados.git/lib/controller/railsproxy"
+ "git.arvados.org/arvados.git/lib/ctrlctx"
"git.arvados.org/arvados.git/sdk/go/arvados"
"git.arvados.org/arvados.git/sdk/go/arvadostest"
"git.arvados.org/arvados.git/sdk/go/auth"
"git.arvados.org/arvados.git/sdk/go/ctxlog"
"github.com/bradleypeabody/godap"
+ "github.com/jmoiron/sqlx"
check "gopkg.in/check.v1"
)
cluster *arvados.Cluster
ctrl *ldapLoginController
ldap *godap.LDAPServer // fake ldap server that accepts auth goodusername/goodpassword
- db *sql.DB
+ db *sqlx.DB
// transaction context
ctx context.Context
- rollback func()
+ rollback func() error
}
func (s *LDAPSuite) TearDownSuite(c *check.C) {
Cluster: s.cluster,
RailsProxy: railsproxy.NewConn(s.cluster),
}
- s.db = testdb(c, s.cluster)
+ s.db = arvadostest.DB(c, s.cluster)
}
func (s *LDAPSuite) SetUpTest(c *check.C) {
- s.ctx, s.rollback = testctx(c, s.db)
+ tx, err := s.db.Beginx()
+ c.Assert(err, check.IsNil)
+ s.ctx = ctrlctx.NewWithTransaction(context.Background(), tx)
+ s.rollback = tx.Rollback
}
func (s *LDAPSuite) TearDownTest(c *check.C) {
- s.rollback()
+ if s.rollback != nil {
+ s.rollback()
+ }
}
func (s *LDAPSuite) TestLoginSuccess(c *check.C) {
"net/http"
"strings"
+ "git.arvados.org/arvados.git/lib/controller/api"
"git.arvados.org/arvados.git/sdk/go/arvados"
"git.arvados.org/arvados.git/sdk/go/auth"
"git.arvados.org/arvados.git/sdk/go/ctxlog"
type router struct {
mux *mux.Router
backend arvados.API
- wrapCalls func(RoutableFunc) RoutableFunc
+ wrapCalls func(api.RoutableFunc) api.RoutableFunc
}
// New returns a new router (which implements the http.Handler
// the returned method is used in its place. This can be used to
// install hooks before and after each API call and alter responses;
// see localdb.WrapCallsInTransaction for an example.
-func New(backend arvados.API, wrapCalls func(RoutableFunc) RoutableFunc) *router {
+func New(backend arvados.API, wrapCalls func(api.RoutableFunc) api.RoutableFunc) *router {
rtr := &router{
mux: mux.NewRouter(),
backend: backend,
return rtr
}
-type RoutableFunc func(ctx context.Context, opts interface{}) (interface{}, error)
-
func (rtr *router) addRoutes() {
for _, route := range []struct {
endpoint arvados.APIEndpoint
defaultOpts func() interface{}
- exec RoutableFunc
+ exec api.RoutableFunc
}{
{
arvados.EndpointConfigGet,
"GET": "HEAD", // Accept HEAD at any GET route
}
-func (rtr *router) addRoute(endpoint arvados.APIEndpoint, defaultOpts func() interface{}, exec RoutableFunc) {
+func (rtr *router) addRoute(endpoint arvados.APIEndpoint, defaultOpts func() interface{}, exec api.RoutableFunc) {
methods := []string{endpoint.Method}
if alt, ok := altMethod[endpoint.Method]; ok {
methods = append(methods, alt)
//
// SPDX-License-Identifier: AGPL-3.0
-package localdb
+package ctrlctx
import (
"context"
- "database/sql"
"errors"
"sync"
- "git.arvados.org/arvados.git/lib/controller/router"
+ "git.arvados.org/arvados.git/lib/controller/api"
"git.arvados.org/arvados.git/sdk/go/ctxlog"
+ "github.com/jmoiron/sqlx"
+ _ "github.com/lib/pq"
+)
+
+var (
+ ErrNoTransaction = errors.New("bug: there is no transaction in this context")
+ ErrContextFinished = errors.New("refusing to start a transaction after wrapped function already returned")
)
// WrapCallsInTransactions returns a call wrapper (suitable for
//
// The wrapper calls getdb() to get a database handle before each API
// call.
-func WrapCallsInTransactions(getdb func(context.Context) (*sql.DB, error)) func(router.RoutableFunc) router.RoutableFunc {
- return func(origFunc router.RoutableFunc) router.RoutableFunc {
+func WrapCallsInTransactions(getdb func(context.Context) (*sqlx.DB, error)) func(api.RoutableFunc) api.RoutableFunc {
+ return func(origFunc api.RoutableFunc) api.RoutableFunc {
return func(ctx context.Context, opts interface{}) (_ interface{}, err error) {
- ctx, finishtx := starttx(ctx, getdb)
+ ctx, finishtx := New(ctx, getdb)
defer finishtx(&err)
return origFunc(ctx, opts)
}
}
}
-// ContextWithTransaction returns a child context in which the given
+// NewWithTransaction returns a child context in which the given
// transaction will be used by any localdb API call that needs one.
// The caller is responsible for calling Commit or Rollback on tx.
-func ContextWithTransaction(ctx context.Context, tx *sql.Tx) context.Context {
+func NewWithTransaction(ctx context.Context, tx *sqlx.Tx) context.Context {
txn := &transaction{tx: tx}
txn.setup.Do(func() {})
return context.WithValue(ctx, contextKeyTransaction, txn)
var contextKeyTransaction = contextKeyT("transaction")
type transaction struct {
- tx *sql.Tx
+ tx *sqlx.Tx
err error
- getdb func(context.Context) (*sql.DB, error)
+ getdb func(context.Context) (*sqlx.DB, error)
setup sync.Once
}
-type transactionFinishFunc func(*error)
+type finishFunc func(*error)
-// starttx returns a new child context that can be used with
-// currenttx(). It does not open a database transaction until the
-// first call to currenttx().
+// New returns a new child context that can be used with
+// CurrentTx(). It does not open a database transaction until the
+// first call to CurrentTx().
//
// The caller must eventually call the returned finishtx() func to
// commit or rollback the transaction, if any.
//
// func example(ctx context.Context) (err error) {
-// ctx, finishtx := starttx(ctx, dber)
+// ctx, finishtx := New(ctx, dber)
// defer finishtx(&err)
// // ...
-// tx, err := currenttx(ctx)
+// tx, err := CurrentTx(ctx)
// if err != nil {
// return fmt.Errorf("example: %s", err)
// }
//
// If *err is non-nil, finishtx() rolls back the transaction, and
// does not modify *err.
-func starttx(ctx context.Context, getdb func(context.Context) (*sql.DB, error)) (context.Context, transactionFinishFunc) {
+func New(ctx context.Context, getdb func(context.Context) (*sqlx.DB, error)) (context.Context, finishFunc) {
txn := &transaction{getdb: getdb}
return context.WithValue(ctx, contextKeyTransaction, txn), func(err *error) {
txn.setup.Do(func() {
// Using (*sync.Once)Do() prevents a future
- // call to currenttx() from opening a
+ // call to CurrentTx() from opening a
// transaction which would never get committed
- // or rolled back. If currenttx() hasn't been
+ // or rolled back. If CurrentTx() hasn't been
// called before now, future calls will return
// this error.
- txn.err = errors.New("refusing to start a transaction after wrapped function already returned")
+ txn.err = ErrContextFinished
})
if txn.tx == nil {
// we never [successfully] started a transaction
}
}
-func currenttx(ctx context.Context) (*sql.Tx, error) {
+func CurrentTx(ctx context.Context) (*sqlx.Tx, error) {
txn, ok := ctx.Value(contextKeyTransaction).(*transaction)
if !ok {
- return nil, errors.New("bug: there is no transaction in this context")
+ return nil, ErrNoTransaction
}
txn.setup.Do(func() {
if db, err := txn.getdb(ctx); err != nil {
txn.err = err
} else {
- txn.tx, txn.err = db.Begin()
+ txn.tx, txn.err = db.Beginx()
}
})
return txn.tx, txn.err
//
// SPDX-License-Identifier: AGPL-3.0
-package localdb
+package ctrlctx
import (
"context"
- "database/sql"
"sync"
"sync/atomic"
+ "testing"
"git.arvados.org/arvados.git/lib/config"
- "git.arvados.org/arvados.git/sdk/go/arvados"
"git.arvados.org/arvados.git/sdk/go/ctxlog"
+ "github.com/jmoiron/sqlx"
_ "github.com/lib/pq"
check "gopkg.in/check.v1"
)
-// testdb returns a DB connection for the given cluster config.
-func testdb(c *check.C, cluster *arvados.Cluster) *sql.DB {
- db, err := sql.Open("postgres", cluster.PostgreSQL.Connection.String())
- c.Assert(err, check.IsNil)
- return db
-}
-
-// testctx returns a context suitable for running a test case in a new
-// transaction, and a rollback func which the caller should call after
-// the test.
-func testctx(c *check.C, db *sql.DB) (ctx context.Context, rollback func()) {
- tx, err := db.Begin()
- c.Assert(err, check.IsNil)
- return ContextWithTransaction(context.Background(), tx), func() {
- c.Check(tx.Rollback(), check.IsNil)
- }
+// Gocheck boilerplate
+func Test(t *testing.T) {
+ check.TestingT(t)
}
var _ = check.Suite(&DatabaseSuite{})
c.Assert(err, check.IsNil)
var getterCalled int64
- getter := func(context.Context) (*sql.DB, error) {
+ getter := func(context.Context) (*sqlx.DB, error) {
atomic.AddInt64(&getterCalled, 1)
- return testdb(c, cluster), nil
+ db, err := sqlx.Open("postgres", cluster.PostgreSQL.Connection.String())
+ c.Assert(err, check.IsNil)
+ return db, nil
}
wrapper := WrapCallsInTransactions(getter)
wrappedFunc := wrapper(func(ctx context.Context, opts interface{}) (interface{}, error) {
- txes := make([]*sql.Tx, 20)
+ txes := make([]*sqlx.Tx, 20)
var wg sync.WaitGroup
for i := range txes {
i := i
wg.Add(1)
go func() {
- // Concurrent calls to currenttx(),
+ // Concurrent calls to CurrentTx(),
// with different children of the same
// parent context, will all return the
// same transaction.
defer wg.Done()
ctx, cancel := context.WithCancel(ctx)
defer cancel()
- tx, err := currenttx(ctx)
+ tx, err := CurrentTx(ctx)
c.Check(err, check.IsNil)
txes[i] = tx
}()
c.Check(err, check.IsNil)
c.Check(getterCalled, check.Equals, int64(1))
- // When a wrapped func returns without calling currenttx(),
- // calling currenttx() later shouldn't start a new
+ // When a wrapped func returns without calling CurrentTx(),
+ // calling CurrentTx() later shouldn't start a new
// transaction.
var savedctx context.Context
ok, err = wrapper(func(ctx context.Context, opts interface{}) (interface{}, error) {
})(context.Background(), "blah")
c.Check(ok, check.Equals, true)
c.Check(err, check.IsNil)
- tx, err := currenttx(savedctx)
+ tx, err := CurrentTx(savedctx)
c.Check(tx, check.IsNil)
c.Check(err, check.NotNil)
}
--- /dev/null
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package deduplicationreport
+
+import (
+ "io"
+
+ "git.arvados.org/arvados.git/lib/config"
+ "git.arvados.org/arvados.git/sdk/go/ctxlog"
+ "github.com/sirupsen/logrus"
+)
+
+var Command command
+
+type command struct{}
+
+type NoPrefixFormatter struct{}
+
+func (f *NoPrefixFormatter) Format(entry *logrus.Entry) ([]byte, error) {
+ return []byte(entry.Message), nil
+}
+
+// RunCommand implements the subcommand "deduplication-report <collection> <collection> ..."
+func (command) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
+ var err error
+ logger := ctxlog.New(stderr, "text", "info")
+ defer func() {
+ if err != nil {
+ logger.WithError(err).Error("fatal")
+ }
+ }()
+
+ logger.SetFormatter(new(NoPrefixFormatter))
+
+ loader := config.NewLoader(stdin, logger)
+ loader.SkipLegacy = true
+
+ exitcode := report(prog, args, loader, logger, stdout, stderr)
+
+ return exitcode
+}
--- /dev/null
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package deduplicationreport
+
+import (
+ "flag"
+ "fmt"
+ "io"
+ "strings"
+
+ "git.arvados.org/arvados.git/lib/config"
+ "git.arvados.org/arvados.git/sdk/go/arvados"
+ "git.arvados.org/arvados.git/sdk/go/arvadosclient"
+ "git.arvados.org/arvados.git/sdk/go/manifest"
+
+ "github.com/dustin/go-humanize"
+ "github.com/sirupsen/logrus"
+)
+
+func deDuplicate(inputs []string) (trimmed []string) {
+ seen := make(map[string]bool)
+ for _, uuid := range inputs {
+ if !seen[uuid] {
+ seen[uuid] = true
+ trimmed = append(trimmed, uuid)
+ }
+ }
+ return
+}
+
+func parseFlags(prog string, args []string, loader *config.Loader, logger *logrus.Logger, stderr io.Writer) (exitcode int, inputs []string) {
+ flags := flag.NewFlagSet("", flag.ContinueOnError)
+ flags.SetOutput(stderr)
+ flags.Usage = func() {
+ fmt.Fprintf(flags.Output(), `
+Usage:
+ %s [options ...] <collection-uuid> <collection-uuid> ...
+
+ %s [options ...] <collection-pdh>,<collection_uuid> \
+ <collection-pdh>,<collection_uuid> ...
+
+ This program analyzes the overlap in blocks used by 2 or more collections. It
+ prints a deduplication report that shows the nominal space used by the
+ collections, as well as the actual size and the amount of space that is saved
+ by Keep's deduplication.
+
+ The list of collections may be provided in two ways. A list of collection
+ uuids is sufficient. Alternatively, the PDH for each collection may also be
+ provided. This is will greatly speed up operation when the list contains
+ multiple collections with the same PDH.
+
+ Exit status will be zero if there were no errors generating the report.
+
+Example:
+
+ Use the 'arv' and 'jq' commands to get the list of the 100
+ largest collections and generate the deduplication report:
+
+ arv collection list --order 'file_size_total desc' --limit 100 | \
+ jq -r '.items[] | [.portable_data_hash,.uuid] |@csv' | \
+ tail -n+2 |sed -e 's/"//g'|tr '\n' ' ' | \
+ xargs %s
+
+Options:
+`, prog, prog, prog)
+ flags.PrintDefaults()
+ }
+ loader.SetupFlags(flags)
+ loglevel := flags.String("log-level", "info", "logging level (debug, info, ...)")
+ err := flags.Parse(args)
+ if err == flag.ErrHelp {
+ return 0, inputs
+ } else if err != nil {
+ return 2, inputs
+ }
+
+ inputs = flags.Args()
+
+ inputs = deDuplicate(inputs)
+
+ if len(inputs) < 1 {
+ logger.Errorf("Error: no collections provided")
+ flags.Usage()
+ return 2, inputs
+ }
+
+ lvl, err := logrus.ParseLevel(*loglevel)
+ if err != nil {
+ return 2, inputs
+ }
+ logger.SetLevel(lvl)
+ return
+}
+
+func blockList(collection arvados.Collection) (blocks map[string]int) {
+ blocks = make(map[string]int)
+ m := manifest.Manifest{Text: collection.ManifestText}
+ blockChannel := m.BlockIterWithDuplicates()
+ for b := range blockChannel {
+ blocks[b.Digest.String()] = b.Size
+ }
+ return
+}
+
+func report(prog string, args []string, loader *config.Loader, logger *logrus.Logger, stdout, stderr io.Writer) (exitcode int) {
+
+ var inputs []string
+ exitcode, inputs = parseFlags(prog, args, loader, logger, stderr)
+ if exitcode != 0 {
+ return
+ }
+
+ // Arvados Client setup
+ arv, err := arvadosclient.MakeArvadosClient()
+ if err != nil {
+ logger.Errorf("Error creating Arvados object: %s", err)
+ exitcode = 1
+ return
+ }
+
+ type Col struct {
+ FileSizeTotal int64
+ FileCount int64
+ }
+
+ blocks := make(map[string]map[string]int)
+ pdhs := make(map[string]Col)
+ var nominalSize int64
+
+ for _, input := range inputs {
+ var uuid string
+ var pdh string
+ if strings.Contains(input, ",") {
+ // The input is in the format pdh,uuid. This will allow us to save time on duplicate pdh's
+ tmp := strings.Split(input, ",")
+ pdh = tmp[0]
+ uuid = tmp[1]
+ } else {
+ // The input must be a plain uuid
+ uuid = input
+ }
+ if !strings.Contains(uuid, "-4zz18-") {
+ logger.Errorf("Error: uuid must refer to collection object")
+ exitcode = 1
+ return
+ }
+ if _, ok := pdhs[pdh]; ok {
+ // We've processed a collection with this pdh already. Simply add its
+ // size to the totals and move on to the next one.
+ // Note that we simply trust the PDH matches the collection UUID here,
+ // in other words, we use it over the UUID. If they don't match, the report
+ // will be wrong.
+ nominalSize += pdhs[pdh].FileSizeTotal
+ } else {
+ var collection arvados.Collection
+ err = arv.Get("collections", uuid, nil, &collection)
+ if err != nil {
+ logger.Errorf("Error: unable to retrieve collection: %s", err)
+ exitcode = 1
+ return
+ }
+ blocks[uuid] = make(map[string]int)
+ blocks[uuid] = blockList(collection)
+ if pdh != "" && collection.PortableDataHash != pdh {
+ logger.Errorf("Error: the collection with UUID %s has PDH %s, but a different PDH was provided in the arguments: %s", uuid, collection.PortableDataHash, pdh)
+ exitcode = 1
+ return
+ }
+ if pdh == "" {
+ pdh = collection.PortableDataHash
+ }
+
+ col := Col{}
+ if collection.FileSizeTotal != 0 || collection.FileCount != 0 {
+ nominalSize += collection.FileSizeTotal
+ col.FileSizeTotal = collection.FileSizeTotal
+ col.FileCount = int64(collection.FileCount)
+ } else {
+ // Collections created with old Arvados versions do not always have the total file size and count cached in the collections object
+ var collSize int64
+ for _, size := range blocks[uuid] {
+ collSize += int64(size)
+ }
+ nominalSize += collSize
+ col.FileSizeTotal = collSize
+ }
+ pdhs[pdh] = col
+ }
+
+ if pdhs[pdh].FileCount != 0 {
+ fmt.Fprintf(stdout, "Collection %s: pdh %s; nominal size %d (%s); file count %d\n", uuid, pdh, pdhs[pdh].FileSizeTotal, humanize.IBytes(uint64(pdhs[pdh].FileSizeTotal)), pdhs[pdh].FileCount)
+ } else {
+ fmt.Fprintf(stdout, "Collection %s: pdh %s; nominal size %d (%s)\n", uuid, pdh, pdhs[pdh].FileSizeTotal, humanize.IBytes(uint64(pdhs[pdh].FileSizeTotal)))
+ }
+ }
+
+ var totalSize int64
+ seen := make(map[string]bool)
+ for _, v := range blocks {
+ for pdh, size := range v {
+ if !seen[pdh] {
+ seen[pdh] = true
+ totalSize += int64(size)
+ }
+ }
+ }
+ fmt.Fprintln(stdout)
+ fmt.Fprintf(stdout, "Collections: %15d\n", len(inputs))
+ fmt.Fprintf(stdout, "Nominal size of stored data: %15d bytes (%s)\n", nominalSize, humanize.IBytes(uint64(nominalSize)))
+ fmt.Fprintf(stdout, "Actual size of stored data: %15d bytes (%s)\n", totalSize, humanize.IBytes(uint64(totalSize)))
+ fmt.Fprintf(stdout, "Saved by Keep deduplication: %15d bytes (%s)\n", nominalSize-totalSize, humanize.IBytes(uint64(nominalSize-totalSize)))
+
+ return exitcode
+}
--- /dev/null
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package deduplicationreport
+
+import (
+ "bytes"
+ "testing"
+
+ "git.arvados.org/arvados.git/sdk/go/arvados"
+ "git.arvados.org/arvados.git/sdk/go/arvadostest"
+ "gopkg.in/check.v1"
+)
+
+func Test(t *testing.T) {
+ check.TestingT(t)
+}
+
+var _ = check.Suite(&Suite{})
+
+type Suite struct{}
+
+func (s *Suite) TearDownSuite(c *check.C) {
+ // Undo any changes/additions to the database so they don't affect subsequent tests.
+ arvadostest.ResetEnv()
+}
+
+func (*Suite) TestUsage(c *check.C) {
+ var stdout, stderr bytes.Buffer
+ exitcode := Command.RunCommand("deduplicationreport.test", []string{"-log-level=debug"}, &bytes.Buffer{}, &stdout, &stderr)
+ c.Check(exitcode, check.Equals, 2)
+ c.Check(stdout.String(), check.Equals, "")
+ c.Log(stderr.String())
+ c.Check(stderr.String(), check.Matches, `(?ms).*Usage:.*`)
+}
+
+func (*Suite) TestTwoIdenticalUUIDs(c *check.C) {
+ var stdout, stderr bytes.Buffer
+ // Run dedupreport with 2 identical uuids
+ exitcode := Command.RunCommand("deduplicationreport.test", []string{arvadostest.FooCollection, arvadostest.FooCollection}, &bytes.Buffer{}, &stdout, &stderr)
+ c.Check(exitcode, check.Equals, 0)
+ c.Check(stdout.String(), check.Matches, "(?ms).*Collections:[[:space:]]+1.*")
+ c.Check(stdout.String(), check.Matches, "(?ms).*Nominal size of stored data:[[:space:]]+3 bytes \\(3 B\\).*")
+ c.Check(stdout.String(), check.Matches, "(?ms).*Actual size of stored data:[[:space:]]+3 bytes \\(3 B\\).*")
+ c.Check(stdout.String(), check.Matches, "(?ms).*Saved by Keep deduplication:[[:space:]]+0 bytes \\(0 B\\).*")
+ c.Log(stderr.String())
+}
+
+func (*Suite) TestTwoUUIDsInvalidPDH(c *check.C) {
+ var stdout, stderr bytes.Buffer
+ // Run dedupreport with pdh,uuid where pdh does not match
+ exitcode := Command.RunCommand("deduplicationreport.test", []string{arvadostest.FooAndBarFilesInDirPDH + "," + arvadostest.FooCollection, arvadostest.FooCollection}, &bytes.Buffer{}, &stdout, &stderr)
+ c.Check(exitcode, check.Equals, 1)
+ c.Check(stdout.String(), check.Equals, "")
+ c.Log(stderr.String())
+ c.Check(stderr.String(), check.Matches, `(?ms).*Error: the collection with UUID zzzzz-4zz18-fy296fx3hot09f7 has PDH 1f4b0bc7583c2a7f9102c395f4ffc5e3\+45, but a different PDH was provided in the arguments: 870369fc72738603c2fad16664e50e2d\+58.*`)
+}
+
+func (*Suite) TestNonExistentCollection(c *check.C) {
+ var stdout, stderr bytes.Buffer
+ // Run dedupreport with many UUIDs
+ exitcode := Command.RunCommand("deduplicationreport.test", []string{arvadostest.FooCollection, arvadostest.NonexistentCollection}, &bytes.Buffer{}, &stdout, &stderr)
+ c.Check(exitcode, check.Equals, 1)
+ c.Check(stdout.String(), check.Equals, "Collection zzzzz-4zz18-fy296fx3hot09f7: pdh 1f4b0bc7583c2a7f9102c395f4ffc5e3+45; nominal size 3 (3 B)\n")
+ c.Log(stderr.String())
+ c.Check(stderr.String(), check.Matches, `(?ms).*Error: unable to retrieve collection:.*404 Not Found.*`)
+}
+
+func (*Suite) TestManyUUIDsNoOverlap(c *check.C) {
+ var stdout, stderr bytes.Buffer
+ // Run dedupreport with 5 UUIDs
+ exitcode := Command.RunCommand("deduplicationreport.test", []string{arvadostest.FooCollection, arvadostest.HelloWorldCollection, arvadostest.FooBarDirCollection, arvadostest.WazVersion1Collection, arvadostest.UserAgreementCollection}, &bytes.Buffer{}, &stdout, &stderr)
+ c.Check(exitcode, check.Equals, 0)
+ c.Check(stdout.String(), check.Matches, "(?ms).*Collections:[[:space:]]+5.*")
+ c.Check(stdout.String(), check.Matches, "(?ms).*Nominal size of stored data:[[:space:]]+249049 bytes \\(243 KiB\\).*")
+ c.Check(stdout.String(), check.Matches, "(?ms).*Actual size of stored data:[[:space:]]+249049 bytes \\(243 KiB\\).*")
+ c.Check(stdout.String(), check.Matches, "(?ms).*Saved by Keep deduplication:[[:space:]]+0 bytes \\(0 B\\).*")
+ c.Log(stderr.String())
+ c.Check(stderr.String(), check.Equals, "")
+}
+
+func (*Suite) TestTwoOverlappingCollections(c *check.C) {
+ var stdout, stderr bytes.Buffer
+ // Create two collections
+ arv := arvados.NewClientFromEnv()
+
+ var c1 arvados.Collection
+ err := arv.RequestAndDecode(&c1, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". d3b07384d113edec49eaa6238ad5ff00+4 0:4:foo\n"}})
+ c.Assert(err, check.Equals, nil)
+
+ var c2 arvados.Collection
+ err = arv.RequestAndDecode(&c2, "POST", "arvados/v1/collections", nil, map[string]interface{}{"collection": map[string]interface{}{"manifest_text": ". c157a79031e1c40f85931829bc5fc552+4 d3b07384d113edec49eaa6238ad5ff00+4 0:4:bar 4:4:foo\n"}})
+ c.Assert(err, check.Equals, nil)
+
+ for _, trial := range []struct {
+ field1 string
+ field2 string
+ }{
+ {
+ // Run dedupreport with 2 arguments: uuid uuid
+ field1: c1.UUID,
+ field2: c2.UUID,
+ },
+ {
+ // Run dedupreport with 2 arguments: pdh,uuid uuid
+ field1: c1.PortableDataHash + "," + c1.UUID,
+ field2: c2.UUID,
+ },
+ } {
+ exitcode := Command.RunCommand("deduplicationreport.test", []string{trial.field1, trial.field2}, &bytes.Buffer{}, &stdout, &stderr)
+ c.Check(exitcode, check.Equals, 0)
+ c.Check(stdout.String(), check.Matches, "(?ms).*Nominal size of stored data:[[:space:]]+12 bytes \\(12 B\\).*")
+ c.Check(stdout.String(), check.Matches, "(?ms).*Actual size of stored data:[[:space:]]+8 bytes \\(8 B\\).*")
+ c.Check(stdout.String(), check.Matches, "(?ms).*Saved by Keep deduplication:[[:space:]]+4 bytes \\(4 B\\).*")
+ c.Log(stderr.String())
+ c.Check(stderr.String(), check.Equals, "")
+ }
+}
--- /dev/null
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package arvadostest
+
+import (
+ "context"
+
+ "git.arvados.org/arvados.git/lib/ctrlctx"
+ "git.arvados.org/arvados.git/sdk/go/arvados"
+ "github.com/jmoiron/sqlx"
+ _ "github.com/lib/pq"
+ "gopkg.in/check.v1"
+)
+
+// DB returns a DB connection for the given cluster config.
+func DB(c *check.C, cluster *arvados.Cluster) *sqlx.DB {
+ db, err := sqlx.Open("postgres", cluster.PostgreSQL.Connection.String())
+ c.Assert(err, check.IsNil)
+ return db
+}
+
+// TransactionContext returns a context suitable for running a test
+// case in a new transaction, and a rollback func which the caller
+// should call after the test.
+func TransactionContext(c *check.C, db *sqlx.DB) (ctx context.Context, rollback func()) {
+ tx, err := db.Beginx()
+ c.Assert(err, check.IsNil)
+ return ctrlctx.NewWithTransaction(context.Background(), tx), func() {
+ c.Check(tx.Rollback(), check.IsNil)
+ }
+}