[Service]
Type=notify
EnvironmentFile=-/etc/arvados/environment
-ExecStart=/usr/bin/keep-balance -commit-pulls -commit-trash
+ExecStart=/usr/bin/keep-balance
# Set a reasonable default for the open file limit
LimitNOFILE=65536
Restart=always
# once.
BalanceUpdateLimit: 100000
+ # Maximum number of "pull block from other server" and "trash
+ # block" requests to send to each keepstore server at a
+ # time. Smaller values use less memory in keepstore and
+ # keep-balance. Larger values allow more progress per
+ # keep-balance iteration. Setting both limits to zero amounts
+ # to a "dry run".
+ BalancePullLimit: 100000
+ BalanceTrashLimit: 100000
+
# Default lifetime for ephemeral collections: 2 weeks. This must not
# be less than BlobSigningTTL.
DefaultTrashLifetime: 336h
"Collections.BalanceCollectionBatch": false,
"Collections.BalanceCollectionBuffers": false,
"Collections.BalancePeriod": false,
+ "Collections.BalancePullLimit": false,
"Collections.BalanceTimeout": false,
+ "Collections.BalanceTrashLimit": false,
"Collections.BalanceUpdateLimit": false,
"Collections.BlobDeleteConcurrency": false,
"Collections.BlobMissingReport": false,
BalanceCollectionBuffers int
BalanceTimeout Duration
BalanceUpdateLimit int
+ BalancePullLimit int
+ BalanceTrashLimit int
WebDAVCache WebDAVCacheConfig
client.Timeout = 0
rs := bal.rendezvousState()
- if runOptions.CommitTrash && rs != runOptions.SafeRendezvousState {
+ if cluster.Collections.BalanceTrashLimit > 0 && rs != runOptions.SafeRendezvousState {
if runOptions.SafeRendezvousState != "" {
bal.logf("notice: KeepServices list has changed since last run")
}
if err = bal.GetCurrentState(ctx, client, cluster.Collections.BalanceCollectionBatch, cluster.Collections.BalanceCollectionBuffers); err != nil {
return
}
+ bal.setupLookupTables(cluster)
bal.ComputeChangeSets()
bal.PrintStatistics()
if err = bal.CheckSanityLate(); err != nil {
}
lbFile = nil
}
- if runOptions.CommitPulls {
+ if cluster.Collections.BalancePullLimit > 0 {
err = bal.CommitPulls(ctx, client)
if err != nil {
// Skip trash if we can't pull. (Too cautious?)
return
}
}
- if runOptions.CommitTrash {
+ if cluster.Collections.BalanceTrashLimit > 0 {
err = bal.CommitTrash(ctx, client)
if err != nil {
return
// This just calls balanceBlock() once for each block, using a
// pool of worker goroutines.
defer bal.time("changeset_compute", "wall clock time to compute changesets")()
- bal.setupLookupTables()
type balanceTask struct {
blkid arvados.SizedDigest
bal.collectStatistics(results)
}
-func (bal *Balancer) setupLookupTables() {
+func (bal *Balancer) setupLookupTables(cluster *arvados.Cluster) {
bal.serviceRoots = make(map[string]string)
bal.classes = defaultClasses
bal.mountsByClass = map[string]map[*KeepMount]bool{"default": {}}
// class" case in balanceBlock depends on the order classes
// are considered.
sort.Strings(bal.classes)
+
+ for _, srv := range bal.KeepServices {
+ srv.ChangeSet = &ChangeSet{
+ PullLimit: cluster.Collections.BalancePullLimit,
+ TrashLimit: cluster.Collections.BalanceTrashLimit,
+ }
+ }
}
const (
}
type balancerStats struct {
- lost blocksNBytes
- overrep blocksNBytes
- unref blocksNBytes
- garbage blocksNBytes
- underrep blocksNBytes
- unachievable blocksNBytes
- justright blocksNBytes
- desired blocksNBytes
- current blocksNBytes
- pulls int
- trashes int
- replHistogram []int
- classStats map[string]replicationStats
+ lost blocksNBytes
+ overrep blocksNBytes
+ unref blocksNBytes
+ garbage blocksNBytes
+ underrep blocksNBytes
+ unachievable blocksNBytes
+ justright blocksNBytes
+ desired blocksNBytes
+ current blocksNBytes
+ pulls int
+ pullsDeferred int
+ trashes int
+ trashesDeferred int
+ replHistogram []int
+ classStats map[string]replicationStats
// collectionBytes / collectionBlockBytes = deduplication ratio
collectionBytes int64 // sum(bytes in referenced blocks) across all collections
}
for _, srv := range bal.KeepServices {
s.pulls += len(srv.ChangeSet.Pulls)
+ s.pullsDeferred += srv.ChangeSet.PullsDeferred
s.trashes += len(srv.ChangeSet.Trashes)
+ s.trashesDeferred += srv.ChangeSet.TrashesDeferred
}
bal.stats = s
bal.Metrics.UpdateStats(s)
_, err := s.db.Exec(`delete from collections`)
c.Assert(err, check.IsNil)
opts := RunOptions{
- CommitPulls: true,
- CommitTrash: true,
- Logger: ctxlog.TestLogger(c),
+ Logger: ctxlog.TestLogger(c),
}
s.stub.serveCurrentUserAdmin()
s.stub.serveZeroCollections()
func (s *runSuite) TestRefuseBadIndex(c *check.C) {
opts := RunOptions{
- CommitPulls: true,
- CommitTrash: true,
ChunkPrefix: "abc",
Logger: ctxlog.TestLogger(c),
}
func (s *runSuite) TestRefuseNonAdmin(c *check.C) {
opts := RunOptions{
- CommitPulls: true,
- CommitTrash: true,
- Logger: ctxlog.TestLogger(c),
+ Logger: ctxlog.TestLogger(c),
}
s.stub.serveCurrentUserNotAdmin()
s.stub.serveZeroCollections()
s.SetUpTest(c)
c.Logf("trying invalid prefix %q", trial.prefix)
opts := RunOptions{
- CommitPulls: true,
- CommitTrash: true,
ChunkPrefix: trial.prefix,
Logger: ctxlog.TestLogger(c),
}
func (s *runSuite) TestRefuseSameDeviceDifferentVolumes(c *check.C) {
opts := RunOptions{
- CommitPulls: true,
- CommitTrash: true,
- Logger: ctxlog.TestLogger(c),
+ Logger: ctxlog.TestLogger(c),
}
s.stub.serveCurrentUserAdmin()
s.stub.serveZeroCollections()
s.config.Collections.BlobMissingReport = lostf.Name()
defer os.Remove(lostf.Name())
opts := RunOptions{
- CommitPulls: true,
- CommitTrash: true,
- Logger: ctxlog.TestLogger(c),
+ Logger: ctxlog.TestLogger(c),
}
s.stub.serveCurrentUserAdmin()
s.stub.serveFooBarFileCollections()
}
func (s *runSuite) TestDryRun(c *check.C) {
+ s.config.Collections.BalanceTrashLimit = 0
+ s.config.Collections.BalancePullLimit = 0
opts := RunOptions{
- CommitPulls: false,
- CommitTrash: false,
- Logger: ctxlog.TestLogger(c),
+ Logger: ctxlog.TestLogger(c),
}
s.stub.serveCurrentUserAdmin()
collReqs := s.stub.serveFooBarFileCollections()
}
c.Check(trashReqs.Count(), check.Equals, 0)
c.Check(pullReqs.Count(), check.Equals, 0)
- c.Check(bal.stats.pulls, check.Not(check.Equals), 0)
+ c.Check(bal.stats.pulls, check.Equals, 0)
+ c.Check(bal.stats.pullsDeferred, check.Not(check.Equals), 0)
+ c.Check(bal.stats.trashes, check.Equals, 0)
+ c.Check(bal.stats.trashesDeferred, check.Not(check.Equals), 0)
c.Check(bal.stats.underrep.replicas, check.Not(check.Equals), 0)
c.Check(bal.stats.overrep.replicas, check.Not(check.Equals), 0)
}
s.config.Collections.BlobMissingReport = c.MkDir() + "/keep-balance-lost-blocks-test-"
s.config.ManagementToken = "xyzzy"
opts := RunOptions{
- CommitPulls: true,
- CommitTrash: true,
- Logger: ctxlog.TestLogger(c),
- Dumper: ctxlog.TestLogger(c),
+ Logger: ctxlog.TestLogger(c),
+ Dumper: ctxlog.TestLogger(c),
}
s.stub.serveCurrentUserAdmin()
s.stub.serveFooBarFileCollections()
func (s *runSuite) TestChunkPrefix(c *check.C) {
s.config.Collections.BlobMissingReport = c.MkDir() + "/keep-balance-lost-blocks-test-"
opts := RunOptions{
- CommitPulls: true,
- CommitTrash: true,
ChunkPrefix: "ac", // catch "foo" but not "bar"
Logger: ctxlog.TestLogger(c),
Dumper: ctxlog.TestLogger(c),
func (s *runSuite) TestRunForever(c *check.C) {
s.config.ManagementToken = "xyzzy"
opts := RunOptions{
- CommitPulls: true,
- CommitTrash: true,
- Logger: ctxlog.TestLogger(c),
- Dumper: ctxlog.TestLogger(c),
+ Logger: ctxlog.TestLogger(c),
+ Dumper: ctxlog.TestLogger(c),
}
s.stub.serveCurrentUserAdmin()
s.stub.serveFooBarFileCollections()
"testing"
"time"
+ "git.arvados.org/arvados.git/lib/config"
"git.arvados.org/arvados.git/sdk/go/arvados"
"git.arvados.org/arvados.git/sdk/go/ctxlog"
check "gopkg.in/check.v1"
type balancerSuite struct {
Balancer
+ config *arvados.Cluster
srvs []*KeepService
blks map[string]tester
knownRendezvous [][]int
bal.signatureTTL = 3600
bal.Logger = ctxlog.TestLogger(c)
+
+ cfg, err := config.NewLoader(nil, ctxlog.TestLogger(c)).Load()
+ c.Assert(err, check.Equals, nil)
+ bal.config, err = cfg.GetCluster("")
+ c.Assert(err, check.Equals, nil)
}
func (bal *balancerSuite) SetUpTest(c *check.C) {
// the appropriate changes for that block have been added to the
// changesets.
func (bal *balancerSuite) try(c *check.C, t tester) {
- bal.setupLookupTables()
+ bal.setupLookupTables(bal.config)
blk := &BlockState{
Replicas: bal.replList(t.known, t.current),
Desired: t.desired,
for i, t := range t.timestamps {
blk.Replicas[i].Mtime = t
}
- for _, srv := range bal.srvs {
- srv.ChangeSet = &ChangeSet{}
- }
result := bal.balanceBlock(knownBlkid(t.known), blk)
var didPull, didTrash slots
// ChangeSet is a set of change requests that will be sent to a
// keepstore server.
type ChangeSet struct {
- Pulls []Pull
- Trashes []Trash
- mutex sync.Mutex
+ PullLimit int
+ TrashLimit int
+
+ Pulls []Pull
+ PullsDeferred int // number that weren't added because of PullLimit
+ Trashes []Trash
+ TrashesDeferred int // number that weren't added because of TrashLimit
+ mutex sync.Mutex
}
// AddPull adds a Pull operation.
func (cs *ChangeSet) AddPull(p Pull) {
cs.mutex.Lock()
- cs.Pulls = append(cs.Pulls, p)
+ if len(cs.Pulls) < cs.PullLimit {
+ cs.Pulls = append(cs.Pulls, p)
+ } else {
+ cs.PullsDeferred++
+ }
cs.mutex.Unlock()
}
// AddTrash adds a Trash operation
func (cs *ChangeSet) AddTrash(t Trash) {
cs.mutex.Lock()
- cs.Trashes = append(cs.Trashes, t)
+ if len(cs.Trashes) < cs.TrashLimit {
+ cs.Trashes = append(cs.Trashes, t)
+ } else {
+ cs.TrashesDeferred++
+ }
cs.mutex.Unlock()
}
func (cs *ChangeSet) String() string {
cs.mutex.Lock()
defer cs.mutex.Unlock()
- return fmt.Sprintf("ChangeSet{Pulls:%d, Trashes:%d}", len(cs.Pulls), len(cs.Trashes))
+ return fmt.Sprintf("ChangeSet{Pulls:%d, Trashes:%d} Deferred{Pulls:%d Trashes:%d}", len(cs.Pulls), len(cs.Trashes), cs.PullsDeferred, cs.TrashesDeferred)
}
logger := logrus.New()
logger.Out = io.MultiWriter(&logBuf, os.Stderr)
opts := RunOptions{
- CommitPulls: true,
- CommitTrash: true,
CommitConfirmedFields: true,
Logger: logger,
}
nextOpts, err := bal.Run(context.Background(), s.client, s.config, opts)
c.Check(err, check.IsNil)
c.Check(nextOpts.SafeRendezvousState, check.Not(check.Equals), "")
- c.Check(nextOpts.CommitPulls, check.Equals, true)
if iter == 0 {
c.Check(logBuf.String(), check.Matches, `(?ms).*ChangeSet{Pulls:1.*`)
c.Check(logBuf.String(), check.Not(check.Matches), `(?ms).*ChangeSet{.*Trashes:[^0]}*`)
flags := flag.NewFlagSet(prog, flag.ContinueOnError)
flags.BoolVar(&options.Once, "once", false,
"balance once and then exit")
- flags.BoolVar(&options.CommitPulls, "commit-pulls", false,
- "send pull requests (make more replicas of blocks that are underreplicated or are not in optimal rendezvous probe order)")
- flags.BoolVar(&options.CommitTrash, "commit-trash", false,
- "send trash requests (delete unreferenced old blocks, and excess replicas of overreplicated blocks)")
+ deprCommitPulls := flags.Bool("commit-pulls", true,
+ "send pull requests (must be true -- configure Collections.BalancePullLimit = 0 to disable.)")
+ deprCommitTrash := flags.Bool("commit-trash", true,
+ "send trash requests (must be true -- configure Collections.BalanceTrashLimit = 0 to disable.)")
flags.BoolVar(&options.CommitConfirmedFields, "commit-confirmed-fields", true,
"update collection fields (replicas_confirmed, storage_classes_confirmed, etc.)")
flags.StringVar(&options.ChunkPrefix, "chunk-prefix", "",
return code
}
+ if !*deprCommitPulls || !*deprCommitTrash {
+ fmt.Fprint(stderr,
+ "Usage error: the -commit-pulls or -commit-trash command line flags are no longer supported.\n",
+ "Use Collections.BalancePullLimit and Collections.BalanceTrashLimit instead.\n")
+ return cmd.EX_USAGE
+ }
+
// Drop our custom args that would be rejected by the generic
// service.Command
args = nil
// RunOptions fields are controlled by command line flags.
type RunOptions struct {
Once bool
- CommitPulls bool
- CommitTrash bool
CommitConfirmedFields bool
ChunkPrefix string
Logger logrus.FieldLogger
logger.Printf("starting up: will scan every %v and on SIGUSR1", srv.Cluster.Collections.BalancePeriod)
for {
- if !srv.RunOptions.CommitPulls && !srv.RunOptions.CommitTrash {
+ if srv.Cluster.Collections.BalancePullLimit < 1 && srv.Cluster.Collections.BalanceTrashLimit < 1 {
logger.Print("WARNING: Will scan periodically, but no changes will be committed.")
- logger.Print("======= Consider using -commit-pulls and -commit-trash flags.")
+ logger.Print("======= Consider using non-zero BalancePullLimit and BalanceTrashLimit configs.")
}
if !dblock.KeepBalanceService.Check() {