client.Timeout = 0
rs := bal.rendezvousState()
- if runOptions.CommitTrash && rs != runOptions.SafeRendezvousState {
+ if cluster.Collections.BalanceTrashLimit > 0 && rs != runOptions.SafeRendezvousState {
if runOptions.SafeRendezvousState != "" {
bal.logf("notice: KeepServices list has changed since last run")
}
if err = bal.GetCurrentState(ctx, client, cluster.Collections.BalanceCollectionBatch, cluster.Collections.BalanceCollectionBuffers); err != nil {
return
}
+ bal.setupLookupTables(cluster)
bal.ComputeChangeSets()
bal.PrintStatistics()
if err = bal.CheckSanityLate(); err != nil {
}
lbFile = nil
}
- if runOptions.CommitPulls {
+ if cluster.Collections.BalancePullLimit > 0 {
err = bal.CommitPulls(ctx, client)
if err != nil {
// Skip trash if we can't pull. (Too cautious?)
return
}
}
- if runOptions.CommitTrash {
+ if cluster.Collections.BalanceTrashLimit > 0 {
err = bal.CommitTrash(ctx, client)
if err != nil {
return
rwdev := map[string]*KeepService{}
for _, srv := range bal.KeepServices {
for _, mnt := range srv.mounts {
- if !mnt.ReadOnly {
+ if mnt.AllowWrite {
rwdev[mnt.UUID] = srv
}
}
for _, srv := range bal.KeepServices {
var dedup []*KeepMount
for _, mnt := range srv.mounts {
- if mnt.ReadOnly && rwdev[mnt.UUID] != nil {
+ if !mnt.AllowWrite && rwdev[mnt.UUID] != nil {
bal.logf("skipping srv %s readonly mount %q because same volume is mounted read-write on srv %s", srv, mnt.UUID, rwdev[mnt.UUID])
} else {
dedup = append(dedup, mnt)
// This just calls balanceBlock() once for each block, using a
// pool of worker goroutines.
defer bal.time("changeset_compute", "wall clock time to compute changesets")()
- bal.setupLookupTables()
type balanceTask struct {
blkid arvados.SizedDigest
bal.collectStatistics(results)
}
-func (bal *Balancer) setupLookupTables() {
+func (bal *Balancer) setupLookupTables(cluster *arvados.Cluster) {
bal.serviceRoots = make(map[string]string)
bal.classes = defaultClasses
bal.mountsByClass = map[string]map[*KeepMount]bool{"default": {}}
for _, mnt := range srv.mounts {
bal.mounts++
- // All mounts on a read-only service are
- // effectively read-only.
- mnt.ReadOnly = mnt.ReadOnly || srv.ReadOnly
+ if srv.ReadOnly {
+ // All mounts on a read-only service
+ // are effectively read-only.
+ mnt.AllowWrite = false
+ }
for class := range mnt.StorageClasses {
if mbc := bal.mountsByClass[class]; mbc == nil {
// class" case in balanceBlock depends on the order classes
// are considered.
sort.Strings(bal.classes)
+
+ for _, srv := range bal.KeepServices {
+ srv.ChangeSet = &ChangeSet{
+ PullLimit: cluster.Collections.BalancePullLimit,
+ TrashLimit: cluster.Collections.BalanceTrashLimit,
+ }
+ }
}
const (
slots = append(slots, slot{
mnt: mnt,
repl: repl,
- want: repl != nil && mnt.ReadOnly,
+ want: repl != nil && !mnt.AllowTrash,
})
}
}
protMnt[slot.mnt] = true
replProt += slot.mnt.Replication
}
- if replWant < desired && (slot.repl != nil || !slot.mnt.ReadOnly) {
+ if replWant < desired && (slot.repl != nil || slot.mnt.AllowWrite) {
slots[i].want = true
wantSrv[slot.mnt.KeepService] = true
wantMnt[slot.mnt] = true
}
blockState := computeBlockState(slots, nil, len(blk.Replicas), 0)
- var lost bool
- var changes []string
+ // Sort the slots by rendezvous order. This ensures "trash the
+ // first of N replicas with identical timestamps" is
+ // predictable (helpful for testing) and well distributed
+ // across servers.
+ sort.Slice(slots, func(i, j int) bool {
+ si, sj := slots[i], slots[j]
+ if orderi, orderj := srvRendezvous[si.mnt.KeepService], srvRendezvous[sj.mnt.KeepService]; orderi != orderj {
+ return orderi < orderj
+ } else {
+ return rendezvousLess(si.mnt.UUID, sj.mnt.UUID, blkid)
+ }
+ })
+
+ var (
+ lost bool
+ changes []string
+ trashedMtime = make(map[int64]bool, len(slots))
+ )
for _, slot := range slots {
// TODO: request a Touch if Mtime is duplicated.
var change int
switch {
case !slot.want && slot.repl != nil && slot.repl.Mtime < bal.MinMtime:
- slot.mnt.KeepService.AddTrash(Trash{
- SizedDigest: blkid,
- Mtime: slot.repl.Mtime,
- From: slot.mnt,
- })
- change = changeTrash
+ if trashedMtime[slot.repl.Mtime] {
+ // Don't trash multiple replicas with
+ // identical timestamps. If they are
+ // multiple views of the same backing
+ // storage, asking both servers to
+ // trash is redundant and can cause
+ // races (see #20242). If they are
+ // distinct replicas that happen to
+ // have identical timestamps, we'll
+ // get this one on the next sweep.
+ change = changeNone
+ } else {
+ slot.mnt.KeepService.AddTrash(Trash{
+ SizedDigest: blkid,
+ Mtime: slot.repl.Mtime,
+ From: slot.mnt,
+ })
+ change = changeTrash
+ trashedMtime[slot.repl.Mtime] = true
+ }
case slot.repl == nil && slot.want && len(blk.Replicas) == 0:
lost = true
change = changeNone
- case slot.repl == nil && slot.want && !slot.mnt.ReadOnly:
+ case slot.repl == nil && slot.want && slot.mnt.AllowWrite:
slot.mnt.KeepService.AddPull(Pull{
SizedDigest: blkid,
From: blk.Replicas[0].KeepMount.KeepService,
}
type balancerStats struct {
- lost blocksNBytes
- overrep blocksNBytes
- unref blocksNBytes
- garbage blocksNBytes
- underrep blocksNBytes
- unachievable blocksNBytes
- justright blocksNBytes
- desired blocksNBytes
- current blocksNBytes
- pulls int
- trashes int
- replHistogram []int
- classStats map[string]replicationStats
+ lost blocksNBytes
+ overrep blocksNBytes
+ unref blocksNBytes
+ garbage blocksNBytes
+ underrep blocksNBytes
+ unachievable blocksNBytes
+ justright blocksNBytes
+ desired blocksNBytes
+ current blocksNBytes
+ pulls int
+ pullsDeferred int
+ trashes int
+ trashesDeferred int
+ replHistogram []int
+ classStats map[string]replicationStats
// collectionBytes / collectionBlockBytes = deduplication ratio
collectionBytes int64 // sum(bytes in referenced blocks) across all collections
}
for _, srv := range bal.KeepServices {
s.pulls += len(srv.ChangeSet.Pulls)
+ s.pullsDeferred += srv.ChangeSet.PullsDeferred
s.trashes += len(srv.ChangeSet.Trashes)
+ s.trashesDeferred += srv.ChangeSet.TrashesDeferred
}
bal.stats = s
bal.Metrics.UpdateStats(s)