Merge branch 'main' from workbench2.git
[arvados.git] / services / keep-balance / balance.go
index 33c907c2031ac97dbcabe8742ad2313ead157f5f..e44dfeda8748aec51e18cd55118c15d509d8fc12 100644 (file)
@@ -227,7 +227,7 @@ func (bal *Balancer) cleanupMounts() {
        rwdev := map[string]*KeepService{}
        for _, srv := range bal.KeepServices {
                for _, mnt := range srv.mounts {
-                       if !mnt.ReadOnly {
+                       if mnt.AllowWrite {
                                rwdev[mnt.UUID] = srv
                        }
                }
@@ -237,7 +237,7 @@ func (bal *Balancer) cleanupMounts() {
        for _, srv := range bal.KeepServices {
                var dedup []*KeepMount
                for _, mnt := range srv.mounts {
-                       if mnt.ReadOnly && rwdev[mnt.UUID] != nil {
+                       if !mnt.AllowWrite && rwdev[mnt.UUID] != nil {
                                bal.logf("skipping srv %s readonly mount %q because same volume is mounted read-write on srv %s", srv, mnt.UUID, rwdev[mnt.UUID])
                        } else {
                                dedup = append(dedup, mnt)
@@ -587,9 +587,11 @@ func (bal *Balancer) setupLookupTables() {
                for _, mnt := range srv.mounts {
                        bal.mounts++
 
-                       // All mounts on a read-only service are
-                       // effectively read-only.
-                       mnt.ReadOnly = mnt.ReadOnly || srv.ReadOnly
+                       if srv.ReadOnly {
+                               // All mounts on a read-only service
+                               // are effectively read-only.
+                               mnt.AllowWrite = false
+                       }
 
                        for class := range mnt.StorageClasses {
                                if mbc := bal.mountsByClass[class]; mbc == nil {
@@ -667,7 +669,7 @@ func (bal *Balancer) balanceBlock(blkid arvados.SizedDigest, blk *BlockState) ba
                        slots = append(slots, slot{
                                mnt:  mnt,
                                repl: repl,
-                               want: repl != nil && mnt.ReadOnly,
+                               want: repl != nil && !mnt.AllowTrash,
                        })
                }
        }
@@ -756,7 +758,7 @@ func (bal *Balancer) balanceBlock(blkid arvados.SizedDigest, blk *BlockState) ba
                                protMnt[slot.mnt] = true
                                replProt += slot.mnt.Replication
                        }
-                       if replWant < desired && (slot.repl != nil || !slot.mnt.ReadOnly) {
+                       if replWant < desired && (slot.repl != nil || slot.mnt.AllowWrite) {
                                slots[i].want = true
                                wantSrv[slot.mnt.KeepService] = true
                                wantMnt[slot.mnt] = true
@@ -829,23 +831,53 @@ func (bal *Balancer) balanceBlock(blkid arvados.SizedDigest, blk *BlockState) ba
        }
        blockState := computeBlockState(slots, nil, len(blk.Replicas), 0)
 
-       var lost bool
-       var changes []string
+       // Sort the slots by rendezvous order. This ensures "trash the
+       // first of N replicas with identical timestamps" is
+       // predictable (helpful for testing) and well distributed
+       // across servers.
+       sort.Slice(slots, func(i, j int) bool {
+               si, sj := slots[i], slots[j]
+               if orderi, orderj := srvRendezvous[si.mnt.KeepService], srvRendezvous[sj.mnt.KeepService]; orderi != orderj {
+                       return orderi < orderj
+               } else {
+                       return rendezvousLess(si.mnt.UUID, sj.mnt.UUID, blkid)
+               }
+       })
+
+       var (
+               lost         bool
+               changes      []string
+               trashedMtime = make(map[int64]bool, len(slots))
+       )
        for _, slot := range slots {
                // TODO: request a Touch if Mtime is duplicated.
                var change int
                switch {
                case !slot.want && slot.repl != nil && slot.repl.Mtime < bal.MinMtime:
-                       slot.mnt.KeepService.AddTrash(Trash{
-                               SizedDigest: blkid,
-                               Mtime:       slot.repl.Mtime,
-                               From:        slot.mnt,
-                       })
-                       change = changeTrash
+                       if trashedMtime[slot.repl.Mtime] {
+                               // Don't trash multiple replicas with
+                               // identical timestamps. If they are
+                               // multiple views of the same backing
+                               // storage, asking both servers to
+                               // trash is redundant and can cause
+                               // races (see #20242). If they are
+                               // distinct replicas that happen to
+                               // have identical timestamps, we'll
+                               // get this one on the next sweep.
+                               change = changeNone
+                       } else {
+                               slot.mnt.KeepService.AddTrash(Trash{
+                                       SizedDigest: blkid,
+                                       Mtime:       slot.repl.Mtime,
+                                       From:        slot.mnt,
+                               })
+                               change = changeTrash
+                               trashedMtime[slot.repl.Mtime] = true
+                       }
                case slot.repl == nil && slot.want && len(blk.Replicas) == 0:
                        lost = true
                        change = changeNone
-               case slot.repl == nil && slot.want && !slot.mnt.ReadOnly:
+               case slot.repl == nil && slot.want && slot.mnt.AllowWrite:
                        slot.mnt.KeepService.AddPull(Pull{
                                SizedDigest: blkid,
                                From:        blk.Replicas[0].KeepMount.KeepService,