20242: Trash only one when identical replicas are eligible to trash. 20242-dup-trash-lists
authorTom Clegg <tom@curii.com>
Thu, 23 Mar 2023 18:36:55 +0000 (14:36 -0400)
committerTom Clegg <tom@curii.com>
Thu, 23 Mar 2023 19:03:58 +0000 (15:03 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

services/keep-balance/balance.go
services/keep-balance/balance_test.go

index 33c907c2031ac97dbcabe8742ad2313ead157f5f..215c5e1b5be1355e9f6793da54ab0c3148eff242 100644 (file)
@@ -829,19 +829,49 @@ func (bal *Balancer) balanceBlock(blkid arvados.SizedDigest, blk *BlockState) ba
        }
        blockState := computeBlockState(slots, nil, len(blk.Replicas), 0)
 
-       var lost bool
-       var changes []string
+       // Sort the slots by rendezvous order. This ensures "trash the
+       // first of N replicas with identical timestamps" is
+       // predictable (helpful for testing) and well distributed
+       // across servers.
+       sort.Slice(slots, func(i, j int) bool {
+               si, sj := slots[i], slots[j]
+               if orderi, orderj := srvRendezvous[si.mnt.KeepService], srvRendezvous[sj.mnt.KeepService]; orderi != orderj {
+                       return orderi < orderj
+               } else {
+                       return rendezvousLess(si.mnt.UUID, sj.mnt.UUID, blkid)
+               }
+       })
+
+       var (
+               lost         bool
+               changes      []string
+               trashedMtime = make(map[int64]bool, len(slots))
+       )
        for _, slot := range slots {
                // TODO: request a Touch if Mtime is duplicated.
                var change int
                switch {
                case !slot.want && slot.repl != nil && slot.repl.Mtime < bal.MinMtime:
-                       slot.mnt.KeepService.AddTrash(Trash{
-                               SizedDigest: blkid,
-                               Mtime:       slot.repl.Mtime,
-                               From:        slot.mnt,
-                       })
-                       change = changeTrash
+                       if trashedMtime[slot.repl.Mtime] {
+                               // Don't trash multiple replicas with
+                               // identical timestamps. If they are
+                               // multiple views of the same backing
+                               // storage, asking both servers to
+                               // trash is redundant and can cause
+                               // races (see #20242). If they are
+                               // distinct replicas that happen to
+                               // have identical timestamps, we'll
+                               // get this one on the next sweep.
+                               change = changeNone
+                       } else {
+                               slot.mnt.KeepService.AddTrash(Trash{
+                                       SizedDigest: blkid,
+                                       Mtime:       slot.repl.Mtime,
+                                       From:        slot.mnt,
+                               })
+                               change = changeTrash
+                               trashedMtime[slot.repl.Mtime] = true
+                       }
                case slot.repl == nil && slot.want && len(blk.Replicas) == 0:
                        lost = true
                        change = changeNone
index 6626609b5769f55bdb7d32385afffc443df8712c..f9fca1431b65f4c944618a37f76a7c8cfddcb8a1 100644 (file)
@@ -321,6 +321,35 @@ func (bal *balancerSuite) TestDecreaseReplTimestampCollision(c *check.C) {
                desired:    map[string]int{"default": 2},
                current:    slots{0, 1, 2},
                timestamps: []int64{12345678, 10000000, 10000000}})
+       bal.try(c, tester{
+               desired:     map[string]int{"default": 0},
+               current:     slots{0, 1, 2},
+               timestamps:  []int64{12345678, 12345678, 12345678},
+               shouldTrash: slots{0},
+               shouldTrashMounts: []string{
+                       bal.srvs[bal.knownRendezvous[0][0]].mounts[0].UUID}})
+       bal.try(c, tester{
+               desired:     map[string]int{"default": 2},
+               current:     slots{0, 1, 2, 5, 6},
+               timestamps:  []int64{12345678, 12345679, 10000000, 10000000, 10000000},
+               shouldTrash: slots{2},
+               shouldTrashMounts: []string{
+                       bal.srvs[bal.knownRendezvous[0][2]].mounts[0].UUID}})
+       bal.try(c, tester{
+               desired:     map[string]int{"default": 2},
+               current:     slots{0, 1, 2, 5, 6},
+               timestamps:  []int64{12345678, 12345679, 12345671, 10000000, 10000000},
+               shouldTrash: slots{2, 5},
+               shouldTrashMounts: []string{
+                       bal.srvs[bal.knownRendezvous[0][2]].mounts[0].UUID,
+                       bal.srvs[bal.knownRendezvous[0][5]].mounts[0].UUID}})
+       bal.try(c, tester{
+               desired:     map[string]int{"default": 2},
+               current:     slots{0, 1, 2, 5, 6},
+               timestamps:  []int64{12345678, 12345679, 12345679, 10000000, 10000000},
+               shouldTrash: slots{5},
+               shouldTrashMounts: []string{
+                       bal.srvs[bal.knownRendezvous[0][5]].mounts[0].UUID}})
 }
 
 func (bal *balancerSuite) TestDecreaseReplBlockTooNew(c *check.C) {