From e16866d0f398f6c61f11e2ecdf473d47100329c0 Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Fri, 3 Dec 2021 10:54:59 -0500 Subject: [PATCH] 18547: Use volume UUID instead of DeviceID to deduplicate mounts. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- services/keep-balance/balance.go | 30 +++++++++++---------------- services/keep-balance/balance_test.go | 27 ++++++++++++------------ 2 files changed, 26 insertions(+), 31 deletions(-) diff --git a/services/keep-balance/balance.go b/services/keep-balance/balance.go index bb590e13b3..fa01d512bc 100644 --- a/services/keep-balance/balance.go +++ b/services/keep-balance/balance.go @@ -217,8 +217,8 @@ func (bal *Balancer) cleanupMounts() { rwdev := map[string]*KeepService{} for _, srv := range bal.KeepServices { for _, mnt := range srv.mounts { - if !mnt.ReadOnly && mnt.DeviceID != "" { - rwdev[mnt.DeviceID] = srv + if !mnt.ReadOnly { + rwdev[mnt.UUID] = srv } } } @@ -227,8 +227,8 @@ func (bal *Balancer) cleanupMounts() { for _, srv := range bal.KeepServices { var dedup []*KeepMount for _, mnt := range srv.mounts { - if mnt.ReadOnly && rwdev[mnt.DeviceID] != nil { - bal.logf("skipping srv %s readonly mount %q because same device %q is mounted read-write on srv %s", srv, mnt.UUID, mnt.DeviceID, rwdev[mnt.DeviceID]) + if mnt.ReadOnly && rwdev[mnt.UUID] != nil { + bal.logf("skipping srv %s readonly mount %q because same volume is mounted read-write on srv %s", srv, mnt.UUID, rwdev[mnt.UUID]) } else { dedup = append(dedup, mnt) } @@ -357,12 +357,10 @@ func (bal *Balancer) GetCurrentState(ctx context.Context, c *arvados.Client, pag deviceMount := map[string]*KeepMount{} for _, srv := range bal.KeepServices { for _, mnt := range srv.mounts { - equiv := deviceMount[mnt.DeviceID] + equiv := deviceMount[mnt.UUID] if equiv == nil { equiv = mnt - if mnt.DeviceID != "" { - deviceMount[mnt.DeviceID] = equiv - } + deviceMount[mnt.UUID] = equiv } equivMount[equiv] = append(equivMount[equiv], mnt) } @@ -667,7 +665,7 @@ func (bal *Balancer) balanceBlock(blkid arvados.SizedDigest, blk *BlockState) ba // new/remaining replicas uniformly // across qualifying mounts on a given // server. - return rendezvousLess(si.mnt.DeviceID, sj.mnt.DeviceID, blkid) + return rendezvousLess(si.mnt.UUID, sj.mnt.UUID, blkid) } }) @@ -692,7 +690,7 @@ func (bal *Balancer) balanceBlock(blkid arvados.SizedDigest, blk *BlockState) ba // and returns true if all requirements are met. trySlot := func(i int) bool { slot := slots[i] - if wantMnt[slot.mnt] || wantDev[slot.mnt.DeviceID] { + if wantMnt[slot.mnt] || wantDev[slot.mnt.UUID] { // Already allocated a replica to this // backend device, possibly on a // different server. @@ -707,9 +705,7 @@ func (bal *Balancer) balanceBlock(blkid arvados.SizedDigest, blk *BlockState) ba slots[i].want = true wantSrv[slot.mnt.KeepService] = true wantMnt[slot.mnt] = true - if slot.mnt.DeviceID != "" { - wantDev[slot.mnt.DeviceID] = true - } + wantDev[slot.mnt.UUID] = true replWant += slot.mnt.Replication } return replProt >= desired && replWant >= desired @@ -751,7 +747,7 @@ func (bal *Balancer) balanceBlock(blkid arvados.SizedDigest, blk *BlockState) ba // haven't already been added to unsafeToDelete // because the servers report different Mtimes. for _, slot := range slots { - if slot.repl != nil && wantDev[slot.mnt.DeviceID] { + if slot.repl != nil && wantDev[slot.mnt.UUID] { unsafeToDelete[slot.repl.Mtime] = true } } @@ -834,7 +830,7 @@ func computeBlockState(slots []slot, onlyCount map[*KeepMount]bool, have, needRe if onlyCount != nil && !onlyCount[slot.mnt] { continue } - if countedDev[slot.mnt.DeviceID] { + if countedDev[slot.mnt.UUID] { continue } switch { @@ -848,9 +844,7 @@ func computeBlockState(slots []slot, onlyCount map[*KeepMount]bool, have, needRe bbs.pulling++ repl += slot.mnt.Replication } - if slot.mnt.DeviceID != "" { - countedDev[slot.mnt.DeviceID] = true - } + countedDev[slot.mnt.UUID] = true } if repl < needRepl { bbs.unachievable = true diff --git a/services/keep-balance/balance_test.go b/services/keep-balance/balance_test.go index c529ac150e..df04145b9d 100644 --- a/services/keep-balance/balance_test.go +++ b/services/keep-balance/balance_test.go @@ -167,8 +167,8 @@ func (bal *balancerSuite) testMultipleViews(c *check.C, readonly bool) { srv.mounts[0].KeepMount.DeviceID = fmt.Sprintf("writable-by-srv-%x", i) srv.mounts = append(srv.mounts, &KeepMount{ KeepMount: arvados.KeepMount{ - DeviceID: fmt.Sprintf("writable-by-srv-%x", (i+1)%len(bal.srvs)), - UUID: fmt.Sprintf("zzzzz-mount-%015x", i<<16), + DeviceID: bal.srvs[(i+1)%len(bal.srvs)].mounts[0].KeepMount.DeviceID, + UUID: bal.srvs[(i+1)%len(bal.srvs)].mounts[0].KeepMount.UUID, ReadOnly: readonly, Replication: 1, StorageClasses: map[string]bool{"default": true}, @@ -347,6 +347,7 @@ func (bal *balancerSuite) TestDecreaseReplBlockTooNew(c *check.C) { func (bal *balancerSuite) TestCleanupMounts(c *check.C) { bal.srvs[3].mounts[0].KeepMount.ReadOnly = true bal.srvs[3].mounts[0].KeepMount.DeviceID = "abcdef" + bal.srvs[14].mounts[0].KeepMount.UUID = bal.srvs[3].mounts[0].KeepMount.UUID bal.srvs[14].mounts[0].KeepMount.DeviceID = "abcdef" c.Check(len(bal.srvs[3].mounts), check.Equals, 1) bal.cleanupMounts() @@ -485,32 +486,32 @@ func (bal *balancerSuite) TestVolumeReplication(c *check.C) { } func (bal *balancerSuite) TestDeviceRWMountedByMultipleServers(c *check.C) { - bal.srvs[0].mounts[0].KeepMount.DeviceID = "abcdef" - bal.srvs[9].mounts[0].KeepMount.DeviceID = "abcdef" - bal.srvs[14].mounts[0].KeepMount.DeviceID = "abcdef" + dupUUID := bal.srvs[0].mounts[0].KeepMount.UUID + bal.srvs[9].mounts[0].KeepMount.UUID = dupUUID + bal.srvs[14].mounts[0].KeepMount.UUID = dupUUID // block 0 belongs on servers 3 and e, which have different - // device IDs. + // UUIDs. bal.try(c, tester{ known: 0, desired: map[string]int{"default": 2}, current: slots{1}, shouldPull: slots{0}}) // block 1 belongs on servers 0 and 9, which both report - // having a replica, but the replicas are on the same device - // ID -- so we should pull to the third position (7). + // having a replica, but the replicas are on the same volume + // -- so we should pull to the third position (7). bal.try(c, tester{ known: 1, desired: map[string]int{"default": 2}, current: slots{0, 1}, shouldPull: slots{2}}) - // block 1 can be pulled to the doubly-mounted device, but the + // block 1 can be pulled to the doubly-mounted volume, but the // pull should only be done on the first of the two servers. bal.try(c, tester{ known: 1, desired: map[string]int{"default": 2}, current: slots{2}, shouldPull: slots{0}}) - // block 0 has one replica on a single device mounted on two + // block 0 has one replica on a single volume mounted on two // servers (e,9 at positions 1,9). Trashing the replica on 9 // would lose the block. bal.try(c, tester{ @@ -523,7 +524,7 @@ func (bal *balancerSuite) TestDeviceRWMountedByMultipleServers(c *check.C) { pulling: 1, }}) // block 0 is overreplicated, but the second and third - // replicas are the same replica according to DeviceID + // replicas are the same replica according to volume UUID // (despite different Mtimes). Don't trash the third replica. bal.try(c, tester{ known: 0, @@ -595,7 +596,7 @@ func (bal *balancerSuite) TestChangeStorageClasses(c *check.C) { desired: map[string]int{"default": 2, "special": 1}, current: slots{0, 1}, shouldPull: slots{9}, - shouldPullMounts: []string{"zzzzz-mount-special00000009"}}) + shouldPullMounts: []string{"zzzzz-mount-special20000009"}}) // If some storage classes are not satisfied, don't trash any // excess replicas. (E.g., if someone desires repl=1 on // class=durable, and we have two copies on class=volatile, we @@ -605,7 +606,7 @@ func (bal *balancerSuite) TestChangeStorageClasses(c *check.C) { desired: map[string]int{"special": 1}, current: slots{0, 1}, shouldPull: slots{9}, - shouldPullMounts: []string{"zzzzz-mount-special00000009"}}) + shouldPullMounts: []string{"zzzzz-mount-special20000009"}}) // Once storage classes are satisfied, trash excess replicas // that appear earlier in probe order but aren't needed to // satisfy the desired classes. -- 2.30.2