- rendezvousOrder[srv] = i
- slots[i].srv = srv
- }
-
- // Sort readonly replicas ahead of trashable ones. This way,
- // if a single service has excessive replicas, the ones we
- // encounter last (and therefore choose to delete) will be on
- // the writable volumes, where possible.
- //
- // TODO: within the trashable set, prefer the oldest replica
- // that doesn't have a timestamp collision with others.
- sort.Slice(blk.Replicas, func(i, j int) bool {
- mnt := blk.Replicas[i].KeepMount
- return mnt.ReadOnly || mnt.KeepService.ReadOnly
- })
+ srvRendezvous[srv] = i
+ }
+
+ // Below we set underreplicated=true if we find any storage
+ // class that's currently underreplicated -- in that case we
+ // won't want to trash any replicas.
+ underreplicated := false
+
+ unsafeToDelete := make(map[int64]bool, len(slots))
+ for _, class := range bal.classes {
+ desired := blk.Desired[class]
+ if desired == 0 {
+ continue
+ }
+
+ // Sort the slots by desirability.
+ sort.Slice(slots, func(i, j int) bool {
+ si, sj := slots[i], slots[j]
+ if classi, classj := bal.mountsByClass[class][si.mnt], bal.mountsByClass[class][sj.mnt]; classi != classj {
+ // Prefer a mount that satisfies the
+ // desired class.
+ return bal.mountsByClass[class][si.mnt]
+ } else if si.want != sj.want {
+ // Prefer a mount that will have a
+ // replica no matter what we do here
+ // -- either because it already has an
+ // untrashable replica, or because we
+ // already need it to satisfy a
+ // different storage class.
+ return si.want
+ } else if orderi, orderj := srvRendezvous[si.mnt.KeepService], srvRendezvous[sj.mnt.KeepService]; orderi != orderj {
+ // Prefer a better rendezvous
+ // position.
+ return orderi < orderj
+ } else if repli, replj := si.repl != nil, sj.repl != nil; repli != replj {
+ // Prefer a mount that already has a
+ // replica.
+ return repli
+ } else {
+ // If pull/trash turns out to be
+ // needed, distribute the
+ // new/remaining replicas uniformly
+ // across qualifying mounts on a given
+ // server.
+ return rendezvousLess(si.mnt.UUID, sj.mnt.UUID, blkid)
+ }
+ })
+
+ // Servers/mounts/devices (with or without existing
+ // replicas) that are part of the best achievable
+ // layout for this storage class.
+ wantSrv := map[*KeepService]bool{}
+ wantMnt := map[*KeepMount]bool{}
+ wantDev := map[string]bool{}
+ // Positions (with existing replicas) that have been
+ // protected (via unsafeToDelete) to ensure we don't
+ // reduce replication below desired level when
+ // trashing replicas that aren't optimal positions for
+ // any storage class.
+ protMnt := map[*KeepMount]bool{}
+ // Replication planned so far (corresponds to wantMnt).
+ replWant := 0
+ // Protected replication (corresponds to protMnt).
+ replProt := 0
+
+ // trySlot tries using a slot to meet requirements,
+ // and returns true if all requirements are met.
+ trySlot := func(i int) bool {
+ slot := slots[i]
+ if wantMnt[slot.mnt] || wantDev[slot.mnt.UUID] {
+ // Already allocated a replica to this
+ // backend device, possibly on a
+ // different server.
+ return false
+ }
+ if replProt < desired && slot.repl != nil && !protMnt[slot.mnt] {
+ unsafeToDelete[slot.repl.Mtime] = true
+ protMnt[slot.mnt] = true
+ replProt += slot.mnt.Replication
+ }
+ if replWant < desired && (slot.repl != nil || !slot.mnt.ReadOnly) {
+ slots[i].want = true
+ wantSrv[slot.mnt.KeepService] = true
+ wantMnt[slot.mnt] = true
+ wantDev[slot.mnt.UUID] = true
+ replWant += slot.mnt.Replication
+ }
+ return replProt >= desired && replWant >= desired
+ }