1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
15 "git.arvados.org/arvados.git/lib/config"
16 "git.arvados.org/arvados.git/sdk/go/arvados"
17 "git.arvados.org/arvados.git/sdk/go/ctxlog"
18 check "gopkg.in/check.v1"
22 func Test(t *testing.T) {
26 var _ = check.Suite(&balancerSuite{})
28 type balancerSuite struct {
30 config *arvados.Cluster
32 blks map[string]tester
33 knownRendezvous [][]int
38 // index into knownRendezvous
46 desired map[string]int
52 shouldPullMounts []string
53 shouldTrashMounts []string
55 expectBlockState *balancedBlockState
56 expectClassState map[string]balancedBlockState
59 func (bal *balancerSuite) SetUpSuite(c *check.C) {
60 bal.knownRendezvous = nil
61 for _, str := range []string{
68 for _, c := range []byte(str) {
69 pos, _ := strconv.ParseUint(string(c), 16, 4)
70 slots = append(slots, int(pos))
72 bal.knownRendezvous = append(bal.knownRendezvous, slots)
75 bal.signatureTTL = 3600
76 bal.Logger = ctxlog.TestLogger(c)
78 cfg, err := config.NewLoader(nil, ctxlog.TestLogger(c)).Load()
79 c.Assert(err, check.Equals, nil)
80 bal.config, err = cfg.GetCluster("")
81 c.Assert(err, check.Equals, nil)
84 func (bal *balancerSuite) SetUpTest(c *check.C) {
85 bal.srvs = make([]*KeepService, 16)
86 bal.KeepServices = make(map[string]*KeepService)
87 for i := range bal.srvs {
89 KeepService: arvados.KeepService{
90 UUID: fmt.Sprintf("zzzzz-bi6l4-%015x", i),
93 srv.mounts = []*KeepMount{{
94 KeepMount: arvados.KeepMount{
95 UUID: fmt.Sprintf("zzzzz-mount-%015x", i),
96 StorageClasses: map[string]bool{"default": true},
103 bal.KeepServices[srv.UUID] = srv
106 bal.MinMtime = time.Now().UnixNano() - bal.signatureTTL*1e9
110 func (bal *balancerSuite) TestPerfect(c *check.C) {
112 desired: map[string]int{"default": 2},
113 current: slots{0, 1},
116 expectBlockState: &balancedBlockState{
121 func (bal *balancerSuite) TestDecreaseRepl(c *check.C) {
123 desired: map[string]int{"default": 2},
124 current: slots{0, 2, 1},
125 shouldTrash: slots{2},
126 expectBlockState: &balancedBlockState{
132 func (bal *balancerSuite) TestDecreaseReplToZero(c *check.C) {
134 desired: map[string]int{"default": 0},
135 current: slots{0, 1, 3},
136 shouldTrash: slots{0, 1, 3},
137 expectBlockState: &balancedBlockState{
142 func (bal *balancerSuite) TestIncreaseRepl(c *check.C) {
144 desired: map[string]int{"default": 4},
145 current: slots{0, 1},
146 shouldPull: slots{2, 3},
147 expectBlockState: &balancedBlockState{
153 func (bal *balancerSuite) TestSkipReadonly(c *check.C) {
154 bal.srvList(0, slots{3})[0].ReadOnly = true
156 desired: map[string]int{"default": 4},
157 current: slots{0, 1},
158 shouldPull: slots{2, 4},
159 expectBlockState: &balancedBlockState{
165 func (bal *balancerSuite) TestAllowTrashWhenReadOnly(c *check.C) {
166 srvs := bal.srvList(0, slots{3})
167 srvs[0].mounts[0].KeepMount.AllowWrite = false
168 srvs[0].mounts[0].KeepMount.AllowTrash = true
169 // can't pull to slot 3, so pull to slot 4 instead
171 desired: map[string]int{"default": 4},
172 current: slots{0, 1},
173 shouldPull: slots{2, 4},
174 expectBlockState: &balancedBlockState{
178 // expect to be able to trash slot 3 in future, so pull to
181 desired: map[string]int{"default": 2},
182 current: slots{0, 3},
183 shouldPull: slots{1},
184 expectBlockState: &balancedBlockState{
188 // trash excess from slot 3
190 desired: map[string]int{"default": 2},
191 current: slots{0, 1, 3},
192 shouldTrash: slots{3},
193 expectBlockState: &balancedBlockState{
199 func (bal *balancerSuite) TestMultipleViewsReadOnly(c *check.C) {
200 bal.testMultipleViews(c, false, false)
203 func (bal *balancerSuite) TestMultipleViewsReadOnlyAllowTrash(c *check.C) {
204 bal.testMultipleViews(c, false, true)
207 func (bal *balancerSuite) TestMultipleViews(c *check.C) {
208 bal.testMultipleViews(c, true, true)
211 func (bal *balancerSuite) testMultipleViews(c *check.C, allowWrite, allowTrash bool) {
212 for i, srv := range bal.srvs {
213 // Add a mount to each service
214 srv.mounts[0].KeepMount.DeviceID = fmt.Sprintf("writable-by-srv-%x", i)
215 srv.mounts = append(srv.mounts, &KeepMount{
216 KeepMount: arvados.KeepMount{
217 DeviceID: bal.srvs[(i+1)%len(bal.srvs)].mounts[0].KeepMount.DeviceID,
218 UUID: bal.srvs[(i+1)%len(bal.srvs)].mounts[0].KeepMount.UUID,
219 AllowWrite: allowWrite,
220 AllowTrash: allowTrash,
222 StorageClasses: map[string]bool{"default": true},
227 for i := 1; i < len(bal.srvs); i++ {
230 // Timestamps are all different, but one of
231 // the mounts on srv[4] has the same device ID
232 // where the non-deletable replica is stored
233 // on srv[3], so only one replica is safe to
236 desired: map[string]int{"default": 1},
237 current: slots{0, i, i},
238 shouldTrash: slots{i}})
239 } else if !allowTrash {
240 // Timestamps are all different, and the third
241 // replica can't be trashed because it's on a
242 // read-only mount (with
243 // AllowTrashWhenReadOnly=false), so the first
244 // two replicas should be trashed.
246 desired: map[string]int{"default": 1},
247 current: slots{0, i, i},
248 shouldTrash: slots{0, i}})
250 // Timestamps are all different, so both
251 // replicas on the non-optimal server should
254 desired: map[string]int{"default": 1},
255 current: slots{0, i, i},
256 shouldTrash: slots{i, i}})
258 // If the three replicas have identical timestamps,
259 // none of them can be trashed safely.
261 desired: map[string]int{"default": 1},
262 current: slots{0, i, i},
263 timestamps: []int64{12345678, 12345678, 12345678}})
264 // If the first and third replicas have identical
265 // timestamps, only the second replica should be
268 desired: map[string]int{"default": 1},
269 current: slots{0, i, i},
270 timestamps: []int64{12345678, 12345679, 12345678},
271 shouldTrash: slots{i}})
275 func (bal *balancerSuite) TestFixUnbalanced(c *check.C) {
277 desired: map[string]int{"default": 2},
278 current: slots{2, 0},
279 shouldPull: slots{1}})
281 desired: map[string]int{"default": 2},
282 current: slots{2, 7},
283 shouldPull: slots{0, 1}})
284 // if only one of the pulls succeeds, we'll see this next:
286 desired: map[string]int{"default": 2},
287 current: slots{2, 1, 7},
288 shouldPull: slots{0},
289 shouldTrash: slots{7}})
290 // if both pulls succeed, we'll see this next:
292 desired: map[string]int{"default": 2},
293 current: slots{2, 0, 1, 7},
294 shouldTrash: slots{2, 7}})
296 // unbalanced + excessive replication => pull + trash
298 desired: map[string]int{"default": 2},
299 current: slots{2, 5, 7},
300 shouldPull: slots{0, 1},
301 shouldTrash: slots{7}})
304 func (bal *balancerSuite) TestMultipleReplicasPerService(c *check.C) {
305 for s, srv := range bal.srvs {
306 for i := 0; i < 3; i++ {
307 m := *(srv.mounts[0])
308 m.UUID = fmt.Sprintf("zzzzz-mount-%015x", (s<<10)+i)
309 srv.mounts = append(srv.mounts, &m)
313 desired: map[string]int{"default": 2},
314 current: slots{0, 0},
315 shouldPull: slots{1}})
317 desired: map[string]int{"default": 2},
318 current: slots{2, 2},
319 shouldPull: slots{0, 1}})
321 desired: map[string]int{"default": 2},
322 current: slots{0, 0, 1},
323 shouldTrash: slots{0}})
325 desired: map[string]int{"default": 2},
326 current: slots{1, 1, 0},
327 shouldTrash: slots{1}})
329 desired: map[string]int{"default": 2},
330 current: slots{1, 0, 1, 0, 2},
331 shouldTrash: slots{0, 1, 2}})
333 desired: map[string]int{"default": 2},
334 current: slots{1, 1, 1, 0, 2},
335 shouldTrash: slots{1, 1, 2}})
337 desired: map[string]int{"default": 2},
338 current: slots{1, 1, 2},
339 shouldPull: slots{0},
340 shouldTrash: slots{1}})
342 desired: map[string]int{"default": 2},
343 current: slots{1, 1, 0},
344 timestamps: []int64{12345678, 12345678, 12345679},
347 desired: map[string]int{"default": 2},
348 current: slots{1, 1},
349 shouldPull: slots{0}})
352 func (bal *balancerSuite) TestIncreaseReplTimestampCollision(c *check.C) {
353 // For purposes of increasing replication, we assume identical
354 // replicas are distinct.
356 desired: map[string]int{"default": 4},
357 current: slots{0, 1},
358 timestamps: []int64{12345678, 12345678},
359 shouldPull: slots{2, 3}})
362 func (bal *balancerSuite) TestDecreaseReplTimestampCollision(c *check.C) {
363 // For purposes of decreasing replication, we assume identical
364 // replicas are NOT distinct.
366 desired: map[string]int{"default": 2},
367 current: slots{0, 1, 2},
368 timestamps: []int64{12345678, 12345678, 12345678}})
370 desired: map[string]int{"default": 2},
371 current: slots{0, 1, 2},
372 timestamps: []int64{12345678, 10000000, 10000000}})
374 desired: map[string]int{"default": 0},
375 current: slots{0, 1, 2},
376 timestamps: []int64{12345678, 12345678, 12345678},
377 shouldTrash: slots{0},
378 shouldTrashMounts: []string{
379 bal.srvs[bal.knownRendezvous[0][0]].mounts[0].UUID}})
381 desired: map[string]int{"default": 2},
382 current: slots{0, 1, 2, 5, 6},
383 timestamps: []int64{12345678, 12345679, 10000000, 10000000, 10000000},
384 shouldTrash: slots{2},
385 shouldTrashMounts: []string{
386 bal.srvs[bal.knownRendezvous[0][2]].mounts[0].UUID}})
388 desired: map[string]int{"default": 2},
389 current: slots{0, 1, 2, 5, 6},
390 timestamps: []int64{12345678, 12345679, 12345671, 10000000, 10000000},
391 shouldTrash: slots{2, 5},
392 shouldTrashMounts: []string{
393 bal.srvs[bal.knownRendezvous[0][2]].mounts[0].UUID,
394 bal.srvs[bal.knownRendezvous[0][5]].mounts[0].UUID}})
396 desired: map[string]int{"default": 2},
397 current: slots{0, 1, 2, 5, 6},
398 timestamps: []int64{12345678, 12345679, 12345679, 10000000, 10000000},
399 shouldTrash: slots{5},
400 shouldTrashMounts: []string{
401 bal.srvs[bal.knownRendezvous[0][5]].mounts[0].UUID}})
404 func (bal *balancerSuite) TestDecreaseReplBlockTooNew(c *check.C) {
405 oldTime := bal.MinMtime - 3600
406 newTime := bal.MinMtime + 3600
407 // The excess replica is too new to delete.
409 desired: map[string]int{"default": 2},
410 current: slots{0, 1, 2},
411 timestamps: []int64{oldTime, newTime, newTime + 1},
412 expectBlockState: &balancedBlockState{
416 // The best replicas are too new to delete, but the excess
417 // replica is old enough.
419 desired: map[string]int{"default": 2},
420 current: slots{0, 1, 2},
421 timestamps: []int64{newTime, newTime + 1, oldTime},
422 shouldTrash: slots{2}})
425 func (bal *balancerSuite) TestCleanupMounts(c *check.C) {
426 bal.srvs[3].mounts[0].KeepMount.AllowWrite = false
427 bal.srvs[3].mounts[0].KeepMount.DeviceID = "abcdef"
428 bal.srvs[14].mounts[0].KeepMount.UUID = bal.srvs[3].mounts[0].KeepMount.UUID
429 bal.srvs[14].mounts[0].KeepMount.DeviceID = "abcdef"
430 c.Check(len(bal.srvs[3].mounts), check.Equals, 1)
432 c.Check(len(bal.srvs[3].mounts), check.Equals, 0)
435 desired: map[string]int{"default": 2},
437 shouldPull: slots{2}})
440 func (bal *balancerSuite) TestVolumeReplication(c *check.C) {
441 bal.srvs[0].mounts[0].KeepMount.Replication = 2 // srv 0
442 bal.srvs[14].mounts[0].KeepMount.Replication = 2 // srv e
444 // block 0 rendezvous is 3,e,a -- so slot 1 has repl=2
447 desired: map[string]int{"default": 2},
449 shouldPull: slots{0},
450 expectBlockState: &balancedBlockState{
456 desired: map[string]int{"default": 2},
457 current: slots{0, 1},
459 expectBlockState: &balancedBlockState{
464 desired: map[string]int{"default": 2},
465 current: slots{0, 1, 2},
466 shouldTrash: slots{2},
467 expectBlockState: &balancedBlockState{
473 desired: map[string]int{"default": 3},
474 current: slots{0, 2, 3, 4},
475 shouldPull: slots{1},
476 shouldTrash: slots{4},
477 expectBlockState: &balancedBlockState{
484 desired: map[string]int{"default": 3},
485 current: slots{0, 1, 2, 3, 4},
486 shouldTrash: slots{2, 3, 4},
487 expectBlockState: &balancedBlockState{
493 desired: map[string]int{"default": 4},
494 current: slots{0, 1, 2, 3, 4},
495 shouldTrash: slots{3, 4},
496 expectBlockState: &balancedBlockState{
500 // block 1 rendezvous is 0,9,7 -- so slot 0 has repl=2
503 desired: map[string]int{"default": 2},
505 expectBlockState: &balancedBlockState{
510 desired: map[string]int{"default": 3},
512 shouldPull: slots{1},
513 expectBlockState: &balancedBlockState{
519 desired: map[string]int{"default": 4},
521 shouldPull: slots{1, 2},
522 expectBlockState: &balancedBlockState{
528 desired: map[string]int{"default": 4},
530 shouldPull: slots{0, 1},
531 expectBlockState: &balancedBlockState{
537 desired: map[string]int{"default": 4},
539 shouldPull: slots{0, 1, 2},
540 expectBlockState: &balancedBlockState{
546 desired: map[string]int{"default": 2},
547 current: slots{1, 2, 3, 4},
548 shouldPull: slots{0},
549 shouldTrash: slots{3, 4},
550 expectBlockState: &balancedBlockState{
557 desired: map[string]int{"default": 2},
558 current: slots{0, 1, 2},
559 shouldTrash: slots{1, 2},
560 expectBlockState: &balancedBlockState{
566 func (bal *balancerSuite) TestDeviceRWMountedByMultipleServers(c *check.C) {
567 dupUUID := bal.srvs[0].mounts[0].KeepMount.UUID
568 bal.srvs[9].mounts[0].KeepMount.UUID = dupUUID
569 bal.srvs[14].mounts[0].KeepMount.UUID = dupUUID
570 // block 0 belongs on servers 3 and e, which have different
574 desired: map[string]int{"default": 2},
576 shouldPull: slots{0}})
577 // block 1 belongs on servers 0 and 9, which both report
578 // having a replica, but the replicas are on the same volume
579 // -- so we should pull to the third position (7).
582 desired: map[string]int{"default": 2},
583 current: slots{0, 1},
584 shouldPull: slots{2}})
585 // block 1 can be pulled to the doubly-mounted volume, but the
586 // pull should only be done on the first of the two servers.
589 desired: map[string]int{"default": 2},
591 shouldPull: slots{0}})
592 // block 0 has one replica on a single volume mounted on two
593 // servers (e,9 at positions 1,9). Trashing the replica on 9
594 // would lose the block.
597 desired: map[string]int{"default": 2},
598 current: slots{1, 9},
599 shouldPull: slots{0},
600 expectBlockState: &balancedBlockState{
604 // block 0 is overreplicated, but the second and third
605 // replicas are the same replica according to volume UUID
606 // (despite different Mtimes). Don't trash the third replica.
609 desired: map[string]int{"default": 2},
610 current: slots{0, 1, 9},
611 expectBlockState: &balancedBlockState{
614 // block 0 is overreplicated; the third and fifth replicas are
615 // extra, but the fourth is another view of the second and
616 // shouldn't be trashed.
619 desired: map[string]int{"default": 2},
620 current: slots{0, 1, 5, 9, 12},
621 shouldTrash: slots{5, 12},
622 expectBlockState: &balancedBlockState{
628 func (bal *balancerSuite) TestChangeStorageClasses(c *check.C) {
629 // For known blocks 0/1/2/3, server 9 is slot 9/1/14/0 in
630 // probe order. For these tests we give it two mounts, one
631 // with classes=[special], one with
632 // classes=[special,special2].
633 bal.srvs[9].mounts = []*KeepMount{{
634 KeepMount: arvados.KeepMount{
638 StorageClasses: map[string]bool{"special": true},
639 UUID: "zzzzz-mount-special00000009",
640 DeviceID: "9-special",
642 KeepService: bal.srvs[9],
644 KeepMount: arvados.KeepMount{
648 StorageClasses: map[string]bool{"special": true, "special2": true},
649 UUID: "zzzzz-mount-special20000009",
650 DeviceID: "9-special-and-special2",
652 KeepService: bal.srvs[9],
654 // For known blocks 0/1/2/3, server 13 (d) is slot 5/3/11/1 in
655 // probe order. We give it two mounts, one with
656 // classes=[special3], one with classes=[default].
657 bal.srvs[13].mounts = []*KeepMount{{
658 KeepMount: arvados.KeepMount{
662 StorageClasses: map[string]bool{"special2": true},
663 UUID: "zzzzz-mount-special2000000d",
664 DeviceID: "13-special2",
666 KeepService: bal.srvs[13],
668 KeepMount: arvados.KeepMount{
672 StorageClasses: map[string]bool{"default": true},
673 UUID: "zzzzz-mount-00000000000000d",
674 DeviceID: "13-default",
676 KeepService: bal.srvs[13],
678 // Pull to slot 9 because that's the only server with the
679 // desired class "special".
682 desired: map[string]int{"default": 2, "special": 1},
683 current: slots{0, 1},
684 shouldPull: slots{9},
685 shouldPullMounts: []string{"zzzzz-mount-special20000009"}})
686 // If some storage classes are not satisfied, don't trash any
687 // excess replicas. (E.g., if someone desires repl=1 on
688 // class=durable, and we have two copies on class=volatile, we
689 // should wait for pull to succeed before trashing anything).
692 desired: map[string]int{"special": 1},
693 current: slots{0, 1},
694 shouldPull: slots{9},
695 shouldPullMounts: []string{"zzzzz-mount-special20000009"}})
696 // Once storage classes are satisfied, trash excess replicas
697 // that appear earlier in probe order but aren't needed to
698 // satisfy the desired classes.
701 desired: map[string]int{"special": 1},
702 current: slots{0, 1, 9},
703 shouldTrash: slots{0, 1}})
704 // Pull to slot 5, the best server with class "special2".
707 desired: map[string]int{"special2": 1},
708 current: slots{0, 1},
709 shouldPull: slots{5},
710 shouldPullMounts: []string{"zzzzz-mount-special2000000d"}})
711 // Pull to slot 5 and 9 to get replication 2 in desired class
715 desired: map[string]int{"special2": 2},
716 current: slots{0, 1},
717 shouldPull: slots{5, 9},
718 shouldPullMounts: []string{"zzzzz-mount-special20000009", "zzzzz-mount-special2000000d"}})
719 // Slot 0 has a replica in "default", slot 1 has a replica
720 // in "special"; we need another replica in "default", i.e.,
724 desired: map[string]int{"default": 2, "special": 1},
725 current: slots{0, 1},
726 shouldPull: slots{2}})
727 // Pull to best probe position 0 (despite wrong storage class)
728 // if it's impossible to achieve desired replication in the
729 // desired class (only slots 1 and 3 have special2).
732 desired: map[string]int{"special2": 3},
734 shouldPull: slots{0, 1}})
735 // Trash excess replica.
738 desired: map[string]int{"special": 1},
739 current: slots{0, 1},
740 shouldTrash: slots{1}})
741 // Leave one copy on slot 1 because slot 0 (server 9) only
745 desired: map[string]int{"special": 2},
746 current: slots{0, 1}})
749 // Clear all servers' changesets, balance a single block, and verify
750 // the appropriate changes for that block have been added to the
752 func (bal *balancerSuite) try(c *check.C, t tester) {
753 bal.setupLookupTables(bal.config)
755 Replicas: bal.replList(t.known, t.current),
758 for i, t := range t.timestamps {
759 blk.Replicas[i].Mtime = t
761 result := bal.balanceBlock(knownBlkid(t.known), blk)
763 var didPull, didTrash slots
764 var didPullMounts, didTrashMounts []string
765 for i, srv := range bal.srvs {
767 for probeOrder, srvNum := range bal.knownRendezvous[t.known] {
772 for _, pull := range srv.Pulls {
773 didPull = append(didPull, slot)
774 didPullMounts = append(didPullMounts, pull.To.UUID)
775 c.Check(pull.SizedDigest, check.Equals, knownBlkid(t.known))
777 for _, trash := range srv.Trashes {
778 didTrash = append(didTrash, slot)
779 didTrashMounts = append(didTrashMounts, trash.From.UUID)
780 c.Check(trash.SizedDigest, check.Equals, knownBlkid(t.known))
784 for _, list := range []slots{didPull, didTrash, t.shouldPull, t.shouldTrash} {
785 sort.Sort(sort.IntSlice(list))
787 c.Check(didPull, check.DeepEquals, t.shouldPull)
788 c.Check(didTrash, check.DeepEquals, t.shouldTrash)
789 if t.shouldPullMounts != nil {
790 sort.Strings(didPullMounts)
791 c.Check(didPullMounts, check.DeepEquals, t.shouldPullMounts)
793 if t.shouldTrashMounts != nil {
794 sort.Strings(didTrashMounts)
795 c.Check(didTrashMounts, check.DeepEquals, t.shouldTrashMounts)
797 if t.expectBlockState != nil {
798 c.Check(result.blockState, check.Equals, *t.expectBlockState)
800 if t.expectClassState != nil {
801 c.Check(result.classState, check.DeepEquals, t.expectClassState)
805 // srvList returns the KeepServices, sorted in rendezvous order and
806 // then selected by idx. For example, srvList(3, slots{0, 1, 4})
807 // returns the first-, second-, and fifth-best servers for storing
808 // bal.knownBlkid(3).
809 func (bal *balancerSuite) srvList(knownBlockID int, order slots) (srvs []*KeepService) {
810 for _, i := range order {
811 srvs = append(srvs, bal.srvs[bal.knownRendezvous[knownBlockID][i]])
816 // replList is like srvList but returns an "existing replicas" slice,
817 // suitable for a BlockState test fixture.
818 func (bal *balancerSuite) replList(knownBlockID int, order slots) (repls []Replica) {
819 nextMnt := map[*KeepService]int{}
820 mtime := time.Now().UnixNano() - (bal.signatureTTL+86400)*1e9
821 for _, srv := range bal.srvList(knownBlockID, order) {
822 // round-robin repls onto each srv's mounts
824 nextMnt[srv] = (n + 1) % len(srv.mounts)
826 repls = append(repls, Replica{srv.mounts[n], mtime})
832 // generate the same data hashes that are tested in
833 // sdk/go/keepclient/root_sorter_test.go
834 func knownBlkid(i int) arvados.SizedDigest {
835 return arvados.SizedDigest(fmt.Sprintf("%x+64", md5.Sum([]byte(fmt.Sprintf("%064x", i)))))