Merge branch '21654-express-upgrade'. Refs #21654
[arvados.git] / services / keep-balance / balance_test.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package keepbalance
6
7 import (
8         "crypto/md5"
9         "fmt"
10         "sort"
11         "strconv"
12         "testing"
13         "time"
14
15         "git.arvados.org/arvados.git/lib/config"
16         "git.arvados.org/arvados.git/sdk/go/arvados"
17         "git.arvados.org/arvados.git/sdk/go/ctxlog"
18         check "gopkg.in/check.v1"
19 )
20
21 // Test with Gocheck
22 func Test(t *testing.T) {
23         check.TestingT(t)
24 }
25
26 var _ = check.Suite(&balancerSuite{})
27
28 type balancerSuite struct {
29         Balancer
30         config          *arvados.Cluster
31         srvs            []*KeepService
32         blks            map[string]tester
33         knownRendezvous [][]int
34         signatureTTL    int64
35 }
36
37 const (
38         // index into knownRendezvous
39         known0 = 0
40 )
41
42 type slots []int
43
44 type tester struct {
45         known       int
46         desired     map[string]int
47         current     slots
48         timestamps  []int64
49         shouldPull  slots
50         shouldTrash slots
51
52         shouldPullMounts  []string
53         shouldTrashMounts []string
54
55         expectBlockState *balancedBlockState
56         expectClassState map[string]balancedBlockState
57 }
58
59 func (bal *balancerSuite) SetUpSuite(c *check.C) {
60         bal.knownRendezvous = nil
61         for _, str := range []string{
62                 "3eab2d5fc9681074",
63                 "097dba52e648f1c3",
64                 "c5b4e023f8a7d691",
65                 "9d81c02e76a3bf54",
66         } {
67                 var slots []int
68                 for _, c := range []byte(str) {
69                         pos, _ := strconv.ParseUint(string(c), 16, 4)
70                         slots = append(slots, int(pos))
71                 }
72                 bal.knownRendezvous = append(bal.knownRendezvous, slots)
73         }
74
75         bal.signatureTTL = 3600
76         bal.Logger = ctxlog.TestLogger(c)
77
78         cfg, err := config.NewLoader(nil, ctxlog.TestLogger(c)).Load()
79         c.Assert(err, check.Equals, nil)
80         bal.config, err = cfg.GetCluster("")
81         c.Assert(err, check.Equals, nil)
82 }
83
84 func (bal *balancerSuite) SetUpTest(c *check.C) {
85         bal.srvs = make([]*KeepService, 16)
86         bal.KeepServices = make(map[string]*KeepService)
87         for i := range bal.srvs {
88                 srv := &KeepService{
89                         KeepService: arvados.KeepService{
90                                 UUID: fmt.Sprintf("zzzzz-bi6l4-%015x", i),
91                         },
92                 }
93                 srv.mounts = []*KeepMount{{
94                         KeepMount: arvados.KeepMount{
95                                 UUID:           fmt.Sprintf("zzzzz-mount-%015x", i),
96                                 StorageClasses: map[string]bool{"default": true},
97                                 AllowWrite:     true,
98                                 AllowTrash:     true,
99                         },
100                         KeepService: srv,
101                 }}
102                 bal.srvs[i] = srv
103                 bal.KeepServices[srv.UUID] = srv
104         }
105
106         bal.MinMtime = time.Now().UnixNano() - bal.signatureTTL*1e9
107         bal.cleanupMounts()
108 }
109
110 func (bal *balancerSuite) TestPerfect(c *check.C) {
111         bal.try(c, tester{
112                 desired:     map[string]int{"default": 2},
113                 current:     slots{0, 1},
114                 shouldPull:  nil,
115                 shouldTrash: nil,
116                 expectBlockState: &balancedBlockState{
117                         needed: 2,
118                 }})
119 }
120
121 func (bal *balancerSuite) TestDecreaseRepl(c *check.C) {
122         bal.try(c, tester{
123                 desired:     map[string]int{"default": 2},
124                 current:     slots{0, 2, 1},
125                 shouldTrash: slots{2},
126                 expectBlockState: &balancedBlockState{
127                         needed:   2,
128                         unneeded: 1,
129                 }})
130 }
131
132 func (bal *balancerSuite) TestDecreaseReplToZero(c *check.C) {
133         bal.try(c, tester{
134                 desired:     map[string]int{"default": 0},
135                 current:     slots{0, 1, 3},
136                 shouldTrash: slots{0, 1, 3},
137                 expectBlockState: &balancedBlockState{
138                         unneeded: 3,
139                 }})
140 }
141
142 func (bal *balancerSuite) TestIncreaseRepl(c *check.C) {
143         bal.try(c, tester{
144                 desired:    map[string]int{"default": 4},
145                 current:    slots{0, 1},
146                 shouldPull: slots{2, 3},
147                 expectBlockState: &balancedBlockState{
148                         needed:  2,
149                         pulling: 2,
150                 }})
151 }
152
153 func (bal *balancerSuite) TestSkipReadonly(c *check.C) {
154         bal.srvList(0, slots{3})[0].ReadOnly = true
155         bal.try(c, tester{
156                 desired:    map[string]int{"default": 4},
157                 current:    slots{0, 1},
158                 shouldPull: slots{2, 4},
159                 expectBlockState: &balancedBlockState{
160                         needed:  2,
161                         pulling: 2,
162                 }})
163 }
164
165 func (bal *balancerSuite) TestAllowTrashWhenReadOnly(c *check.C) {
166         srvs := bal.srvList(0, slots{3})
167         srvs[0].mounts[0].KeepMount.AllowWrite = false
168         srvs[0].mounts[0].KeepMount.AllowTrash = true
169         // can't pull to slot 3, so pull to slot 4 instead
170         bal.try(c, tester{
171                 desired:    map[string]int{"default": 4},
172                 current:    slots{0, 1},
173                 shouldPull: slots{2, 4},
174                 expectBlockState: &balancedBlockState{
175                         needed:  2,
176                         pulling: 2,
177                 }})
178         // expect to be able to trash slot 3 in future, so pull to
179         // slot 1
180         bal.try(c, tester{
181                 desired:    map[string]int{"default": 2},
182                 current:    slots{0, 3},
183                 shouldPull: slots{1},
184                 expectBlockState: &balancedBlockState{
185                         needed:  2,
186                         pulling: 1,
187                 }})
188         // trash excess from slot 3
189         bal.try(c, tester{
190                 desired:     map[string]int{"default": 2},
191                 current:     slots{0, 1, 3},
192                 shouldTrash: slots{3},
193                 expectBlockState: &balancedBlockState{
194                         needed:   2,
195                         unneeded: 1,
196                 }})
197 }
198
199 func (bal *balancerSuite) TestMultipleViewsReadOnly(c *check.C) {
200         bal.testMultipleViews(c, false, false)
201 }
202
203 func (bal *balancerSuite) TestMultipleViewsReadOnlyAllowTrash(c *check.C) {
204         bal.testMultipleViews(c, false, true)
205 }
206
207 func (bal *balancerSuite) TestMultipleViews(c *check.C) {
208         bal.testMultipleViews(c, true, true)
209 }
210
211 func (bal *balancerSuite) testMultipleViews(c *check.C, allowWrite, allowTrash bool) {
212         for i, srv := range bal.srvs {
213                 // Add a mount to each service
214                 srv.mounts[0].KeepMount.DeviceID = fmt.Sprintf("writable-by-srv-%x", i)
215                 srv.mounts = append(srv.mounts, &KeepMount{
216                         KeepMount: arvados.KeepMount{
217                                 DeviceID:       bal.srvs[(i+1)%len(bal.srvs)].mounts[0].KeepMount.DeviceID,
218                                 UUID:           bal.srvs[(i+1)%len(bal.srvs)].mounts[0].KeepMount.UUID,
219                                 AllowWrite:     allowWrite,
220                                 AllowTrash:     allowTrash,
221                                 Replication:    1,
222                                 StorageClasses: map[string]bool{"default": true},
223                         },
224                         KeepService: srv,
225                 })
226         }
227         for i := 1; i < len(bal.srvs); i++ {
228                 c.Logf("i=%d", i)
229                 if i == 4 {
230                         // Timestamps are all different, but one of
231                         // the mounts on srv[4] has the same device ID
232                         // where the non-deletable replica is stored
233                         // on srv[3], so only one replica is safe to
234                         // trash.
235                         bal.try(c, tester{
236                                 desired:     map[string]int{"default": 1},
237                                 current:     slots{0, i, i},
238                                 shouldTrash: slots{i}})
239                 } else if !allowTrash {
240                         // Timestamps are all different, and the third
241                         // replica can't be trashed because it's on a
242                         // read-only mount (with
243                         // AllowTrashWhenReadOnly=false), so the first
244                         // two replicas should be trashed.
245                         bal.try(c, tester{
246                                 desired:     map[string]int{"default": 1},
247                                 current:     slots{0, i, i},
248                                 shouldTrash: slots{0, i}})
249                 } else {
250                         // Timestamps are all different, so both
251                         // replicas on the non-optimal server should
252                         // be trashed.
253                         bal.try(c, tester{
254                                 desired:     map[string]int{"default": 1},
255                                 current:     slots{0, i, i},
256                                 shouldTrash: slots{i, i}})
257                 }
258                 // If the three replicas have identical timestamps,
259                 // none of them can be trashed safely.
260                 bal.try(c, tester{
261                         desired:    map[string]int{"default": 1},
262                         current:    slots{0, i, i},
263                         timestamps: []int64{12345678, 12345678, 12345678}})
264                 // If the first and third replicas have identical
265                 // timestamps, only the second replica should be
266                 // trashed.
267                 bal.try(c, tester{
268                         desired:     map[string]int{"default": 1},
269                         current:     slots{0, i, i},
270                         timestamps:  []int64{12345678, 12345679, 12345678},
271                         shouldTrash: slots{i}})
272         }
273 }
274
275 func (bal *balancerSuite) TestFixUnbalanced(c *check.C) {
276         bal.try(c, tester{
277                 desired:    map[string]int{"default": 2},
278                 current:    slots{2, 0},
279                 shouldPull: slots{1}})
280         bal.try(c, tester{
281                 desired:    map[string]int{"default": 2},
282                 current:    slots{2, 7},
283                 shouldPull: slots{0, 1}})
284         // if only one of the pulls succeeds, we'll see this next:
285         bal.try(c, tester{
286                 desired:     map[string]int{"default": 2},
287                 current:     slots{2, 1, 7},
288                 shouldPull:  slots{0},
289                 shouldTrash: slots{7}})
290         // if both pulls succeed, we'll see this next:
291         bal.try(c, tester{
292                 desired:     map[string]int{"default": 2},
293                 current:     slots{2, 0, 1, 7},
294                 shouldTrash: slots{2, 7}})
295
296         // unbalanced + excessive replication => pull + trash
297         bal.try(c, tester{
298                 desired:     map[string]int{"default": 2},
299                 current:     slots{2, 5, 7},
300                 shouldPull:  slots{0, 1},
301                 shouldTrash: slots{7}})
302 }
303
304 func (bal *balancerSuite) TestMultipleReplicasPerService(c *check.C) {
305         for s, srv := range bal.srvs {
306                 for i := 0; i < 3; i++ {
307                         m := *(srv.mounts[0])
308                         m.UUID = fmt.Sprintf("zzzzz-mount-%015x", (s<<10)+i)
309                         srv.mounts = append(srv.mounts, &m)
310                 }
311         }
312         bal.try(c, tester{
313                 desired:    map[string]int{"default": 2},
314                 current:    slots{0, 0},
315                 shouldPull: slots{1}})
316         bal.try(c, tester{
317                 desired:    map[string]int{"default": 2},
318                 current:    slots{2, 2},
319                 shouldPull: slots{0, 1}})
320         bal.try(c, tester{
321                 desired:     map[string]int{"default": 2},
322                 current:     slots{0, 0, 1},
323                 shouldTrash: slots{0}})
324         bal.try(c, tester{
325                 desired:     map[string]int{"default": 2},
326                 current:     slots{1, 1, 0},
327                 shouldTrash: slots{1}})
328         bal.try(c, tester{
329                 desired:     map[string]int{"default": 2},
330                 current:     slots{1, 0, 1, 0, 2},
331                 shouldTrash: slots{0, 1, 2}})
332         bal.try(c, tester{
333                 desired:     map[string]int{"default": 2},
334                 current:     slots{1, 1, 1, 0, 2},
335                 shouldTrash: slots{1, 1, 2}})
336         bal.try(c, tester{
337                 desired:     map[string]int{"default": 2},
338                 current:     slots{1, 1, 2},
339                 shouldPull:  slots{0},
340                 shouldTrash: slots{1}})
341         bal.try(c, tester{
342                 desired:     map[string]int{"default": 2},
343                 current:     slots{1, 1, 0},
344                 timestamps:  []int64{12345678, 12345678, 12345679},
345                 shouldTrash: nil})
346         bal.try(c, tester{
347                 desired:    map[string]int{"default": 2},
348                 current:    slots{1, 1},
349                 shouldPull: slots{0}})
350 }
351
352 func (bal *balancerSuite) TestIncreaseReplTimestampCollision(c *check.C) {
353         // For purposes of increasing replication, we assume identical
354         // replicas are distinct.
355         bal.try(c, tester{
356                 desired:    map[string]int{"default": 4},
357                 current:    slots{0, 1},
358                 timestamps: []int64{12345678, 12345678},
359                 shouldPull: slots{2, 3}})
360 }
361
362 func (bal *balancerSuite) TestDecreaseReplTimestampCollision(c *check.C) {
363         // For purposes of decreasing replication, we assume identical
364         // replicas are NOT distinct.
365         bal.try(c, tester{
366                 desired:    map[string]int{"default": 2},
367                 current:    slots{0, 1, 2},
368                 timestamps: []int64{12345678, 12345678, 12345678}})
369         bal.try(c, tester{
370                 desired:    map[string]int{"default": 2},
371                 current:    slots{0, 1, 2},
372                 timestamps: []int64{12345678, 10000000, 10000000}})
373         bal.try(c, tester{
374                 desired:     map[string]int{"default": 0},
375                 current:     slots{0, 1, 2},
376                 timestamps:  []int64{12345678, 12345678, 12345678},
377                 shouldTrash: slots{0},
378                 shouldTrashMounts: []string{
379                         bal.srvs[bal.knownRendezvous[0][0]].mounts[0].UUID}})
380         bal.try(c, tester{
381                 desired:     map[string]int{"default": 2},
382                 current:     slots{0, 1, 2, 5, 6},
383                 timestamps:  []int64{12345678, 12345679, 10000000, 10000000, 10000000},
384                 shouldTrash: slots{2},
385                 shouldTrashMounts: []string{
386                         bal.srvs[bal.knownRendezvous[0][2]].mounts[0].UUID}})
387         bal.try(c, tester{
388                 desired:     map[string]int{"default": 2},
389                 current:     slots{0, 1, 2, 5, 6},
390                 timestamps:  []int64{12345678, 12345679, 12345671, 10000000, 10000000},
391                 shouldTrash: slots{2, 5},
392                 shouldTrashMounts: []string{
393                         bal.srvs[bal.knownRendezvous[0][2]].mounts[0].UUID,
394                         bal.srvs[bal.knownRendezvous[0][5]].mounts[0].UUID}})
395         bal.try(c, tester{
396                 desired:     map[string]int{"default": 2},
397                 current:     slots{0, 1, 2, 5, 6},
398                 timestamps:  []int64{12345678, 12345679, 12345679, 10000000, 10000000},
399                 shouldTrash: slots{5},
400                 shouldTrashMounts: []string{
401                         bal.srvs[bal.knownRendezvous[0][5]].mounts[0].UUID}})
402 }
403
404 func (bal *balancerSuite) TestDecreaseReplBlockTooNew(c *check.C) {
405         oldTime := bal.MinMtime - 3600
406         newTime := bal.MinMtime + 3600
407         // The excess replica is too new to delete.
408         bal.try(c, tester{
409                 desired:    map[string]int{"default": 2},
410                 current:    slots{0, 1, 2},
411                 timestamps: []int64{oldTime, newTime, newTime + 1},
412                 expectBlockState: &balancedBlockState{
413                         needed:   2,
414                         unneeded: 1,
415                 }})
416         // The best replicas are too new to delete, but the excess
417         // replica is old enough.
418         bal.try(c, tester{
419                 desired:     map[string]int{"default": 2},
420                 current:     slots{0, 1, 2},
421                 timestamps:  []int64{newTime, newTime + 1, oldTime},
422                 shouldTrash: slots{2}})
423 }
424
425 func (bal *balancerSuite) TestCleanupMounts(c *check.C) {
426         bal.srvs[3].mounts[0].KeepMount.AllowWrite = false
427         bal.srvs[3].mounts[0].KeepMount.DeviceID = "abcdef"
428         bal.srvs[14].mounts[0].KeepMount.UUID = bal.srvs[3].mounts[0].KeepMount.UUID
429         bal.srvs[14].mounts[0].KeepMount.DeviceID = "abcdef"
430         c.Check(len(bal.srvs[3].mounts), check.Equals, 1)
431         bal.cleanupMounts()
432         c.Check(len(bal.srvs[3].mounts), check.Equals, 0)
433         bal.try(c, tester{
434                 known:      0,
435                 desired:    map[string]int{"default": 2},
436                 current:    slots{1},
437                 shouldPull: slots{2}})
438 }
439
440 func (bal *balancerSuite) TestVolumeReplication(c *check.C) {
441         bal.srvs[0].mounts[0].KeepMount.Replication = 2  // srv 0
442         bal.srvs[14].mounts[0].KeepMount.Replication = 2 // srv e
443         bal.cleanupMounts()
444         // block 0 rendezvous is 3,e,a -- so slot 1 has repl=2
445         bal.try(c, tester{
446                 known:      0,
447                 desired:    map[string]int{"default": 2},
448                 current:    slots{1},
449                 shouldPull: slots{0},
450                 expectBlockState: &balancedBlockState{
451                         needed:  1,
452                         pulling: 1,
453                 }})
454         bal.try(c, tester{
455                 known:      0,
456                 desired:    map[string]int{"default": 2},
457                 current:    slots{0, 1},
458                 shouldPull: nil,
459                 expectBlockState: &balancedBlockState{
460                         needed: 2,
461                 }})
462         bal.try(c, tester{
463                 known:       0,
464                 desired:     map[string]int{"default": 2},
465                 current:     slots{0, 1, 2},
466                 shouldTrash: slots{2},
467                 expectBlockState: &balancedBlockState{
468                         needed:   2,
469                         unneeded: 1,
470                 }})
471         bal.try(c, tester{
472                 known:       0,
473                 desired:     map[string]int{"default": 3},
474                 current:     slots{0, 2, 3, 4},
475                 shouldPull:  slots{1},
476                 shouldTrash: slots{4},
477                 expectBlockState: &balancedBlockState{
478                         needed:   3,
479                         unneeded: 1,
480                         pulling:  1,
481                 }})
482         bal.try(c, tester{
483                 known:       0,
484                 desired:     map[string]int{"default": 3},
485                 current:     slots{0, 1, 2, 3, 4},
486                 shouldTrash: slots{2, 3, 4},
487                 expectBlockState: &balancedBlockState{
488                         needed:   2,
489                         unneeded: 3,
490                 }})
491         bal.try(c, tester{
492                 known:       0,
493                 desired:     map[string]int{"default": 4},
494                 current:     slots{0, 1, 2, 3, 4},
495                 shouldTrash: slots{3, 4},
496                 expectBlockState: &balancedBlockState{
497                         needed:   3,
498                         unneeded: 2,
499                 }})
500         // block 1 rendezvous is 0,9,7 -- so slot 0 has repl=2
501         bal.try(c, tester{
502                 known:   1,
503                 desired: map[string]int{"default": 2},
504                 current: slots{0},
505                 expectBlockState: &balancedBlockState{
506                         needed: 1,
507                 }})
508         bal.try(c, tester{
509                 known:      1,
510                 desired:    map[string]int{"default": 3},
511                 current:    slots{0},
512                 shouldPull: slots{1},
513                 expectBlockState: &balancedBlockState{
514                         needed:  1,
515                         pulling: 1,
516                 }})
517         bal.try(c, tester{
518                 known:      1,
519                 desired:    map[string]int{"default": 4},
520                 current:    slots{0},
521                 shouldPull: slots{1, 2},
522                 expectBlockState: &balancedBlockState{
523                         needed:  1,
524                         pulling: 2,
525                 }})
526         bal.try(c, tester{
527                 known:      1,
528                 desired:    map[string]int{"default": 4},
529                 current:    slots{2},
530                 shouldPull: slots{0, 1},
531                 expectBlockState: &balancedBlockState{
532                         needed:  1,
533                         pulling: 2,
534                 }})
535         bal.try(c, tester{
536                 known:      1,
537                 desired:    map[string]int{"default": 4},
538                 current:    slots{7},
539                 shouldPull: slots{0, 1, 2},
540                 expectBlockState: &balancedBlockState{
541                         needed:  1,
542                         pulling: 3,
543                 }})
544         bal.try(c, tester{
545                 known:       1,
546                 desired:     map[string]int{"default": 2},
547                 current:     slots{1, 2, 3, 4},
548                 shouldPull:  slots{0},
549                 shouldTrash: slots{3, 4},
550                 expectBlockState: &balancedBlockState{
551                         needed:   2,
552                         unneeded: 2,
553                         pulling:  1,
554                 }})
555         bal.try(c, tester{
556                 known:       1,
557                 desired:     map[string]int{"default": 2},
558                 current:     slots{0, 1, 2},
559                 shouldTrash: slots{1, 2},
560                 expectBlockState: &balancedBlockState{
561                         needed:   1,
562                         unneeded: 2,
563                 }})
564 }
565
566 func (bal *balancerSuite) TestDeviceRWMountedByMultipleServers(c *check.C) {
567         dupUUID := bal.srvs[0].mounts[0].KeepMount.UUID
568         bal.srvs[9].mounts[0].KeepMount.UUID = dupUUID
569         bal.srvs[14].mounts[0].KeepMount.UUID = dupUUID
570         // block 0 belongs on servers 3 and e, which have different
571         // UUIDs.
572         bal.try(c, tester{
573                 known:      0,
574                 desired:    map[string]int{"default": 2},
575                 current:    slots{1},
576                 shouldPull: slots{0}})
577         // block 1 belongs on servers 0 and 9, which both report
578         // having a replica, but the replicas are on the same volume
579         // -- so we should pull to the third position (7).
580         bal.try(c, tester{
581                 known:      1,
582                 desired:    map[string]int{"default": 2},
583                 current:    slots{0, 1},
584                 shouldPull: slots{2}})
585         // block 1 can be pulled to the doubly-mounted volume, but the
586         // pull should only be done on the first of the two servers.
587         bal.try(c, tester{
588                 known:      1,
589                 desired:    map[string]int{"default": 2},
590                 current:    slots{2},
591                 shouldPull: slots{0}})
592         // block 0 has one replica on a single volume mounted on two
593         // servers (e,9 at positions 1,9). Trashing the replica on 9
594         // would lose the block.
595         bal.try(c, tester{
596                 known:      0,
597                 desired:    map[string]int{"default": 2},
598                 current:    slots{1, 9},
599                 shouldPull: slots{0},
600                 expectBlockState: &balancedBlockState{
601                         needed:  1,
602                         pulling: 1,
603                 }})
604         // block 0 is overreplicated, but the second and third
605         // replicas are the same replica according to volume UUID
606         // (despite different Mtimes). Don't trash the third replica.
607         bal.try(c, tester{
608                 known:   0,
609                 desired: map[string]int{"default": 2},
610                 current: slots{0, 1, 9},
611                 expectBlockState: &balancedBlockState{
612                         needed: 2,
613                 }})
614         // block 0 is overreplicated; the third and fifth replicas are
615         // extra, but the fourth is another view of the second and
616         // shouldn't be trashed.
617         bal.try(c, tester{
618                 known:       0,
619                 desired:     map[string]int{"default": 2},
620                 current:     slots{0, 1, 5, 9, 12},
621                 shouldTrash: slots{5, 12},
622                 expectBlockState: &balancedBlockState{
623                         needed:   2,
624                         unneeded: 2,
625                 }})
626 }
627
628 func (bal *balancerSuite) TestChangeStorageClasses(c *check.C) {
629         // For known blocks 0/1/2/3, server 9 is slot 9/1/14/0 in
630         // probe order. For these tests we give it two mounts, one
631         // with classes=[special], one with
632         // classes=[special,special2].
633         bal.srvs[9].mounts = []*KeepMount{{
634                 KeepMount: arvados.KeepMount{
635                         AllowWrite:     true,
636                         AllowTrash:     true,
637                         Replication:    1,
638                         StorageClasses: map[string]bool{"special": true},
639                         UUID:           "zzzzz-mount-special00000009",
640                         DeviceID:       "9-special",
641                 },
642                 KeepService: bal.srvs[9],
643         }, {
644                 KeepMount: arvados.KeepMount{
645                         AllowWrite:     true,
646                         AllowTrash:     true,
647                         Replication:    1,
648                         StorageClasses: map[string]bool{"special": true, "special2": true},
649                         UUID:           "zzzzz-mount-special20000009",
650                         DeviceID:       "9-special-and-special2",
651                 },
652                 KeepService: bal.srvs[9],
653         }}
654         // For known blocks 0/1/2/3, server 13 (d) is slot 5/3/11/1 in
655         // probe order. We give it two mounts, one with
656         // classes=[special3], one with classes=[default].
657         bal.srvs[13].mounts = []*KeepMount{{
658                 KeepMount: arvados.KeepMount{
659                         AllowWrite:     true,
660                         AllowTrash:     true,
661                         Replication:    1,
662                         StorageClasses: map[string]bool{"special2": true},
663                         UUID:           "zzzzz-mount-special2000000d",
664                         DeviceID:       "13-special2",
665                 },
666                 KeepService: bal.srvs[13],
667         }, {
668                 KeepMount: arvados.KeepMount{
669                         AllowWrite:     true,
670                         AllowTrash:     true,
671                         Replication:    1,
672                         StorageClasses: map[string]bool{"default": true},
673                         UUID:           "zzzzz-mount-00000000000000d",
674                         DeviceID:       "13-default",
675                 },
676                 KeepService: bal.srvs[13],
677         }}
678         // Pull to slot 9 because that's the only server with the
679         // desired class "special".
680         bal.try(c, tester{
681                 known:            0,
682                 desired:          map[string]int{"default": 2, "special": 1},
683                 current:          slots{0, 1},
684                 shouldPull:       slots{9},
685                 shouldPullMounts: []string{"zzzzz-mount-special20000009"}})
686         // If some storage classes are not satisfied, don't trash any
687         // excess replicas. (E.g., if someone desires repl=1 on
688         // class=durable, and we have two copies on class=volatile, we
689         // should wait for pull to succeed before trashing anything).
690         bal.try(c, tester{
691                 known:            0,
692                 desired:          map[string]int{"special": 1},
693                 current:          slots{0, 1},
694                 shouldPull:       slots{9},
695                 shouldPullMounts: []string{"zzzzz-mount-special20000009"}})
696         // Once storage classes are satisfied, trash excess replicas
697         // that appear earlier in probe order but aren't needed to
698         // satisfy the desired classes.
699         bal.try(c, tester{
700                 known:       0,
701                 desired:     map[string]int{"special": 1},
702                 current:     slots{0, 1, 9},
703                 shouldTrash: slots{0, 1}})
704         // Pull to slot 5, the best server with class "special2".
705         bal.try(c, tester{
706                 known:            0,
707                 desired:          map[string]int{"special2": 1},
708                 current:          slots{0, 1},
709                 shouldPull:       slots{5},
710                 shouldPullMounts: []string{"zzzzz-mount-special2000000d"}})
711         // Pull to slot 5 and 9 to get replication 2 in desired class
712         // "special2".
713         bal.try(c, tester{
714                 known:            0,
715                 desired:          map[string]int{"special2": 2},
716                 current:          slots{0, 1},
717                 shouldPull:       slots{5, 9},
718                 shouldPullMounts: []string{"zzzzz-mount-special20000009", "zzzzz-mount-special2000000d"}})
719         // Slot 0 has a replica in "default", slot 1 has a replica
720         // in "special"; we need another replica in "default", i.e.,
721         // on slot 2.
722         bal.try(c, tester{
723                 known:      1,
724                 desired:    map[string]int{"default": 2, "special": 1},
725                 current:    slots{0, 1},
726                 shouldPull: slots{2}})
727         // Pull to best probe position 0 (despite wrong storage class)
728         // if it's impossible to achieve desired replication in the
729         // desired class (only slots 1 and 3 have special2).
730         bal.try(c, tester{
731                 known:      1,
732                 desired:    map[string]int{"special2": 3},
733                 current:    slots{3},
734                 shouldPull: slots{0, 1}})
735         // Trash excess replica.
736         bal.try(c, tester{
737                 known:       3,
738                 desired:     map[string]int{"special": 1},
739                 current:     slots{0, 1},
740                 shouldTrash: slots{1}})
741         // Leave one copy on slot 1 because slot 0 (server 9) only
742         // gives us repl=1.
743         bal.try(c, tester{
744                 known:   3,
745                 desired: map[string]int{"special": 2},
746                 current: slots{0, 1}})
747 }
748
749 // Clear all servers' changesets, balance a single block, and verify
750 // the appropriate changes for that block have been added to the
751 // changesets.
752 func (bal *balancerSuite) try(c *check.C, t tester) {
753         bal.setupLookupTables(bal.config)
754         blk := &BlockState{
755                 Replicas: bal.replList(t.known, t.current),
756                 Desired:  t.desired,
757         }
758         for i, t := range t.timestamps {
759                 blk.Replicas[i].Mtime = t
760         }
761         result := bal.balanceBlock(knownBlkid(t.known), blk)
762
763         var didPull, didTrash slots
764         var didPullMounts, didTrashMounts []string
765         for i, srv := range bal.srvs {
766                 var slot int
767                 for probeOrder, srvNum := range bal.knownRendezvous[t.known] {
768                         if srvNum == i {
769                                 slot = probeOrder
770                         }
771                 }
772                 for _, pull := range srv.Pulls {
773                         didPull = append(didPull, slot)
774                         didPullMounts = append(didPullMounts, pull.To.UUID)
775                         c.Check(pull.SizedDigest, check.Equals, knownBlkid(t.known))
776                 }
777                 for _, trash := range srv.Trashes {
778                         didTrash = append(didTrash, slot)
779                         didTrashMounts = append(didTrashMounts, trash.From.UUID)
780                         c.Check(trash.SizedDigest, check.Equals, knownBlkid(t.known))
781                 }
782         }
783
784         for _, list := range []slots{didPull, didTrash, t.shouldPull, t.shouldTrash} {
785                 sort.Sort(sort.IntSlice(list))
786         }
787         c.Check(didPull, check.DeepEquals, t.shouldPull)
788         c.Check(didTrash, check.DeepEquals, t.shouldTrash)
789         if t.shouldPullMounts != nil {
790                 sort.Strings(didPullMounts)
791                 c.Check(didPullMounts, check.DeepEquals, t.shouldPullMounts)
792         }
793         if t.shouldTrashMounts != nil {
794                 sort.Strings(didTrashMounts)
795                 c.Check(didTrashMounts, check.DeepEquals, t.shouldTrashMounts)
796         }
797         if t.expectBlockState != nil {
798                 c.Check(result.blockState, check.Equals, *t.expectBlockState)
799         }
800         if t.expectClassState != nil {
801                 c.Check(result.classState, check.DeepEquals, t.expectClassState)
802         }
803 }
804
805 // srvList returns the KeepServices, sorted in rendezvous order and
806 // then selected by idx. For example, srvList(3, slots{0, 1, 4})
807 // returns the first-, second-, and fifth-best servers for storing
808 // bal.knownBlkid(3).
809 func (bal *balancerSuite) srvList(knownBlockID int, order slots) (srvs []*KeepService) {
810         for _, i := range order {
811                 srvs = append(srvs, bal.srvs[bal.knownRendezvous[knownBlockID][i]])
812         }
813         return
814 }
815
816 // replList is like srvList but returns an "existing replicas" slice,
817 // suitable for a BlockState test fixture.
818 func (bal *balancerSuite) replList(knownBlockID int, order slots) (repls []Replica) {
819         nextMnt := map[*KeepService]int{}
820         mtime := time.Now().UnixNano() - (bal.signatureTTL+86400)*1e9
821         for _, srv := range bal.srvList(knownBlockID, order) {
822                 // round-robin repls onto each srv's mounts
823                 n := nextMnt[srv]
824                 nextMnt[srv] = (n + 1) % len(srv.mounts)
825
826                 repls = append(repls, Replica{srv.mounts[n], mtime})
827                 mtime++
828         }
829         return
830 }
831
832 // generate the same data hashes that are tested in
833 // sdk/go/keepclient/root_sorter_test.go
834 func knownBlkid(i int) arvados.SizedDigest {
835         return arvados.SizedDigest(fmt.Sprintf("%x+64", md5.Sum([]byte(fmt.Sprintf("%064x", i)))))
836 }