21126: Test keep-balance behavior with AllowTrashWhenReadOnly.
[arvados.git] / services / keep-balance / balance_test.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package keepbalance
6
7 import (
8         "crypto/md5"
9         "fmt"
10         "sort"
11         "strconv"
12         "testing"
13         "time"
14
15         "git.arvados.org/arvados.git/sdk/go/arvados"
16         "git.arvados.org/arvados.git/sdk/go/ctxlog"
17         check "gopkg.in/check.v1"
18 )
19
20 // Test with Gocheck
21 func Test(t *testing.T) {
22         check.TestingT(t)
23 }
24
25 var _ = check.Suite(&balancerSuite{})
26
27 type balancerSuite struct {
28         Balancer
29         srvs            []*KeepService
30         blks            map[string]tester
31         knownRendezvous [][]int
32         signatureTTL    int64
33 }
34
35 const (
36         // index into knownRendezvous
37         known0 = 0
38 )
39
40 type slots []int
41
42 type tester struct {
43         known       int
44         desired     map[string]int
45         current     slots
46         timestamps  []int64
47         shouldPull  slots
48         shouldTrash slots
49
50         shouldPullMounts  []string
51         shouldTrashMounts []string
52
53         expectBlockState *balancedBlockState
54         expectClassState map[string]balancedBlockState
55 }
56
57 func (bal *balancerSuite) SetUpSuite(c *check.C) {
58         bal.knownRendezvous = nil
59         for _, str := range []string{
60                 "3eab2d5fc9681074",
61                 "097dba52e648f1c3",
62                 "c5b4e023f8a7d691",
63                 "9d81c02e76a3bf54",
64         } {
65                 var slots []int
66                 for _, c := range []byte(str) {
67                         pos, _ := strconv.ParseUint(string(c), 16, 4)
68                         slots = append(slots, int(pos))
69                 }
70                 bal.knownRendezvous = append(bal.knownRendezvous, slots)
71         }
72
73         bal.signatureTTL = 3600
74         bal.Logger = ctxlog.TestLogger(c)
75 }
76
77 func (bal *balancerSuite) SetUpTest(c *check.C) {
78         bal.srvs = make([]*KeepService, 16)
79         bal.KeepServices = make(map[string]*KeepService)
80         for i := range bal.srvs {
81                 srv := &KeepService{
82                         KeepService: arvados.KeepService{
83                                 UUID: fmt.Sprintf("zzzzz-bi6l4-%015x", i),
84                         },
85                 }
86                 srv.mounts = []*KeepMount{{
87                         KeepMount: arvados.KeepMount{
88                                 UUID:           fmt.Sprintf("zzzzz-mount-%015x", i),
89                                 StorageClasses: map[string]bool{"default": true},
90                                 AllowWrite:     true,
91                                 AllowTrash:     true,
92                         },
93                         KeepService: srv,
94                 }}
95                 bal.srvs[i] = srv
96                 bal.KeepServices[srv.UUID] = srv
97         }
98
99         bal.MinMtime = time.Now().UnixNano() - bal.signatureTTL*1e9
100         bal.cleanupMounts()
101 }
102
103 func (bal *balancerSuite) TestPerfect(c *check.C) {
104         bal.try(c, tester{
105                 desired:     map[string]int{"default": 2},
106                 current:     slots{0, 1},
107                 shouldPull:  nil,
108                 shouldTrash: nil,
109                 expectBlockState: &balancedBlockState{
110                         needed: 2,
111                 }})
112 }
113
114 func (bal *balancerSuite) TestDecreaseRepl(c *check.C) {
115         bal.try(c, tester{
116                 desired:     map[string]int{"default": 2},
117                 current:     slots{0, 2, 1},
118                 shouldTrash: slots{2},
119                 expectBlockState: &balancedBlockState{
120                         needed:   2,
121                         unneeded: 1,
122                 }})
123 }
124
125 func (bal *balancerSuite) TestDecreaseReplToZero(c *check.C) {
126         bal.try(c, tester{
127                 desired:     map[string]int{"default": 0},
128                 current:     slots{0, 1, 3},
129                 shouldTrash: slots{0, 1, 3},
130                 expectBlockState: &balancedBlockState{
131                         unneeded: 3,
132                 }})
133 }
134
135 func (bal *balancerSuite) TestIncreaseRepl(c *check.C) {
136         bal.try(c, tester{
137                 desired:    map[string]int{"default": 4},
138                 current:    slots{0, 1},
139                 shouldPull: slots{2, 3},
140                 expectBlockState: &balancedBlockState{
141                         needed:  2,
142                         pulling: 2,
143                 }})
144 }
145
146 func (bal *balancerSuite) TestSkipReadonly(c *check.C) {
147         bal.srvList(0, slots{3})[0].ReadOnly = true
148         bal.try(c, tester{
149                 desired:    map[string]int{"default": 4},
150                 current:    slots{0, 1},
151                 shouldPull: slots{2, 4},
152                 expectBlockState: &balancedBlockState{
153                         needed:  2,
154                         pulling: 2,
155                 }})
156 }
157
158 func (bal *balancerSuite) TestAllowTrashWhenReadOnly(c *check.C) {
159         srvs := bal.srvList(0, slots{3})
160         srvs[0].mounts[0].KeepMount.AllowWrite = false
161         srvs[0].mounts[0].KeepMount.AllowTrash = true
162         // can't pull to slot 3, so pull to slot 4 instead
163         bal.try(c, tester{
164                 desired:    map[string]int{"default": 4},
165                 current:    slots{0, 1},
166                 shouldPull: slots{2, 4},
167                 expectBlockState: &balancedBlockState{
168                         needed:  2,
169                         pulling: 2,
170                 }})
171         // expect to be able to trash slot 3 in future, so pull to
172         // slot 1
173         bal.try(c, tester{
174                 desired:    map[string]int{"default": 2},
175                 current:    slots{0, 3},
176                 shouldPull: slots{1},
177                 expectBlockState: &balancedBlockState{
178                         needed:  2,
179                         pulling: 1,
180                 }})
181         // trash excess from slot 3
182         bal.try(c, tester{
183                 desired:     map[string]int{"default": 2},
184                 current:     slots{0, 1, 3},
185                 shouldTrash: slots{3},
186                 expectBlockState: &balancedBlockState{
187                         needed:   2,
188                         unneeded: 1,
189                 }})
190 }
191
192 func (bal *balancerSuite) TestMultipleViewsReadOnly(c *check.C) {
193         bal.testMultipleViews(c, false, false)
194 }
195
196 func (bal *balancerSuite) TestMultipleViewsReadOnlyAllowTrash(c *check.C) {
197         bal.testMultipleViews(c, false, true)
198 }
199
200 func (bal *balancerSuite) TestMultipleViews(c *check.C) {
201         bal.testMultipleViews(c, true, true)
202 }
203
204 func (bal *balancerSuite) testMultipleViews(c *check.C, allowWrite, allowTrash bool) {
205         for i, srv := range bal.srvs {
206                 // Add a mount to each service
207                 srv.mounts[0].KeepMount.DeviceID = fmt.Sprintf("writable-by-srv-%x", i)
208                 srv.mounts = append(srv.mounts, &KeepMount{
209                         KeepMount: arvados.KeepMount{
210                                 DeviceID:       bal.srvs[(i+1)%len(bal.srvs)].mounts[0].KeepMount.DeviceID,
211                                 UUID:           bal.srvs[(i+1)%len(bal.srvs)].mounts[0].KeepMount.UUID,
212                                 AllowWrite:     allowWrite,
213                                 AllowTrash:     allowTrash,
214                                 Replication:    1,
215                                 StorageClasses: map[string]bool{"default": true},
216                         },
217                         KeepService: srv,
218                 })
219         }
220         for i := 1; i < len(bal.srvs); i++ {
221                 c.Logf("i=%d", i)
222                 if i == 4 {
223                         // Timestamps are all different, but one of
224                         // the mounts on srv[4] has the same device ID
225                         // where the non-deletable replica is stored
226                         // on srv[3], so only one replica is safe to
227                         // trash.
228                         bal.try(c, tester{
229                                 desired:     map[string]int{"default": 1},
230                                 current:     slots{0, i, i},
231                                 shouldTrash: slots{i}})
232                 } else if !allowTrash {
233                         // Timestamps are all different, and the third
234                         // replica can't be trashed because it's on a
235                         // read-only mount (with
236                         // AllowTrashWhenReadOnly=false), so the first
237                         // two replicas should be trashed.
238                         bal.try(c, tester{
239                                 desired:     map[string]int{"default": 1},
240                                 current:     slots{0, i, i},
241                                 shouldTrash: slots{0, i}})
242                 } else {
243                         // Timestamps are all different, so both
244                         // replicas on the non-optimal server should
245                         // be trashed.
246                         bal.try(c, tester{
247                                 desired:     map[string]int{"default": 1},
248                                 current:     slots{0, i, i},
249                                 shouldTrash: slots{i, i}})
250                 }
251                 // If the three replicas have identical timestamps,
252                 // none of them can be trashed safely.
253                 bal.try(c, tester{
254                         desired:    map[string]int{"default": 1},
255                         current:    slots{0, i, i},
256                         timestamps: []int64{12345678, 12345678, 12345678}})
257                 // If the first and third replicas have identical
258                 // timestamps, only the second replica should be
259                 // trashed.
260                 bal.try(c, tester{
261                         desired:     map[string]int{"default": 1},
262                         current:     slots{0, i, i},
263                         timestamps:  []int64{12345678, 12345679, 12345678},
264                         shouldTrash: slots{i}})
265         }
266 }
267
268 func (bal *balancerSuite) TestFixUnbalanced(c *check.C) {
269         bal.try(c, tester{
270                 desired:    map[string]int{"default": 2},
271                 current:    slots{2, 0},
272                 shouldPull: slots{1}})
273         bal.try(c, tester{
274                 desired:    map[string]int{"default": 2},
275                 current:    slots{2, 7},
276                 shouldPull: slots{0, 1}})
277         // if only one of the pulls succeeds, we'll see this next:
278         bal.try(c, tester{
279                 desired:     map[string]int{"default": 2},
280                 current:     slots{2, 1, 7},
281                 shouldPull:  slots{0},
282                 shouldTrash: slots{7}})
283         // if both pulls succeed, we'll see this next:
284         bal.try(c, tester{
285                 desired:     map[string]int{"default": 2},
286                 current:     slots{2, 0, 1, 7},
287                 shouldTrash: slots{2, 7}})
288
289         // unbalanced + excessive replication => pull + trash
290         bal.try(c, tester{
291                 desired:     map[string]int{"default": 2},
292                 current:     slots{2, 5, 7},
293                 shouldPull:  slots{0, 1},
294                 shouldTrash: slots{7}})
295 }
296
297 func (bal *balancerSuite) TestMultipleReplicasPerService(c *check.C) {
298         for s, srv := range bal.srvs {
299                 for i := 0; i < 3; i++ {
300                         m := *(srv.mounts[0])
301                         m.UUID = fmt.Sprintf("zzzzz-mount-%015x", (s<<10)+i)
302                         srv.mounts = append(srv.mounts, &m)
303                 }
304         }
305         bal.try(c, tester{
306                 desired:    map[string]int{"default": 2},
307                 current:    slots{0, 0},
308                 shouldPull: slots{1}})
309         bal.try(c, tester{
310                 desired:    map[string]int{"default": 2},
311                 current:    slots{2, 2},
312                 shouldPull: slots{0, 1}})
313         bal.try(c, tester{
314                 desired:     map[string]int{"default": 2},
315                 current:     slots{0, 0, 1},
316                 shouldTrash: slots{0}})
317         bal.try(c, tester{
318                 desired:     map[string]int{"default": 2},
319                 current:     slots{1, 1, 0},
320                 shouldTrash: slots{1}})
321         bal.try(c, tester{
322                 desired:     map[string]int{"default": 2},
323                 current:     slots{1, 0, 1, 0, 2},
324                 shouldTrash: slots{0, 1, 2}})
325         bal.try(c, tester{
326                 desired:     map[string]int{"default": 2},
327                 current:     slots{1, 1, 1, 0, 2},
328                 shouldTrash: slots{1, 1, 2}})
329         bal.try(c, tester{
330                 desired:     map[string]int{"default": 2},
331                 current:     slots{1, 1, 2},
332                 shouldPull:  slots{0},
333                 shouldTrash: slots{1}})
334         bal.try(c, tester{
335                 desired:     map[string]int{"default": 2},
336                 current:     slots{1, 1, 0},
337                 timestamps:  []int64{12345678, 12345678, 12345679},
338                 shouldTrash: nil})
339         bal.try(c, tester{
340                 desired:    map[string]int{"default": 2},
341                 current:    slots{1, 1},
342                 shouldPull: slots{0}})
343 }
344
345 func (bal *balancerSuite) TestIncreaseReplTimestampCollision(c *check.C) {
346         // For purposes of increasing replication, we assume identical
347         // replicas are distinct.
348         bal.try(c, tester{
349                 desired:    map[string]int{"default": 4},
350                 current:    slots{0, 1},
351                 timestamps: []int64{12345678, 12345678},
352                 shouldPull: slots{2, 3}})
353 }
354
355 func (bal *balancerSuite) TestDecreaseReplTimestampCollision(c *check.C) {
356         // For purposes of decreasing replication, we assume identical
357         // replicas are NOT distinct.
358         bal.try(c, tester{
359                 desired:    map[string]int{"default": 2},
360                 current:    slots{0, 1, 2},
361                 timestamps: []int64{12345678, 12345678, 12345678}})
362         bal.try(c, tester{
363                 desired:    map[string]int{"default": 2},
364                 current:    slots{0, 1, 2},
365                 timestamps: []int64{12345678, 10000000, 10000000}})
366         bal.try(c, tester{
367                 desired:     map[string]int{"default": 0},
368                 current:     slots{0, 1, 2},
369                 timestamps:  []int64{12345678, 12345678, 12345678},
370                 shouldTrash: slots{0},
371                 shouldTrashMounts: []string{
372                         bal.srvs[bal.knownRendezvous[0][0]].mounts[0].UUID}})
373         bal.try(c, tester{
374                 desired:     map[string]int{"default": 2},
375                 current:     slots{0, 1, 2, 5, 6},
376                 timestamps:  []int64{12345678, 12345679, 10000000, 10000000, 10000000},
377                 shouldTrash: slots{2},
378                 shouldTrashMounts: []string{
379                         bal.srvs[bal.knownRendezvous[0][2]].mounts[0].UUID}})
380         bal.try(c, tester{
381                 desired:     map[string]int{"default": 2},
382                 current:     slots{0, 1, 2, 5, 6},
383                 timestamps:  []int64{12345678, 12345679, 12345671, 10000000, 10000000},
384                 shouldTrash: slots{2, 5},
385                 shouldTrashMounts: []string{
386                         bal.srvs[bal.knownRendezvous[0][2]].mounts[0].UUID,
387                         bal.srvs[bal.knownRendezvous[0][5]].mounts[0].UUID}})
388         bal.try(c, tester{
389                 desired:     map[string]int{"default": 2},
390                 current:     slots{0, 1, 2, 5, 6},
391                 timestamps:  []int64{12345678, 12345679, 12345679, 10000000, 10000000},
392                 shouldTrash: slots{5},
393                 shouldTrashMounts: []string{
394                         bal.srvs[bal.knownRendezvous[0][5]].mounts[0].UUID}})
395 }
396
397 func (bal *balancerSuite) TestDecreaseReplBlockTooNew(c *check.C) {
398         oldTime := bal.MinMtime - 3600
399         newTime := bal.MinMtime + 3600
400         // The excess replica is too new to delete.
401         bal.try(c, tester{
402                 desired:    map[string]int{"default": 2},
403                 current:    slots{0, 1, 2},
404                 timestamps: []int64{oldTime, newTime, newTime + 1},
405                 expectBlockState: &balancedBlockState{
406                         needed:   2,
407                         unneeded: 1,
408                 }})
409         // The best replicas are too new to delete, but the excess
410         // replica is old enough.
411         bal.try(c, tester{
412                 desired:     map[string]int{"default": 2},
413                 current:     slots{0, 1, 2},
414                 timestamps:  []int64{newTime, newTime + 1, oldTime},
415                 shouldTrash: slots{2}})
416 }
417
418 func (bal *balancerSuite) TestCleanupMounts(c *check.C) {
419         bal.srvs[3].mounts[0].KeepMount.AllowWrite = false
420         bal.srvs[3].mounts[0].KeepMount.DeviceID = "abcdef"
421         bal.srvs[14].mounts[0].KeepMount.UUID = bal.srvs[3].mounts[0].KeepMount.UUID
422         bal.srvs[14].mounts[0].KeepMount.DeviceID = "abcdef"
423         c.Check(len(bal.srvs[3].mounts), check.Equals, 1)
424         bal.cleanupMounts()
425         c.Check(len(bal.srvs[3].mounts), check.Equals, 0)
426         bal.try(c, tester{
427                 known:      0,
428                 desired:    map[string]int{"default": 2},
429                 current:    slots{1},
430                 shouldPull: slots{2}})
431 }
432
433 func (bal *balancerSuite) TestVolumeReplication(c *check.C) {
434         bal.srvs[0].mounts[0].KeepMount.Replication = 2  // srv 0
435         bal.srvs[14].mounts[0].KeepMount.Replication = 2 // srv e
436         bal.cleanupMounts()
437         // block 0 rendezvous is 3,e,a -- so slot 1 has repl=2
438         bal.try(c, tester{
439                 known:      0,
440                 desired:    map[string]int{"default": 2},
441                 current:    slots{1},
442                 shouldPull: slots{0},
443                 expectBlockState: &balancedBlockState{
444                         needed:  1,
445                         pulling: 1,
446                 }})
447         bal.try(c, tester{
448                 known:      0,
449                 desired:    map[string]int{"default": 2},
450                 current:    slots{0, 1},
451                 shouldPull: nil,
452                 expectBlockState: &balancedBlockState{
453                         needed: 2,
454                 }})
455         bal.try(c, tester{
456                 known:       0,
457                 desired:     map[string]int{"default": 2},
458                 current:     slots{0, 1, 2},
459                 shouldTrash: slots{2},
460                 expectBlockState: &balancedBlockState{
461                         needed:   2,
462                         unneeded: 1,
463                 }})
464         bal.try(c, tester{
465                 known:       0,
466                 desired:     map[string]int{"default": 3},
467                 current:     slots{0, 2, 3, 4},
468                 shouldPull:  slots{1},
469                 shouldTrash: slots{4},
470                 expectBlockState: &balancedBlockState{
471                         needed:   3,
472                         unneeded: 1,
473                         pulling:  1,
474                 }})
475         bal.try(c, tester{
476                 known:       0,
477                 desired:     map[string]int{"default": 3},
478                 current:     slots{0, 1, 2, 3, 4},
479                 shouldTrash: slots{2, 3, 4},
480                 expectBlockState: &balancedBlockState{
481                         needed:   2,
482                         unneeded: 3,
483                 }})
484         bal.try(c, tester{
485                 known:       0,
486                 desired:     map[string]int{"default": 4},
487                 current:     slots{0, 1, 2, 3, 4},
488                 shouldTrash: slots{3, 4},
489                 expectBlockState: &balancedBlockState{
490                         needed:   3,
491                         unneeded: 2,
492                 }})
493         // block 1 rendezvous is 0,9,7 -- so slot 0 has repl=2
494         bal.try(c, tester{
495                 known:   1,
496                 desired: map[string]int{"default": 2},
497                 current: slots{0},
498                 expectBlockState: &balancedBlockState{
499                         needed: 1,
500                 }})
501         bal.try(c, tester{
502                 known:      1,
503                 desired:    map[string]int{"default": 3},
504                 current:    slots{0},
505                 shouldPull: slots{1},
506                 expectBlockState: &balancedBlockState{
507                         needed:  1,
508                         pulling: 1,
509                 }})
510         bal.try(c, tester{
511                 known:      1,
512                 desired:    map[string]int{"default": 4},
513                 current:    slots{0},
514                 shouldPull: slots{1, 2},
515                 expectBlockState: &balancedBlockState{
516                         needed:  1,
517                         pulling: 2,
518                 }})
519         bal.try(c, tester{
520                 known:      1,
521                 desired:    map[string]int{"default": 4},
522                 current:    slots{2},
523                 shouldPull: slots{0, 1},
524                 expectBlockState: &balancedBlockState{
525                         needed:  1,
526                         pulling: 2,
527                 }})
528         bal.try(c, tester{
529                 known:      1,
530                 desired:    map[string]int{"default": 4},
531                 current:    slots{7},
532                 shouldPull: slots{0, 1, 2},
533                 expectBlockState: &balancedBlockState{
534                         needed:  1,
535                         pulling: 3,
536                 }})
537         bal.try(c, tester{
538                 known:       1,
539                 desired:     map[string]int{"default": 2},
540                 current:     slots{1, 2, 3, 4},
541                 shouldPull:  slots{0},
542                 shouldTrash: slots{3, 4},
543                 expectBlockState: &balancedBlockState{
544                         needed:   2,
545                         unneeded: 2,
546                         pulling:  1,
547                 }})
548         bal.try(c, tester{
549                 known:       1,
550                 desired:     map[string]int{"default": 2},
551                 current:     slots{0, 1, 2},
552                 shouldTrash: slots{1, 2},
553                 expectBlockState: &balancedBlockState{
554                         needed:   1,
555                         unneeded: 2,
556                 }})
557 }
558
559 func (bal *balancerSuite) TestDeviceRWMountedByMultipleServers(c *check.C) {
560         dupUUID := bal.srvs[0].mounts[0].KeepMount.UUID
561         bal.srvs[9].mounts[0].KeepMount.UUID = dupUUID
562         bal.srvs[14].mounts[0].KeepMount.UUID = dupUUID
563         // block 0 belongs on servers 3 and e, which have different
564         // UUIDs.
565         bal.try(c, tester{
566                 known:      0,
567                 desired:    map[string]int{"default": 2},
568                 current:    slots{1},
569                 shouldPull: slots{0}})
570         // block 1 belongs on servers 0 and 9, which both report
571         // having a replica, but the replicas are on the same volume
572         // -- so we should pull to the third position (7).
573         bal.try(c, tester{
574                 known:      1,
575                 desired:    map[string]int{"default": 2},
576                 current:    slots{0, 1},
577                 shouldPull: slots{2}})
578         // block 1 can be pulled to the doubly-mounted volume, but the
579         // pull should only be done on the first of the two servers.
580         bal.try(c, tester{
581                 known:      1,
582                 desired:    map[string]int{"default": 2},
583                 current:    slots{2},
584                 shouldPull: slots{0}})
585         // block 0 has one replica on a single volume mounted on two
586         // servers (e,9 at positions 1,9). Trashing the replica on 9
587         // would lose the block.
588         bal.try(c, tester{
589                 known:      0,
590                 desired:    map[string]int{"default": 2},
591                 current:    slots{1, 9},
592                 shouldPull: slots{0},
593                 expectBlockState: &balancedBlockState{
594                         needed:  1,
595                         pulling: 1,
596                 }})
597         // block 0 is overreplicated, but the second and third
598         // replicas are the same replica according to volume UUID
599         // (despite different Mtimes). Don't trash the third replica.
600         bal.try(c, tester{
601                 known:   0,
602                 desired: map[string]int{"default": 2},
603                 current: slots{0, 1, 9},
604                 expectBlockState: &balancedBlockState{
605                         needed: 2,
606                 }})
607         // block 0 is overreplicated; the third and fifth replicas are
608         // extra, but the fourth is another view of the second and
609         // shouldn't be trashed.
610         bal.try(c, tester{
611                 known:       0,
612                 desired:     map[string]int{"default": 2},
613                 current:     slots{0, 1, 5, 9, 12},
614                 shouldTrash: slots{5, 12},
615                 expectBlockState: &balancedBlockState{
616                         needed:   2,
617                         unneeded: 2,
618                 }})
619 }
620
621 func (bal *balancerSuite) TestChangeStorageClasses(c *check.C) {
622         // For known blocks 0/1/2/3, server 9 is slot 9/1/14/0 in
623         // probe order. For these tests we give it two mounts, one
624         // with classes=[special], one with
625         // classes=[special,special2].
626         bal.srvs[9].mounts = []*KeepMount{{
627                 KeepMount: arvados.KeepMount{
628                         AllowWrite:     true,
629                         AllowTrash:     true,
630                         Replication:    1,
631                         StorageClasses: map[string]bool{"special": true},
632                         UUID:           "zzzzz-mount-special00000009",
633                         DeviceID:       "9-special",
634                 },
635                 KeepService: bal.srvs[9],
636         }, {
637                 KeepMount: arvados.KeepMount{
638                         AllowWrite:     true,
639                         AllowTrash:     true,
640                         Replication:    1,
641                         StorageClasses: map[string]bool{"special": true, "special2": true},
642                         UUID:           "zzzzz-mount-special20000009",
643                         DeviceID:       "9-special-and-special2",
644                 },
645                 KeepService: bal.srvs[9],
646         }}
647         // For known blocks 0/1/2/3, server 13 (d) is slot 5/3/11/1 in
648         // probe order. We give it two mounts, one with
649         // classes=[special3], one with classes=[default].
650         bal.srvs[13].mounts = []*KeepMount{{
651                 KeepMount: arvados.KeepMount{
652                         AllowWrite:     true,
653                         AllowTrash:     true,
654                         Replication:    1,
655                         StorageClasses: map[string]bool{"special2": true},
656                         UUID:           "zzzzz-mount-special2000000d",
657                         DeviceID:       "13-special2",
658                 },
659                 KeepService: bal.srvs[13],
660         }, {
661                 KeepMount: arvados.KeepMount{
662                         AllowWrite:     true,
663                         AllowTrash:     true,
664                         Replication:    1,
665                         StorageClasses: map[string]bool{"default": true},
666                         UUID:           "zzzzz-mount-00000000000000d",
667                         DeviceID:       "13-default",
668                 },
669                 KeepService: bal.srvs[13],
670         }}
671         // Pull to slot 9 because that's the only server with the
672         // desired class "special".
673         bal.try(c, tester{
674                 known:            0,
675                 desired:          map[string]int{"default": 2, "special": 1},
676                 current:          slots{0, 1},
677                 shouldPull:       slots{9},
678                 shouldPullMounts: []string{"zzzzz-mount-special20000009"}})
679         // If some storage classes are not satisfied, don't trash any
680         // excess replicas. (E.g., if someone desires repl=1 on
681         // class=durable, and we have two copies on class=volatile, we
682         // should wait for pull to succeed before trashing anything).
683         bal.try(c, tester{
684                 known:            0,
685                 desired:          map[string]int{"special": 1},
686                 current:          slots{0, 1},
687                 shouldPull:       slots{9},
688                 shouldPullMounts: []string{"zzzzz-mount-special20000009"}})
689         // Once storage classes are satisfied, trash excess replicas
690         // that appear earlier in probe order but aren't needed to
691         // satisfy the desired classes.
692         bal.try(c, tester{
693                 known:       0,
694                 desired:     map[string]int{"special": 1},
695                 current:     slots{0, 1, 9},
696                 shouldTrash: slots{0, 1}})
697         // Pull to slot 5, the best server with class "special2".
698         bal.try(c, tester{
699                 known:            0,
700                 desired:          map[string]int{"special2": 1},
701                 current:          slots{0, 1},
702                 shouldPull:       slots{5},
703                 shouldPullMounts: []string{"zzzzz-mount-special2000000d"}})
704         // Pull to slot 5 and 9 to get replication 2 in desired class
705         // "special2".
706         bal.try(c, tester{
707                 known:            0,
708                 desired:          map[string]int{"special2": 2},
709                 current:          slots{0, 1},
710                 shouldPull:       slots{5, 9},
711                 shouldPullMounts: []string{"zzzzz-mount-special20000009", "zzzzz-mount-special2000000d"}})
712         // Slot 0 has a replica in "default", slot 1 has a replica
713         // in "special"; we need another replica in "default", i.e.,
714         // on slot 2.
715         bal.try(c, tester{
716                 known:      1,
717                 desired:    map[string]int{"default": 2, "special": 1},
718                 current:    slots{0, 1},
719                 shouldPull: slots{2}})
720         // Pull to best probe position 0 (despite wrong storage class)
721         // if it's impossible to achieve desired replication in the
722         // desired class (only slots 1 and 3 have special2).
723         bal.try(c, tester{
724                 known:      1,
725                 desired:    map[string]int{"special2": 3},
726                 current:    slots{3},
727                 shouldPull: slots{0, 1}})
728         // Trash excess replica.
729         bal.try(c, tester{
730                 known:       3,
731                 desired:     map[string]int{"special": 1},
732                 current:     slots{0, 1},
733                 shouldTrash: slots{1}})
734         // Leave one copy on slot 1 because slot 0 (server 9) only
735         // gives us repl=1.
736         bal.try(c, tester{
737                 known:   3,
738                 desired: map[string]int{"special": 2},
739                 current: slots{0, 1}})
740 }
741
742 // Clear all servers' changesets, balance a single block, and verify
743 // the appropriate changes for that block have been added to the
744 // changesets.
745 func (bal *balancerSuite) try(c *check.C, t tester) {
746         bal.setupLookupTables()
747         blk := &BlockState{
748                 Replicas: bal.replList(t.known, t.current),
749                 Desired:  t.desired,
750         }
751         for i, t := range t.timestamps {
752                 blk.Replicas[i].Mtime = t
753         }
754         for _, srv := range bal.srvs {
755                 srv.ChangeSet = &ChangeSet{}
756         }
757         result := bal.balanceBlock(knownBlkid(t.known), blk)
758
759         var didPull, didTrash slots
760         var didPullMounts, didTrashMounts []string
761         for i, srv := range bal.srvs {
762                 var slot int
763                 for probeOrder, srvNum := range bal.knownRendezvous[t.known] {
764                         if srvNum == i {
765                                 slot = probeOrder
766                         }
767                 }
768                 for _, pull := range srv.Pulls {
769                         didPull = append(didPull, slot)
770                         didPullMounts = append(didPullMounts, pull.To.UUID)
771                         c.Check(pull.SizedDigest, check.Equals, knownBlkid(t.known))
772                 }
773                 for _, trash := range srv.Trashes {
774                         didTrash = append(didTrash, slot)
775                         didTrashMounts = append(didTrashMounts, trash.From.UUID)
776                         c.Check(trash.SizedDigest, check.Equals, knownBlkid(t.known))
777                 }
778         }
779
780         for _, list := range []slots{didPull, didTrash, t.shouldPull, t.shouldTrash} {
781                 sort.Sort(sort.IntSlice(list))
782         }
783         c.Check(didPull, check.DeepEquals, t.shouldPull)
784         c.Check(didTrash, check.DeepEquals, t.shouldTrash)
785         if t.shouldPullMounts != nil {
786                 sort.Strings(didPullMounts)
787                 c.Check(didPullMounts, check.DeepEquals, t.shouldPullMounts)
788         }
789         if t.shouldTrashMounts != nil {
790                 sort.Strings(didTrashMounts)
791                 c.Check(didTrashMounts, check.DeepEquals, t.shouldTrashMounts)
792         }
793         if t.expectBlockState != nil {
794                 c.Check(result.blockState, check.Equals, *t.expectBlockState)
795         }
796         if t.expectClassState != nil {
797                 c.Check(result.classState, check.DeepEquals, t.expectClassState)
798         }
799 }
800
801 // srvList returns the KeepServices, sorted in rendezvous order and
802 // then selected by idx. For example, srvList(3, slots{0, 1, 4})
803 // returns the first-, second-, and fifth-best servers for storing
804 // bal.knownBlkid(3).
805 func (bal *balancerSuite) srvList(knownBlockID int, order slots) (srvs []*KeepService) {
806         for _, i := range order {
807                 srvs = append(srvs, bal.srvs[bal.knownRendezvous[knownBlockID][i]])
808         }
809         return
810 }
811
812 // replList is like srvList but returns an "existing replicas" slice,
813 // suitable for a BlockState test fixture.
814 func (bal *balancerSuite) replList(knownBlockID int, order slots) (repls []Replica) {
815         nextMnt := map[*KeepService]int{}
816         mtime := time.Now().UnixNano() - (bal.signatureTTL+86400)*1e9
817         for _, srv := range bal.srvList(knownBlockID, order) {
818                 // round-robin repls onto each srv's mounts
819                 n := nextMnt[srv]
820                 nextMnt[srv] = (n + 1) % len(srv.mounts)
821
822                 repls = append(repls, Replica{srv.mounts[n], mtime})
823                 mtime++
824         }
825         return
826 }
827
828 // generate the same data hashes that are tested in
829 // sdk/go/keepclient/root_sorter_test.go
830 func knownBlkid(i int) arvados.SizedDigest {
831         return arvados.SizedDigest(fmt.Sprintf("%x+64", md5.Sum([]byte(fmt.Sprintf("%064x", i)))))
832 }