13427: Handle same backend device mounted RW in multiple places.
[arvados.git] / services / keep-balance / balance_test.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package main
6
7 import (
8         "crypto/md5"
9         "fmt"
10         "sort"
11         "strconv"
12         "testing"
13         "time"
14
15         "git.curoverse.com/arvados.git/sdk/go/arvados"
16
17         check "gopkg.in/check.v1"
18 )
19
20 // Test with Gocheck
21 func Test(t *testing.T) {
22         check.TestingT(t)
23 }
24
25 var _ = check.Suite(&balancerSuite{})
26
27 type balancerSuite struct {
28         Balancer
29         srvs            []*KeepService
30         blks            map[string]tester
31         knownRendezvous [][]int
32         signatureTTL    int64
33 }
34
35 const (
36         // index into knownRendezvous
37         known0 = 0
38 )
39
40 type slots []int
41
42 type tester struct {
43         known       int
44         desired     map[string]int
45         current     slots
46         timestamps  []int64
47         shouldPull  slots
48         shouldTrash slots
49
50         shouldPullMounts  []string
51         shouldTrashMounts []string
52 }
53
54 func (bal *balancerSuite) SetUpSuite(c *check.C) {
55         bal.knownRendezvous = nil
56         for _, str := range []string{
57                 "3eab2d5fc9681074",
58                 "097dba52e648f1c3",
59                 "c5b4e023f8a7d691",
60                 "9d81c02e76a3bf54",
61         } {
62                 var slots []int
63                 for _, c := range []byte(str) {
64                         pos, _ := strconv.ParseUint(string(c), 16, 4)
65                         slots = append(slots, int(pos))
66                 }
67                 bal.knownRendezvous = append(bal.knownRendezvous, slots)
68         }
69
70         bal.signatureTTL = 3600
71 }
72
73 func (bal *balancerSuite) SetUpTest(c *check.C) {
74         bal.srvs = make([]*KeepService, 16)
75         bal.KeepServices = make(map[string]*KeepService)
76         for i := range bal.srvs {
77                 srv := &KeepService{
78                         KeepService: arvados.KeepService{
79                                 UUID: fmt.Sprintf("zzzzz-bi6l4-%015x", i),
80                         },
81                 }
82                 srv.mounts = []*KeepMount{{
83                         KeepMount: arvados.KeepMount{
84                                 UUID: fmt.Sprintf("zzzzz-mount-%015x", i),
85                         },
86                         KeepService: srv,
87                 }}
88                 bal.srvs[i] = srv
89                 bal.KeepServices[srv.UUID] = srv
90         }
91
92         bal.MinMtime = time.Now().UnixNano() - bal.signatureTTL*1e9
93 }
94
95 func (bal *balancerSuite) TestPerfect(c *check.C) {
96         bal.try(c, tester{
97                 desired:     map[string]int{"default": 2},
98                 current:     slots{0, 1},
99                 shouldPull:  nil,
100                 shouldTrash: nil})
101 }
102
103 func (bal *balancerSuite) TestDecreaseRepl(c *check.C) {
104         bal.try(c, tester{
105                 desired:     map[string]int{"default": 2},
106                 current:     slots{0, 2, 1},
107                 shouldTrash: slots{2}})
108 }
109
110 func (bal *balancerSuite) TestDecreaseReplToZero(c *check.C) {
111         bal.try(c, tester{
112                 desired:     map[string]int{"default": 0},
113                 current:     slots{0, 1, 3},
114                 shouldTrash: slots{0, 1, 3}})
115 }
116
117 func (bal *balancerSuite) TestIncreaseRepl(c *check.C) {
118         bal.try(c, tester{
119                 desired:    map[string]int{"default": 4},
120                 current:    slots{0, 1},
121                 shouldPull: slots{2, 3}})
122 }
123
124 func (bal *balancerSuite) TestSkipReadonly(c *check.C) {
125         bal.srvList(0, slots{3})[0].ReadOnly = true
126         bal.try(c, tester{
127                 desired:    map[string]int{"default": 4},
128                 current:    slots{0, 1},
129                 shouldPull: slots{2, 4}})
130 }
131
132 func (bal *balancerSuite) TestFixUnbalanced(c *check.C) {
133         bal.try(c, tester{
134                 desired:    map[string]int{"default": 2},
135                 current:    slots{2, 0},
136                 shouldPull: slots{1}})
137         bal.try(c, tester{
138                 desired:    map[string]int{"default": 2},
139                 current:    slots{2, 7},
140                 shouldPull: slots{0, 1}})
141         // if only one of the pulls succeeds, we'll see this next:
142         bal.try(c, tester{
143                 desired:     map[string]int{"default": 2},
144                 current:     slots{2, 1, 7},
145                 shouldPull:  slots{0},
146                 shouldTrash: slots{7}})
147         // if both pulls succeed, we'll see this next:
148         bal.try(c, tester{
149                 desired:     map[string]int{"default": 2},
150                 current:     slots{2, 0, 1, 7},
151                 shouldTrash: slots{2, 7}})
152
153         // unbalanced + excessive replication => pull + trash
154         bal.try(c, tester{
155                 desired:     map[string]int{"default": 2},
156                 current:     slots{2, 5, 7},
157                 shouldPull:  slots{0, 1},
158                 shouldTrash: slots{7}})
159 }
160
161 func (bal *balancerSuite) TestMultipleReplicasPerService(c *check.C) {
162         for _, srv := range bal.srvs {
163                 for i := 0; i < 3; i++ {
164                         m := *(srv.mounts[0])
165                         srv.mounts = append(srv.mounts, &m)
166                 }
167         }
168         bal.try(c, tester{
169                 desired:    map[string]int{"default": 2},
170                 current:    slots{0, 0},
171                 shouldPull: slots{1}})
172         bal.try(c, tester{
173                 desired:    map[string]int{"default": 2},
174                 current:    slots{2, 2},
175                 shouldPull: slots{0, 1}})
176         bal.try(c, tester{
177                 desired:     map[string]int{"default": 2},
178                 current:     slots{0, 0, 1},
179                 shouldTrash: slots{0}})
180         bal.try(c, tester{
181                 desired:     map[string]int{"default": 2},
182                 current:     slots{1, 1, 0},
183                 shouldTrash: slots{1}})
184         bal.try(c, tester{
185                 desired:     map[string]int{"default": 2},
186                 current:     slots{1, 0, 1, 0, 2},
187                 shouldTrash: slots{0, 1, 2}})
188         bal.try(c, tester{
189                 desired:     map[string]int{"default": 2},
190                 current:     slots{1, 1, 1, 0, 2},
191                 shouldTrash: slots{1, 1, 2}})
192         bal.try(c, tester{
193                 desired:     map[string]int{"default": 2},
194                 current:     slots{1, 1, 2},
195                 shouldPull:  slots{0},
196                 shouldTrash: slots{1}})
197         bal.try(c, tester{
198                 desired:     map[string]int{"default": 2},
199                 current:     slots{1, 1, 0},
200                 timestamps:  []int64{12345678, 12345678, 12345679},
201                 shouldTrash: nil})
202         bal.try(c, tester{
203                 desired:    map[string]int{"default": 2},
204                 current:    slots{1, 1},
205                 shouldPull: slots{0}})
206 }
207
208 func (bal *balancerSuite) TestIncreaseReplTimestampCollision(c *check.C) {
209         // For purposes of increasing replication, we assume identical
210         // replicas are distinct.
211         bal.try(c, tester{
212                 desired:    map[string]int{"default": 4},
213                 current:    slots{0, 1},
214                 timestamps: []int64{12345678, 12345678},
215                 shouldPull: slots{2, 3}})
216 }
217
218 func (bal *balancerSuite) TestDecreaseReplTimestampCollision(c *check.C) {
219         // For purposes of decreasing replication, we assume identical
220         // replicas are NOT distinct.
221         bal.try(c, tester{
222                 desired:    map[string]int{"default": 2},
223                 current:    slots{0, 1, 2},
224                 timestamps: []int64{12345678, 12345678, 12345678}})
225         bal.try(c, tester{
226                 desired:    map[string]int{"default": 2},
227                 current:    slots{0, 1, 2},
228                 timestamps: []int64{12345678, 10000000, 10000000}})
229 }
230
231 func (bal *balancerSuite) TestDecreaseReplBlockTooNew(c *check.C) {
232         oldTime := bal.MinMtime - 3600
233         newTime := bal.MinMtime + 3600
234         // The excess replica is too new to delete.
235         bal.try(c, tester{
236                 desired:    map[string]int{"default": 2},
237                 current:    slots{0, 1, 2},
238                 timestamps: []int64{oldTime, newTime, newTime + 1}})
239         // The best replicas are too new to delete, but the excess
240         // replica is old enough.
241         bal.try(c, tester{
242                 desired:     map[string]int{"default": 2},
243                 current:     slots{0, 1, 2},
244                 timestamps:  []int64{newTime, newTime + 1, oldTime},
245                 shouldTrash: slots{2}})
246 }
247
248 func (bal *balancerSuite) TestDedupDevices(c *check.C) {
249         bal.srvs[3].mounts[0].KeepMount.ReadOnly = true
250         bal.srvs[3].mounts[0].KeepMount.DeviceID = "abcdef"
251         bal.srvs[14].mounts[0].KeepMount.DeviceID = "abcdef"
252         c.Check(len(bal.srvs[3].mounts), check.Equals, 1)
253         bal.dedupDevices()
254         c.Check(len(bal.srvs[3].mounts), check.Equals, 0)
255         bal.try(c, tester{
256                 known:      0,
257                 desired:    map[string]int{"default": 2},
258                 current:    slots{1},
259                 shouldPull: slots{2}})
260 }
261
262 func (bal *balancerSuite) TestDeviceRWMountedByMultipleServers(c *check.C) {
263         bal.srvs[0].mounts[0].KeepMount.DeviceID = "abcdef"
264         bal.srvs[9].mounts[0].KeepMount.DeviceID = "abcdef"
265         bal.srvs[14].mounts[0].KeepMount.DeviceID = "abcdef"
266         // block 0 belongs on servers 3 and e, which have different
267         // device IDs.
268         bal.try(c, tester{
269                 known:      0,
270                 desired:    map[string]int{"default": 2},
271                 current:    slots{1},
272                 shouldPull: slots{0}})
273         // block 1 belongs on servers 0 and 9, which both report
274         // having a replica, but the replicas are on the same device
275         // ID -- so we should pull to the third position (7).
276         bal.try(c, tester{
277                 known:      1,
278                 desired:    map[string]int{"default": 2},
279                 current:    slots{0, 1},
280                 shouldPull: slots{2}})
281         // block 1 can be pulled to the doubly-mounted device, but the
282         // pull should only be done on the first of the two servers.
283         bal.try(c, tester{
284                 known:      1,
285                 desired:    map[string]int{"default": 2},
286                 current:    slots{2},
287                 shouldPull: slots{0}})
288         // block 0 has one replica on a single device mounted on two
289         // servers (e,9 at positions 1,9). Trashing the replica on 9
290         // would lose the block.
291         bal.try(c, tester{
292                 known:      0,
293                 desired:    map[string]int{"default": 2},
294                 current:    slots{1, 9},
295                 shouldPull: slots{0}})
296         // block 0 is overreplicated, but the second and third
297         // replicas are the same replica according to DeviceID
298         // (despite different Mtimes). Don't trash the third replica.
299         bal.try(c, tester{
300                 known:   0,
301                 desired: map[string]int{"default": 2},
302                 current: slots{0, 1, 9}})
303         // block 0 is overreplicated; the third and fifth replicas are
304         // extra, but the fourth is another view of the second and
305         // shouldn't be trashed.
306         bal.try(c, tester{
307                 known:       0,
308                 desired:     map[string]int{"default": 2},
309                 current:     slots{0, 1, 5, 9, 12},
310                 shouldTrash: slots{5, 12}})
311 }
312
313 func (bal *balancerSuite) TestChangeStorageClasses(c *check.C) {
314         // For known blocks 0/1/2/3, server 9 is slot 9/1/14/0 in
315         // probe order. For these tests we give it two mounts, one
316         // with classes=[special], one with
317         // classes=[special,special2].
318         bal.srvs[9].mounts = []*KeepMount{{
319                 KeepMount: arvados.KeepMount{
320                         Replication:    1,
321                         StorageClasses: []string{"special"},
322                         UUID:           "zzzzz-mount-special00000009",
323                         DeviceID:       "9-special",
324                 },
325                 KeepService: bal.srvs[9],
326         }, {
327                 KeepMount: arvados.KeepMount{
328                         Replication:    1,
329                         StorageClasses: []string{"special", "special2"},
330                         UUID:           "zzzzz-mount-special20000009",
331                         DeviceID:       "9-special-and-special2",
332                 },
333                 KeepService: bal.srvs[9],
334         }}
335         // For known blocks 0/1/2/3, server 13 (d) is slot 5/3/11/1 in
336         // probe order. We give it two mounts, one with
337         // classes=[special3], one with classes=[default].
338         bal.srvs[13].mounts = []*KeepMount{{
339                 KeepMount: arvados.KeepMount{
340                         Replication:    1,
341                         StorageClasses: []string{"special2"},
342                         UUID:           "zzzzz-mount-special2000000d",
343                         DeviceID:       "13-special2",
344                 },
345                 KeepService: bal.srvs[13],
346         }, {
347                 KeepMount: arvados.KeepMount{
348                         Replication:    1,
349                         StorageClasses: []string{"default"},
350                         UUID:           "zzzzz-mount-00000000000000d",
351                         DeviceID:       "13-default",
352                 },
353                 KeepService: bal.srvs[13],
354         }}
355         // Pull to slot 9 because that's the only server with the
356         // desired class "special".
357         bal.try(c, tester{
358                 known:            0,
359                 desired:          map[string]int{"default": 2, "special": 1},
360                 current:          slots{0, 1},
361                 shouldPull:       slots{9},
362                 shouldPullMounts: []string{"zzzzz-mount-special00000009"}})
363         // If some storage classes are not satisfied, don't trash any
364         // excess replicas. (E.g., if someone desires repl=1 on
365         // class=durable, and we have two copies on class=volatile, we
366         // should wait for pull to succeed before trashing anything).
367         bal.try(c, tester{
368                 known:            0,
369                 desired:          map[string]int{"special": 1},
370                 current:          slots{0, 1},
371                 shouldPull:       slots{9},
372                 shouldPullMounts: []string{"zzzzz-mount-special00000009"}})
373         // Once storage classes are satisfied, trash excess replicas
374         // that appear earlier in probe order but aren't needed to
375         // satisfy the desired classes.
376         bal.try(c, tester{
377                 known:       0,
378                 desired:     map[string]int{"special": 1},
379                 current:     slots{0, 1, 9},
380                 shouldTrash: slots{0, 1}})
381         // Pull to slot 5, the best server with class "special2".
382         bal.try(c, tester{
383                 known:            0,
384                 desired:          map[string]int{"special2": 1},
385                 current:          slots{0, 1},
386                 shouldPull:       slots{5},
387                 shouldPullMounts: []string{"zzzzz-mount-special2000000d"}})
388         // Pull to slot 5 and 9 to get replication 2 in desired class
389         // "special2".
390         bal.try(c, tester{
391                 known:            0,
392                 desired:          map[string]int{"special2": 2},
393                 current:          slots{0, 1},
394                 shouldPull:       slots{5, 9},
395                 shouldPullMounts: []string{"zzzzz-mount-special20000009", "zzzzz-mount-special2000000d"}})
396         // Slot 0 has a replica in "default", slot 1 has a replica
397         // in "special"; we need another replica in "default", i.e.,
398         // on slot 2.
399         bal.try(c, tester{
400                 known:      1,
401                 desired:    map[string]int{"default": 2, "special": 1},
402                 current:    slots{0, 1},
403                 shouldPull: slots{2}})
404         // Pull to best probe position 0 (despite wrong storage class)
405         // if it's impossible to achieve desired replication in the
406         // desired class (only slots 1 and 3 have special2).
407         bal.try(c, tester{
408                 known:      1,
409                 desired:    map[string]int{"special2": 3},
410                 current:    slots{3},
411                 shouldPull: slots{0, 1}})
412         // Trash excess replica.
413         bal.try(c, tester{
414                 known:       3,
415                 desired:     map[string]int{"special": 1},
416                 current:     slots{0, 1},
417                 shouldTrash: slots{1}})
418         // Leave one copy on slot 1 because slot 0 (server 9) only
419         // gives us repl=1.
420         bal.try(c, tester{
421                 known:   3,
422                 desired: map[string]int{"special": 2},
423                 current: slots{0, 1}})
424 }
425
426 // Clear all servers' changesets, balance a single block, and verify
427 // the appropriate changes for that block have been added to the
428 // changesets.
429 func (bal *balancerSuite) try(c *check.C, t tester) {
430         bal.setupLookupTables()
431         blk := &BlockState{
432                 Replicas: bal.replList(t.known, t.current),
433                 Desired:  t.desired,
434         }
435         for i, t := range t.timestamps {
436                 blk.Replicas[i].Mtime = t
437         }
438         for _, srv := range bal.srvs {
439                 srv.ChangeSet = &ChangeSet{}
440         }
441         bal.balanceBlock(knownBlkid(t.known), blk)
442
443         var didPull, didTrash slots
444         var didPullMounts, didTrashMounts []string
445         for i, srv := range bal.srvs {
446                 var slot int
447                 for probeOrder, srvNum := range bal.knownRendezvous[t.known] {
448                         if srvNum == i {
449                                 slot = probeOrder
450                         }
451                 }
452                 for _, pull := range srv.Pulls {
453                         didPull = append(didPull, slot)
454                         didPullMounts = append(didPullMounts, pull.To.UUID)
455                         c.Check(pull.SizedDigest, check.Equals, knownBlkid(t.known))
456                 }
457                 for _, trash := range srv.Trashes {
458                         didTrash = append(didTrash, slot)
459                         didTrashMounts = append(didTrashMounts, trash.From.UUID)
460                         c.Check(trash.SizedDigest, check.Equals, knownBlkid(t.known))
461                 }
462         }
463
464         for _, list := range []slots{didPull, didTrash, t.shouldPull, t.shouldTrash} {
465                 sort.Sort(sort.IntSlice(list))
466         }
467         c.Check(didPull, check.DeepEquals, t.shouldPull)
468         c.Check(didTrash, check.DeepEquals, t.shouldTrash)
469         if t.shouldPullMounts != nil {
470                 sort.Strings(didPullMounts)
471                 c.Check(didPullMounts, check.DeepEquals, t.shouldPullMounts)
472         }
473         if t.shouldTrashMounts != nil {
474                 sort.Strings(didTrashMounts)
475                 c.Check(didTrashMounts, check.DeepEquals, t.shouldTrashMounts)
476         }
477 }
478
479 // srvList returns the KeepServices, sorted in rendezvous order and
480 // then selected by idx. For example, srvList(3, slots{0, 1, 4})
481 // returns the the first-, second-, and fifth-best servers for storing
482 // bal.knownBlkid(3).
483 func (bal *balancerSuite) srvList(knownBlockID int, order slots) (srvs []*KeepService) {
484         for _, i := range order {
485                 srvs = append(srvs, bal.srvs[bal.knownRendezvous[knownBlockID][i]])
486         }
487         return
488 }
489
490 // replList is like srvList but returns an "existing replicas" slice,
491 // suitable for a BlockState test fixture.
492 func (bal *balancerSuite) replList(knownBlockID int, order slots) (repls []Replica) {
493         nextMnt := map[*KeepService]int{}
494         mtime := time.Now().UnixNano() - (bal.signatureTTL+86400)*1e9
495         for _, srv := range bal.srvList(knownBlockID, order) {
496                 // round-robin repls onto each srv's mounts
497                 n := nextMnt[srv]
498                 nextMnt[srv] = (n + 1) % len(srv.mounts)
499
500                 repls = append(repls, Replica{srv.mounts[n], mtime})
501                 mtime++
502         }
503         return
504 }
505
506 // generate the same data hashes that are tested in
507 // sdk/go/keepclient/root_sorter_test.go
508 func knownBlkid(i int) arvados.SizedDigest {
509         return arvados.SizedDigest(fmt.Sprintf("%x+64", md5.Sum([]byte(fmt.Sprintf("%064x", i)))))
510 }