Merge branch '13212-unavailable-output-workunit' closes #13212
[arvados.git] / services / keep-balance / balance_test.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package main
6
7 import (
8         "crypto/md5"
9         "fmt"
10         "sort"
11         "strconv"
12         "testing"
13         "time"
14
15         "git.curoverse.com/arvados.git/sdk/go/arvados"
16
17         check "gopkg.in/check.v1"
18 )
19
20 // Test with Gocheck
21 func Test(t *testing.T) {
22         check.TestingT(t)
23 }
24
25 var _ = check.Suite(&balancerSuite{})
26
27 type balancerSuite struct {
28         Balancer
29         srvs            []*KeepService
30         blks            map[string]tester
31         knownRendezvous [][]int
32         signatureTTL    int64
33 }
34
35 const (
36         // index into knownRendezvous
37         known0 = 0
38 )
39
40 type slots []int
41
42 type tester struct {
43         known       int
44         desired     int
45         current     slots
46         timestamps  []int64
47         shouldPull  slots
48         shouldTrash slots
49 }
50
51 func (bal *balancerSuite) SetUpSuite(c *check.C) {
52         bal.knownRendezvous = nil
53         for _, str := range []string{
54                 "3eab2d5fc9681074",
55                 "097dba52e648f1c3",
56                 "c5b4e023f8a7d691",
57                 "9d81c02e76a3bf54",
58         } {
59                 var slots []int
60                 for _, c := range []byte(str) {
61                         pos, _ := strconv.ParseUint(string(c), 16, 4)
62                         slots = append(slots, int(pos))
63                 }
64                 bal.knownRendezvous = append(bal.knownRendezvous, slots)
65         }
66
67         bal.signatureTTL = 3600
68 }
69
70 func (bal *balancerSuite) SetUpTest(c *check.C) {
71         bal.srvs = make([]*KeepService, 16)
72         bal.KeepServices = make(map[string]*KeepService)
73         for i := range bal.srvs {
74                 srv := &KeepService{
75                         KeepService: arvados.KeepService{
76                                 UUID: fmt.Sprintf("zzzzz-bi6l4-%015x", i),
77                         },
78                 }
79                 srv.mounts = []*KeepMount{{KeepMount: arvados.KeepMount{UUID: fmt.Sprintf("mount-%015x", i)}, KeepService: srv}}
80                 bal.srvs[i] = srv
81                 bal.KeepServices[srv.UUID] = srv
82         }
83
84         bal.MinMtime = time.Now().UnixNano() - bal.signatureTTL*1e9
85 }
86
87 func (bal *balancerSuite) TestPerfect(c *check.C) {
88         bal.try(c, tester{
89                 desired:     2,
90                 current:     slots{0, 1},
91                 shouldPull:  nil,
92                 shouldTrash: nil})
93 }
94
95 func (bal *balancerSuite) TestDecreaseRepl(c *check.C) {
96         bal.try(c, tester{
97                 desired:     2,
98                 current:     slots{0, 2, 1},
99                 shouldTrash: slots{2}})
100 }
101
102 func (bal *balancerSuite) TestDecreaseReplToZero(c *check.C) {
103         bal.try(c, tester{
104                 desired:     0,
105                 current:     slots{0, 1, 3},
106                 shouldTrash: slots{0, 1, 3}})
107 }
108
109 func (bal *balancerSuite) TestIncreaseRepl(c *check.C) {
110         bal.try(c, tester{
111                 desired:    4,
112                 current:    slots{0, 1},
113                 shouldPull: slots{2, 3}})
114 }
115
116 func (bal *balancerSuite) TestSkipReadonly(c *check.C) {
117         bal.srvList(0, slots{3})[0].ReadOnly = true
118         bal.try(c, tester{
119                 desired:    4,
120                 current:    slots{0, 1},
121                 shouldPull: slots{2, 4}})
122 }
123
124 func (bal *balancerSuite) TestFixUnbalanced(c *check.C) {
125         bal.try(c, tester{
126                 desired:    2,
127                 current:    slots{2, 0},
128                 shouldPull: slots{1}})
129         bal.try(c, tester{
130                 desired:    2,
131                 current:    slots{2, 7},
132                 shouldPull: slots{0, 1}})
133         // if only one of the pulls succeeds, we'll see this next:
134         bal.try(c, tester{
135                 desired:     2,
136                 current:     slots{2, 1, 7},
137                 shouldPull:  slots{0},
138                 shouldTrash: slots{7}})
139         // if both pulls succeed, we'll see this next:
140         bal.try(c, tester{
141                 desired:     2,
142                 current:     slots{2, 0, 1, 7},
143                 shouldTrash: slots{2, 7}})
144
145         // unbalanced + excessive replication => pull + trash
146         bal.try(c, tester{
147                 desired:     2,
148                 current:     slots{2, 5, 7},
149                 shouldPull:  slots{0, 1},
150                 shouldTrash: slots{7}})
151 }
152
153 func (bal *balancerSuite) TestMultipleReplicasPerService(c *check.C) {
154         bal.try(c, tester{
155                 desired:    2,
156                 current:    slots{0, 0},
157                 shouldPull: slots{1}})
158         bal.try(c, tester{
159                 desired:    2,
160                 current:    slots{2, 2},
161                 shouldPull: slots{0, 1}})
162         bal.try(c, tester{
163                 desired:     2,
164                 current:     slots{0, 0, 1},
165                 shouldTrash: slots{0}})
166         bal.try(c, tester{
167                 desired:     2,
168                 current:     slots{1, 1, 0},
169                 shouldTrash: slots{1}})
170         bal.try(c, tester{
171                 desired:     2,
172                 current:     slots{1, 0, 1, 0, 2},
173                 shouldTrash: slots{0, 1, 2}})
174         bal.try(c, tester{
175                 desired:     2,
176                 current:     slots{1, 1, 1, 0, 2},
177                 shouldTrash: slots{1, 1, 2}})
178         bal.try(c, tester{
179                 desired:     2,
180                 current:     slots{1, 1, 2},
181                 shouldPull:  slots{0},
182                 shouldTrash: slots{1}})
183         bal.try(c, tester{
184                 desired:     2,
185                 current:     slots{1, 1, 0},
186                 timestamps:  []int64{12345678, 12345678, 12345679},
187                 shouldTrash: nil})
188         bal.try(c, tester{
189                 desired:    2,
190                 current:    slots{1, 1},
191                 shouldPull: slots{0}})
192 }
193
194 func (bal *balancerSuite) TestIncreaseReplTimestampCollision(c *check.C) {
195         // For purposes of increasing replication, we assume identical
196         // replicas are distinct.
197         bal.try(c, tester{
198                 desired:    4,
199                 current:    slots{0, 1},
200                 timestamps: []int64{12345678, 12345678},
201                 shouldPull: slots{2, 3}})
202 }
203
204 func (bal *balancerSuite) TestDecreaseReplTimestampCollision(c *check.C) {
205         // For purposes of decreasing replication, we assume identical
206         // replicas are NOT distinct.
207         bal.try(c, tester{
208                 desired:    2,
209                 current:    slots{0, 1, 2},
210                 timestamps: []int64{12345678, 12345678, 12345678}})
211         bal.try(c, tester{
212                 desired:    2,
213                 current:    slots{0, 1, 2},
214                 timestamps: []int64{12345678, 10000000, 10000000}})
215 }
216
217 func (bal *balancerSuite) TestDecreaseReplBlockTooNew(c *check.C) {
218         oldTime := bal.MinMtime - 3600
219         newTime := bal.MinMtime + 3600
220         // The excess replica is too new to delete.
221         bal.try(c, tester{
222                 desired:    2,
223                 current:    slots{0, 1, 2},
224                 timestamps: []int64{oldTime, newTime, newTime + 1}})
225         // The best replicas are too new to delete, but the excess
226         // replica is old enough.
227         bal.try(c, tester{
228                 desired:     2,
229                 current:     slots{0, 1, 2},
230                 timestamps:  []int64{newTime, newTime + 1, oldTime},
231                 shouldTrash: slots{2}})
232 }
233
234 // Clear all servers' changesets, balance a single block, and verify
235 // the appropriate changes for that block have been added to the
236 // changesets.
237 func (bal *balancerSuite) try(c *check.C, t tester) {
238         bal.setupServiceRoots()
239         blk := &BlockState{
240                 Desired:  t.desired,
241                 Replicas: bal.replList(t.known, t.current)}
242         for i, t := range t.timestamps {
243                 blk.Replicas[i].Mtime = t
244         }
245         for _, srv := range bal.srvs {
246                 srv.ChangeSet = &ChangeSet{}
247         }
248         bal.balanceBlock(knownBlkid(t.known), blk)
249
250         var didPull, didTrash slots
251         for i, srv := range bal.srvs {
252                 var slot int
253                 for probeOrder, srvNum := range bal.knownRendezvous[t.known] {
254                         if srvNum == i {
255                                 slot = probeOrder
256                         }
257                 }
258                 for _, pull := range srv.Pulls {
259                         didPull = append(didPull, slot)
260                         c.Check(pull.SizedDigest, check.Equals, knownBlkid(t.known))
261                 }
262                 for _, trash := range srv.Trashes {
263                         didTrash = append(didTrash, slot)
264                         c.Check(trash.SizedDigest, check.Equals, knownBlkid(t.known))
265                 }
266         }
267
268         for _, list := range []slots{didPull, didTrash, t.shouldPull, t.shouldTrash} {
269                 sort.Sort(sort.IntSlice(list))
270         }
271         c.Check(didPull, check.DeepEquals, t.shouldPull)
272         c.Check(didTrash, check.DeepEquals, t.shouldTrash)
273 }
274
275 // srvList returns the KeepServices, sorted in rendezvous order and
276 // then selected by idx. For example, srvList(3, slots{0, 1, 4})
277 // returns the the first-, second-, and fifth-best servers for storing
278 // bal.knownBlkid(3).
279 func (bal *balancerSuite) srvList(knownBlockID int, order slots) (srvs []*KeepService) {
280         for _, i := range order {
281                 srvs = append(srvs, bal.srvs[bal.knownRendezvous[knownBlockID][i]])
282         }
283         return
284 }
285
286 // replList is like srvList but returns an "existing replicas" slice,
287 // suitable for a BlockState test fixture.
288 func (bal *balancerSuite) replList(knownBlockID int, order slots) (repls []Replica) {
289         mtime := time.Now().UnixNano() - (bal.signatureTTL+86400)*1e9
290         for _, srv := range bal.srvList(knownBlockID, order) {
291                 repls = append(repls, Replica{srv.mounts[0], mtime})
292                 mtime++
293         }
294         return
295 }
296
297 // generate the same data hashes that are tested in
298 // sdk/go/keepclient/root_sorter_test.go
299 func knownBlkid(i int) arvados.SizedDigest {
300         return arvados.SizedDigest(fmt.Sprintf("%x+64", md5.Sum([]byte(fmt.Sprintf("%064x", i)))))
301 }