20425: Test response status for mixed-status backend errors.
[arvados.git] / services / crunch-dispatch-slurm / squeue_test.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package dispatchslurm
6
7 import (
8         "time"
9
10         "github.com/sirupsen/logrus"
11         . "gopkg.in/check.v1"
12 )
13
14 var _ = Suite(&SqueueSuite{})
15
16 type SqueueSuite struct{}
17
18 func (s *SqueueSuite) TestReleasePending(c *C) {
19         uuids := []string{
20                 "zzzzz-dz642-fake0fake0fake0",
21                 "zzzzz-dz642-fake1fake1fake1",
22                 "zzzzz-dz642-fake2fake2fake2",
23         }
24         slurm := &slurmFake{
25                 queue: uuids[0] + " 10000 4294000000 PENDING Resources\n" + uuids[1] + " 10000 4294000111 PENDING Resources\n" + uuids[2] + " 10000 0 PENDING BadConstraints\n",
26         }
27         sqc := &SqueueChecker{
28                 Logger: logrus.StandardLogger(),
29                 Slurm:  slurm,
30                 Period: time.Hour,
31         }
32         sqc.startOnce.Do(sqc.start)
33         defer sqc.Stop()
34
35         done := make(chan struct{})
36         go func() {
37                 for _, u := range uuids {
38                         sqc.SetPriority(u, 1)
39                 }
40                 close(done)
41         }()
42         callUntilReady(sqc.check, done)
43
44         slurm.didRelease = nil
45         sqc.check()
46         c.Check(slurm.didRelease, DeepEquals, []string{uuids[2]})
47 }
48
49 func (s *SqueueSuite) TestReniceAll(c *C) {
50         uuids := []string{"zzzzz-dz642-fake0fake0fake0", "zzzzz-dz642-fake1fake1fake1", "zzzzz-dz642-fake2fake2fake2"}
51         for _, test := range []struct {
52                 spread int64
53                 squeue string
54                 want   map[string]int64
55                 expect [][]string
56         }{
57                 {
58                         spread: 1,
59                         squeue: uuids[0] + " 10000 4294000000 PENDING Resources\n",
60                         want:   map[string]int64{uuids[0]: 1},
61                         expect: [][]string{{uuids[0], "0"}},
62                 },
63                 { // fake0 priority is too high
64                         spread: 1,
65                         squeue: uuids[0] + " 10000 4294000777 PENDING Resources\n" + uuids[1] + " 10000 4294000444 PENDING Resources\n",
66                         want:   map[string]int64{uuids[0]: 1, uuids[1]: 999},
67                         expect: [][]string{{uuids[1], "0"}, {uuids[0], "334"}},
68                 },
69                 { // specify spread
70                         spread: 100,
71                         squeue: uuids[0] + " 10000 4294000777 PENDING Resources\n" + uuids[1] + " 10000 4294000444 PENDING Resources\n",
72                         want:   map[string]int64{uuids[0]: 1, uuids[1]: 999},
73                         expect: [][]string{{uuids[1], "0"}, {uuids[0], "433"}},
74                 },
75                 { // ignore fake2 because SetPriority() not called
76                         spread: 1,
77                         squeue: uuids[0] + " 10000 4294000000 PENDING Resources\n" + uuids[1] + " 10000 4294000111 PENDING Resources\n" + uuids[2] + " 10000 4294000222 PENDING Resources\n",
78                         want:   map[string]int64{uuids[0]: 999, uuids[1]: 1},
79                         expect: [][]string{{uuids[0], "0"}, {uuids[1], "112"}},
80                 },
81                 { // ignore fake2 because slurm priority=0
82                         spread: 1,
83                         squeue: uuids[0] + " 10000 4294000000 PENDING Resources\n" + uuids[1] + " 10000 4294000111 PENDING Resources\n" + uuids[2] + " 10000 0 PENDING Resources\n",
84                         want:   map[string]int64{uuids[0]: 999, uuids[1]: 1, uuids[2]: 997},
85                         expect: [][]string{{uuids[0], "0"}, {uuids[1], "112"}},
86                 },
87         } {
88                 c.Logf("spread=%d squeue=%q want=%v -> expect=%v", test.spread, test.squeue, test.want, test.expect)
89                 slurm := &slurmFake{
90                         queue: test.squeue,
91                 }
92                 sqc := &SqueueChecker{
93                         Logger:         logrus.StandardLogger(),
94                         Slurm:          slurm,
95                         PrioritySpread: test.spread,
96                         Period:         time.Hour,
97                 }
98                 sqc.startOnce.Do(sqc.start)
99                 sqc.check()
100                 for uuid, pri := range test.want {
101                         sqc.SetPriority(uuid, pri)
102                 }
103                 sqc.reniceAll()
104                 c.Check(slurm.didRenice, DeepEquals, test.expect)
105                 sqc.Stop()
106         }
107 }
108
109 // If a limited nice range prevents desired priority adjustments, give
110 // up and clamp nice to 10K.
111 func (s *SqueueSuite) TestReniceInvalidNiceValue(c *C) {
112         uuids := []string{"zzzzz-dz642-fake0fake0fake0", "zzzzz-dz642-fake1fake1fake1", "zzzzz-dz642-fake2fake2fake2"}
113         slurm := &slurmFake{
114                 queue:         uuids[0] + " 0 4294000222 PENDING Resources\n" + uuids[1] + " 0 4294555222 PENDING Resources\n",
115                 rejectNice10K: true,
116         }
117         sqc := &SqueueChecker{
118                 Logger:         logrus.StandardLogger(),
119                 Slurm:          slurm,
120                 PrioritySpread: 1,
121                 Period:         time.Hour,
122         }
123         sqc.startOnce.Do(sqc.start)
124         sqc.check()
125         sqc.SetPriority(uuids[0], 2)
126         sqc.SetPriority(uuids[1], 1)
127
128         // First attempt should renice to 555001, which will fail
129         sqc.reniceAll()
130         c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}})
131
132         // Next attempt should renice to 10K, which will succeed
133         sqc.reniceAll()
134         c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}, {uuids[1], "10000"}})
135         // ...so we'll change the squeue response to reflect the
136         // updated priority+nice, and make sure sqc sees that...
137         slurm.queue = uuids[0] + " 0 4294000222 PENDING Resources\n" + uuids[1] + " 10000 4294545222 PENDING Resources\n"
138         sqc.check()
139
140         // Next attempt should leave nice alone because it's already
141         // at the 10K limit
142         sqc.reniceAll()
143         c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}, {uuids[1], "10000"}})
144
145         // Back to normal if desired nice value falls below 10K
146         slurm.queue = uuids[0] + " 0 4294000222 PENDING Resources\n" + uuids[1] + " 10000 4294000111 PENDING Resources\n"
147         sqc.check()
148         sqc.reniceAll()
149         c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}, {uuids[1], "10000"}, {uuids[1], "9890"}})
150
151         sqc.Stop()
152 }
153
154 // If the given UUID isn't in the slurm queue yet, SetPriority()
155 // should wait for it to appear on the very next poll, then give up.
156 func (s *SqueueSuite) TestSetPriorityBeforeQueued(c *C) {
157         uuidGood := "zzzzz-dz642-fake0fake0fake0"
158         uuidBad := "zzzzz-dz642-fake1fake1fake1"
159
160         slurm := &slurmFake{}
161         sqc := &SqueueChecker{
162                 Logger: logrus.StandardLogger(),
163                 Slurm:  slurm,
164                 Period: time.Hour,
165         }
166         sqc.startOnce.Do(sqc.start)
167         sqc.Stop()
168         sqc.check()
169
170         done := make(chan struct{})
171         go func() {
172                 sqc.SetPriority(uuidGood, 123)
173                 sqc.SetPriority(uuidBad, 345)
174                 close(done)
175         }()
176         c.Check(sqc.queue[uuidGood], IsNil)
177         c.Check(sqc.queue[uuidBad], IsNil)
178         timeout := time.NewTimer(time.Second)
179         defer timeout.Stop()
180         tick := time.NewTicker(time.Millisecond)
181         defer tick.Stop()
182         for {
183                 select {
184                 case <-tick.C:
185                         slurm.queue = uuidGood + " 0 12345 PENDING Resources\n"
186                         sqc.check()
187
188                         // Avoid immediately selecting this case again
189                         // on the next iteration if check() took
190                         // longer than one tick.
191                         select {
192                         case <-tick.C:
193                         default:
194                         }
195                 case <-timeout.C:
196                         c.Fatal("timed out")
197                 case <-done:
198                         c.Assert(sqc.queue[uuidGood], NotNil)
199                         c.Check(sqc.queue[uuidGood].wantPriority, Equals, int64(123))
200                         c.Check(sqc.queue[uuidBad], IsNil)
201                         return
202                 }
203         }
204 }
205
206 func callUntilReady(fn func(), done <-chan struct{}) {
207         tick := time.NewTicker(time.Millisecond)
208         defer tick.Stop()
209         for {
210                 select {
211                 case <-done:
212                         return
213                 case <-tick.C:
214                         fn()
215                 }
216         }
217 }