12199: Merge branch 'master' into 12199-dispatch-to-node-type
[arvados.git] / services / crunch-dispatch-slurm / crunch-dispatch-slurm_test.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package main
6
7 import (
8         "bytes"
9         "context"
10         "errors"
11         "fmt"
12         "io"
13         "io/ioutil"
14         "log"
15         "net/http"
16         "net/http/httptest"
17         "os"
18         "os/exec"
19         "strings"
20         "testing"
21         "time"
22
23         "git.curoverse.com/arvados.git/sdk/go/arvados"
24         "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
25         "git.curoverse.com/arvados.git/sdk/go/arvadostest"
26         "git.curoverse.com/arvados.git/sdk/go/dispatch"
27         "git.curoverse.com/arvados.git/services/dispatchcloud"
28         . "gopkg.in/check.v1"
29 )
30
31 // Gocheck boilerplate
32 func Test(t *testing.T) {
33         TestingT(t)
34 }
35
36 var _ = Suite(&IntegrationSuite{})
37 var _ = Suite(&StubbedSuite{})
38
39 type IntegrationSuite struct {
40         disp  Dispatcher
41         slurm slurmFake
42 }
43
44 func (s *IntegrationSuite) SetUpTest(c *C) {
45         arvadostest.StartAPI()
46         os.Setenv("ARVADOS_API_TOKEN", arvadostest.Dispatch1Token)
47         s.disp = Dispatcher{}
48         s.disp.setup()
49         s.slurm = slurmFake{}
50 }
51
52 func (s *IntegrationSuite) TearDownTest(c *C) {
53         arvadostest.ResetEnv()
54         arvadostest.StopAPI()
55 }
56
57 type slurmFake struct {
58         didBatch  [][]string
59         didCancel []string
60         didRenice [][]string
61         queue     string
62         // If non-nil, run this func during the 2nd+ call to Cancel()
63         onCancel func()
64         // Error returned by Batch()
65         errBatch error
66 }
67
68 func (sf *slurmFake) Batch(script io.Reader, args []string) error {
69         sf.didBatch = append(sf.didBatch, args)
70         return sf.errBatch
71 }
72
73 func (sf *slurmFake) QueueCommand(args []string) *exec.Cmd {
74         return exec.Command("echo", sf.queue)
75 }
76
77 func (sf *slurmFake) Renice(name string, nice int) error {
78         sf.didRenice = append(sf.didRenice, []string{name, fmt.Sprintf("%d", nice)})
79         return nil
80 }
81
82 func (sf *slurmFake) Cancel(name string) error {
83         sf.didCancel = append(sf.didCancel, name)
84         if len(sf.didCancel) == 1 {
85                 // simulate error on first attempt
86                 return errors.New("something terrible happened")
87         }
88         if sf.onCancel != nil {
89                 sf.onCancel()
90         }
91         return nil
92 }
93
94 func (s *IntegrationSuite) integrationTest(c *C,
95         expectBatch [][]string,
96         runContainer func(*dispatch.Dispatcher, arvados.Container)) arvados.Container {
97         arvadostest.ResetEnv()
98
99         arv, err := arvadosclient.MakeArvadosClient()
100         c.Assert(err, IsNil)
101
102         // There should be one queued container
103         params := arvadosclient.Dict{
104                 "filters": [][]string{{"state", "=", "Queued"}},
105         }
106         var containers arvados.ContainerList
107         err = arv.List("containers", params, &containers)
108         c.Check(err, IsNil)
109         c.Check(len(containers.Items), Equals, 1)
110
111         s.disp.CrunchRunCommand = []string{"echo"}
112
113         ctx, cancel := context.WithCancel(context.Background())
114         doneRun := make(chan struct{})
115
116         s.disp.Dispatcher = &dispatch.Dispatcher{
117                 Arv:        arv,
118                 PollPeriod: time.Duration(1) * time.Second,
119                 RunContainer: func(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) {
120                         go func() {
121                                 runContainer(disp, ctr)
122                                 s.slurm.queue = ""
123                                 doneRun <- struct{}{}
124                         }()
125                         s.disp.runContainer(disp, ctr, status)
126                         cancel()
127                 },
128         }
129
130         s.disp.slurm = &s.slurm
131         s.disp.sqCheck = &SqueueChecker{Period: 500 * time.Millisecond, Slurm: s.disp.slurm}
132
133         err = s.disp.Dispatcher.Run(ctx)
134         <-doneRun
135         c.Assert(err, Equals, context.Canceled)
136
137         s.disp.sqCheck.Stop()
138
139         c.Check(s.slurm.didBatch, DeepEquals, expectBatch)
140
141         // There should be no queued containers now
142         err = arv.List("containers", params, &containers)
143         c.Check(err, IsNil)
144         c.Check(len(containers.Items), Equals, 0)
145
146         // Previously "Queued" container should now be in "Complete" state
147         var container arvados.Container
148         err = arv.Get("containers", "zzzzz-dz642-queuedcontainer", nil, &container)
149         c.Check(err, IsNil)
150         return container
151 }
152
153 func (s *IntegrationSuite) TestNormal(c *C) {
154         s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 9990 100\n"}
155         container := s.integrationTest(c,
156                 nil,
157                 func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
158                         dispatcher.UpdateState(container.UUID, dispatch.Running)
159                         time.Sleep(3 * time.Second)
160                         dispatcher.UpdateState(container.UUID, dispatch.Complete)
161                 })
162         c.Check(container.State, Equals, arvados.ContainerStateComplete)
163 }
164
165 func (s *IntegrationSuite) TestCancel(c *C) {
166         s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 9990 100\n"}
167         readyToCancel := make(chan bool)
168         s.slurm.onCancel = func() { <-readyToCancel }
169         container := s.integrationTest(c,
170                 nil,
171                 func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
172                         dispatcher.UpdateState(container.UUID, dispatch.Running)
173                         time.Sleep(time.Second)
174                         dispatcher.Arv.Update("containers", container.UUID,
175                                 arvadosclient.Dict{
176                                         "container": arvadosclient.Dict{"priority": 0}},
177                                 nil)
178                         readyToCancel <- true
179                         close(readyToCancel)
180                 })
181         c.Check(container.State, Equals, arvados.ContainerStateCancelled)
182         c.Check(len(s.slurm.didCancel) > 1, Equals, true)
183         c.Check(s.slurm.didCancel[:2], DeepEquals, []string{"zzzzz-dz642-queuedcontainer", "zzzzz-dz642-queuedcontainer"})
184 }
185
186 func (s *IntegrationSuite) TestMissingFromSqueue(c *C) {
187         container := s.integrationTest(c,
188                 [][]string{{
189                         fmt.Sprintf("--job-name=%s", "zzzzz-dz642-queuedcontainer"),
190                         fmt.Sprintf("--mem=%d", 11445),
191                         fmt.Sprintf("--cpus-per-task=%d", 4),
192                         fmt.Sprintf("--tmp=%d", 45777),
193                         fmt.Sprintf("--nice=%d", 9990)}},
194                 func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
195                         dispatcher.UpdateState(container.UUID, dispatch.Running)
196                         time.Sleep(3 * time.Second)
197                         dispatcher.UpdateState(container.UUID, dispatch.Complete)
198                 })
199         c.Check(container.State, Equals, arvados.ContainerStateCancelled)
200 }
201
202 func (s *IntegrationSuite) TestSbatchFail(c *C) {
203         s.slurm = slurmFake{errBatch: errors.New("something terrible happened")}
204         container := s.integrationTest(c,
205                 [][]string{{"--job-name=zzzzz-dz642-queuedcontainer", "--mem=11445", "--cpus-per-task=4", "--tmp=45777", "--nice=9990"}},
206                 func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
207                         dispatcher.UpdateState(container.UUID, dispatch.Running)
208                         dispatcher.UpdateState(container.UUID, dispatch.Complete)
209                 })
210         c.Check(container.State, Equals, arvados.ContainerStateComplete)
211
212         arv, err := arvadosclient.MakeArvadosClient()
213         c.Assert(err, IsNil)
214
215         var ll arvados.LogList
216         err = arv.List("logs", arvadosclient.Dict{"filters": [][]string{
217                 {"object_uuid", "=", container.UUID},
218                 {"event_type", "=", "dispatch"},
219         }}, &ll)
220         c.Assert(len(ll.Items), Equals, 1)
221 }
222
223 func (s *IntegrationSuite) TestChangePriority(c *C) {
224         s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 9990 100\n"}
225         container := s.integrationTest(c, nil,
226                 func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
227                         dispatcher.UpdateState(container.UUID, dispatch.Running)
228                         time.Sleep(time.Second)
229                         dispatcher.Arv.Update("containers", container.UUID,
230                                 arvadosclient.Dict{
231                                         "container": arvadosclient.Dict{"priority": 600}},
232                                 nil)
233                         time.Sleep(time.Second)
234                         dispatcher.UpdateState(container.UUID, dispatch.Complete)
235                 })
236         c.Check(container.State, Equals, arvados.ContainerStateComplete)
237         c.Assert(len(s.slurm.didRenice), Not(Equals), 0)
238         c.Check(s.slurm.didRenice[len(s.slurm.didRenice)-1], DeepEquals, []string{"zzzzz-dz642-queuedcontainer", "4000"})
239 }
240
241 type StubbedSuite struct {
242         disp Dispatcher
243 }
244
245 func (s *StubbedSuite) SetUpTest(c *C) {
246         s.disp = Dispatcher{}
247         s.disp.setup()
248 }
249
250 func (s *StubbedSuite) TestAPIErrorGettingContainers(c *C) {
251         apiStubResponses := make(map[string]arvadostest.StubResponse)
252         apiStubResponses["/arvados/v1/api_client_authorizations/current"] = arvadostest.StubResponse{200, `{"uuid":"` + arvadostest.Dispatch1AuthUUID + `"}`}
253         apiStubResponses["/arvados/v1/containers"] = arvadostest.StubResponse{500, string(`{}`)}
254
255         s.testWithServerStub(c, apiStubResponses, "echo", "Error getting list of containers")
256 }
257
258 func (s *StubbedSuite) testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubResponse, crunchCmd string, expected string) {
259         apiStub := arvadostest.ServerStub{apiStubResponses}
260
261         api := httptest.NewServer(&apiStub)
262         defer api.Close()
263
264         arv := &arvadosclient.ArvadosClient{
265                 Scheme:    "http",
266                 ApiServer: api.URL[7:],
267                 ApiToken:  "abc123",
268                 Client:    &http.Client{Transport: &http.Transport{}},
269                 Retries:   0,
270         }
271
272         buf := bytes.NewBuffer(nil)
273         log.SetOutput(io.MultiWriter(buf, os.Stderr))
274         defer log.SetOutput(os.Stderr)
275
276         s.disp.CrunchRunCommand = []string{crunchCmd}
277
278         ctx, cancel := context.WithCancel(context.Background())
279         dispatcher := dispatch.Dispatcher{
280                 Arv:        arv,
281                 PollPeriod: time.Duration(1) * time.Second,
282                 RunContainer: func(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) {
283                         go func() {
284                                 time.Sleep(1 * time.Second)
285                                 disp.UpdateState(ctr.UUID, dispatch.Running)
286                                 disp.UpdateState(ctr.UUID, dispatch.Complete)
287                         }()
288                         s.disp.runContainer(disp, ctr, status)
289                         cancel()
290                 },
291         }
292
293         go func() {
294                 for i := 0; i < 80 && !strings.Contains(buf.String(), expected); i++ {
295                         time.Sleep(100 * time.Millisecond)
296                 }
297                 cancel()
298         }()
299
300         err := dispatcher.Run(ctx)
301         c.Assert(err, Equals, context.Canceled)
302
303         c.Check(buf.String(), Matches, `(?ms).*`+expected+`.*`)
304 }
305
306 func (s *StubbedSuite) TestNoSuchConfigFile(c *C) {
307         err := s.disp.readConfig("/nosuchdir89j7879/8hjwr7ojgyy7")
308         c.Assert(err, NotNil)
309 }
310
311 func (s *StubbedSuite) TestBadSbatchArgsConfig(c *C) {
312         tmpfile, err := ioutil.TempFile(os.TempDir(), "config")
313         c.Check(err, IsNil)
314         defer os.Remove(tmpfile.Name())
315
316         _, err = tmpfile.Write([]byte(`{"SbatchArguments": "oops this is not a string array"}`))
317         c.Check(err, IsNil)
318
319         err = s.disp.readConfig(tmpfile.Name())
320         c.Assert(err, NotNil)
321 }
322
323 func (s *StubbedSuite) TestNoSuchArgInConfigIgnored(c *C) {
324         tmpfile, err := ioutil.TempFile(os.TempDir(), "config")
325         c.Check(err, IsNil)
326         defer os.Remove(tmpfile.Name())
327
328         _, err = tmpfile.Write([]byte(`{"NoSuchArg": "Nobody loves me, not one tiny hunk."}`))
329         c.Check(err, IsNil)
330
331         err = s.disp.readConfig(tmpfile.Name())
332         c.Assert(err, IsNil)
333         c.Check(0, Equals, len(s.disp.SbatchArguments))
334 }
335
336 func (s *StubbedSuite) TestReadConfig(c *C) {
337         tmpfile, err := ioutil.TempFile(os.TempDir(), "config")
338         c.Check(err, IsNil)
339         defer os.Remove(tmpfile.Name())
340
341         args := []string{"--arg1=v1", "--arg2", "--arg3=v3"}
342         argsS := `{"SbatchArguments": ["--arg1=v1",  "--arg2", "--arg3=v3"]}`
343         _, err = tmpfile.Write([]byte(argsS))
344         c.Check(err, IsNil)
345
346         err = s.disp.readConfig(tmpfile.Name())
347         c.Assert(err, IsNil)
348         c.Check(args, DeepEquals, s.disp.SbatchArguments)
349 }
350
351 func (s *StubbedSuite) TestSbatchArgs(c *C) {
352         container := arvados.Container{
353                 UUID:               "123",
354                 RuntimeConstraints: arvados.RuntimeConstraints{RAM: 250000000, VCPUs: 2},
355                 Priority:           1,
356         }
357
358         for _, defaults := range [][]string{
359                 nil,
360                 {},
361                 {"--arg1=v1", "--arg2"},
362         } {
363                 c.Logf("%#v", defaults)
364                 s.disp.SbatchArguments = defaults
365
366                 args, err := s.disp.sbatchArgs(container)
367                 c.Check(args, DeepEquals, append(defaults, "--job-name=123", "--mem=239", "--cpus-per-task=2", "--tmp=0", "--nice=9990"))
368                 c.Check(err, IsNil)
369         }
370 }
371
372 func (s *StubbedSuite) TestSbatchInstanceTypeConstraint(c *C) {
373         container := arvados.Container{
374                 UUID:               "123",
375                 RuntimeConstraints: arvados.RuntimeConstraints{RAM: 250000000, VCPUs: 2},
376                 Priority:           1,
377         }
378
379         for _, trial := range []struct {
380                 types      []arvados.InstanceType
381                 sbatchArgs []string
382                 err        error
383         }{
384                 // Choose node type => use --constraint arg
385                 {
386                         types: []arvados.InstanceType{
387                                 {Name: "a1.tiny", Price: 0.02, RAM: 128000000, VCPUs: 1},
388                                 {Name: "a1.small", Price: 0.04, RAM: 256000000, VCPUs: 2},
389                                 {Name: "a1.medium", Price: 0.08, RAM: 512000000, VCPUs: 4},
390                                 {Name: "a1.large", Price: 0.16, RAM: 1024000000, VCPUs: 8},
391                         },
392                         sbatchArgs: []string{"--constraint=instancetype=a1.medium"},
393                 },
394                 // No node types configured => no slurm constraint
395                 {
396                         types:      nil,
397                         sbatchArgs: nil,
398                 },
399                 // No node type is big enough => error
400                 {
401                         types: []arvados.InstanceType{
402                                 {Name: "a1.tiny", Price: 0.02, RAM: 128000000, VCPUs: 1},
403                         },
404                         err: dispatchcloud.ErrConstraintsNotSatisfiable,
405                 },
406         } {
407                 c.Logf("%#v", trial)
408                 s.disp.cluster = &arvados.Cluster{InstanceTypes: trial.types}
409
410                 args, err := s.disp.sbatchArgs(container)
411                 c.Check(err, Equals, trial.err)
412                 if trial.err == nil {
413                         c.Check(args, DeepEquals, append([]string{"--job-name=123", "--mem=239", "--cpus-per-task=2", "--tmp=0", "--nice=9990"}, trial.sbatchArgs...))
414                 }
415         }
416 }
417
418 func (s *StubbedSuite) TestSbatchPartition(c *C) {
419         container := arvados.Container{
420                 UUID:                 "123",
421                 RuntimeConstraints:   arvados.RuntimeConstraints{RAM: 250000000, VCPUs: 1},
422                 SchedulingParameters: arvados.SchedulingParameters{Partitions: []string{"blurb", "b2"}},
423                 Priority:             1,
424         }
425
426         args, err := s.disp.sbatchArgs(container)
427         c.Check(args, DeepEquals, []string{
428                 "--job-name=123", "--mem=239", "--cpus-per-task=1", "--tmp=0", "--nice=9990",
429                 "--partition=blurb,b2",
430         })
431         c.Check(err, IsNil)
432 }