Merge branch 'master' into 11454-wb-federated-search
[arvados.git] / services / crunch-dispatch-slurm / crunch-dispatch-slurm_test.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package main
6
7 import (
8         "bytes"
9         "context"
10         "errors"
11         "fmt"
12         "io"
13         "io/ioutil"
14         "log"
15         "net/http"
16         "net/http/httptest"
17         "os"
18         "os/exec"
19         "strings"
20         "testing"
21         "time"
22
23         "git.curoverse.com/arvados.git/sdk/go/arvados"
24         "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
25         "git.curoverse.com/arvados.git/sdk/go/arvadostest"
26         "git.curoverse.com/arvados.git/sdk/go/dispatch"
27         . "gopkg.in/check.v1"
28 )
29
30 // Gocheck boilerplate
31 func Test(t *testing.T) {
32         TestingT(t)
33 }
34
35 var _ = Suite(&TestSuite{})
36 var _ = Suite(&MockArvadosServerSuite{})
37
38 type TestSuite struct{}
39 type MockArvadosServerSuite struct{}
40
41 var initialArgs []string
42
43 func (s *TestSuite) SetUpSuite(c *C) {
44         initialArgs = os.Args
45 }
46
47 func (s *TestSuite) TearDownSuite(c *C) {
48 }
49
50 func (s *TestSuite) SetUpTest(c *C) {
51         args := []string{"crunch-dispatch-slurm"}
52         os.Args = args
53
54         arvadostest.StartAPI()
55         os.Setenv("ARVADOS_API_TOKEN", arvadostest.Dispatch1Token)
56 }
57
58 func (s *TestSuite) TearDownTest(c *C) {
59         os.Args = initialArgs
60         arvadostest.ResetEnv()
61         arvadostest.StopAPI()
62 }
63
64 func (s *MockArvadosServerSuite) TearDownTest(c *C) {
65         arvadostest.ResetEnv()
66 }
67
68 type slurmFake struct {
69         didBatch  [][]string
70         didCancel []string
71         didRenice [][]string
72         queue     string
73         // If non-nil, run this func during the 2nd+ call to Cancel()
74         onCancel func()
75         // Error returned by Batch()
76         errBatch error
77 }
78
79 func (sf *slurmFake) Batch(script io.Reader, args []string) error {
80         sf.didBatch = append(sf.didBatch, args)
81         return sf.errBatch
82 }
83
84 func (sf *slurmFake) QueueCommand(args []string) *exec.Cmd {
85         return exec.Command("echo", sf.queue)
86 }
87
88 func (sf *slurmFake) Renice(name string, nice int) error {
89         sf.didRenice = append(sf.didRenice, []string{name, fmt.Sprintf("%d", nice)})
90         return nil
91 }
92
93 func (sf *slurmFake) Cancel(name string) error {
94         sf.didCancel = append(sf.didCancel, name)
95         if len(sf.didCancel) == 1 {
96                 // simulate error on first attempt
97                 return errors.New("something terrible happened")
98         }
99         if sf.onCancel != nil {
100                 sf.onCancel()
101         }
102         return nil
103 }
104
105 func (s *TestSuite) integrationTest(c *C, slurm *slurmFake,
106         expectBatch [][]string,
107         runContainer func(*dispatch.Dispatcher, arvados.Container)) arvados.Container {
108         arvadostest.ResetEnv()
109
110         arv, err := arvadosclient.MakeArvadosClient()
111         c.Assert(err, IsNil)
112
113         defer func(orig Slurm) {
114                 theConfig.slurm = orig
115         }(theConfig.slurm)
116         theConfig.slurm = slurm
117
118         // There should be one queued container
119         params := arvadosclient.Dict{
120                 "filters": [][]string{{"state", "=", "Queued"}},
121         }
122         var containers arvados.ContainerList
123         err = arv.List("containers", params, &containers)
124         c.Check(err, IsNil)
125         c.Check(len(containers.Items), Equals, 1)
126
127         theConfig.CrunchRunCommand = []string{"echo"}
128
129         ctx, cancel := context.WithCancel(context.Background())
130         doneRun := make(chan struct{})
131
132         dispatcher := dispatch.Dispatcher{
133                 Arv:        arv,
134                 PollPeriod: time.Duration(1) * time.Second,
135                 RunContainer: func(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) {
136                         go func() {
137                                 runContainer(disp, ctr)
138                                 slurm.queue = ""
139                                 doneRun <- struct{}{}
140                         }()
141                         run(disp, ctr, status)
142                         cancel()
143                 },
144         }
145
146         sqCheck = &SqueueChecker{Period: 500 * time.Millisecond}
147
148         err = dispatcher.Run(ctx)
149         <-doneRun
150         c.Assert(err, Equals, context.Canceled)
151
152         sqCheck.Stop()
153
154         c.Check(slurm.didBatch, DeepEquals, expectBatch)
155
156         // There should be no queued containers now
157         err = arv.List("containers", params, &containers)
158         c.Check(err, IsNil)
159         c.Check(len(containers.Items), Equals, 0)
160
161         // Previously "Queued" container should now be in "Complete" state
162         var container arvados.Container
163         err = arv.Get("containers", "zzzzz-dz642-queuedcontainer", nil, &container)
164         c.Check(err, IsNil)
165         return container
166 }
167
168 func (s *TestSuite) TestIntegrationNormal(c *C) {
169         container := s.integrationTest(c,
170                 &slurmFake{queue: "zzzzz-dz642-queuedcontainer 9990 100\n"},
171                 nil,
172                 func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
173                         dispatcher.UpdateState(container.UUID, dispatch.Running)
174                         time.Sleep(3 * time.Second)
175                         dispatcher.UpdateState(container.UUID, dispatch.Complete)
176                 })
177         c.Check(container.State, Equals, arvados.ContainerStateComplete)
178 }
179
180 func (s *TestSuite) TestIntegrationCancel(c *C) {
181         slurm := &slurmFake{queue: "zzzzz-dz642-queuedcontainer 9990 100\n"}
182         readyToCancel := make(chan bool)
183         slurm.onCancel = func() { <-readyToCancel }
184         container := s.integrationTest(c,
185                 slurm,
186                 nil,
187                 func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
188                         dispatcher.UpdateState(container.UUID, dispatch.Running)
189                         time.Sleep(time.Second)
190                         dispatcher.Arv.Update("containers", container.UUID,
191                                 arvadosclient.Dict{
192                                         "container": arvadosclient.Dict{"priority": 0}},
193                                 nil)
194                         readyToCancel <- true
195                         close(readyToCancel)
196                 })
197         c.Check(container.State, Equals, arvados.ContainerStateCancelled)
198         c.Check(len(slurm.didCancel) > 1, Equals, true)
199         c.Check(slurm.didCancel[:2], DeepEquals, []string{"zzzzz-dz642-queuedcontainer", "zzzzz-dz642-queuedcontainer"})
200 }
201
202 func (s *TestSuite) TestIntegrationMissingFromSqueue(c *C) {
203         container := s.integrationTest(c, &slurmFake{},
204                 [][]string{{
205                         fmt.Sprintf("--job-name=%s", "zzzzz-dz642-queuedcontainer"),
206                         fmt.Sprintf("--mem=%d", 11445),
207                         fmt.Sprintf("--cpus-per-task=%d", 4),
208                         fmt.Sprintf("--tmp=%d", 45777),
209                         fmt.Sprintf("--nice=%d", 9990)}},
210                 func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
211                         dispatcher.UpdateState(container.UUID, dispatch.Running)
212                         time.Sleep(3 * time.Second)
213                         dispatcher.UpdateState(container.UUID, dispatch.Complete)
214                 })
215         c.Check(container.State, Equals, arvados.ContainerStateCancelled)
216 }
217
218 func (s *TestSuite) TestSbatchFail(c *C) {
219         container := s.integrationTest(c,
220                 &slurmFake{errBatch: errors.New("something terrible happened")},
221                 [][]string{{"--job-name=zzzzz-dz642-queuedcontainer", "--mem=11445", "--cpus-per-task=4", "--tmp=45777", "--nice=9990"}},
222                 func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
223                         dispatcher.UpdateState(container.UUID, dispatch.Running)
224                         dispatcher.UpdateState(container.UUID, dispatch.Complete)
225                 })
226         c.Check(container.State, Equals, arvados.ContainerStateComplete)
227
228         arv, err := arvadosclient.MakeArvadosClient()
229         c.Assert(err, IsNil)
230
231         var ll arvados.LogList
232         err = arv.List("logs", arvadosclient.Dict{"filters": [][]string{
233                 {"object_uuid", "=", container.UUID},
234                 {"event_type", "=", "dispatch"},
235         }}, &ll)
236         c.Assert(len(ll.Items), Equals, 1)
237 }
238
239 func (s *MockArvadosServerSuite) TestAPIErrorGettingContainers(c *C) {
240         apiStubResponses := make(map[string]arvadostest.StubResponse)
241         apiStubResponses["/arvados/v1/api_client_authorizations/current"] = arvadostest.StubResponse{200, `{"uuid":"` + arvadostest.Dispatch1AuthUUID + `"}`}
242         apiStubResponses["/arvados/v1/containers"] = arvadostest.StubResponse{500, string(`{}`)}
243
244         testWithServerStub(c, apiStubResponses, "echo", "Error getting list of containers")
245 }
246
247 func testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubResponse, crunchCmd string, expected string) {
248         apiStub := arvadostest.ServerStub{apiStubResponses}
249
250         api := httptest.NewServer(&apiStub)
251         defer api.Close()
252
253         arv := &arvadosclient.ArvadosClient{
254                 Scheme:    "http",
255                 ApiServer: api.URL[7:],
256                 ApiToken:  "abc123",
257                 Client:    &http.Client{Transport: &http.Transport{}},
258                 Retries:   0,
259         }
260
261         buf := bytes.NewBuffer(nil)
262         log.SetOutput(io.MultiWriter(buf, os.Stderr))
263         defer log.SetOutput(os.Stderr)
264
265         theConfig.CrunchRunCommand = []string{crunchCmd}
266
267         ctx, cancel := context.WithCancel(context.Background())
268         dispatcher := dispatch.Dispatcher{
269                 Arv:        arv,
270                 PollPeriod: time.Duration(1) * time.Second,
271                 RunContainer: func(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) {
272                         go func() {
273                                 time.Sleep(1 * time.Second)
274                                 disp.UpdateState(ctr.UUID, dispatch.Running)
275                                 disp.UpdateState(ctr.UUID, dispatch.Complete)
276                         }()
277                         run(disp, ctr, status)
278                         cancel()
279                 },
280         }
281
282         go func() {
283                 for i := 0; i < 80 && !strings.Contains(buf.String(), expected); i++ {
284                         time.Sleep(100 * time.Millisecond)
285                 }
286                 cancel()
287         }()
288
289         err := dispatcher.Run(ctx)
290         c.Assert(err, Equals, context.Canceled)
291
292         c.Check(buf.String(), Matches, `(?ms).*`+expected+`.*`)
293 }
294
295 func (s *MockArvadosServerSuite) TestNoSuchConfigFile(c *C) {
296         var config Config
297         err := readConfig(&config, "/nosuchdir89j7879/8hjwr7ojgyy7")
298         c.Assert(err, NotNil)
299 }
300
301 func (s *MockArvadosServerSuite) TestBadSbatchArgsConfig(c *C) {
302         var config Config
303
304         tmpfile, err := ioutil.TempFile(os.TempDir(), "config")
305         c.Check(err, IsNil)
306         defer os.Remove(tmpfile.Name())
307
308         _, err = tmpfile.Write([]byte(`{"SbatchArguments": "oops this is not a string array"}`))
309         c.Check(err, IsNil)
310
311         err = readConfig(&config, tmpfile.Name())
312         c.Assert(err, NotNil)
313 }
314
315 func (s *MockArvadosServerSuite) TestNoSuchArgInConfigIgnored(c *C) {
316         var config Config
317
318         tmpfile, err := ioutil.TempFile(os.TempDir(), "config")
319         c.Check(err, IsNil)
320         defer os.Remove(tmpfile.Name())
321
322         _, err = tmpfile.Write([]byte(`{"NoSuchArg": "Nobody loves me, not one tiny hunk."}`))
323         c.Check(err, IsNil)
324
325         err = readConfig(&config, tmpfile.Name())
326         c.Assert(err, IsNil)
327         c.Check(0, Equals, len(config.SbatchArguments))
328 }
329
330 func (s *MockArvadosServerSuite) TestReadConfig(c *C) {
331         var config Config
332
333         tmpfile, err := ioutil.TempFile(os.TempDir(), "config")
334         c.Check(err, IsNil)
335         defer os.Remove(tmpfile.Name())
336
337         args := []string{"--arg1=v1", "--arg2", "--arg3=v3"}
338         argsS := `{"SbatchArguments": ["--arg1=v1",  "--arg2", "--arg3=v3"]}`
339         _, err = tmpfile.Write([]byte(argsS))
340         c.Check(err, IsNil)
341
342         err = readConfig(&config, tmpfile.Name())
343         c.Assert(err, IsNil)
344         c.Check(3, Equals, len(config.SbatchArguments))
345         c.Check(args, DeepEquals, config.SbatchArguments)
346 }
347
348 func (s *MockArvadosServerSuite) TestSbatchFuncWithNoConfigArgs(c *C) {
349         testSbatchFuncWithArgs(c, nil)
350 }
351
352 func (s *MockArvadosServerSuite) TestSbatchFuncWithEmptyConfigArgs(c *C) {
353         testSbatchFuncWithArgs(c, []string{})
354 }
355
356 func (s *MockArvadosServerSuite) TestSbatchFuncWithConfigArgs(c *C) {
357         testSbatchFuncWithArgs(c, []string{"--arg1=v1", "--arg2"})
358 }
359
360 func testSbatchFuncWithArgs(c *C, args []string) {
361         defer func() { theConfig.SbatchArguments = nil }()
362         theConfig.SbatchArguments = append(theConfig.SbatchArguments, args...)
363
364         container := arvados.Container{
365                 UUID:               "123",
366                 RuntimeConstraints: arvados.RuntimeConstraints{RAM: 250000000, VCPUs: 2},
367                 Priority:           1}
368
369         var expected []string
370         expected = append(expected, theConfig.SbatchArguments...)
371         expected = append(expected, "--job-name=123", "--mem=239", "--cpus-per-task=2", "--tmp=0", "--nice=9990")
372         c.Check(sbatchArgs(container), DeepEquals, expected)
373 }
374
375 func (s *MockArvadosServerSuite) TestSbatchPartition(c *C) {
376         container := arvados.Container{
377                 UUID:                 "123",
378                 RuntimeConstraints:   arvados.RuntimeConstraints{RAM: 250000000, VCPUs: 1},
379                 SchedulingParameters: arvados.SchedulingParameters{Partitions: []string{"blurb", "b2"}},
380                 Priority:             1}
381
382         c.Check(sbatchArgs(container), DeepEquals, []string{
383                 "--job-name=123", "--mem=239", "--cpus-per-task=1", "--tmp=0", "--nice=9990",
384                 "--partition=blurb,b2",
385         })
386 }
387
388 func (s *TestSuite) TestIntegrationChangePriority(c *C) {
389         slurm := &slurmFake{queue: "zzzzz-dz642-queuedcontainer 9990 100\n"}
390         container := s.integrationTest(c, slurm, nil,
391                 func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
392                         dispatcher.UpdateState(container.UUID, dispatch.Running)
393                         time.Sleep(time.Second)
394                         dispatcher.Arv.Update("containers", container.UUID,
395                                 arvadosclient.Dict{
396                                         "container": arvadosclient.Dict{"priority": 600}},
397                                 nil)
398                         time.Sleep(time.Second)
399                         dispatcher.UpdateState(container.UUID, dispatch.Complete)
400                 })
401         c.Check(container.State, Equals, arvados.ContainerStateComplete)
402         c.Assert(len(slurm.didRenice), Not(Equals), 0)
403         c.Check(slurm.didRenice[len(slurm.didRenice)-1], DeepEquals, []string{"zzzzz-dz642-queuedcontainer", "4000"})
404 }