13964: Trim space
[arvados.git] / services / crunch-dispatch-slurm / crunch-dispatch-slurm_test.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package main
6
7 import (
8         "bytes"
9         "context"
10         "errors"
11         "fmt"
12         "io"
13         "io/ioutil"
14         "net/http"
15         "net/http/httptest"
16         "os"
17         "os/exec"
18         "strings"
19         "testing"
20         "time"
21
22         "git.curoverse.com/arvados.git/lib/dispatchcloud"
23         "git.curoverse.com/arvados.git/sdk/go/arvados"
24         "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
25         "git.curoverse.com/arvados.git/sdk/go/arvadostest"
26         "git.curoverse.com/arvados.git/sdk/go/dispatch"
27         "github.com/Sirupsen/logrus"
28         . "gopkg.in/check.v1"
29 )
30
31 // Gocheck boilerplate
32 func Test(t *testing.T) {
33         TestingT(t)
34 }
35
36 var _ = Suite(&IntegrationSuite{})
37 var _ = Suite(&StubbedSuite{})
38
39 type IntegrationSuite struct {
40         disp  Dispatcher
41         slurm slurmFake
42 }
43
44 func (s *IntegrationSuite) SetUpTest(c *C) {
45         arvadostest.StartAPI()
46         os.Setenv("ARVADOS_API_TOKEN", arvadostest.Dispatch1Token)
47         s.disp = Dispatcher{}
48         s.disp.setup()
49         s.slurm = slurmFake{}
50 }
51
52 func (s *IntegrationSuite) TearDownTest(c *C) {
53         arvadostest.ResetEnv()
54         arvadostest.StopAPI()
55 }
56
57 type slurmFake struct {
58         didBatch      [][]string
59         didCancel     []string
60         didRelease    []string
61         didRenice     [][]string
62         queue         string
63         rejectNice10K bool
64         // If non-nil, run this func during the 2nd+ call to Cancel()
65         onCancel func()
66         // Error returned by Batch()
67         errBatch error
68 }
69
70 func (sf *slurmFake) Batch(script io.Reader, args []string) error {
71         sf.didBatch = append(sf.didBatch, args)
72         return sf.errBatch
73 }
74
75 func (sf *slurmFake) QueueCommand(args []string) *exec.Cmd {
76         return exec.Command("echo", sf.queue)
77 }
78
79 func (sf *slurmFake) Release(name string) error {
80         sf.didRelease = append(sf.didRelease, name)
81         return nil
82 }
83
84 func (sf *slurmFake) Renice(name string, nice int64) error {
85         sf.didRenice = append(sf.didRenice, []string{name, fmt.Sprintf("%d", nice)})
86         if sf.rejectNice10K && nice > 10000 {
87                 return errors.New("scontrol: error: Invalid nice value, must be between -10000 and 10000")
88         }
89         return nil
90 }
91
92 func (sf *slurmFake) Cancel(name string) error {
93         sf.didCancel = append(sf.didCancel, name)
94         if len(sf.didCancel) == 1 {
95                 // simulate error on first attempt
96                 return errors.New("something terrible happened")
97         }
98         if sf.onCancel != nil {
99                 sf.onCancel()
100         }
101         return nil
102 }
103
104 func (s *IntegrationSuite) integrationTest(c *C,
105         expectBatch [][]string,
106         runContainer func(*dispatch.Dispatcher, arvados.Container)) arvados.Container {
107         arvadostest.ResetEnv()
108
109         arv, err := arvadosclient.MakeArvadosClient()
110         c.Assert(err, IsNil)
111
112         // There should be one queued container
113         params := arvadosclient.Dict{
114                 "filters": [][]string{{"state", "=", "Queued"}},
115         }
116         var containers arvados.ContainerList
117         err = arv.List("containers", params, &containers)
118         c.Check(err, IsNil)
119         c.Assert(len(containers.Items), Equals, 1)
120
121         s.disp.CrunchRunCommand = []string{"echo"}
122
123         ctx, cancel := context.WithCancel(context.Background())
124         doneRun := make(chan struct{})
125
126         s.disp.Dispatcher = &dispatch.Dispatcher{
127                 Arv:        arv,
128                 PollPeriod: time.Second,
129                 RunContainer: func(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) {
130                         go func() {
131                                 runContainer(disp, ctr)
132                                 s.slurm.queue = ""
133                                 doneRun <- struct{}{}
134                         }()
135                         s.disp.runContainer(disp, ctr, status)
136                         cancel()
137                 },
138         }
139
140         s.disp.slurm = &s.slurm
141         s.disp.sqCheck = &SqueueChecker{
142                 Logger: logrus.StandardLogger(),
143                 Period: 500 * time.Millisecond,
144                 Slurm:  s.disp.slurm,
145         }
146
147         err = s.disp.Dispatcher.Run(ctx)
148         <-doneRun
149         c.Assert(err, Equals, context.Canceled)
150
151         s.disp.sqCheck.Stop()
152
153         c.Check(s.slurm.didBatch, DeepEquals, expectBatch)
154
155         // There should be no queued containers now
156         err = arv.List("containers", params, &containers)
157         c.Check(err, IsNil)
158         c.Check(len(containers.Items), Equals, 0)
159
160         // Previously "Queued" container should now be in "Complete" state
161         var container arvados.Container
162         err = arv.Get("containers", "zzzzz-dz642-queuedcontainer", nil, &container)
163         c.Check(err, IsNil)
164         return container
165 }
166
167 func (s *IntegrationSuite) TestNormal(c *C) {
168         s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 10000 100 PENDING Resources\n"}
169         container := s.integrationTest(c,
170                 nil,
171                 func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
172                         dispatcher.UpdateState(container.UUID, dispatch.Running)
173                         time.Sleep(3 * time.Second)
174                         dispatcher.UpdateState(container.UUID, dispatch.Complete)
175                 })
176         c.Check(container.State, Equals, arvados.ContainerStateComplete)
177 }
178
179 func (s *IntegrationSuite) TestCancel(c *C) {
180         s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 10000 100 PENDING Resources\n"}
181         readyToCancel := make(chan bool)
182         s.slurm.onCancel = func() { <-readyToCancel }
183         container := s.integrationTest(c,
184                 nil,
185                 func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
186                         dispatcher.UpdateState(container.UUID, dispatch.Running)
187                         time.Sleep(time.Second)
188                         dispatcher.Arv.Update("containers", container.UUID,
189                                 arvadosclient.Dict{
190                                         "container": arvadosclient.Dict{"priority": 0}},
191                                 nil)
192                         readyToCancel <- true
193                         close(readyToCancel)
194                 })
195         c.Check(container.State, Equals, arvados.ContainerStateCancelled)
196         c.Check(len(s.slurm.didCancel) > 1, Equals, true)
197         c.Check(s.slurm.didCancel[:2], DeepEquals, []string{"zzzzz-dz642-queuedcontainer", "zzzzz-dz642-queuedcontainer"})
198 }
199
200 func (s *IntegrationSuite) TestMissingFromSqueue(c *C) {
201         container := s.integrationTest(c,
202                 [][]string{{
203                         fmt.Sprintf("--job-name=%s", "zzzzz-dz642-queuedcontainer"),
204                         fmt.Sprintf("--nice=%d", 10000),
205                         "--no-requeue",
206                         fmt.Sprintf("--mem=%d", 11445),
207                         fmt.Sprintf("--cpus-per-task=%d", 4),
208                         fmt.Sprintf("--tmp=%d", 45777),
209                 }},
210                 func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
211                         dispatcher.UpdateState(container.UUID, dispatch.Running)
212                         time.Sleep(3 * time.Second)
213                         dispatcher.UpdateState(container.UUID, dispatch.Complete)
214                 })
215         c.Check(container.State, Equals, arvados.ContainerStateCancelled)
216 }
217
218 func (s *IntegrationSuite) TestSbatchFail(c *C) {
219         s.slurm = slurmFake{errBatch: errors.New("something terrible happened")}
220         container := s.integrationTest(c,
221                 [][]string{{"--job-name=zzzzz-dz642-queuedcontainer", "--nice=10000", "--no-requeue", "--mem=11445", "--cpus-per-task=4", "--tmp=45777"}},
222                 func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
223                         dispatcher.UpdateState(container.UUID, dispatch.Running)
224                         dispatcher.UpdateState(container.UUID, dispatch.Complete)
225                 })
226         c.Check(container.State, Equals, arvados.ContainerStateComplete)
227
228         arv, err := arvadosclient.MakeArvadosClient()
229         c.Assert(err, IsNil)
230
231         var ll arvados.LogList
232         err = arv.List("logs", arvadosclient.Dict{"filters": [][]string{
233                 {"object_uuid", "=", container.UUID},
234                 {"event_type", "=", "dispatch"},
235         }}, &ll)
236         c.Assert(err, IsNil)
237         c.Assert(len(ll.Items), Equals, 1)
238 }
239
240 type StubbedSuite struct {
241         disp Dispatcher
242 }
243
244 func (s *StubbedSuite) SetUpTest(c *C) {
245         s.disp = Dispatcher{}
246         s.disp.setup()
247 }
248
249 func (s *StubbedSuite) TestAPIErrorGettingContainers(c *C) {
250         apiStubResponses := make(map[string]arvadostest.StubResponse)
251         apiStubResponses["/arvados/v1/api_client_authorizations/current"] = arvadostest.StubResponse{200, `{"uuid":"` + arvadostest.Dispatch1AuthUUID + `"}`}
252         apiStubResponses["/arvados/v1/containers"] = arvadostest.StubResponse{500, string(`{}`)}
253
254         s.testWithServerStub(c, apiStubResponses, "echo", "error getting count of containers")
255 }
256
257 func (s *StubbedSuite) testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubResponse, crunchCmd string, expected string) {
258         apiStub := arvadostest.ServerStub{apiStubResponses}
259
260         api := httptest.NewServer(&apiStub)
261         defer api.Close()
262
263         arv := &arvadosclient.ArvadosClient{
264                 Scheme:    "http",
265                 ApiServer: api.URL[7:],
266                 ApiToken:  "abc123",
267                 Client:    &http.Client{Transport: &http.Transport{}},
268                 Retries:   0,
269         }
270
271         buf := bytes.NewBuffer(nil)
272         logrus.SetOutput(io.MultiWriter(buf, os.Stderr))
273         defer logrus.SetOutput(os.Stderr)
274
275         s.disp.CrunchRunCommand = []string{crunchCmd}
276
277         ctx, cancel := context.WithCancel(context.Background())
278         dispatcher := dispatch.Dispatcher{
279                 Arv:        arv,
280                 PollPeriod: time.Second,
281                 RunContainer: func(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) {
282                         go func() {
283                                 time.Sleep(time.Second)
284                                 disp.UpdateState(ctr.UUID, dispatch.Running)
285                                 disp.UpdateState(ctr.UUID, dispatch.Complete)
286                         }()
287                         s.disp.runContainer(disp, ctr, status)
288                         cancel()
289                 },
290         }
291
292         go func() {
293                 for i := 0; i < 80 && !strings.Contains(buf.String(), expected); i++ {
294                         time.Sleep(100 * time.Millisecond)
295                 }
296                 cancel()
297         }()
298
299         err := dispatcher.Run(ctx)
300         c.Assert(err, Equals, context.Canceled)
301
302         c.Check(buf.String(), Matches, `(?ms).*`+expected+`.*`)
303 }
304
305 func (s *StubbedSuite) TestNoSuchConfigFile(c *C) {
306         err := s.disp.readConfig("/nosuchdir89j7879/8hjwr7ojgyy7")
307         c.Assert(err, NotNil)
308 }
309
310 func (s *StubbedSuite) TestBadSbatchArgsConfig(c *C) {
311         tmpfile, err := ioutil.TempFile(os.TempDir(), "config")
312         c.Check(err, IsNil)
313         defer os.Remove(tmpfile.Name())
314
315         _, err = tmpfile.Write([]byte(`{"SbatchArguments": "oops this is not a string array"}`))
316         c.Check(err, IsNil)
317
318         err = s.disp.readConfig(tmpfile.Name())
319         c.Assert(err, NotNil)
320 }
321
322 func (s *StubbedSuite) TestNoSuchArgInConfigIgnored(c *C) {
323         tmpfile, err := ioutil.TempFile(os.TempDir(), "config")
324         c.Check(err, IsNil)
325         defer os.Remove(tmpfile.Name())
326
327         _, err = tmpfile.Write([]byte(`{"NoSuchArg": "Nobody loves me, not one tiny hunk."}`))
328         c.Check(err, IsNil)
329
330         err = s.disp.readConfig(tmpfile.Name())
331         c.Assert(err, IsNil)
332         c.Check(0, Equals, len(s.disp.SbatchArguments))
333 }
334
335 func (s *StubbedSuite) TestReadConfig(c *C) {
336         tmpfile, err := ioutil.TempFile(os.TempDir(), "config")
337         c.Check(err, IsNil)
338         defer os.Remove(tmpfile.Name())
339
340         args := []string{"--arg1=v1", "--arg2", "--arg3=v3"}
341         argsS := `{"SbatchArguments": ["--arg1=v1",  "--arg2", "--arg3=v3"]}`
342         _, err = tmpfile.Write([]byte(argsS))
343         c.Check(err, IsNil)
344
345         err = s.disp.readConfig(tmpfile.Name())
346         c.Assert(err, IsNil)
347         c.Check(args, DeepEquals, s.disp.SbatchArguments)
348 }
349
350 func (s *StubbedSuite) TestSbatchArgs(c *C) {
351         container := arvados.Container{
352                 UUID:               "123",
353                 RuntimeConstraints: arvados.RuntimeConstraints{RAM: 250000000, VCPUs: 2},
354                 Priority:           1,
355         }
356
357         for _, defaults := range [][]string{
358                 nil,
359                 {},
360                 {"--arg1=v1", "--arg2"},
361         } {
362                 c.Logf("%#v", defaults)
363                 s.disp.SbatchArguments = defaults
364
365                 args, err := s.disp.sbatchArgs(container)
366                 c.Check(args, DeepEquals, append(defaults, "--job-name=123", "--nice=10000", "--no-requeue", "--mem=239", "--cpus-per-task=2", "--tmp=0"))
367                 c.Check(err, IsNil)
368         }
369 }
370
371 func (s *StubbedSuite) TestSbatchInstanceTypeConstraint(c *C) {
372         container := arvados.Container{
373                 UUID:               "123",
374                 RuntimeConstraints: arvados.RuntimeConstraints{RAM: 250000000, VCPUs: 2},
375                 Priority:           1,
376         }
377
378         for _, trial := range []struct {
379                 types      map[string]arvados.InstanceType
380                 sbatchArgs []string
381                 err        error
382         }{
383                 // Choose node type => use --constraint arg
384                 {
385                         types: map[string]arvados.InstanceType{
386                                 "a1.tiny":   {Name: "a1.tiny", Price: 0.02, RAM: 128000000, VCPUs: 1},
387                                 "a1.small":  {Name: "a1.small", Price: 0.04, RAM: 256000000, VCPUs: 2},
388                                 "a1.medium": {Name: "a1.medium", Price: 0.08, RAM: 512000000, VCPUs: 4},
389                                 "a1.large":  {Name: "a1.large", Price: 0.16, RAM: 1024000000, VCPUs: 8},
390                         },
391                         sbatchArgs: []string{"--constraint=instancetype=a1.medium"},
392                 },
393                 // No node types configured => no slurm constraint
394                 {
395                         types:      nil,
396                         sbatchArgs: []string{"--mem=239", "--cpus-per-task=2", "--tmp=0"},
397                 },
398                 // No node type is big enough => error
399                 {
400                         types: map[string]arvados.InstanceType{
401                                 "a1.tiny": {Name: "a1.tiny", Price: 0.02, RAM: 128000000, VCPUs: 1},
402                         },
403                         err: dispatchcloud.ConstraintsNotSatisfiableError{},
404                 },
405         } {
406                 c.Logf("%#v", trial)
407                 s.disp.cluster = &arvados.Cluster{InstanceTypes: trial.types}
408
409                 args, err := s.disp.sbatchArgs(container)
410                 c.Check(err == nil, Equals, trial.err == nil)
411                 if trial.err == nil {
412                         c.Check(args, DeepEquals, append([]string{"--job-name=123", "--nice=10000", "--no-requeue"}, trial.sbatchArgs...))
413                 } else {
414                         c.Check(len(err.(dispatchcloud.ConstraintsNotSatisfiableError).AvailableTypes), Equals, len(trial.types))
415                 }
416         }
417 }
418
419 func (s *StubbedSuite) TestSbatchPartition(c *C) {
420         container := arvados.Container{
421                 UUID:                 "123",
422                 RuntimeConstraints:   arvados.RuntimeConstraints{RAM: 250000000, VCPUs: 1},
423                 SchedulingParameters: arvados.SchedulingParameters{Partitions: []string{"blurb", "b2"}},
424                 Priority:             1,
425         }
426
427         args, err := s.disp.sbatchArgs(container)
428         c.Check(args, DeepEquals, []string{
429                 "--job-name=123", "--nice=10000", "--no-requeue",
430                 "--mem=239", "--cpus-per-task=1", "--tmp=0",
431                 "--partition=blurb,b2",
432         })
433         c.Check(err, IsNil)
434 }