Merge branch '13446-keepstore-tls'
[arvados.git] / services / crunch-dispatch-slurm / crunch-dispatch-slurm_test.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package main
6
7 import (
8         "bytes"
9         "context"
10         "errors"
11         "fmt"
12         "io"
13         "io/ioutil"
14         "log"
15         "net/http"
16         "net/http/httptest"
17         "os"
18         "os/exec"
19         "strings"
20         "testing"
21         "time"
22
23         "git.curoverse.com/arvados.git/lib/dispatchcloud"
24         "git.curoverse.com/arvados.git/sdk/go/arvados"
25         "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
26         "git.curoverse.com/arvados.git/sdk/go/arvadostest"
27         "git.curoverse.com/arvados.git/sdk/go/dispatch"
28         . "gopkg.in/check.v1"
29 )
30
31 // Gocheck boilerplate
32 func Test(t *testing.T) {
33         TestingT(t)
34 }
35
36 var _ = Suite(&IntegrationSuite{})
37 var _ = Suite(&StubbedSuite{})
38
39 type IntegrationSuite struct {
40         disp  Dispatcher
41         slurm slurmFake
42 }
43
44 func (s *IntegrationSuite) SetUpTest(c *C) {
45         arvadostest.StartAPI()
46         os.Setenv("ARVADOS_API_TOKEN", arvadostest.Dispatch1Token)
47         s.disp = Dispatcher{}
48         s.disp.setup()
49         s.slurm = slurmFake{}
50 }
51
52 func (s *IntegrationSuite) TearDownTest(c *C) {
53         arvadostest.ResetEnv()
54         arvadostest.StopAPI()
55 }
56
57 type slurmFake struct {
58         didBatch   [][]string
59         didCancel  []string
60         didRelease []string
61         didRenice  [][]string
62         queue      string
63         // If non-nil, run this func during the 2nd+ call to Cancel()
64         onCancel func()
65         // Error returned by Batch()
66         errBatch error
67 }
68
69 func (sf *slurmFake) Batch(script io.Reader, args []string) error {
70         sf.didBatch = append(sf.didBatch, args)
71         return sf.errBatch
72 }
73
74 func (sf *slurmFake) QueueCommand(args []string) *exec.Cmd {
75         return exec.Command("echo", sf.queue)
76 }
77
78 func (sf *slurmFake) Release(name string) error {
79         sf.didRelease = append(sf.didRelease, name)
80         return nil
81 }
82
83 func (sf *slurmFake) Renice(name string, nice int64) error {
84         sf.didRenice = append(sf.didRenice, []string{name, fmt.Sprintf("%d", nice)})
85         return nil
86 }
87
88 func (sf *slurmFake) Cancel(name string) error {
89         sf.didCancel = append(sf.didCancel, name)
90         if len(sf.didCancel) == 1 {
91                 // simulate error on first attempt
92                 return errors.New("something terrible happened")
93         }
94         if sf.onCancel != nil {
95                 sf.onCancel()
96         }
97         return nil
98 }
99
100 func (s *IntegrationSuite) integrationTest(c *C,
101         expectBatch [][]string,
102         runContainer func(*dispatch.Dispatcher, arvados.Container)) arvados.Container {
103         arvadostest.ResetEnv()
104
105         arv, err := arvadosclient.MakeArvadosClient()
106         c.Assert(err, IsNil)
107
108         // There should be one queued container
109         params := arvadosclient.Dict{
110                 "filters": [][]string{{"state", "=", "Queued"}},
111         }
112         var containers arvados.ContainerList
113         err = arv.List("containers", params, &containers)
114         c.Check(err, IsNil)
115         c.Check(len(containers.Items), Equals, 1)
116
117         s.disp.CrunchRunCommand = []string{"echo"}
118
119         ctx, cancel := context.WithCancel(context.Background())
120         doneRun := make(chan struct{})
121
122         s.disp.Dispatcher = &dispatch.Dispatcher{
123                 Arv:        arv,
124                 PollPeriod: time.Second,
125                 RunContainer: func(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) {
126                         go func() {
127                                 runContainer(disp, ctr)
128                                 s.slurm.queue = ""
129                                 doneRun <- struct{}{}
130                         }()
131                         s.disp.runContainer(disp, ctr, status)
132                         cancel()
133                 },
134         }
135
136         s.disp.slurm = &s.slurm
137         s.disp.sqCheck = &SqueueChecker{Period: 500 * time.Millisecond, Slurm: s.disp.slurm}
138
139         err = s.disp.Dispatcher.Run(ctx)
140         <-doneRun
141         c.Assert(err, Equals, context.Canceled)
142
143         s.disp.sqCheck.Stop()
144
145         c.Check(s.slurm.didBatch, DeepEquals, expectBatch)
146
147         // There should be no queued containers now
148         err = arv.List("containers", params, &containers)
149         c.Check(err, IsNil)
150         c.Check(len(containers.Items), Equals, 0)
151
152         // Previously "Queued" container should now be in "Complete" state
153         var container arvados.Container
154         err = arv.Get("containers", "zzzzz-dz642-queuedcontainer", nil, &container)
155         c.Check(err, IsNil)
156         return container
157 }
158
159 func (s *IntegrationSuite) TestNormal(c *C) {
160         s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 10000 100 PENDING Resources\n"}
161         container := s.integrationTest(c,
162                 nil,
163                 func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
164                         dispatcher.UpdateState(container.UUID, dispatch.Running)
165                         time.Sleep(3 * time.Second)
166                         dispatcher.UpdateState(container.UUID, dispatch.Complete)
167                 })
168         c.Check(container.State, Equals, arvados.ContainerStateComplete)
169 }
170
171 func (s *IntegrationSuite) TestCancel(c *C) {
172         s.slurm = slurmFake{queue: "zzzzz-dz642-queuedcontainer 10000 100 PENDING Resources\n"}
173         readyToCancel := make(chan bool)
174         s.slurm.onCancel = func() { <-readyToCancel }
175         container := s.integrationTest(c,
176                 nil,
177                 func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
178                         dispatcher.UpdateState(container.UUID, dispatch.Running)
179                         time.Sleep(time.Second)
180                         dispatcher.Arv.Update("containers", container.UUID,
181                                 arvadosclient.Dict{
182                                         "container": arvadosclient.Dict{"priority": 0}},
183                                 nil)
184                         readyToCancel <- true
185                         close(readyToCancel)
186                 })
187         c.Check(container.State, Equals, arvados.ContainerStateCancelled)
188         c.Check(len(s.slurm.didCancel) > 1, Equals, true)
189         c.Check(s.slurm.didCancel[:2], DeepEquals, []string{"zzzzz-dz642-queuedcontainer", "zzzzz-dz642-queuedcontainer"})
190 }
191
192 func (s *IntegrationSuite) TestMissingFromSqueue(c *C) {
193         container := s.integrationTest(c,
194                 [][]string{{
195                         fmt.Sprintf("--job-name=%s", "zzzzz-dz642-queuedcontainer"),
196                         fmt.Sprintf("--mem=%d", 11445),
197                         fmt.Sprintf("--cpus-per-task=%d", 4),
198                         fmt.Sprintf("--tmp=%d", 45777),
199                         fmt.Sprintf("--nice=%d", 10000)}},
200                 func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
201                         dispatcher.UpdateState(container.UUID, dispatch.Running)
202                         time.Sleep(3 * time.Second)
203                         dispatcher.UpdateState(container.UUID, dispatch.Complete)
204                 })
205         c.Check(container.State, Equals, arvados.ContainerStateCancelled)
206 }
207
208 func (s *IntegrationSuite) TestSbatchFail(c *C) {
209         s.slurm = slurmFake{errBatch: errors.New("something terrible happened")}
210         container := s.integrationTest(c,
211                 [][]string{{"--job-name=zzzzz-dz642-queuedcontainer", "--mem=11445", "--cpus-per-task=4", "--tmp=45777", "--nice=10000"}},
212                 func(dispatcher *dispatch.Dispatcher, container arvados.Container) {
213                         dispatcher.UpdateState(container.UUID, dispatch.Running)
214                         dispatcher.UpdateState(container.UUID, dispatch.Complete)
215                 })
216         c.Check(container.State, Equals, arvados.ContainerStateComplete)
217
218         arv, err := arvadosclient.MakeArvadosClient()
219         c.Assert(err, IsNil)
220
221         var ll arvados.LogList
222         err = arv.List("logs", arvadosclient.Dict{"filters": [][]string{
223                 {"object_uuid", "=", container.UUID},
224                 {"event_type", "=", "dispatch"},
225         }}, &ll)
226         c.Assert(err, IsNil)
227         c.Assert(len(ll.Items), Equals, 1)
228 }
229
230 type StubbedSuite struct {
231         disp Dispatcher
232 }
233
234 func (s *StubbedSuite) SetUpTest(c *C) {
235         s.disp = Dispatcher{}
236         s.disp.setup()
237 }
238
239 func (s *StubbedSuite) TestAPIErrorGettingContainers(c *C) {
240         apiStubResponses := make(map[string]arvadostest.StubResponse)
241         apiStubResponses["/arvados/v1/api_client_authorizations/current"] = arvadostest.StubResponse{200, `{"uuid":"` + arvadostest.Dispatch1AuthUUID + `"}`}
242         apiStubResponses["/arvados/v1/containers"] = arvadostest.StubResponse{500, string(`{}`)}
243
244         s.testWithServerStub(c, apiStubResponses, "echo", "Error getting list of containers")
245 }
246
247 func (s *StubbedSuite) testWithServerStub(c *C, apiStubResponses map[string]arvadostest.StubResponse, crunchCmd string, expected string) {
248         apiStub := arvadostest.ServerStub{apiStubResponses}
249
250         api := httptest.NewServer(&apiStub)
251         defer api.Close()
252
253         arv := &arvadosclient.ArvadosClient{
254                 Scheme:    "http",
255                 ApiServer: api.URL[7:],
256                 ApiToken:  "abc123",
257                 Client:    &http.Client{Transport: &http.Transport{}},
258                 Retries:   0,
259         }
260
261         buf := bytes.NewBuffer(nil)
262         log.SetOutput(io.MultiWriter(buf, os.Stderr))
263         defer log.SetOutput(os.Stderr)
264
265         s.disp.CrunchRunCommand = []string{crunchCmd}
266
267         ctx, cancel := context.WithCancel(context.Background())
268         dispatcher := dispatch.Dispatcher{
269                 Arv:        arv,
270                 PollPeriod: time.Second,
271                 RunContainer: func(disp *dispatch.Dispatcher, ctr arvados.Container, status <-chan arvados.Container) {
272                         go func() {
273                                 time.Sleep(time.Second)
274                                 disp.UpdateState(ctr.UUID, dispatch.Running)
275                                 disp.UpdateState(ctr.UUID, dispatch.Complete)
276                         }()
277                         s.disp.runContainer(disp, ctr, status)
278                         cancel()
279                 },
280         }
281
282         go func() {
283                 for i := 0; i < 80 && !strings.Contains(buf.String(), expected); i++ {
284                         time.Sleep(100 * time.Millisecond)
285                 }
286                 cancel()
287         }()
288
289         err := dispatcher.Run(ctx)
290         c.Assert(err, Equals, context.Canceled)
291
292         c.Check(buf.String(), Matches, `(?ms).*`+expected+`.*`)
293 }
294
295 func (s *StubbedSuite) TestNoSuchConfigFile(c *C) {
296         err := s.disp.readConfig("/nosuchdir89j7879/8hjwr7ojgyy7")
297         c.Assert(err, NotNil)
298 }
299
300 func (s *StubbedSuite) TestBadSbatchArgsConfig(c *C) {
301         tmpfile, err := ioutil.TempFile(os.TempDir(), "config")
302         c.Check(err, IsNil)
303         defer os.Remove(tmpfile.Name())
304
305         _, err = tmpfile.Write([]byte(`{"SbatchArguments": "oops this is not a string array"}`))
306         c.Check(err, IsNil)
307
308         err = s.disp.readConfig(tmpfile.Name())
309         c.Assert(err, NotNil)
310 }
311
312 func (s *StubbedSuite) TestNoSuchArgInConfigIgnored(c *C) {
313         tmpfile, err := ioutil.TempFile(os.TempDir(), "config")
314         c.Check(err, IsNil)
315         defer os.Remove(tmpfile.Name())
316
317         _, err = tmpfile.Write([]byte(`{"NoSuchArg": "Nobody loves me, not one tiny hunk."}`))
318         c.Check(err, IsNil)
319
320         err = s.disp.readConfig(tmpfile.Name())
321         c.Assert(err, IsNil)
322         c.Check(0, Equals, len(s.disp.SbatchArguments))
323 }
324
325 func (s *StubbedSuite) TestReadConfig(c *C) {
326         tmpfile, err := ioutil.TempFile(os.TempDir(), "config")
327         c.Check(err, IsNil)
328         defer os.Remove(tmpfile.Name())
329
330         args := []string{"--arg1=v1", "--arg2", "--arg3=v3"}
331         argsS := `{"SbatchArguments": ["--arg1=v1",  "--arg2", "--arg3=v3"]}`
332         _, err = tmpfile.Write([]byte(argsS))
333         c.Check(err, IsNil)
334
335         err = s.disp.readConfig(tmpfile.Name())
336         c.Assert(err, IsNil)
337         c.Check(args, DeepEquals, s.disp.SbatchArguments)
338 }
339
340 func (s *StubbedSuite) TestSbatchArgs(c *C) {
341         container := arvados.Container{
342                 UUID:               "123",
343                 RuntimeConstraints: arvados.RuntimeConstraints{RAM: 250000000, VCPUs: 2},
344                 Priority:           1,
345         }
346
347         for _, defaults := range [][]string{
348                 nil,
349                 {},
350                 {"--arg1=v1", "--arg2"},
351         } {
352                 c.Logf("%#v", defaults)
353                 s.disp.SbatchArguments = defaults
354
355                 args, err := s.disp.sbatchArgs(container)
356                 c.Check(args, DeepEquals, append(defaults, "--job-name=123", "--mem=239", "--cpus-per-task=2", "--tmp=0", "--nice=10000"))
357                 c.Check(err, IsNil)
358         }
359 }
360
361 func (s *StubbedSuite) TestSbatchInstanceTypeConstraint(c *C) {
362         container := arvados.Container{
363                 UUID:               "123",
364                 RuntimeConstraints: arvados.RuntimeConstraints{RAM: 250000000, VCPUs: 2},
365                 Priority:           1,
366         }
367
368         for _, trial := range []struct {
369                 types      []arvados.InstanceType
370                 sbatchArgs []string
371                 err        error
372         }{
373                 // Choose node type => use --constraint arg
374                 {
375                         types: []arvados.InstanceType{
376                                 {Name: "a1.tiny", Price: 0.02, RAM: 128000000, VCPUs: 1},
377                                 {Name: "a1.small", Price: 0.04, RAM: 256000000, VCPUs: 2},
378                                 {Name: "a1.medium", Price: 0.08, RAM: 512000000, VCPUs: 4},
379                                 {Name: "a1.large", Price: 0.16, RAM: 1024000000, VCPUs: 8},
380                         },
381                         sbatchArgs: []string{"--constraint=instancetype=a1.medium"},
382                 },
383                 // No node types configured => no slurm constraint
384                 {
385                         types:      nil,
386                         sbatchArgs: nil,
387                 },
388                 // No node type is big enough => error
389                 {
390                         types: []arvados.InstanceType{
391                                 {Name: "a1.tiny", Price: 0.02, RAM: 128000000, VCPUs: 1},
392                         },
393                         err: dispatchcloud.ErrConstraintsNotSatisfiable,
394                 },
395         } {
396                 c.Logf("%#v", trial)
397                 s.disp.cluster = &arvados.Cluster{InstanceTypes: trial.types}
398
399                 args, err := s.disp.sbatchArgs(container)
400                 c.Check(err, Equals, trial.err)
401                 if trial.err == nil {
402                         c.Check(args, DeepEquals, append([]string{"--job-name=123", "--mem=239", "--cpus-per-task=2", "--tmp=0", "--nice=10000"}, trial.sbatchArgs...))
403                 }
404         }
405 }
406
407 func (s *StubbedSuite) TestSbatchPartition(c *C) {
408         container := arvados.Container{
409                 UUID:                 "123",
410                 RuntimeConstraints:   arvados.RuntimeConstraints{RAM: 250000000, VCPUs: 1},
411                 SchedulingParameters: arvados.SchedulingParameters{Partitions: []string{"blurb", "b2"}},
412                 Priority:             1,
413         }
414
415         args, err := s.disp.sbatchArgs(container)
416         c.Check(args, DeepEquals, []string{
417                 "--job-name=123", "--mem=239", "--cpus-per-task=1", "--tmp=0", "--nice=10000",
418                 "--partition=blurb,b2",
419         })
420         c.Check(err, IsNil)
421 }