Merge branch '18324-lsf-gpu' refs #18324
[arvados.git] / lib / lsf / dispatch_test.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package lsf
6
7 import (
8         "context"
9         "encoding/json"
10         "fmt"
11         "math/rand"
12         "os/exec"
13         "strconv"
14         "sync"
15         "testing"
16         "time"
17
18         "git.arvados.org/arvados.git/lib/config"
19         "git.arvados.org/arvados.git/sdk/go/arvados"
20         "git.arvados.org/arvados.git/sdk/go/arvadostest"
21         "git.arvados.org/arvados.git/sdk/go/ctxlog"
22         "github.com/prometheus/client_golang/prometheus"
23         "gopkg.in/check.v1"
24 )
25
26 func Test(t *testing.T) {
27         check.TestingT(t)
28 }
29
30 var _ = check.Suite(&suite{})
31
32 type suite struct {
33         disp          *dispatcher
34         crTooBig      arvados.ContainerRequest
35         crCUDARequest arvados.ContainerRequest
36 }
37
38 func (s *suite) TearDownTest(c *check.C) {
39         arvados.NewClientFromEnv().RequestAndDecode(nil, "POST", "database/reset", nil, nil)
40 }
41
42 func (s *suite) SetUpTest(c *check.C) {
43         cfg, err := config.NewLoader(nil, ctxlog.TestLogger(c)).Load()
44         c.Assert(err, check.IsNil)
45         cluster, err := cfg.GetCluster("")
46         c.Assert(err, check.IsNil)
47         cluster.Containers.CloudVMs.PollInterval = arvados.Duration(time.Second)
48         s.disp = newHandler(context.Background(), cluster, arvadostest.Dispatch1Token, prometheus.NewRegistry()).(*dispatcher)
49         s.disp.lsfcli.stubCommand = func(string, ...string) *exec.Cmd {
50                 return exec.Command("bash", "-c", "echo >&2 unimplemented stub; false")
51         }
52         err = arvados.NewClientFromEnv().RequestAndDecode(&s.crTooBig, "POST", "arvados/v1/container_requests", nil, map[string]interface{}{
53                 "container_request": map[string]interface{}{
54                         "runtime_constraints": arvados.RuntimeConstraints{
55                                 RAM:   1000000000000,
56                                 VCPUs: 1,
57                         },
58                         "container_image":     arvadostest.DockerImage112PDH,
59                         "command":             []string{"sleep", "1"},
60                         "mounts":              map[string]arvados.Mount{"/mnt/out": {Kind: "tmp", Capacity: 1000}},
61                         "output_path":         "/mnt/out",
62                         "state":               arvados.ContainerRequestStateCommitted,
63                         "priority":            1,
64                         "container_count_max": 1,
65                 },
66         })
67         c.Assert(err, check.IsNil)
68
69         err = arvados.NewClientFromEnv().RequestAndDecode(&s.crCUDARequest, "POST", "arvados/v1/container_requests", nil, map[string]interface{}{
70                 "container_request": map[string]interface{}{
71                         "runtime_constraints": arvados.RuntimeConstraints{
72                                 RAM:   16000000,
73                                 VCPUs: 1,
74                                 CUDA: arvados.CUDARuntimeConstraints{
75                                         DeviceCount:        1,
76                                         DriverVersion:      "11.0",
77                                         HardwareCapability: "8.0",
78                                 },
79                         },
80                         "container_image":     arvadostest.DockerImage112PDH,
81                         "command":             []string{"sleep", "1"},
82                         "mounts":              map[string]arvados.Mount{"/mnt/out": {Kind: "tmp", Capacity: 1000}},
83                         "output_path":         "/mnt/out",
84                         "state":               arvados.ContainerRequestStateCommitted,
85                         "priority":            1,
86                         "container_count_max": 1,
87                 },
88         })
89         c.Assert(err, check.IsNil)
90
91 }
92
93 type lsfstub struct {
94         sudoUser  string
95         errorRate float64
96 }
97
98 func (stub lsfstub) stubCommand(s *suite, c *check.C) func(prog string, args ...string) *exec.Cmd {
99         mtx := sync.Mutex{}
100         nextjobid := 100
101         fakejobq := map[int]string{}
102         return func(prog string, args ...string) *exec.Cmd {
103                 c.Logf("stubCommand: %q %q", prog, args)
104                 if rand.Float64() < stub.errorRate {
105                         return exec.Command("bash", "-c", "echo >&2 'stub random failure' && false")
106                 }
107                 if stub.sudoUser != "" && len(args) > 3 &&
108                         prog == "sudo" &&
109                         args[0] == "-E" &&
110                         args[1] == "-u" &&
111                         args[2] == stub.sudoUser {
112                         prog, args = args[3], args[4:]
113                 }
114                 switch prog {
115                 case "bsub":
116                         defaultArgs := s.disp.Cluster.Containers.LSF.BsubArgumentsList
117                         if args[5] == s.crCUDARequest.ContainerUUID {
118                                 c.Assert(len(args), check.Equals, len(defaultArgs)+len(s.disp.Cluster.Containers.LSF.BsubCUDAArguments))
119                         } else {
120                                 c.Assert(len(args), check.Equals, len(defaultArgs))
121                         }
122                         // %%J must have been rewritten to %J
123                         c.Check(args[1], check.Equals, "/tmp/crunch-run.%J.out")
124                         args = args[4:]
125                         switch args[1] {
126                         case arvadostest.LockedContainerUUID:
127                                 c.Check(args, check.DeepEquals, []string{
128                                         "-J", arvadostest.LockedContainerUUID,
129                                         "-n", "4",
130                                         "-D", "11701MB",
131                                         "-R", "rusage[mem=11701MB:tmp=0MB] span[hosts=1]",
132                                         "-R", "select[mem>=11701MB]",
133                                         "-R", "select[tmp>=0MB]",
134                                         "-R", "select[ncpus>=4]"})
135                                 mtx.Lock()
136                                 fakejobq[nextjobid] = args[1]
137                                 nextjobid++
138                                 mtx.Unlock()
139                         case arvadostest.QueuedContainerUUID:
140                                 c.Check(args, check.DeepEquals, []string{
141                                         "-J", arvadostest.QueuedContainerUUID,
142                                         "-n", "4",
143                                         "-D", "11701MB",
144                                         "-R", "rusage[mem=11701MB:tmp=45777MB] span[hosts=1]",
145                                         "-R", "select[mem>=11701MB]",
146                                         "-R", "select[tmp>=45777MB]",
147                                         "-R", "select[ncpus>=4]"})
148                                 mtx.Lock()
149                                 fakejobq[nextjobid] = args[1]
150                                 nextjobid++
151                                 mtx.Unlock()
152                         case s.crTooBig.ContainerUUID:
153                                 c.Check(args, check.DeepEquals, []string{
154                                         "-J", s.crTooBig.ContainerUUID,
155                                         "-n", "1",
156                                         "-D", "954187MB",
157                                         "-R", "rusage[mem=954187MB:tmp=256MB] span[hosts=1]",
158                                         "-R", "select[mem>=954187MB]",
159                                         "-R", "select[tmp>=256MB]",
160                                         "-R", "select[ncpus>=1]"})
161                                 mtx.Lock()
162                                 fakejobq[nextjobid] = args[1]
163                                 nextjobid++
164                                 mtx.Unlock()
165                         case s.crCUDARequest.ContainerUUID:
166                                 c.Check(args, check.DeepEquals, []string{
167                                         "-J", s.crCUDARequest.ContainerUUID,
168                                         "-n", "1",
169                                         "-D", "528MB",
170                                         "-R", "rusage[mem=528MB:tmp=256MB] span[hosts=1]",
171                                         "-R", "select[mem>=528MB]",
172                                         "-R", "select[tmp>=256MB]",
173                                         "-R", "select[ncpus>=1]",
174                                         "-gpu", "num=1"})
175                                 mtx.Lock()
176                                 fakejobq[nextjobid] = args[1]
177                                 nextjobid++
178                                 mtx.Unlock()
179                         default:
180                                 c.Errorf("unexpected uuid passed to bsub: args %q", args)
181                                 return exec.Command("false")
182                         }
183                         return exec.Command("echo", "submitted job")
184                 case "bjobs":
185                         c.Check(args, check.DeepEquals, []string{"-u", "all", "-o", "jobid stat job_name pend_reason", "-json"})
186                         var records []map[string]interface{}
187                         for jobid, uuid := range fakejobq {
188                                 stat, reason := "RUN", ""
189                                 if uuid == s.crTooBig.ContainerUUID {
190                                         // The real bjobs output includes a trailing ';' here:
191                                         stat, reason = "PEND", "There are no suitable hosts for the job;"
192                                 }
193                                 records = append(records, map[string]interface{}{
194                                         "JOBID":       fmt.Sprintf("%d", jobid),
195                                         "STAT":        stat,
196                                         "JOB_NAME":    uuid,
197                                         "PEND_REASON": reason,
198                                 })
199                         }
200                         out, err := json.Marshal(map[string]interface{}{
201                                 "COMMAND": "bjobs",
202                                 "JOBS":    len(fakejobq),
203                                 "RECORDS": records,
204                         })
205                         if err != nil {
206                                 panic(err)
207                         }
208                         c.Logf("bjobs out: %s", out)
209                         return exec.Command("printf", string(out))
210                 case "bkill":
211                         killid, _ := strconv.Atoi(args[0])
212                         if uuid, ok := fakejobq[killid]; !ok {
213                                 return exec.Command("bash", "-c", fmt.Sprintf("printf >&2 'Job <%d>: No matching job found\n'", killid))
214                         } else if uuid == "" {
215                                 return exec.Command("bash", "-c", fmt.Sprintf("printf >&2 'Job <%d>: Job has already finished\n'", killid))
216                         } else {
217                                 go func() {
218                                         time.Sleep(time.Millisecond)
219                                         mtx.Lock()
220                                         delete(fakejobq, killid)
221                                         mtx.Unlock()
222                                 }()
223                                 return exec.Command("bash", "-c", fmt.Sprintf("printf 'Job <%d> is being terminated\n'", killid))
224                         }
225                 default:
226                         return exec.Command("bash", "-c", fmt.Sprintf("echo >&2 'stub: command not found: %+q'", prog))
227                 }
228         }
229 }
230
231 func (s *suite) TestSubmit(c *check.C) {
232         s.disp.lsfcli.stubCommand = lsfstub{
233                 errorRate: 0.1,
234                 sudoUser:  s.disp.Cluster.Containers.LSF.BsubSudoUser,
235         }.stubCommand(s, c)
236         s.disp.Start()
237
238         deadline := time.Now().Add(20 * time.Second)
239         for range time.NewTicker(time.Second).C {
240                 if time.Now().After(deadline) {
241                         c.Error("timed out")
242                         break
243                 }
244                 // "queuedcontainer" should be running
245                 if _, ok := s.disp.lsfqueue.Lookup(arvadostest.QueuedContainerUUID); !ok {
246                         continue
247                 }
248                 // "lockedcontainer" should be cancelled because it
249                 // has priority 0 (no matching container requests)
250                 if _, ok := s.disp.lsfqueue.Lookup(arvadostest.LockedContainerUUID); ok {
251                         continue
252                 }
253                 // "crTooBig" should be cancelled because lsf stub
254                 // reports there is no suitable instance type
255                 if _, ok := s.disp.lsfqueue.Lookup(s.crTooBig.ContainerUUID); ok {
256                         continue
257                 }
258                 var ctr arvados.Container
259                 if err := s.disp.arvDispatcher.Arv.Get("containers", arvadostest.LockedContainerUUID, nil, &ctr); err != nil {
260                         c.Logf("error getting container state for %s: %s", arvadostest.LockedContainerUUID, err)
261                         continue
262                 } else if ctr.State != arvados.ContainerStateQueued {
263                         c.Logf("LockedContainer is not in the LSF queue but its arvados record has not been updated to state==Queued (state is %q)", ctr.State)
264                         continue
265                 }
266
267                 if err := s.disp.arvDispatcher.Arv.Get("containers", s.crTooBig.ContainerUUID, nil, &ctr); err != nil {
268                         c.Logf("error getting container state for %s: %s", s.crTooBig.ContainerUUID, err)
269                         continue
270                 } else if ctr.State != arvados.ContainerStateCancelled {
271                         c.Logf("container %s is not in the LSF queue but its arvados record has not been updated to state==Cancelled (state is %q)", s.crTooBig.ContainerUUID, ctr.State)
272                         continue
273                 } else {
274                         c.Check(ctr.RuntimeStatus["error"], check.Equals, "There are no suitable hosts for the job;")
275                 }
276                 c.Log("reached desired state")
277                 break
278         }
279 }