19686: Introduce low-level api_client constructor
[arvados.git] / lib / lsf / dispatch_test.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package lsf
6
7 import (
8         "context"
9         "encoding/json"
10         "fmt"
11         "math/rand"
12         "os/exec"
13         "strconv"
14         "sync"
15         "testing"
16         "time"
17
18         "git.arvados.org/arvados.git/lib/config"
19         "git.arvados.org/arvados.git/sdk/go/arvados"
20         "git.arvados.org/arvados.git/sdk/go/arvadostest"
21         "git.arvados.org/arvados.git/sdk/go/ctxlog"
22         "github.com/prometheus/client_golang/prometheus"
23         "gopkg.in/check.v1"
24 )
25
26 func Test(t *testing.T) {
27         check.TestingT(t)
28 }
29
30 var _ = check.Suite(&suite{})
31
32 type suite struct {
33         disp          *dispatcher
34         crTooBig      arvados.ContainerRequest
35         crPending     arvados.ContainerRequest
36         crCUDARequest arvados.ContainerRequest
37 }
38
39 func (s *suite) TearDownTest(c *check.C) {
40         arvados.NewClientFromEnv().RequestAndDecode(nil, "POST", "database/reset", nil, nil)
41 }
42
43 func (s *suite) SetUpTest(c *check.C) {
44         cfg, err := config.NewLoader(nil, ctxlog.TestLogger(c)).Load()
45         c.Assert(err, check.IsNil)
46         cluster, err := cfg.GetCluster("")
47         c.Assert(err, check.IsNil)
48         cluster.Containers.ReserveExtraRAM = 256 << 20
49         cluster.Containers.CloudVMs.PollInterval = arvados.Duration(time.Second / 4)
50         cluster.Containers.MinRetryPeriod = arvados.Duration(time.Second / 4)
51         cluster.InstanceTypes = arvados.InstanceTypeMap{
52                 "biggest_available_node": arvados.InstanceType{
53                         RAM:             100 << 30, // 100 GiB
54                         VCPUs:           4,
55                         IncludedScratch: 100 << 30,
56                         Scratch:         100 << 30,
57                 }}
58         s.disp = newHandler(context.Background(), cluster, arvadostest.Dispatch1Token, prometheus.NewRegistry()).(*dispatcher)
59         s.disp.lsfcli.stubCommand = func(string, ...string) *exec.Cmd {
60                 return exec.Command("bash", "-c", "echo >&2 unimplemented stub; false")
61         }
62         err = arvados.NewClientFromEnv().RequestAndDecode(&s.crTooBig, "POST", "arvados/v1/container_requests", nil, map[string]interface{}{
63                 "container_request": map[string]interface{}{
64                         "runtime_constraints": arvados.RuntimeConstraints{
65                                 RAM:   1000000000000,
66                                 VCPUs: 1,
67                         },
68                         "container_image":     arvadostest.DockerImage112PDH,
69                         "command":             []string{"sleep", "1"},
70                         "mounts":              map[string]arvados.Mount{"/mnt/out": {Kind: "tmp", Capacity: 1000}},
71                         "output_path":         "/mnt/out",
72                         "state":               arvados.ContainerRequestStateCommitted,
73                         "priority":            1,
74                         "container_count_max": 1,
75                 },
76         })
77         c.Assert(err, check.IsNil)
78
79         err = arvados.NewClientFromEnv().RequestAndDecode(&s.crPending, "POST", "arvados/v1/container_requests", nil, map[string]interface{}{
80                 "container_request": map[string]interface{}{
81                         "runtime_constraints": arvados.RuntimeConstraints{
82                                 RAM:   100000000,
83                                 VCPUs: 2,
84                         },
85                         "container_image":     arvadostest.DockerImage112PDH,
86                         "command":             []string{"sleep", "1"},
87                         "mounts":              map[string]arvados.Mount{"/mnt/out": {Kind: "tmp", Capacity: 1000}},
88                         "output_path":         "/mnt/out",
89                         "state":               arvados.ContainerRequestStateCommitted,
90                         "priority":            1,
91                         "container_count_max": 1,
92                 },
93         })
94         c.Assert(err, check.IsNil)
95
96         err = arvados.NewClientFromEnv().RequestAndDecode(&s.crCUDARequest, "POST", "arvados/v1/container_requests", nil, map[string]interface{}{
97                 "container_request": map[string]interface{}{
98                         "runtime_constraints": arvados.RuntimeConstraints{
99                                 RAM:   16000000,
100                                 VCPUs: 1,
101                                 CUDA: arvados.CUDARuntimeConstraints{
102                                         DeviceCount:        1,
103                                         DriverVersion:      "11.0",
104                                         HardwareCapability: "8.0",
105                                 },
106                         },
107                         "container_image":     arvadostest.DockerImage112PDH,
108                         "command":             []string{"sleep", "1"},
109                         "mounts":              map[string]arvados.Mount{"/mnt/out": {Kind: "tmp", Capacity: 1000}},
110                         "output_path":         "/mnt/out",
111                         "state":               arvados.ContainerRequestStateCommitted,
112                         "priority":            1,
113                         "container_count_max": 1,
114                 },
115         })
116         c.Assert(err, check.IsNil)
117
118 }
119
120 type lsfstub struct {
121         sudoUser  string
122         errorRate float64
123 }
124
125 func (stub lsfstub) stubCommand(s *suite, c *check.C) func(prog string, args ...string) *exec.Cmd {
126         mtx := sync.Mutex{}
127         nextjobid := 100
128         fakejobq := map[int]string{}
129         return func(prog string, args ...string) *exec.Cmd {
130                 c.Logf("stubCommand: %q %q", prog, args)
131                 if rand.Float64() < stub.errorRate {
132                         return exec.Command("bash", "-c", "echo >&2 'stub random failure' && false")
133                 }
134                 if stub.sudoUser != "" && len(args) > 3 &&
135                         prog == "sudo" &&
136                         args[0] == "-E" &&
137                         args[1] == "-u" &&
138                         args[2] == stub.sudoUser {
139                         prog, args = args[3], args[4:]
140                 }
141                 switch prog {
142                 case "bsub":
143                         defaultArgs := s.disp.Cluster.Containers.LSF.BsubArgumentsList
144                         if args[5] == s.crCUDARequest.ContainerUUID {
145                                 c.Assert(len(args), check.Equals, len(defaultArgs)+len(s.disp.Cluster.Containers.LSF.BsubCUDAArguments))
146                         } else {
147                                 c.Assert(len(args), check.Equals, len(defaultArgs))
148                         }
149                         // %%J must have been rewritten to %J
150                         c.Check(args[1], check.Equals, "/tmp/crunch-run.%J.out")
151                         args = args[4:]
152                         switch args[1] {
153                         case arvadostest.LockedContainerUUID:
154                                 c.Check(args, check.DeepEquals, []string{
155                                         "-J", arvadostest.LockedContainerUUID,
156                                         "-n", "4",
157                                         "-D", "11701MB",
158                                         "-R", "rusage[mem=11701MB:tmp=0MB] span[hosts=1]",
159                                         "-R", "select[mem>=11701MB]",
160                                         "-R", "select[tmp>=0MB]",
161                                         "-R", "select[ncpus>=4]"})
162                                 mtx.Lock()
163                                 fakejobq[nextjobid] = args[1]
164                                 nextjobid++
165                                 mtx.Unlock()
166                         case arvadostest.QueuedContainerUUID:
167                                 c.Check(args, check.DeepEquals, []string{
168                                         "-J", arvadostest.QueuedContainerUUID,
169                                         "-n", "4",
170                                         "-D", "11701MB",
171                                         "-R", "rusage[mem=11701MB:tmp=45777MB] span[hosts=1]",
172                                         "-R", "select[mem>=11701MB]",
173                                         "-R", "select[tmp>=45777MB]",
174                                         "-R", "select[ncpus>=4]"})
175                                 mtx.Lock()
176                                 fakejobq[nextjobid] = args[1]
177                                 nextjobid++
178                                 mtx.Unlock()
179                         case s.crPending.ContainerUUID:
180                                 c.Check(args, check.DeepEquals, []string{
181                                         "-J", s.crPending.ContainerUUID,
182                                         "-n", "2",
183                                         "-D", "352MB",
184                                         "-R", "rusage[mem=352MB:tmp=8448MB] span[hosts=1]",
185                                         "-R", "select[mem>=352MB]",
186                                         "-R", "select[tmp>=8448MB]",
187                                         "-R", "select[ncpus>=2]"})
188                                 mtx.Lock()
189                                 fakejobq[nextjobid] = args[1]
190                                 nextjobid++
191                                 mtx.Unlock()
192                         case s.crCUDARequest.ContainerUUID:
193                                 c.Check(args, check.DeepEquals, []string{
194                                         "-J", s.crCUDARequest.ContainerUUID,
195                                         "-n", "1",
196                                         "-D", "528MB",
197                                         "-R", "rusage[mem=528MB:tmp=256MB] span[hosts=1]",
198                                         "-R", "select[mem>=528MB]",
199                                         "-R", "select[tmp>=256MB]",
200                                         "-R", "select[ncpus>=1]",
201                                         "-gpu", "num=1"})
202                                 mtx.Lock()
203                                 fakejobq[nextjobid] = args[1]
204                                 nextjobid++
205                                 mtx.Unlock()
206                         default:
207                                 c.Errorf("unexpected uuid passed to bsub: args %q", args)
208                                 return exec.Command("false")
209                         }
210                         return exec.Command("echo", "submitted job")
211                 case "bjobs":
212                         c.Check(args, check.DeepEquals, []string{"-u", "all", "-o", "jobid stat job_name pend_reason", "-json"})
213                         var records []map[string]interface{}
214                         for jobid, uuid := range fakejobq {
215                                 stat, reason := "RUN", ""
216                                 if uuid == s.crPending.ContainerUUID {
217                                         // The real bjobs output includes a trailing ';' here:
218                                         stat, reason = "PEND", "There are no suitable hosts for the job;"
219                                 }
220                                 records = append(records, map[string]interface{}{
221                                         "JOBID":       fmt.Sprintf("%d", jobid),
222                                         "STAT":        stat,
223                                         "JOB_NAME":    uuid,
224                                         "PEND_REASON": reason,
225                                 })
226                         }
227                         out, err := json.Marshal(map[string]interface{}{
228                                 "COMMAND": "bjobs",
229                                 "JOBS":    len(fakejobq),
230                                 "RECORDS": records,
231                         })
232                         if err != nil {
233                                 panic(err)
234                         }
235                         c.Logf("bjobs out: %s", out)
236                         return exec.Command("printf", string(out))
237                 case "bkill":
238                         killid, _ := strconv.Atoi(args[0])
239                         if uuid, ok := fakejobq[killid]; !ok {
240                                 return exec.Command("bash", "-c", fmt.Sprintf("printf >&2 'Job <%d>: No matching job found\n'", killid))
241                         } else if uuid == "" {
242                                 return exec.Command("bash", "-c", fmt.Sprintf("printf >&2 'Job <%d>: Job has already finished\n'", killid))
243                         } else {
244                                 go func() {
245                                         time.Sleep(time.Millisecond)
246                                         mtx.Lock()
247                                         delete(fakejobq, killid)
248                                         mtx.Unlock()
249                                 }()
250                                 return exec.Command("bash", "-c", fmt.Sprintf("printf 'Job <%d> is being terminated\n'", killid))
251                         }
252                 default:
253                         return exec.Command("bash", "-c", fmt.Sprintf("echo >&2 'stub: command not found: %+q'", prog))
254                 }
255         }
256 }
257
258 func (s *suite) TestSubmit(c *check.C) {
259         s.disp.lsfcli.stubCommand = lsfstub{
260                 errorRate: 0.1,
261                 sudoUser:  s.disp.Cluster.Containers.LSF.BsubSudoUser,
262         }.stubCommand(s, c)
263         s.disp.Start()
264
265         deadline := time.Now().Add(20 * time.Second)
266         for range time.NewTicker(time.Second).C {
267                 if time.Now().After(deadline) {
268                         c.Error("timed out")
269                         break
270                 }
271                 // "crTooBig" should never be submitted to lsf because
272                 // it is bigger than any configured instance type
273                 if ent, ok := s.disp.lsfqueue.Lookup(s.crTooBig.ContainerUUID); ok {
274                         c.Errorf("Lookup(crTooBig) == true, ent = %#v", ent)
275                         break
276                 }
277                 // "queuedcontainer" should be running
278                 if _, ok := s.disp.lsfqueue.Lookup(arvadostest.QueuedContainerUUID); !ok {
279                         c.Log("Lookup(queuedcontainer) == false")
280                         continue
281                 }
282                 // "crPending" should be pending
283                 if ent, ok := s.disp.lsfqueue.Lookup(s.crPending.ContainerUUID); !ok {
284                         c.Logf("Lookup(crPending) == false", ent)
285                         continue
286                 }
287                 // "lockedcontainer" should be cancelled because it
288                 // has priority 0 (no matching container requests)
289                 if ent, ok := s.disp.lsfqueue.Lookup(arvadostest.LockedContainerUUID); ok {
290                         c.Logf("Lookup(lockedcontainer) == true, ent = %#v", ent)
291                         continue
292                 }
293                 var ctr arvados.Container
294                 if err := s.disp.arvDispatcher.Arv.Get("containers", arvadostest.LockedContainerUUID, nil, &ctr); err != nil {
295                         c.Logf("error getting container state for %s: %s", arvadostest.LockedContainerUUID, err)
296                         continue
297                 } else if ctr.State != arvados.ContainerStateQueued {
298                         c.Logf("LockedContainer is not in the LSF queue but its arvados record has not been updated to state==Queued (state is %q)", ctr.State)
299                         continue
300                 }
301
302                 if err := s.disp.arvDispatcher.Arv.Get("containers", s.crTooBig.ContainerUUID, nil, &ctr); err != nil {
303                         c.Logf("error getting container state for %s: %s", s.crTooBig.ContainerUUID, err)
304                         continue
305                 } else if ctr.State != arvados.ContainerStateCancelled {
306                         c.Logf("container %s is not in the LSF queue but its arvados record has not been updated to state==Cancelled (state is %q)", s.crTooBig.ContainerUUID, ctr.State)
307                         continue
308                 } else {
309                         c.Check(ctr.RuntimeStatus["error"], check.Equals, "constraints not satisfiable by any configured instance type")
310                 }
311                 c.Log("reached desired state")
312                 break
313         }
314 }