17755: Merge branch 'main' into 17755-add-singularity-to-compute-image
[arvados.git] / lib / lsf / dispatch_test.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package lsf
6
7 import (
8         "context"
9         "fmt"
10         "math/rand"
11         "os/exec"
12         "strconv"
13         "sync"
14         "testing"
15         "time"
16
17         "git.arvados.org/arvados.git/lib/config"
18         "git.arvados.org/arvados.git/sdk/go/arvados"
19         "git.arvados.org/arvados.git/sdk/go/arvadostest"
20         "git.arvados.org/arvados.git/sdk/go/ctxlog"
21         "github.com/prometheus/client_golang/prometheus"
22         "gopkg.in/check.v1"
23 )
24
25 func Test(t *testing.T) {
26         check.TestingT(t)
27 }
28
29 var _ = check.Suite(&suite{})
30
31 type suite struct {
32         disp *dispatcher
33 }
34
35 func (s *suite) TearDownTest(c *check.C) {
36         arvados.NewClientFromEnv().RequestAndDecode(nil, "POST", "database/reset", nil, nil)
37 }
38
39 func (s *suite) SetUpTest(c *check.C) {
40         cfg, err := config.NewLoader(nil, ctxlog.TestLogger(c)).Load()
41         c.Assert(err, check.IsNil)
42         cluster, err := cfg.GetCluster("")
43         c.Assert(err, check.IsNil)
44         cluster.Containers.CloudVMs.PollInterval = arvados.Duration(time.Second)
45         s.disp = newHandler(context.Background(), cluster, arvadostest.Dispatch1Token, prometheus.NewRegistry()).(*dispatcher)
46         s.disp.lsfcli.stubCommand = func(string, ...string) *exec.Cmd {
47                 return exec.Command("bash", "-c", "echo >&2 unimplemented stub; false")
48         }
49 }
50
51 type lsfstub struct {
52         sudoUser  string
53         errorRate float64
54 }
55
56 func (stub lsfstub) stubCommand(s *suite, c *check.C) func(prog string, args ...string) *exec.Cmd {
57         mtx := sync.Mutex{}
58         nextjobid := 100
59         fakejobq := map[int]string{}
60         return func(prog string, args ...string) *exec.Cmd {
61                 c.Logf("stubCommand: %q %q", prog, args)
62                 if rand.Float64() < stub.errorRate {
63                         return exec.Command("bash", "-c", "echo >&2 'stub random failure' && false")
64                 }
65                 if stub.sudoUser != "" && len(args) > 3 &&
66                         prog == "sudo" &&
67                         args[0] == "-E" &&
68                         args[1] == "-u" &&
69                         args[2] == stub.sudoUser {
70                         prog, args = args[3], args[4:]
71                 }
72                 switch prog {
73                 case "bsub":
74                         defaultArgs := s.disp.Cluster.Containers.LSF.BsubArgumentsList
75                         c.Assert(args, check.HasLen, 4+len(defaultArgs))
76                         c.Check(args[:len(defaultArgs)], check.DeepEquals, defaultArgs)
77                         args = args[len(defaultArgs):]
78
79                         c.Check(args[0], check.Equals, "-J")
80                         switch args[1] {
81                         case arvadostest.LockedContainerUUID:
82                                 c.Check(args, check.DeepEquals, []string{"-J", arvadostest.LockedContainerUUID, "-R", "rusage[mem=11701MB:tmp=0MB] affinity[core(4)]"})
83                                 mtx.Lock()
84                                 fakejobq[nextjobid] = args[1]
85                                 nextjobid++
86                                 mtx.Unlock()
87                         case arvadostest.QueuedContainerUUID:
88                                 c.Check(args, check.DeepEquals, []string{"-J", arvadostest.QueuedContainerUUID, "-R", "rusage[mem=11701MB:tmp=45777MB] affinity[core(4)]"})
89                                 mtx.Lock()
90                                 fakejobq[nextjobid] = args[1]
91                                 nextjobid++
92                                 mtx.Unlock()
93                         default:
94                                 c.Errorf("unexpected uuid passed to bsub: args %q", args)
95                                 return exec.Command("false")
96                         }
97                         return exec.Command("echo", "submitted job")
98                 case "bjobs":
99                         c.Check(args, check.DeepEquals, []string{"-u", "all", "-noheader", "-o", "jobid stat job_name:30"})
100                         out := ""
101                         for jobid, uuid := range fakejobq {
102                                 out += fmt.Sprintf(`%d %s %s\n`, jobid, "RUN", uuid)
103                         }
104                         c.Logf("bjobs out: %q", out)
105                         return exec.Command("printf", out)
106                 case "bkill":
107                         killid, _ := strconv.Atoi(args[0])
108                         if uuid, ok := fakejobq[killid]; !ok {
109                                 return exec.Command("bash", "-c", fmt.Sprintf("printf >&2 'Job <%d>: No matching job found\n'", killid))
110                         } else if uuid == "" {
111                                 return exec.Command("bash", "-c", fmt.Sprintf("printf >&2 'Job <%d>: Job has already finished\n'", killid))
112                         } else {
113                                 go func() {
114                                         time.Sleep(time.Millisecond)
115                                         mtx.Lock()
116                                         delete(fakejobq, killid)
117                                         mtx.Unlock()
118                                 }()
119                                 return exec.Command("bash", "-c", fmt.Sprintf("printf 'Job <%d> is being terminated\n'", killid))
120                         }
121                 default:
122                         return exec.Command("bash", "-c", fmt.Sprintf("echo >&2 'stub: command not found: %+q'", prog))
123                 }
124         }
125 }
126
127 func (s *suite) TestSubmit(c *check.C) {
128         s.disp.lsfcli.stubCommand = lsfstub{
129                 errorRate: 0.1,
130                 sudoUser:  s.disp.Cluster.Containers.LSF.BsubSudoUser,
131         }.stubCommand(s, c)
132         s.disp.Start()
133         deadline := time.Now().Add(20 * time.Second)
134         for range time.NewTicker(time.Second).C {
135                 if time.Now().After(deadline) {
136                         c.Error("timed out")
137                         break
138                 }
139                 // "queuedcontainer" should be running
140                 if _, ok := s.disp.lsfqueue.JobID(arvadostest.QueuedContainerUUID); !ok {
141                         continue
142                 }
143                 // "lockedcontainer" should be cancelled because it
144                 // has priority 0 (no matching container requests)
145                 if _, ok := s.disp.lsfqueue.JobID(arvadostest.LockedContainerUUID); ok {
146                         continue
147                 }
148                 var ctr arvados.Container
149                 if err := s.disp.arvDispatcher.Arv.Get("containers", arvadostest.LockedContainerUUID, nil, &ctr); err != nil {
150                         c.Logf("error getting container state for %s: %s", arvadostest.LockedContainerUUID, err)
151                         continue
152                 }
153                 if ctr.State != arvados.ContainerStateQueued {
154                         c.Logf("LockedContainer is not in the LSF queue but its arvados record has not been updated to state==Queued (state is %q)", ctr.State)
155                         continue
156                 }
157                 c.Log("reached desired state")
158                 break
159         }
160 }