17756: Add lsf dispatcher.
[arvados.git] / lib / lsf / dispatch_test.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package lsf
6
7 import (
8         "context"
9         "fmt"
10         "math/rand"
11         "os/exec"
12         "strconv"
13         "sync"
14         "testing"
15         "time"
16
17         "git.arvados.org/arvados.git/lib/config"
18         "git.arvados.org/arvados.git/sdk/go/arvados"
19         "git.arvados.org/arvados.git/sdk/go/arvadostest"
20         "git.arvados.org/arvados.git/sdk/go/ctxlog"
21         "github.com/prometheus/client_golang/prometheus"
22         "gopkg.in/check.v1"
23 )
24
25 func Test(t *testing.T) {
26         check.TestingT(t)
27 }
28
29 var _ = check.Suite(&suite{})
30
31 type suite struct {
32         disp *dispatcher
33 }
34
35 func (s *suite) TearDownTest(c *check.C) {
36         arvados.NewClientFromEnv().RequestAndDecode(nil, "POST", "database/reset", nil, nil)
37 }
38
39 func (s *suite) SetUpTest(c *check.C) {
40         cfg, err := config.NewLoader(nil, ctxlog.TestLogger(c)).Load()
41         c.Assert(err, check.IsNil)
42         cluster, err := cfg.GetCluster("")
43         c.Assert(err, check.IsNil)
44         cluster.Containers.CloudVMs.PollInterval = arvados.Duration(time.Second)
45         s.disp = newHandler(context.Background(), cluster, arvadostest.Dispatch1Token, prometheus.NewRegistry()).(*dispatcher)
46         s.disp.lsfcli.stubCommand = func(string, ...string) *exec.Cmd {
47                 return exec.Command("bash", "-c", "echo >&2 unimplemented stub; false")
48         }
49 }
50
51 type lsfstub struct {
52         errorRate float64
53 }
54
55 func (stub lsfstub) stubCommand(c *check.C) func(prog string, args ...string) *exec.Cmd {
56         mtx := sync.Mutex{}
57         nextjobid := 100
58         fakejobq := map[int]string{}
59         return func(prog string, args ...string) *exec.Cmd {
60                 c.Logf("stubCommand: %q %q", prog, args)
61                 if rand.Float64() < stub.errorRate {
62                         return exec.Command("bash", "-c", "echo >&2 'stub random failure' && false")
63                 }
64                 switch prog {
65                 case "bsub":
66                         c.Assert(args, check.HasLen, 4)
67                         c.Check(args[0], check.Equals, "-J")
68                         switch args[1] {
69                         case arvadostest.LockedContainerUUID:
70                                 c.Check(args, check.DeepEquals, []string{"-J", arvadostest.LockedContainerUUID, "-R", "rusage[mem=11701MB:tmp=0MB] affinity[core(4)]"})
71                                 mtx.Lock()
72                                 fakejobq[nextjobid] = args[1]
73                                 nextjobid++
74                                 mtx.Unlock()
75                         case arvadostest.QueuedContainerUUID:
76                                 c.Check(args, check.DeepEquals, []string{"-J", arvadostest.QueuedContainerUUID, "-R", "rusage[mem=11701MB:tmp=45777MB] affinity[core(4)]"})
77                                 mtx.Lock()
78                                 fakejobq[nextjobid] = args[1]
79                                 nextjobid++
80                                 mtx.Unlock()
81                         default:
82                                 c.Errorf("unexpected uuid passed to bsub: args %q", args)
83                                 return exec.Command("false")
84                         }
85                         return exec.Command("echo", "submitted job")
86                 case "bjobs":
87                         c.Check(args, check.DeepEquals, []string{"-noheader", "-o", "jobid stat job_name:30"})
88                         out := ""
89                         for jobid, uuid := range fakejobq {
90                                 out += fmt.Sprintf(`%d %s %s\n`, jobid, "RUN", uuid)
91                         }
92                         c.Logf("bjobs out: %q", out)
93                         return exec.Command("printf", out)
94                 case "bkill":
95                         killid, _ := strconv.Atoi(args[0])
96                         if uuid, ok := fakejobq[killid]; !ok {
97                                 return exec.Command("bash", "-c", fmt.Sprintf("printf >&2 'Job <%d>: No matching job found\n'", killid))
98                         } else if uuid == "" {
99                                 return exec.Command("bash", "-c", fmt.Sprintf("printf >&2 'Job <%d>: Job has already finished\n'", killid))
100                         } else {
101                                 go func() {
102                                         time.Sleep(time.Millisecond)
103                                         mtx.Lock()
104                                         delete(fakejobq, killid)
105                                         mtx.Unlock()
106                                 }()
107                                 return exec.Command("bash", "-c", fmt.Sprintf("printf 'Job <%d> is being terminated\n'", killid))
108                         }
109                 default:
110                         return exec.Command("bash", "-c", fmt.Sprintf("echo >&2 'stub: command not found: %+q'", prog))
111                 }
112         }
113 }
114
115 func (s *suite) TestSubmit(c *check.C) {
116         s.disp.lsfcli.stubCommand = lsfstub{errorRate: 0.1}.stubCommand(c)
117         s.disp.Start()
118         deadline := time.Now().Add(20 * time.Second)
119         for range time.NewTicker(time.Second).C {
120                 if time.Now().After(deadline) {
121                         c.Error("timed out")
122                         break
123                 }
124                 // "queuedcontainer" should be running
125                 if _, ok := s.disp.lsfqueue.JobID(arvadostest.QueuedContainerUUID); !ok {
126                         continue
127                 }
128                 // "lockedcontainer" should be cancelled because it
129                 // has priority 0 (no matching container requests)
130                 if _, ok := s.disp.lsfqueue.JobID(arvadostest.LockedContainerUUID); ok {
131                         continue
132                 }
133                 var ctr arvados.Container
134                 if err := s.disp.arvDispatcher.Arv.Get("containers", arvadostest.LockedContainerUUID, nil, &ctr); err != nil {
135                         c.Logf("error getting container state for %s: %s", arvadostest.LockedContainerUUID, err)
136                         continue
137                 }
138                 if ctr.State != arvados.ContainerStateQueued {
139                         c.Logf("LockedContainer is not in the LSF queue but its arvados record has not been updated to state==Queued (state is %q)", ctr.State)
140                         continue
141                 }
142                 c.Log("reached desired state")
143                 break
144         }
145 }