Merge branch 'main' into 19385-cwl-fast-pack
[arvados.git] / lib / dispatchcloud / worker / worker_test.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package worker
6
7 import (
8         "bytes"
9         "crypto/md5"
10         "errors"
11         "fmt"
12         "io"
13         "strings"
14         "time"
15
16         "git.arvados.org/arvados.git/lib/cloud"
17         "git.arvados.org/arvados.git/lib/config"
18         "git.arvados.org/arvados.git/lib/dispatchcloud/test"
19         "git.arvados.org/arvados.git/sdk/go/arvados"
20         "git.arvados.org/arvados.git/sdk/go/ctxlog"
21         "github.com/prometheus/client_golang/prometheus"
22         "github.com/sirupsen/logrus"
23         check "gopkg.in/check.v1"
24 )
25
26 var _ = check.Suite(&WorkerSuite{})
27
28 type WorkerSuite struct {
29         logger      logrus.FieldLogger
30         testCluster *arvados.Cluster
31 }
32
33 func (suite *WorkerSuite) SetUpTest(c *check.C) {
34         suite.logger = ctxlog.TestLogger(c)
35         cfg, err := config.NewLoader(nil, suite.logger).Load()
36         c.Assert(err, check.IsNil)
37         suite.testCluster, err = cfg.GetCluster("")
38         c.Assert(err, check.IsNil)
39 }
40
41 func (suite *WorkerSuite) TestProbeAndUpdate(c *check.C) {
42         bootTimeout := time.Minute
43         probeTimeout := time.Second
44
45         ac := arvados.NewClientFromEnv()
46         is, err := (&test.StubDriver{}).InstanceSet(nil, "test-instance-set-id", nil, suite.logger)
47         c.Assert(err, check.IsNil)
48         inst, err := is.Create(arvados.InstanceType{}, "", nil, "echo InitCommand", nil)
49         c.Assert(err, check.IsNil)
50
51         type trialT struct {
52                 testCaseComment string // displayed in test output to help identify failure case
53                 age             time.Duration
54                 state           State
55                 running         int
56                 starting        int
57                 respBoot        stubResp // zero value is success
58                 respDeploy      stubResp // zero value is success
59                 respRun         stubResp // zero value is success + nothing running
60                 respRunDeployed stubResp
61                 deployRunner    []byte
62                 expectStdin     []byte
63                 expectState     State
64                 expectRunning   int
65         }
66
67         errFail := errors.New("failed")
68         respFail := stubResp{"", "command failed\n", errFail}
69         respContainerRunning := stubResp{"zzzzz-dz642-abcdefghijklmno\n", "", nil}
70         for idx, trial := range []trialT{
71                 {
72                         testCaseComment: "Unknown, probes fail",
73                         state:           StateUnknown,
74                         respBoot:        respFail,
75                         respRun:         respFail,
76                         expectState:     StateUnknown,
77                 },
78                 {
79                         testCaseComment: "Unknown, boot probe fails, but one container is running",
80                         state:           StateUnknown,
81                         respBoot:        respFail,
82                         respRun:         respContainerRunning,
83                         expectState:     StateUnknown,
84                         expectRunning:   1,
85                 },
86                 {
87                         testCaseComment: "Unknown, boot probe fails, previously running container has exited",
88                         state:           StateUnknown,
89                         running:         1,
90                         respBoot:        respFail,
91                         expectState:     StateUnknown,
92                         expectRunning:   0,
93                 },
94                 {
95                         testCaseComment: "Unknown, boot timeout exceeded, boot probe fails",
96                         state:           StateUnknown,
97                         age:             bootTimeout + time.Second,
98                         respBoot:        respFail,
99                         respRun:         respFail,
100                         expectState:     StateShutdown,
101                 },
102                 {
103                         testCaseComment: "Unknown, boot timeout exceeded, boot probe succeeds but crunch-run fails",
104                         state:           StateUnknown,
105                         age:             bootTimeout * 2,
106                         respRun:         respFail,
107                         expectState:     StateShutdown,
108                 },
109                 {
110                         testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but crunch-run succeeds",
111                         state:           StateUnknown,
112                         age:             bootTimeout * 2,
113                         respBoot:        respFail,
114                         expectState:     StateShutdown,
115                 },
116                 {
117                         testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but container is running",
118                         state:           StateUnknown,
119                         age:             bootTimeout * 2,
120                         respBoot:        respFail,
121                         respRun:         respContainerRunning,
122                         expectState:     StateUnknown,
123                         expectRunning:   1,
124                 },
125                 {
126                         testCaseComment: "Booting, boot probe fails, run probe fails",
127                         state:           StateBooting,
128                         respBoot:        respFail,
129                         respRun:         respFail,
130                         expectState:     StateBooting,
131                 },
132                 {
133                         testCaseComment: "Booting, boot probe fails, run probe succeeds (but isn't expected to be called)",
134                         state:           StateBooting,
135                         respBoot:        respFail,
136                         expectState:     StateBooting,
137                 },
138                 {
139                         testCaseComment: "Booting, boot probe succeeds, run probe fails",
140                         state:           StateBooting,
141                         respRun:         respFail,
142                         expectState:     StateBooting,
143                 },
144                 {
145                         testCaseComment: "Booting, boot probe succeeds, run probe succeeds",
146                         state:           StateBooting,
147                         expectState:     StateIdle,
148                 },
149                 {
150                         testCaseComment: "Booting, boot probe succeeds, run probe succeeds, container is running",
151                         state:           StateBooting,
152                         respRun:         respContainerRunning,
153                         expectState:     StateRunning,
154                         expectRunning:   1,
155                 },
156                 {
157                         testCaseComment: "Booting, boot timeout exceeded",
158                         state:           StateBooting,
159                         age:             bootTimeout * 2,
160                         respRun:         respFail,
161                         expectState:     StateShutdown,
162                 },
163                 {
164                         testCaseComment: "Idle, probe timeout exceeded, one container running",
165                         state:           StateIdle,
166                         age:             probeTimeout * 2,
167                         respRun:         respContainerRunning,
168                         expectState:     StateRunning,
169                         expectRunning:   1,
170                 },
171                 {
172                         testCaseComment: "Idle, probe timeout exceeded, one container running, probe fails",
173                         state:           StateIdle,
174                         age:             probeTimeout * 2,
175                         running:         1,
176                         respRun:         respFail,
177                         expectState:     StateShutdown,
178                         expectRunning:   1,
179                 },
180                 {
181                         testCaseComment: "Idle, probe timeout exceeded, nothing running, probe fails",
182                         state:           StateIdle,
183                         age:             probeTimeout * 2,
184                         respRun:         respFail,
185                         expectState:     StateShutdown,
186                 },
187                 {
188                         testCaseComment: "Running, one container still running",
189                         state:           StateRunning,
190                         running:         1,
191                         respRun:         respContainerRunning,
192                         expectState:     StateRunning,
193                         expectRunning:   1,
194                 },
195                 {
196                         testCaseComment: "Running, container has exited",
197                         state:           StateRunning,
198                         running:         1,
199                         expectState:     StateIdle,
200                         expectRunning:   0,
201                 },
202                 {
203                         testCaseComment: "Running, probe timeout exceeded, nothing running, new container being started",
204                         state:           StateRunning,
205                         age:             probeTimeout * 2,
206                         starting:        1,
207                         expectState:     StateRunning,
208                 },
209                 {
210                         testCaseComment: "Booting, boot probe succeeds, deployRunner succeeds, run probe succeeds",
211                         state:           StateBooting,
212                         deployRunner:    []byte("ELF"),
213                         expectStdin:     []byte("ELF"),
214                         respRun:         respFail,
215                         respRunDeployed: respContainerRunning,
216                         expectRunning:   1,
217                         expectState:     StateRunning,
218                 },
219                 {
220                         testCaseComment: "Booting, boot probe succeeds, deployRunner fails",
221                         state:           StateBooting,
222                         deployRunner:    []byte("ELF"),
223                         respDeploy:      respFail,
224                         expectStdin:     []byte("ELF"),
225                         expectState:     StateBooting,
226                 },
227                 {
228                         testCaseComment: "Booting, boot probe succeeds, deployRunner skipped, run probe succeeds",
229                         state:           StateBooting,
230                         deployRunner:    nil,
231                         respDeploy:      respFail,
232                         expectState:     StateIdle,
233                 },
234         } {
235                 c.Logf("------- trial %d: %#v", idx, trial)
236                 ctime := time.Now().Add(-trial.age)
237                 exr := &stubExecutor{
238                         response: map[string]stubResp{
239                                 "bootprobe":         trial.respBoot,
240                                 "crunch-run --list": trial.respRun,
241                                 "{deploy}":          trial.respDeploy,
242                         },
243                 }
244                 wp := &Pool{
245                         arvClient:        ac,
246                         newExecutor:      func(cloud.Instance) Executor { return exr },
247                         cluster:          suite.testCluster,
248                         bootProbeCommand: "bootprobe",
249                         timeoutBooting:   bootTimeout,
250                         timeoutProbe:     probeTimeout,
251                         exited:           map[string]time.Time{},
252                         runnerCmdDefault: "crunch-run",
253                         runnerArgs:       []string{"--args=not used with --list"},
254                         runnerCmd:        "crunch-run",
255                         runnerData:       trial.deployRunner,
256                         runnerMD5:        md5.Sum(trial.deployRunner),
257                 }
258                 wp.registerMetrics(prometheus.NewRegistry())
259                 if trial.deployRunner != nil {
260                         svHash := md5.Sum(trial.deployRunner)
261                         wp.runnerCmd = fmt.Sprintf("/var/run/arvados/crunch-run~%x", svHash)
262                         exr.response[wp.runnerCmd+" --list"] = trial.respRunDeployed
263                 }
264                 wkr := &worker{
265                         logger:   suite.logger,
266                         executor: exr,
267                         wp:       wp,
268                         mtx:      &wp.mtx,
269                         state:    trial.state,
270                         instance: inst,
271                         appeared: ctime,
272                         busy:     ctime,
273                         probed:   ctime,
274                         updated:  ctime,
275                         running:  map[string]*remoteRunner{},
276                         starting: map[string]*remoteRunner{},
277                         probing:  make(chan struct{}, 1),
278                 }
279                 if trial.running > 0 {
280                         uuid := "zzzzz-dz642-abcdefghijklmno"
281                         wkr.running = map[string]*remoteRunner{uuid: newRemoteRunner(uuid, wkr)}
282                 }
283                 if trial.starting > 0 {
284                         uuid := "zzzzz-dz642-bcdefghijklmnop"
285                         wkr.starting = map[string]*remoteRunner{uuid: newRemoteRunner(uuid, wkr)}
286                 }
287                 wkr.probeAndUpdate()
288                 c.Check(wkr.state, check.Equals, trial.expectState)
289                 c.Check(len(wkr.running), check.Equals, trial.expectRunning)
290                 c.Check(exr.stdin.String(), check.Equals, string(trial.expectStdin))
291         }
292 }
293
294 type stubResp struct {
295         stdout string
296         stderr string
297         err    error
298 }
299
300 type stubExecutor struct {
301         response map[string]stubResp
302         stdin    bytes.Buffer
303 }
304
305 func (se *stubExecutor) SetTarget(cloud.ExecutorTarget) {}
306 func (se *stubExecutor) Close()                         {}
307 func (se *stubExecutor) Execute(env map[string]string, cmd string, stdin io.Reader) (stdout, stderr []byte, err error) {
308         if stdin != nil {
309                 _, err = io.Copy(&se.stdin, stdin)
310                 if err != nil {
311                         return nil, []byte(err.Error()), err
312                 }
313         }
314         resp, ok := se.response[cmd]
315         if !ok && strings.Contains(cmd, `; cat >"$dstfile"`) {
316                 resp, ok = se.response["{deploy}"]
317         }
318         if !ok {
319                 return nil, []byte(fmt.Sprintf("%s: command not found\n", cmd)), errors.New("command not found")
320         }
321         return []byte(resp.stdout), []byte(resp.stderr), resp.err
322 }