Merge branch '20531-cwl-log-tail' refs #20531
[arvados.git] / lib / dispatchcloud / worker / worker_test.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package worker
6
7 import (
8         "bytes"
9         "crypto/md5"
10         "errors"
11         "fmt"
12         "io"
13         "strings"
14         "time"
15
16         "git.arvados.org/arvados.git/lib/cloud"
17         "git.arvados.org/arvados.git/lib/config"
18         "git.arvados.org/arvados.git/lib/dispatchcloud/test"
19         "git.arvados.org/arvados.git/sdk/go/arvados"
20         "git.arvados.org/arvados.git/sdk/go/ctxlog"
21         "github.com/prometheus/client_golang/prometheus"
22         "github.com/sirupsen/logrus"
23         check "gopkg.in/check.v1"
24 )
25
26 var _ = check.Suite(&WorkerSuite{})
27
28 type WorkerSuite struct {
29         logger      logrus.FieldLogger
30         testCluster *arvados.Cluster
31 }
32
33 func (suite *WorkerSuite) SetUpTest(c *check.C) {
34         suite.logger = ctxlog.TestLogger(c)
35         cfg, err := config.NewLoader(nil, suite.logger).Load()
36         c.Assert(err, check.IsNil)
37         suite.testCluster, err = cfg.GetCluster("")
38         c.Assert(err, check.IsNil)
39 }
40
41 func (suite *WorkerSuite) TestProbeAndUpdate(c *check.C) {
42         bootTimeout := time.Minute
43         probeTimeout := time.Second
44
45         ac := arvados.NewClientFromEnv()
46         is, err := (&test.StubDriver{}).InstanceSet(nil, "test-instance-set-id", nil, suite.logger)
47         c.Assert(err, check.IsNil)
48         inst, err := is.Create(arvados.InstanceType{}, "", nil, "echo InitCommand", nil)
49         c.Assert(err, check.IsNil)
50
51         type trialT struct {
52                 testCaseComment string // displayed in test output to help identify failure case
53                 age             time.Duration
54                 state           State
55                 running         int
56                 starting        int
57                 respBoot        stubResp // zero value is success
58                 respDeploy      stubResp // zero value is success
59                 respRun         stubResp // zero value is success + nothing running
60                 respRunDeployed stubResp
61                 deployRunner    []byte
62                 expectStdin     []byte
63                 expectState     State
64                 expectRunning   int
65         }
66
67         errFail := errors.New("failed")
68         respFail := stubResp{"", "command failed\n", errFail}
69         respContainerRunning := stubResp{"zzzzz-dz642-abcdefghijklmno\n", "", nil}
70         for idx, trial := range []trialT{
71                 {
72                         testCaseComment: "Unknown, probes fail",
73                         state:           StateUnknown,
74                         respBoot:        respFail,
75                         respRun:         respFail,
76                         expectState:     StateUnknown,
77                 },
78                 {
79                         testCaseComment: "Unknown, boot probe fails, but one container is running",
80                         state:           StateUnknown,
81                         respBoot:        respFail,
82                         respRun:         respContainerRunning,
83                         expectState:     StateUnknown,
84                         expectRunning:   1,
85                 },
86                 {
87                         testCaseComment: "Unknown, boot probe fails, previously running container has exited",
88                         state:           StateUnknown,
89                         running:         1,
90                         respBoot:        respFail,
91                         expectState:     StateUnknown,
92                         expectRunning:   0,
93                 },
94                 {
95                         testCaseComment: "Unknown, boot timeout exceeded, boot probe fails",
96                         state:           StateUnknown,
97                         age:             bootTimeout + time.Second,
98                         respBoot:        respFail,
99                         respRun:         respFail,
100                         expectState:     StateShutdown,
101                 },
102                 {
103                         testCaseComment: "Unknown, boot timeout exceeded, boot probe succeeds but crunch-run fails",
104                         state:           StateUnknown,
105                         age:             bootTimeout * 2,
106                         respRun:         respFail,
107                         expectState:     StateShutdown,
108                 },
109                 {
110                         testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but crunch-run succeeds",
111                         state:           StateUnknown,
112                         age:             bootTimeout * 2,
113                         respBoot:        respFail,
114                         expectState:     StateShutdown,
115                 },
116                 {
117                         testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but container is running",
118                         state:           StateUnknown,
119                         age:             bootTimeout * 2,
120                         respBoot:        respFail,
121                         respRun:         respContainerRunning,
122                         expectState:     StateUnknown,
123                         expectRunning:   1,
124                 },
125                 {
126                         testCaseComment: "Unknown, boot probe fails, deployRunner succeeds, container is running",
127                         state:           StateUnknown,
128                         respBoot:        respFail,
129                         respRun:         respFail,
130                         respRunDeployed: respContainerRunning,
131                         deployRunner:    []byte("ELF"),
132                         expectStdin:     []byte("ELF"),
133                         expectState:     StateUnknown,
134                         expectRunning:   1,
135                 },
136                 {
137                         testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but deployRunner succeeds and container is running",
138                         state:           StateUnknown,
139                         age:             bootTimeout * 2,
140                         respBoot:        respFail,
141                         respRun:         respFail,
142                         respRunDeployed: respContainerRunning,
143                         deployRunner:    []byte("ELF"),
144                         expectStdin:     []byte("ELF"),
145                         expectState:     StateUnknown,
146                         expectRunning:   1,
147                 },
148                 {
149                         testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but deployRunner succeeds and no container is running",
150                         state:           StateUnknown,
151                         age:             bootTimeout * 2,
152                         respBoot:        respFail,
153                         respRun:         respFail,
154                         deployRunner:    []byte("ELF"),
155                         expectStdin:     []byte("ELF"),
156                         expectState:     StateShutdown,
157                 },
158                 {
159                         testCaseComment: "Booting, boot probe fails, run probe fails",
160                         state:           StateBooting,
161                         respBoot:        respFail,
162                         respRun:         respFail,
163                         expectState:     StateBooting,
164                 },
165                 {
166                         testCaseComment: "Booting, boot probe fails, run probe succeeds (but isn't expected to be called)",
167                         state:           StateBooting,
168                         respBoot:        respFail,
169                         expectState:     StateBooting,
170                 },
171                 {
172                         testCaseComment: "Booting, boot probe succeeds, run probe fails",
173                         state:           StateBooting,
174                         respRun:         respFail,
175                         expectState:     StateBooting,
176                 },
177                 {
178                         testCaseComment: "Booting, boot probe succeeds, run probe succeeds",
179                         state:           StateBooting,
180                         expectState:     StateIdle,
181                 },
182                 {
183                         testCaseComment: "Booting, boot probe succeeds, run probe succeeds, container is running",
184                         state:           StateBooting,
185                         respRun:         respContainerRunning,
186                         expectState:     StateRunning,
187                         expectRunning:   1,
188                 },
189                 {
190                         testCaseComment: "Booting, boot timeout exceeded",
191                         state:           StateBooting,
192                         age:             bootTimeout * 2,
193                         respRun:         respFail,
194                         expectState:     StateShutdown,
195                 },
196                 {
197                         testCaseComment: "Idle, probe timeout exceeded, one container running",
198                         state:           StateIdle,
199                         age:             probeTimeout * 2,
200                         respRun:         respContainerRunning,
201                         expectState:     StateRunning,
202                         expectRunning:   1,
203                 },
204                 {
205                         testCaseComment: "Idle, probe timeout exceeded, one container running, probe fails",
206                         state:           StateIdle,
207                         age:             probeTimeout * 2,
208                         running:         1,
209                         respRun:         respFail,
210                         expectState:     StateShutdown,
211                         expectRunning:   1,
212                 },
213                 {
214                         testCaseComment: "Idle, probe timeout exceeded, nothing running, probe fails",
215                         state:           StateIdle,
216                         age:             probeTimeout * 2,
217                         respRun:         respFail,
218                         expectState:     StateShutdown,
219                 },
220                 {
221                         testCaseComment: "Running, one container still running",
222                         state:           StateRunning,
223                         running:         1,
224                         respRun:         respContainerRunning,
225                         expectState:     StateRunning,
226                         expectRunning:   1,
227                 },
228                 {
229                         testCaseComment: "Running, container has exited",
230                         state:           StateRunning,
231                         running:         1,
232                         expectState:     StateIdle,
233                         expectRunning:   0,
234                 },
235                 {
236                         testCaseComment: "Running, probe timeout exceeded, nothing running, new container being started",
237                         state:           StateRunning,
238                         age:             probeTimeout * 2,
239                         starting:        1,
240                         expectState:     StateRunning,
241                 },
242                 {
243                         testCaseComment: "Booting, boot probe succeeds, deployRunner succeeds, run probe succeeds",
244                         state:           StateBooting,
245                         deployRunner:    []byte("ELF"),
246                         expectStdin:     []byte("ELF"),
247                         respRun:         respFail,
248                         respRunDeployed: respContainerRunning,
249                         expectRunning:   1,
250                         expectState:     StateRunning,
251                 },
252                 {
253                         testCaseComment: "Booting, boot probe succeeds, deployRunner fails",
254                         state:           StateBooting,
255                         deployRunner:    []byte("ELF"),
256                         respDeploy:      respFail,
257                         expectStdin:     []byte("ELF"),
258                         expectState:     StateBooting,
259                 },
260                 {
261                         testCaseComment: "Booting, boot probe succeeds, deployRunner skipped, run probe succeeds",
262                         state:           StateBooting,
263                         deployRunner:    nil,
264                         respDeploy:      respFail,
265                         expectState:     StateIdle,
266                 },
267         } {
268                 c.Logf("------- trial %d: %#v", idx, trial)
269                 ctime := time.Now().Add(-trial.age)
270                 exr := &stubExecutor{
271                         response: map[string]stubResp{
272                                 "bootprobe":         trial.respBoot,
273                                 "crunch-run --list": trial.respRun,
274                                 "{deploy}":          trial.respDeploy,
275                         },
276                 }
277                 wp := &Pool{
278                         arvClient:        ac,
279                         newExecutor:      func(cloud.Instance) Executor { return exr },
280                         cluster:          suite.testCluster,
281                         bootProbeCommand: "bootprobe",
282                         timeoutBooting:   bootTimeout,
283                         timeoutProbe:     probeTimeout,
284                         exited:           map[string]time.Time{},
285                         runnerCmdDefault: "crunch-run",
286                         runnerArgs:       []string{"--args=not used with --list"},
287                         runnerCmd:        "crunch-run",
288                         runnerData:       trial.deployRunner,
289                         runnerMD5:        md5.Sum(trial.deployRunner),
290                 }
291                 wp.registerMetrics(prometheus.NewRegistry())
292                 if trial.deployRunner != nil {
293                         svHash := md5.Sum(trial.deployRunner)
294                         wp.runnerCmd = fmt.Sprintf("/var/run/arvados/crunch-run~%x", svHash)
295                         exr.response[wp.runnerCmd+" --list"] = trial.respRunDeployed
296                 }
297                 wkr := &worker{
298                         logger:   suite.logger,
299                         executor: exr,
300                         wp:       wp,
301                         mtx:      &wp.mtx,
302                         state:    trial.state,
303                         instance: inst,
304                         appeared: ctime,
305                         busy:     ctime,
306                         probed:   ctime,
307                         updated:  ctime,
308                         running:  map[string]*remoteRunner{},
309                         starting: map[string]*remoteRunner{},
310                         probing:  make(chan struct{}, 1),
311                 }
312                 if trial.running > 0 {
313                         uuid := "zzzzz-dz642-abcdefghijklmno"
314                         wkr.running = map[string]*remoteRunner{uuid: newRemoteRunner(uuid, wkr)}
315                 }
316                 if trial.starting > 0 {
317                         uuid := "zzzzz-dz642-bcdefghijklmnop"
318                         wkr.starting = map[string]*remoteRunner{uuid: newRemoteRunner(uuid, wkr)}
319                 }
320                 wkr.probeAndUpdate()
321                 c.Check(wkr.state, check.Equals, trial.expectState)
322                 c.Check(len(wkr.running), check.Equals, trial.expectRunning)
323                 c.Check(exr.stdin.String(), check.Equals, string(trial.expectStdin))
324         }
325 }
326
327 type stubResp struct {
328         stdout string
329         stderr string
330         err    error
331 }
332
333 type stubExecutor struct {
334         response map[string]stubResp
335         stdin    bytes.Buffer
336 }
337
338 func (se *stubExecutor) SetTarget(cloud.ExecutorTarget) {}
339 func (se *stubExecutor) Close()                         {}
340 func (se *stubExecutor) Execute(env map[string]string, cmd string, stdin io.Reader) (stdout, stderr []byte, err error) {
341         if stdin != nil {
342                 _, err = io.Copy(&se.stdin, stdin)
343                 if err != nil {
344                         return nil, []byte(err.Error()), err
345                 }
346         }
347         resp, ok := se.response[cmd]
348         if !ok && strings.Contains(cmd, `; cat >"$dstfile"`) {
349                 resp, ok = se.response["{deploy}"]
350         }
351         if !ok {
352                 return nil, []byte(fmt.Sprintf("%s: command not found\n", cmd)), errors.New("command not found")
353         }
354         return []byte(resp.stdout), []byte(resp.stderr), resp.err
355 }