16636: Merge branch 'master' into 16636-more-metrics
[arvados.git] / lib / dispatchcloud / worker / worker_test.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package worker
6
7 import (
8         "bytes"
9         "crypto/md5"
10         "errors"
11         "fmt"
12         "io"
13         "strings"
14         "time"
15
16         "git.arvados.org/arvados.git/lib/cloud"
17         "git.arvados.org/arvados.git/lib/dispatchcloud/test"
18         "git.arvados.org/arvados.git/sdk/go/arvados"
19         "git.arvados.org/arvados.git/sdk/go/ctxlog"
20         check "gopkg.in/check.v1"
21 )
22
23 var _ = check.Suite(&WorkerSuite{})
24
25 type WorkerSuite struct{}
26
27 func (suite *WorkerSuite) TestProbeAndUpdate(c *check.C) {
28         logger := ctxlog.TestLogger(c)
29         bootTimeout := time.Minute
30         probeTimeout := time.Second
31
32         ac := arvados.NewClientFromEnv()
33         is, err := (&test.StubDriver{}).InstanceSet(nil, "test-instance-set-id", nil, logger)
34         c.Assert(err, check.IsNil)
35         inst, err := is.Create(arvados.InstanceType{}, "", nil, "echo InitCommand", nil)
36         c.Assert(err, check.IsNil)
37
38         type trialT struct {
39                 testCaseComment string // displayed in test output to help identify failure case
40                 age             time.Duration
41                 state           State
42                 running         int
43                 starting        int
44                 respBoot        stubResp // zero value is success
45                 respDeploy      stubResp // zero value is success
46                 respRun         stubResp // zero value is success + nothing running
47                 respRunDeployed stubResp
48                 deployRunner    []byte
49                 expectStdin     []byte
50                 expectState     State
51                 expectRunning   int
52         }
53
54         errFail := errors.New("failed")
55         respFail := stubResp{"", "command failed\n", errFail}
56         respContainerRunning := stubResp{"zzzzz-dz642-abcdefghijklmno\n", "", nil}
57         for idx, trial := range []trialT{
58                 {
59                         testCaseComment: "Unknown, probes fail",
60                         state:           StateUnknown,
61                         respBoot:        respFail,
62                         respRun:         respFail,
63                         expectState:     StateUnknown,
64                 },
65                 {
66                         testCaseComment: "Unknown, boot probe fails, but one container is running",
67                         state:           StateUnknown,
68                         respBoot:        respFail,
69                         respRun:         respContainerRunning,
70                         expectState:     StateUnknown,
71                         expectRunning:   1,
72                 },
73                 {
74                         testCaseComment: "Unknown, boot probe fails, previously running container has exited",
75                         state:           StateUnknown,
76                         running:         1,
77                         respBoot:        respFail,
78                         expectState:     StateUnknown,
79                         expectRunning:   0,
80                 },
81                 {
82                         testCaseComment: "Unknown, boot timeout exceeded, boot probe fails",
83                         state:           StateUnknown,
84                         age:             bootTimeout + time.Second,
85                         respBoot:        respFail,
86                         respRun:         respFail,
87                         expectState:     StateShutdown,
88                 },
89                 {
90                         testCaseComment: "Unknown, boot timeout exceeded, boot probe succeeds but crunch-run fails",
91                         state:           StateUnknown,
92                         age:             bootTimeout * 2,
93                         respRun:         respFail,
94                         expectState:     StateShutdown,
95                 },
96                 {
97                         testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but crunch-run succeeds",
98                         state:           StateUnknown,
99                         age:             bootTimeout * 2,
100                         respBoot:        respFail,
101                         expectState:     StateShutdown,
102                 },
103                 {
104                         testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but container is running",
105                         state:           StateUnknown,
106                         age:             bootTimeout * 2,
107                         respBoot:        respFail,
108                         respRun:         respContainerRunning,
109                         expectState:     StateUnknown,
110                         expectRunning:   1,
111                 },
112                 {
113                         testCaseComment: "Booting, boot probe fails, run probe fails",
114                         state:           StateBooting,
115                         respBoot:        respFail,
116                         respRun:         respFail,
117                         expectState:     StateBooting,
118                 },
119                 {
120                         testCaseComment: "Booting, boot probe fails, run probe succeeds (but isn't expected to be called)",
121                         state:           StateBooting,
122                         respBoot:        respFail,
123                         expectState:     StateBooting,
124                 },
125                 {
126                         testCaseComment: "Booting, boot probe succeeds, run probe fails",
127                         state:           StateBooting,
128                         respRun:         respFail,
129                         expectState:     StateBooting,
130                 },
131                 {
132                         testCaseComment: "Booting, boot probe succeeds, run probe succeeds",
133                         state:           StateBooting,
134                         expectState:     StateIdle,
135                 },
136                 {
137                         testCaseComment: "Booting, boot probe succeeds, run probe succeeds, container is running",
138                         state:           StateBooting,
139                         respRun:         respContainerRunning,
140                         expectState:     StateRunning,
141                         expectRunning:   1,
142                 },
143                 {
144                         testCaseComment: "Booting, boot timeout exceeded",
145                         state:           StateBooting,
146                         age:             bootTimeout * 2,
147                         respRun:         respFail,
148                         expectState:     StateShutdown,
149                 },
150                 {
151                         testCaseComment: "Idle, probe timeout exceeded, one container running",
152                         state:           StateIdle,
153                         age:             probeTimeout * 2,
154                         respRun:         respContainerRunning,
155                         expectState:     StateRunning,
156                         expectRunning:   1,
157                 },
158                 {
159                         testCaseComment: "Idle, probe timeout exceeded, one container running, probe fails",
160                         state:           StateIdle,
161                         age:             probeTimeout * 2,
162                         running:         1,
163                         respRun:         respFail,
164                         expectState:     StateShutdown,
165                         expectRunning:   1,
166                 },
167                 {
168                         testCaseComment: "Idle, probe timeout exceeded, nothing running, probe fails",
169                         state:           StateIdle,
170                         age:             probeTimeout * 2,
171                         respRun:         respFail,
172                         expectState:     StateShutdown,
173                 },
174                 {
175                         testCaseComment: "Running, one container still running",
176                         state:           StateRunning,
177                         running:         1,
178                         respRun:         respContainerRunning,
179                         expectState:     StateRunning,
180                         expectRunning:   1,
181                 },
182                 {
183                         testCaseComment: "Running, container has exited",
184                         state:           StateRunning,
185                         running:         1,
186                         expectState:     StateIdle,
187                         expectRunning:   0,
188                 },
189                 {
190                         testCaseComment: "Running, probe timeout exceeded, nothing running, new container being started",
191                         state:           StateRunning,
192                         age:             probeTimeout * 2,
193                         starting:        1,
194                         expectState:     StateRunning,
195                 },
196                 {
197                         testCaseComment: "Booting, boot probe succeeds, deployRunner succeeds, run probe succeeds",
198                         state:           StateBooting,
199                         deployRunner:    []byte("ELF"),
200                         expectStdin:     []byte("ELF"),
201                         respRun:         respFail,
202                         respRunDeployed: respContainerRunning,
203                         expectRunning:   1,
204                         expectState:     StateRunning,
205                 },
206                 {
207                         testCaseComment: "Booting, boot probe succeeds, deployRunner fails",
208                         state:           StateBooting,
209                         deployRunner:    []byte("ELF"),
210                         respDeploy:      respFail,
211                         expectStdin:     []byte("ELF"),
212                         expectState:     StateBooting,
213                 },
214                 {
215                         testCaseComment: "Booting, boot probe succeeds, deployRunner skipped, run probe succeeds",
216                         state:           StateBooting,
217                         deployRunner:    nil,
218                         respDeploy:      respFail,
219                         expectState:     StateIdle,
220                 },
221         } {
222                 c.Logf("------- trial %d: %#v", idx, trial)
223                 ctime := time.Now().Add(-trial.age)
224                 exr := &stubExecutor{
225                         response: map[string]stubResp{
226                                 "bootprobe":         trial.respBoot,
227                                 "crunch-run --list": trial.respRun,
228                                 "{deploy}":          trial.respDeploy,
229                         },
230                 }
231                 wp := &Pool{
232                         arvClient:        ac,
233                         newExecutor:      func(cloud.Instance) Executor { return exr },
234                         bootProbeCommand: "bootprobe",
235                         timeoutBooting:   bootTimeout,
236                         timeoutProbe:     probeTimeout,
237                         exited:           map[string]time.Time{},
238                         runnerCmd:        "crunch-run",
239                         runnerData:       trial.deployRunner,
240                         runnerMD5:        md5.Sum(trial.deployRunner),
241                 }
242                 if trial.deployRunner != nil {
243                         svHash := md5.Sum(trial.deployRunner)
244                         wp.runnerCmd = fmt.Sprintf("/var/run/arvados/crunch-run~%x", svHash)
245                         exr.response[wp.runnerCmd+" --list"] = trial.respRunDeployed
246                 }
247                 wkr := &worker{
248                         logger:   logger,
249                         executor: exr,
250                         wp:       wp,
251                         mtx:      &wp.mtx,
252                         state:    trial.state,
253                         instance: inst,
254                         appeared: ctime,
255                         busy:     ctime,
256                         probed:   ctime,
257                         updated:  ctime,
258                         running:  map[string]*remoteRunner{},
259                         starting: map[string]*remoteRunner{},
260                         probing:  make(chan struct{}, 1),
261                 }
262                 if trial.running > 0 {
263                         uuid := "zzzzz-dz642-abcdefghijklmno"
264                         wkr.running = map[string]*remoteRunner{uuid: newRemoteRunner(uuid, wkr)}
265                 }
266                 if trial.starting > 0 {
267                         uuid := "zzzzz-dz642-bcdefghijklmnop"
268                         wkr.starting = map[string]*remoteRunner{uuid: newRemoteRunner(uuid, wkr)}
269                 }
270                 wkr.probeAndUpdate()
271                 c.Check(wkr.state, check.Equals, trial.expectState)
272                 c.Check(len(wkr.running), check.Equals, trial.expectRunning)
273                 c.Check(exr.stdin.String(), check.Equals, string(trial.expectStdin))
274         }
275 }
276
277 type stubResp struct {
278         stdout string
279         stderr string
280         err    error
281 }
282
283 type stubExecutor struct {
284         response map[string]stubResp
285         stdin    bytes.Buffer
286 }
287
288 func (se *stubExecutor) SetTarget(cloud.ExecutorTarget) {}
289 func (se *stubExecutor) Close()                         {}
290 func (se *stubExecutor) Execute(env map[string]string, cmd string, stdin io.Reader) (stdout, stderr []byte, err error) {
291         if stdin != nil {
292                 _, err = io.Copy(&se.stdin, stdin)
293                 if err != nil {
294                         return nil, []byte(err.Error()), err
295                 }
296         }
297         resp, ok := se.response[cmd]
298         if !ok && strings.Contains(cmd, `; cat >"$dstfile"`) {
299                 resp, ok = se.response["{deploy}"]
300         }
301         if !ok {
302                 return nil, []byte(fmt.Sprintf("%s: command not found\n", cmd)), errors.New("command not found")
303         }
304         return []byte(resp.stdout), []byte(resp.stderr), resp.err
305 }