17384: Respect CrunchRunCommand and CrunchRunArgumentsList in a-d-c.
[arvados.git] / lib / dispatchcloud / worker / worker_test.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package worker
6
7 import (
8         "bytes"
9         "crypto/md5"
10         "errors"
11         "fmt"
12         "io"
13         "strings"
14         "time"
15
16         "git.arvados.org/arvados.git/lib/cloud"
17         "git.arvados.org/arvados.git/lib/dispatchcloud/test"
18         "git.arvados.org/arvados.git/sdk/go/arvados"
19         "git.arvados.org/arvados.git/sdk/go/ctxlog"
20         "github.com/prometheus/client_golang/prometheus"
21         check "gopkg.in/check.v1"
22 )
23
24 var _ = check.Suite(&WorkerSuite{})
25
26 type WorkerSuite struct{}
27
28 func (suite *WorkerSuite) TestProbeAndUpdate(c *check.C) {
29         logger := ctxlog.TestLogger(c)
30         bootTimeout := time.Minute
31         probeTimeout := time.Second
32
33         ac := arvados.NewClientFromEnv()
34         is, err := (&test.StubDriver{}).InstanceSet(nil, "test-instance-set-id", nil, logger)
35         c.Assert(err, check.IsNil)
36         inst, err := is.Create(arvados.InstanceType{}, "", nil, "echo InitCommand", nil)
37         c.Assert(err, check.IsNil)
38
39         type trialT struct {
40                 testCaseComment string // displayed in test output to help identify failure case
41                 age             time.Duration
42                 state           State
43                 running         int
44                 starting        int
45                 respBoot        stubResp // zero value is success
46                 respDeploy      stubResp // zero value is success
47                 respRun         stubResp // zero value is success + nothing running
48                 respRunDeployed stubResp
49                 deployRunner    []byte
50                 expectStdin     []byte
51                 expectState     State
52                 expectRunning   int
53         }
54
55         errFail := errors.New("failed")
56         respFail := stubResp{"", "command failed\n", errFail}
57         respContainerRunning := stubResp{"zzzzz-dz642-abcdefghijklmno\n", "", nil}
58         for idx, trial := range []trialT{
59                 {
60                         testCaseComment: "Unknown, probes fail",
61                         state:           StateUnknown,
62                         respBoot:        respFail,
63                         respRun:         respFail,
64                         expectState:     StateUnknown,
65                 },
66                 {
67                         testCaseComment: "Unknown, boot probe fails, but one container is running",
68                         state:           StateUnknown,
69                         respBoot:        respFail,
70                         respRun:         respContainerRunning,
71                         expectState:     StateUnknown,
72                         expectRunning:   1,
73                 },
74                 {
75                         testCaseComment: "Unknown, boot probe fails, previously running container has exited",
76                         state:           StateUnknown,
77                         running:         1,
78                         respBoot:        respFail,
79                         expectState:     StateUnknown,
80                         expectRunning:   0,
81                 },
82                 {
83                         testCaseComment: "Unknown, boot timeout exceeded, boot probe fails",
84                         state:           StateUnknown,
85                         age:             bootTimeout + time.Second,
86                         respBoot:        respFail,
87                         respRun:         respFail,
88                         expectState:     StateShutdown,
89                 },
90                 {
91                         testCaseComment: "Unknown, boot timeout exceeded, boot probe succeeds but crunch-run fails",
92                         state:           StateUnknown,
93                         age:             bootTimeout * 2,
94                         respRun:         respFail,
95                         expectState:     StateShutdown,
96                 },
97                 {
98                         testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but crunch-run succeeds",
99                         state:           StateUnknown,
100                         age:             bootTimeout * 2,
101                         respBoot:        respFail,
102                         expectState:     StateShutdown,
103                 },
104                 {
105                         testCaseComment: "Unknown, boot timeout exceeded, boot probe fails but container is running",
106                         state:           StateUnknown,
107                         age:             bootTimeout * 2,
108                         respBoot:        respFail,
109                         respRun:         respContainerRunning,
110                         expectState:     StateUnknown,
111                         expectRunning:   1,
112                 },
113                 {
114                         testCaseComment: "Booting, boot probe fails, run probe fails",
115                         state:           StateBooting,
116                         respBoot:        respFail,
117                         respRun:         respFail,
118                         expectState:     StateBooting,
119                 },
120                 {
121                         testCaseComment: "Booting, boot probe fails, run probe succeeds (but isn't expected to be called)",
122                         state:           StateBooting,
123                         respBoot:        respFail,
124                         expectState:     StateBooting,
125                 },
126                 {
127                         testCaseComment: "Booting, boot probe succeeds, run probe fails",
128                         state:           StateBooting,
129                         respRun:         respFail,
130                         expectState:     StateBooting,
131                 },
132                 {
133                         testCaseComment: "Booting, boot probe succeeds, run probe succeeds",
134                         state:           StateBooting,
135                         expectState:     StateIdle,
136                 },
137                 {
138                         testCaseComment: "Booting, boot probe succeeds, run probe succeeds, container is running",
139                         state:           StateBooting,
140                         respRun:         respContainerRunning,
141                         expectState:     StateRunning,
142                         expectRunning:   1,
143                 },
144                 {
145                         testCaseComment: "Booting, boot timeout exceeded",
146                         state:           StateBooting,
147                         age:             bootTimeout * 2,
148                         respRun:         respFail,
149                         expectState:     StateShutdown,
150                 },
151                 {
152                         testCaseComment: "Idle, probe timeout exceeded, one container running",
153                         state:           StateIdle,
154                         age:             probeTimeout * 2,
155                         respRun:         respContainerRunning,
156                         expectState:     StateRunning,
157                         expectRunning:   1,
158                 },
159                 {
160                         testCaseComment: "Idle, probe timeout exceeded, one container running, probe fails",
161                         state:           StateIdle,
162                         age:             probeTimeout * 2,
163                         running:         1,
164                         respRun:         respFail,
165                         expectState:     StateShutdown,
166                         expectRunning:   1,
167                 },
168                 {
169                         testCaseComment: "Idle, probe timeout exceeded, nothing running, probe fails",
170                         state:           StateIdle,
171                         age:             probeTimeout * 2,
172                         respRun:         respFail,
173                         expectState:     StateShutdown,
174                 },
175                 {
176                         testCaseComment: "Running, one container still running",
177                         state:           StateRunning,
178                         running:         1,
179                         respRun:         respContainerRunning,
180                         expectState:     StateRunning,
181                         expectRunning:   1,
182                 },
183                 {
184                         testCaseComment: "Running, container has exited",
185                         state:           StateRunning,
186                         running:         1,
187                         expectState:     StateIdle,
188                         expectRunning:   0,
189                 },
190                 {
191                         testCaseComment: "Running, probe timeout exceeded, nothing running, new container being started",
192                         state:           StateRunning,
193                         age:             probeTimeout * 2,
194                         starting:        1,
195                         expectState:     StateRunning,
196                 },
197                 {
198                         testCaseComment: "Booting, boot probe succeeds, deployRunner succeeds, run probe succeeds",
199                         state:           StateBooting,
200                         deployRunner:    []byte("ELF"),
201                         expectStdin:     []byte("ELF"),
202                         respRun:         respFail,
203                         respRunDeployed: respContainerRunning,
204                         expectRunning:   1,
205                         expectState:     StateRunning,
206                 },
207                 {
208                         testCaseComment: "Booting, boot probe succeeds, deployRunner fails",
209                         state:           StateBooting,
210                         deployRunner:    []byte("ELF"),
211                         respDeploy:      respFail,
212                         expectStdin:     []byte("ELF"),
213                         expectState:     StateBooting,
214                 },
215                 {
216                         testCaseComment: "Booting, boot probe succeeds, deployRunner skipped, run probe succeeds",
217                         state:           StateBooting,
218                         deployRunner:    nil,
219                         respDeploy:      respFail,
220                         expectState:     StateIdle,
221                 },
222         } {
223                 c.Logf("------- trial %d: %#v", idx, trial)
224                 ctime := time.Now().Add(-trial.age)
225                 exr := &stubExecutor{
226                         response: map[string]stubResp{
227                                 "bootprobe":         trial.respBoot,
228                                 "crunch-run --list": trial.respRun,
229                                 "{deploy}":          trial.respDeploy,
230                         },
231                 }
232                 wp := &Pool{
233                         arvClient:        ac,
234                         newExecutor:      func(cloud.Instance) Executor { return exr },
235                         bootProbeCommand: "bootprobe",
236                         timeoutBooting:   bootTimeout,
237                         timeoutProbe:     probeTimeout,
238                         exited:           map[string]time.Time{},
239                         runnerCmdDefault: "crunch-run",
240                         runnerArgs:       []string{"--args=not used with --list"},
241                         runnerCmd:        "crunch-run",
242                         runnerData:       trial.deployRunner,
243                         runnerMD5:        md5.Sum(trial.deployRunner),
244                 }
245                 wp.registerMetrics(prometheus.NewRegistry())
246                 if trial.deployRunner != nil {
247                         svHash := md5.Sum(trial.deployRunner)
248                         wp.runnerCmd = fmt.Sprintf("/var/run/arvados/crunch-run~%x", svHash)
249                         exr.response[wp.runnerCmd+" --list"] = trial.respRunDeployed
250                 }
251                 wkr := &worker{
252                         logger:   logger,
253                         executor: exr,
254                         wp:       wp,
255                         mtx:      &wp.mtx,
256                         state:    trial.state,
257                         instance: inst,
258                         appeared: ctime,
259                         busy:     ctime,
260                         probed:   ctime,
261                         updated:  ctime,
262                         running:  map[string]*remoteRunner{},
263                         starting: map[string]*remoteRunner{},
264                         probing:  make(chan struct{}, 1),
265                 }
266                 if trial.running > 0 {
267                         uuid := "zzzzz-dz642-abcdefghijklmno"
268                         wkr.running = map[string]*remoteRunner{uuid: newRemoteRunner(uuid, wkr)}
269                 }
270                 if trial.starting > 0 {
271                         uuid := "zzzzz-dz642-bcdefghijklmnop"
272                         wkr.starting = map[string]*remoteRunner{uuid: newRemoteRunner(uuid, wkr)}
273                 }
274                 wkr.probeAndUpdate()
275                 c.Check(wkr.state, check.Equals, trial.expectState)
276                 c.Check(len(wkr.running), check.Equals, trial.expectRunning)
277                 c.Check(exr.stdin.String(), check.Equals, string(trial.expectStdin))
278         }
279 }
280
281 type stubResp struct {
282         stdout string
283         stderr string
284         err    error
285 }
286
287 type stubExecutor struct {
288         response map[string]stubResp
289         stdin    bytes.Buffer
290 }
291
292 func (se *stubExecutor) SetTarget(cloud.ExecutorTarget) {}
293 func (se *stubExecutor) Close()                         {}
294 func (se *stubExecutor) Execute(env map[string]string, cmd string, stdin io.Reader) (stdout, stderr []byte, err error) {
295         if stdin != nil {
296                 _, err = io.Copy(&se.stdin, stdin)
297                 if err != nil {
298                         return nil, []byte(err.Error()), err
299                 }
300         }
301         resp, ok := se.response[cmd]
302         if !ok && strings.Contains(cmd, `; cat >"$dstfile"`) {
303                 resp, ok = se.response["{deploy}"]
304         }
305         if !ok {
306                 return nil, []byte(fmt.Sprintf("%s: command not found\n", cmd)), errors.New("command not found")
307         }
308         return []byte(resp.stdout), []byte(resp.stderr), resp.err
309 }