Merge branch '20688-wb1-to-wb2-redirects' refs #20688
[arvados.git] / lib / cloud / cloudtest / tester.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package cloudtest
6
7 import (
8         "crypto/rand"
9         "encoding/json"
10         "errors"
11         "fmt"
12         "time"
13
14         "git.arvados.org/arvados.git/lib/cloud"
15         "git.arvados.org/arvados.git/lib/dispatchcloud/sshexecutor"
16         "git.arvados.org/arvados.git/lib/dispatchcloud/worker"
17         "git.arvados.org/arvados.git/sdk/go/arvados"
18         "github.com/sirupsen/logrus"
19         "golang.org/x/crypto/ssh"
20 )
21
22 var (
23         errTestInstanceNotFound = errors.New("test instance missing from cloud provider's list")
24 )
25
26 // A tester does a sequence of operations to test a cloud driver and
27 // configuration. Run() should be called only once, after assigning
28 // suitable values to public fields.
29 type tester struct {
30         Logger              logrus.FieldLogger
31         Tags                cloud.SharedResourceTags
32         TagKeyPrefix        string
33         SetID               cloud.InstanceSetID
34         DestroyExisting     bool
35         ProbeInterval       time.Duration
36         SyncInterval        time.Duration
37         TimeoutBooting      time.Duration
38         Driver              cloud.Driver
39         DriverParameters    json.RawMessage
40         InstanceType        arvados.InstanceType
41         ImageID             cloud.ImageID
42         SSHKey              ssh.Signer
43         SSHPort             string
44         BootProbeCommand    string
45         InstanceInitCommand cloud.InitCommand
46         ShellCommand        string
47         PauseBeforeDestroy  func()
48
49         is              cloud.InstanceSet
50         testInstance    *worker.TagVerifier
51         secret          string
52         executor        *sshexecutor.Executor
53         showedLoginInfo bool
54
55         failed bool
56 }
57
58 // Run the test suite once for each applicable permutation of
59 // DriverParameters.  Return true if everything worked.
60 //
61 // Currently this means run once for each configured SubnetID.
62 func (t *tester) Run() bool {
63         var dp map[string]interface{}
64         if len(t.DriverParameters) > 0 {
65                 err := json.Unmarshal(t.DriverParameters, &dp)
66                 if err != nil {
67                         t.Logger.WithError(err).Error("error decoding configured CloudVMs.DriverParameters")
68                         return false
69                 }
70         }
71         subnets, ok := dp["SubnetID"].([]interface{})
72         if !ok || len(subnets) <= 1 {
73                 // Easy, only one SubnetID to test.
74                 return t.runWithDriverParameters(t.DriverParameters)
75         }
76
77         deferredError := false
78         for i, subnet := range subnets {
79                 subnet, ok := subnet.(string)
80                 if !ok {
81                         t.Logger.Errorf("CloudVMs.DriverParameters.SubnetID[%d] is invalid -- must be a string", i)
82                         deferredError = true
83                         continue
84                 }
85                 dp["SubnetID"] = subnet
86                 t.Logger.Infof("running tests using SubnetID[%d] %q", i, subnet)
87                 dpjson, err := json.Marshal(dp)
88                 if err != nil {
89                         t.Logger.WithError(err).Error("error encoding driver parameters")
90                         deferredError = true
91                         continue
92                 }
93                 ok = t.runWithDriverParameters(dpjson)
94                 if !ok {
95                         t.Logger.Infof("failed tests using SubnetID[%d] %q", i, subnet)
96                         deferredError = true
97                 }
98         }
99         return !deferredError
100 }
101
102 // Run the test suite as specified, clean up as needed, and return
103 // true (everything is OK) or false (something went wrong).
104 func (t *tester) runWithDriverParameters(driverParameters json.RawMessage) bool {
105         // This flag gets set when we encounter a non-fatal error, so
106         // we can continue doing more tests but remember to return
107         // false (failure) at the end.
108         deferredError := false
109
110         var err error
111         t.is, err = t.Driver.InstanceSet(driverParameters, t.SetID, t.Tags, t.Logger, nil)
112         if err != nil {
113                 t.Logger.WithError(err).Info("error initializing driver")
114                 return false
115         }
116
117         for {
118                 // Don't send the driver any filters when getting the
119                 // initial instance list. This way we can log an
120                 // instance count (N=...)  that includes all instances
121                 // in this service account, even if they don't have
122                 // the same InstanceSetID.
123                 insts, err := t.getInstances(nil)
124                 if err != nil {
125                         t.Logger.WithError(err).Info("error getting list of instances")
126                         return false
127                 }
128
129                 foundExisting := false
130                 for _, i := range insts {
131                         if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
132                                 continue
133                         }
134                         lgr := t.Logger.WithFields(logrus.Fields{
135                                 "Instance":      i.ID(),
136                                 "InstanceSetID": t.SetID,
137                         })
138                         foundExisting = true
139                         if t.DestroyExisting {
140                                 lgr.Info("destroying existing instance with our InstanceSetID")
141                                 t0 := time.Now()
142                                 err := i.Destroy()
143                                 lgr := lgr.WithField("Duration", time.Since(t0))
144                                 if err != nil {
145                                         lgr.WithError(err).Error("error destroying existing instance")
146                                 } else {
147                                         lgr.Info("Destroy() call succeeded")
148                                 }
149                         } else {
150                                 lgr.Error("found existing instance with our InstanceSetID")
151                         }
152                 }
153                 if !foundExisting {
154                         break
155                 } else if t.DestroyExisting {
156                         t.sleepSyncInterval()
157                 } else {
158                         t.Logger.Error("cannot continue with existing instances -- clean up manually, use -destroy-existing=true, or choose a different -instance-set-id")
159                         return false
160                 }
161         }
162
163         t.secret = randomHex(40)
164
165         tags := cloud.InstanceTags{}
166         for k, v := range t.Tags {
167                 tags[k] = v
168         }
169         tags[t.TagKeyPrefix+"InstanceSetID"] = string(t.SetID)
170         tags[t.TagKeyPrefix+"InstanceSecret"] = t.secret
171
172         defer t.destroyTestInstance()
173
174         bootDeadline := time.Now().Add(t.TimeoutBooting)
175         initCommand := worker.TagVerifier{Instance: nil, Secret: t.secret, ReportVerified: nil}.InitCommand() + "\n" + t.InstanceInitCommand
176
177         t.Logger.WithFields(logrus.Fields{
178                 "InstanceType":         t.InstanceType.Name,
179                 "ProviderInstanceType": t.InstanceType.ProviderType,
180                 "ImageID":              t.ImageID,
181                 "Tags":                 tags,
182                 "InitCommand":          initCommand,
183         }).Info("creating instance")
184         t0 := time.Now()
185         inst, err := t.is.Create(t.InstanceType, t.ImageID, tags, initCommand, t.SSHKey.PublicKey())
186         lgrC := t.Logger.WithField("Duration", time.Since(t0))
187         if err != nil {
188                 // Create() might have failed due to a bug or network
189                 // error even though the creation was successful, so
190                 // it's safer to wait a bit for an instance to appear.
191                 deferredError = true
192                 lgrC.WithError(err).Error("error creating test instance")
193                 t.Logger.WithField("Deadline", bootDeadline).Info("waiting for instance to appear anyway, in case the Create response was incorrect")
194                 for err = t.refreshTestInstance(); err != nil; err = t.refreshTestInstance() {
195                         if time.Now().After(bootDeadline) {
196                                 t.Logger.Error("timed out")
197                                 return false
198                         }
199                         t.sleepSyncInterval()
200                 }
201                 t.Logger.WithField("Instance", t.testInstance.ID()).Info("new instance appeared")
202                 t.showLoginInfo()
203         } else {
204                 // Create() succeeded. Make sure the new instance
205                 // appears right away in the Instances() list.
206                 lgrC.WithField("Instance", inst.ID()).Info("created instance")
207                 t.testInstance = &worker.TagVerifier{Instance: inst, Secret: t.secret, ReportVerified: nil}
208                 t.showLoginInfo()
209                 err = t.refreshTestInstance()
210                 if err == errTestInstanceNotFound {
211                         t.Logger.WithError(err).Error("cloud/driver Create succeeded, but instance is not in list")
212                         deferredError = true
213                 } else if err != nil {
214                         t.Logger.WithError(err).Error("error getting list of instances")
215                         return false
216                 }
217         }
218
219         if !t.checkTags() {
220                 // checkTags() already logged the errors
221                 deferredError = true
222         }
223
224         if !t.waitForBoot(bootDeadline) {
225                 deferredError = true
226         }
227
228         if t.ShellCommand != "" {
229                 err = t.runShellCommand(t.ShellCommand)
230                 if err != nil {
231                         t.Logger.WithError(err).Error("shell command failed")
232                         deferredError = true
233                 }
234         }
235
236         if fn := t.PauseBeforeDestroy; fn != nil {
237                 t.showLoginInfo()
238                 fn()
239         }
240
241         return !deferredError
242 }
243
244 // If the test instance has an address, log an "ssh user@host" command
245 // line that the operator can paste into another terminal, and set
246 // t.showedLoginInfo.
247 //
248 // If the test instance doesn't have an address yet, do nothing.
249 func (t *tester) showLoginInfo() {
250         t.updateExecutor()
251         host, port := t.executor.TargetHostPort()
252         if host == "" {
253                 return
254         }
255         user := t.testInstance.RemoteUser()
256         t.Logger.WithField("Command", fmt.Sprintf("ssh -p%s %s@%s", port, user, host)).Info("showing login information")
257         t.showedLoginInfo = true
258 }
259
260 // Get the latest instance list from the driver. If our test instance
261 // is found, assign it to t.testIntance.
262 func (t *tester) refreshTestInstance() error {
263         insts, err := t.getInstances(cloud.InstanceTags{t.TagKeyPrefix + "InstanceSetID": string(t.SetID)})
264         if err != nil {
265                 return err
266         }
267         for _, i := range insts {
268                 if t.testInstance == nil {
269                         // Filter by InstanceSetID tag value
270                         if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
271                                 continue
272                         }
273                 } else {
274                         // Filter by instance ID
275                         if i.ID() != t.testInstance.ID() {
276                                 continue
277                         }
278                 }
279                 t.Logger.WithFields(logrus.Fields{
280                         "Instance": i.ID(),
281                         "Address":  i.Address(),
282                 }).Info("found our instance in returned list")
283                 t.testInstance = &worker.TagVerifier{Instance: i, Secret: t.secret, ReportVerified: nil}
284                 if !t.showedLoginInfo {
285                         t.showLoginInfo()
286                 }
287                 return nil
288         }
289         return errTestInstanceNotFound
290 }
291
292 // Get the list of instances, passing the given tags to the cloud
293 // driver to filter results.
294 //
295 // Return only the instances that have our InstanceSetID tag.
296 func (t *tester) getInstances(tags cloud.InstanceTags) ([]cloud.Instance, error) {
297         var ret []cloud.Instance
298         t.Logger.WithField("FilterTags", tags).Info("getting instance list")
299         t0 := time.Now()
300         insts, err := t.is.Instances(tags)
301         if err != nil {
302                 return nil, err
303         }
304         t.Logger.WithFields(logrus.Fields{
305                 "Duration": time.Since(t0),
306                 "N":        len(insts),
307         }).Info("got instance list")
308         for _, i := range insts {
309                 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] == string(t.SetID) {
310                         ret = append(ret, i)
311                 }
312         }
313         return ret, nil
314 }
315
316 // Check that t.testInstance has every tag in t.Tags. If not, log an
317 // error and return false.
318 func (t *tester) checkTags() bool {
319         ok := true
320         for k, v := range t.Tags {
321                 if got := t.testInstance.Tags()[k]; got != v {
322                         ok = false
323                         t.Logger.WithFields(logrus.Fields{
324                                 "Key":           k,
325                                 "ExpectedValue": v,
326                                 "GotValue":      got,
327                         }).Error("tag is missing from test instance")
328                 }
329         }
330         if ok {
331                 t.Logger.Info("all expected tags are present")
332         }
333         return ok
334 }
335
336 // Run t.BootProbeCommand on t.testInstance until it succeeds or the
337 // deadline arrives.
338 func (t *tester) waitForBoot(deadline time.Time) bool {
339         for time.Now().Before(deadline) {
340                 err := t.runShellCommand(t.BootProbeCommand)
341                 if err == nil {
342                         return true
343                 }
344                 t.sleepProbeInterval()
345                 t.refreshTestInstance()
346         }
347         t.Logger.Error("timed out")
348         return false
349 }
350
351 // Create t.executor and/or update its target to t.testInstance's
352 // current address.
353 func (t *tester) updateExecutor() {
354         if t.executor == nil {
355                 t.executor = sshexecutor.New(t.testInstance)
356                 t.executor.SetTargetPort(t.SSHPort)
357                 t.executor.SetSigners(t.SSHKey)
358         } else {
359                 t.executor.SetTarget(t.testInstance)
360         }
361 }
362
363 func (t *tester) runShellCommand(cmd string) error {
364         t.updateExecutor()
365         t.Logger.WithFields(logrus.Fields{
366                 "Command": cmd,
367         }).Info("executing remote command")
368         t0 := time.Now()
369         stdout, stderr, err := t.executor.Execute(nil, cmd, nil)
370         lgr := t.Logger.WithFields(logrus.Fields{
371                 "Duration": time.Since(t0),
372                 "Command":  cmd,
373                 "stdout":   string(stdout),
374                 "stderr":   string(stderr),
375         })
376         if err != nil {
377                 lgr.WithError(err).Info("remote command failed")
378         } else {
379                 lgr.Info("remote command succeeded")
380         }
381         return err
382 }
383
384 // currently, this tries forever until it can return true (success).
385 func (t *tester) destroyTestInstance() bool {
386         if t.testInstance == nil {
387                 return true
388         }
389         for {
390                 lgr := t.Logger.WithField("Instance", t.testInstance.ID())
391                 lgr.Info("destroying instance")
392                 t0 := time.Now()
393
394                 err := t.testInstance.Destroy()
395                 lgrDur := lgr.WithField("Duration", time.Since(t0))
396                 if err != nil {
397                         lgrDur.WithError(err).Error("error destroying instance")
398                 } else {
399                         lgrDur.Info("destroyed instance")
400                 }
401
402                 err = t.refreshTestInstance()
403                 if err == errTestInstanceNotFound {
404                         lgr.Info("instance no longer appears in list")
405                         t.testInstance = nil
406                         return true
407                 } else if err == nil {
408                         lgr.Info("instance still exists after calling Destroy")
409                         t.sleepSyncInterval()
410                         continue
411                 } else {
412                         t.Logger.WithError(err).Error("error getting list of instances")
413                         continue
414                 }
415         }
416 }
417
418 func (t *tester) sleepSyncInterval() {
419         t.Logger.WithField("Duration", t.SyncInterval).Info("waiting SyncInterval")
420         time.Sleep(t.SyncInterval)
421 }
422
423 func (t *tester) sleepProbeInterval() {
424         t.Logger.WithField("Duration", t.ProbeInterval).Info("waiting ProbeInterval")
425         time.Sleep(t.ProbeInterval)
426 }
427
428 // Return a random string of n hexadecimal digits (n*4 random bits). n
429 // must be even.
430 func randomHex(n int) string {
431         buf := make([]byte, n/2)
432         _, err := rand.Read(buf)
433         if err != nil {
434                 panic(err)
435         }
436         return fmt.Sprintf("%x", buf)
437 }