Merge branch '22316-test-fixes'
[arvados.git] / lib / cloud / cloudtest / tester.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package cloudtest
6
7 import (
8         "crypto/rand"
9         "encoding/json"
10         "errors"
11         "fmt"
12         "time"
13
14         "git.arvados.org/arvados.git/lib/cloud"
15         "git.arvados.org/arvados.git/lib/dispatchcloud/sshexecutor"
16         "git.arvados.org/arvados.git/lib/dispatchcloud/worker"
17         "git.arvados.org/arvados.git/sdk/go/arvados"
18         "github.com/sirupsen/logrus"
19         "golang.org/x/crypto/ssh"
20 )
21
22 var (
23         errTestInstanceNotFound = errors.New("test instance missing from cloud provider's list")
24 )
25
26 // A tester does a sequence of operations to test a cloud driver and
27 // configuration. Run() should be called only once, after assigning
28 // suitable values to public fields.
29 type tester struct {
30         Logger              logrus.FieldLogger
31         Tags                cloud.SharedResourceTags
32         TagKeyPrefix        string
33         SetID               cloud.InstanceSetID
34         DestroyExisting     bool
35         ProbeInterval       time.Duration
36         SyncInterval        time.Duration
37         TimeoutBooting      time.Duration
38         Driver              cloud.Driver
39         DriverParameters    json.RawMessage
40         InstanceType        arvados.InstanceType
41         ImageID             cloud.ImageID
42         SSHKey              ssh.Signer
43         SSHPort             string
44         DeployPublicKey     bool
45         BootProbeCommand    string
46         InstanceInitCommand cloud.InitCommand
47         ShellCommand        string
48         PauseBeforeDestroy  func()
49
50         is              cloud.InstanceSet
51         testInstance    *worker.TagVerifier
52         secret          string
53         executor        *sshexecutor.Executor
54         showedLoginInfo bool
55
56         failed bool
57 }
58
59 // Run the test suite once for each applicable permutation of
60 // DriverParameters.  Return true if everything worked.
61 //
62 // Currently this means run once for each configured SubnetID.
63 func (t *tester) Run() bool {
64         var dp map[string]interface{}
65         if len(t.DriverParameters) > 0 {
66                 err := json.Unmarshal(t.DriverParameters, &dp)
67                 if err != nil {
68                         t.Logger.WithError(err).Error("error decoding configured CloudVMs.DriverParameters")
69                         return false
70                 }
71         }
72         subnets, ok := dp["SubnetID"].([]interface{})
73         if !ok || len(subnets) <= 1 {
74                 // Easy, only one SubnetID to test.
75                 return t.runWithDriverParameters(t.DriverParameters)
76         }
77
78         deferredError := false
79         for i, subnet := range subnets {
80                 subnet, ok := subnet.(string)
81                 if !ok {
82                         t.Logger.Errorf("CloudVMs.DriverParameters.SubnetID[%d] is invalid -- must be a string", i)
83                         deferredError = true
84                         continue
85                 }
86                 dp["SubnetID"] = subnet
87                 t.Logger.Infof("running tests using SubnetID[%d] %q", i, subnet)
88                 dpjson, err := json.Marshal(dp)
89                 if err != nil {
90                         t.Logger.WithError(err).Error("error encoding driver parameters")
91                         deferredError = true
92                         continue
93                 }
94                 ok = t.runWithDriverParameters(dpjson)
95                 if !ok {
96                         t.Logger.Infof("failed tests using SubnetID[%d] %q", i, subnet)
97                         deferredError = true
98                 }
99         }
100         return !deferredError
101 }
102
103 // Run the test suite as specified, clean up as needed, and return
104 // true (everything is OK) or false (something went wrong).
105 func (t *tester) runWithDriverParameters(driverParameters json.RawMessage) bool {
106         // This flag gets set when we encounter a non-fatal error, so
107         // we can continue doing more tests but remember to return
108         // false (failure) at the end.
109         deferredError := false
110
111         var err error
112         t.is, err = t.Driver.InstanceSet(driverParameters, t.SetID, t.Tags, t.Logger, nil)
113         if err != nil {
114                 t.Logger.WithError(err).Info("error initializing driver")
115                 return false
116         }
117
118         for {
119                 // Don't send the driver any filters when getting the
120                 // initial instance list. This way we can log an
121                 // instance count (N=...)  that includes all instances
122                 // in this service account, even if they don't have
123                 // the same InstanceSetID.
124                 insts, err := t.getInstances(nil)
125                 if err != nil {
126                         t.Logger.WithError(err).Info("error getting list of instances")
127                         return false
128                 }
129
130                 foundExisting := false
131                 for _, i := range insts {
132                         if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
133                                 continue
134                         }
135                         lgr := t.Logger.WithFields(logrus.Fields{
136                                 "Instance":      i.ID(),
137                                 "InstanceSetID": t.SetID,
138                         })
139                         foundExisting = true
140                         if t.DestroyExisting {
141                                 lgr.Info("destroying existing instance with our InstanceSetID")
142                                 t0 := time.Now()
143                                 err := i.Destroy()
144                                 lgr := lgr.WithField("Duration", time.Since(t0))
145                                 if err != nil {
146                                         lgr.WithError(err).Error("error destroying existing instance")
147                                 } else {
148                                         lgr.Info("Destroy() call succeeded")
149                                 }
150                         } else {
151                                 lgr.Error("found existing instance with our InstanceSetID")
152                         }
153                 }
154                 if !foundExisting {
155                         break
156                 } else if t.DestroyExisting {
157                         t.sleepSyncInterval()
158                 } else {
159                         t.Logger.Error("cannot continue with existing instances -- clean up manually, use -destroy-existing=true, or choose a different -instance-set-id")
160                         return false
161                 }
162         }
163
164         t.secret = randomHex(40)
165
166         tags := cloud.InstanceTags{}
167         for k, v := range t.Tags {
168                 tags[k] = v
169         }
170         tags[t.TagKeyPrefix+"InstanceSetID"] = string(t.SetID)
171         tags[t.TagKeyPrefix+"InstanceSecret"] = t.secret
172
173         defer t.destroyTestInstance()
174
175         bootDeadline := time.Now().Add(t.TimeoutBooting)
176         initCommand := worker.TagVerifier{Instance: nil, Secret: t.secret, ReportVerified: nil}.InitCommand() + "\n" + t.InstanceInitCommand
177
178         installPublicKey := t.SSHKey.PublicKey()
179         if !t.DeployPublicKey {
180                 installPublicKey = nil
181         }
182
183         t.Logger.WithFields(logrus.Fields{
184                 "InstanceType":         t.InstanceType.Name,
185                 "ProviderInstanceType": t.InstanceType.ProviderType,
186                 "ImageID":              t.ImageID,
187                 "Tags":                 tags,
188                 "InitCommand":          initCommand,
189                 "DeployPublicKey":      installPublicKey != nil,
190         }).Info("creating instance")
191         t0 := time.Now()
192         inst, err := t.is.Create(t.InstanceType, t.ImageID, tags, initCommand, installPublicKey)
193         lgrC := t.Logger.WithField("Duration", time.Since(t0))
194         if err != nil {
195                 // Create() might have failed due to a bug or network
196                 // error even though the creation was successful, so
197                 // it's safer to wait a bit for an instance to appear.
198                 deferredError = true
199                 lgrC.WithError(err).Error("error creating test instance")
200                 t.Logger.WithField("Deadline", bootDeadline).Info("waiting for instance to appear anyway, in case the Create response was incorrect")
201                 for err = t.refreshTestInstance(); err != nil; err = t.refreshTestInstance() {
202                         if time.Now().After(bootDeadline) {
203                                 t.Logger.Error("timed out")
204                                 return false
205                         }
206                         t.sleepSyncInterval()
207                 }
208                 t.Logger.WithField("Instance", t.testInstance.ID()).Info("new instance appeared")
209                 t.showLoginInfo()
210         } else {
211                 // Create() succeeded. Make sure the new instance
212                 // appears right away in the Instances() list.
213                 lgrC.WithField("Instance", inst.ID()).Info("created instance")
214                 t.testInstance = &worker.TagVerifier{Instance: inst, Secret: t.secret, ReportVerified: nil}
215                 t.showLoginInfo()
216                 err = t.refreshTestInstance()
217                 if err == errTestInstanceNotFound {
218                         t.Logger.WithError(err).Error("cloud/driver Create succeeded, but instance is not in list")
219                         deferredError = true
220                 } else if err != nil {
221                         t.Logger.WithError(err).Error("error getting list of instances")
222                         return false
223                 }
224         }
225
226         if !t.checkTags() {
227                 // checkTags() already logged the errors
228                 deferredError = true
229         }
230
231         if !t.waitForBoot(bootDeadline) {
232                 deferredError = true
233         }
234
235         if t.ShellCommand != "" {
236                 err = t.runShellCommand(t.ShellCommand)
237                 if err != nil {
238                         t.Logger.WithError(err).Error("shell command failed")
239                         deferredError = true
240                 }
241         }
242
243         if fn := t.PauseBeforeDestroy; fn != nil {
244                 t.showLoginInfo()
245                 fn()
246         }
247
248         return !deferredError
249 }
250
251 // If the test instance has an address, log an "ssh user@host" command
252 // line that the operator can paste into another terminal, and set
253 // t.showedLoginInfo.
254 //
255 // If the test instance doesn't have an address yet, do nothing.
256 func (t *tester) showLoginInfo() {
257         t.updateExecutor()
258         host, port := t.executor.TargetHostPort()
259         if host == "" {
260                 return
261         }
262         user := t.testInstance.RemoteUser()
263         t.Logger.WithField("Command", fmt.Sprintf("ssh -p%s %s@%s", port, user, host)).Info("showing login information")
264         t.showedLoginInfo = true
265 }
266
267 // Get the latest instance list from the driver. If our test instance
268 // is found, assign it to t.testIntance.
269 func (t *tester) refreshTestInstance() error {
270         insts, err := t.getInstances(cloud.InstanceTags{t.TagKeyPrefix + "InstanceSetID": string(t.SetID)})
271         if err != nil {
272                 return err
273         }
274         for _, i := range insts {
275                 if t.testInstance == nil {
276                         // Filter by InstanceSetID tag value
277                         if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
278                                 continue
279                         }
280                 } else {
281                         // Filter by instance ID
282                         if i.ID() != t.testInstance.ID() {
283                                 continue
284                         }
285                 }
286                 t.Logger.WithFields(logrus.Fields{
287                         "Instance": i.ID(),
288                         "Address":  i.Address(),
289                 }).Info("found our instance in returned list")
290                 t.testInstance = &worker.TagVerifier{Instance: i, Secret: t.secret, ReportVerified: nil}
291                 if !t.showedLoginInfo {
292                         t.showLoginInfo()
293                 }
294                 return nil
295         }
296         return errTestInstanceNotFound
297 }
298
299 // Get the list of instances, passing the given tags to the cloud
300 // driver to filter results.
301 //
302 // Return only the instances that have our InstanceSetID tag.
303 func (t *tester) getInstances(tags cloud.InstanceTags) ([]cloud.Instance, error) {
304         var ret []cloud.Instance
305         t.Logger.WithField("FilterTags", tags).Info("getting instance list")
306         t0 := time.Now()
307         insts, err := t.is.Instances(tags)
308         if err != nil {
309                 return nil, err
310         }
311         t.Logger.WithFields(logrus.Fields{
312                 "Duration": time.Since(t0),
313                 "N":        len(insts),
314         }).Info("got instance list")
315         for _, i := range insts {
316                 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] == string(t.SetID) {
317                         ret = append(ret, i)
318                 }
319         }
320         return ret, nil
321 }
322
323 // Check that t.testInstance has every tag in t.Tags. If not, log an
324 // error and return false.
325 func (t *tester) checkTags() bool {
326         ok := true
327         for k, v := range t.Tags {
328                 if got := t.testInstance.Tags()[k]; got != v {
329                         ok = false
330                         t.Logger.WithFields(logrus.Fields{
331                                 "Key":           k,
332                                 "ExpectedValue": v,
333                                 "GotValue":      got,
334                         }).Error("tag is missing from test instance")
335                 }
336         }
337         if ok {
338                 t.Logger.Info("all expected tags are present")
339         }
340         return ok
341 }
342
343 // Run t.BootProbeCommand on t.testInstance until it succeeds or the
344 // deadline arrives.
345 func (t *tester) waitForBoot(deadline time.Time) bool {
346         for time.Now().Before(deadline) {
347                 err := t.runShellCommand(t.BootProbeCommand)
348                 if err == nil {
349                         return true
350                 }
351                 t.sleepProbeInterval()
352                 t.refreshTestInstance()
353         }
354         t.Logger.Error("timed out")
355         return false
356 }
357
358 // Create t.executor and/or update its target to t.testInstance's
359 // current address.
360 func (t *tester) updateExecutor() {
361         if t.executor == nil {
362                 t.executor = sshexecutor.New(t.testInstance)
363                 t.executor.SetTargetPort(t.SSHPort)
364                 t.executor.SetSigners(t.SSHKey)
365         } else {
366                 t.executor.SetTarget(t.testInstance)
367         }
368 }
369
370 func (t *tester) runShellCommand(cmd string) error {
371         t.updateExecutor()
372         t.Logger.WithFields(logrus.Fields{
373                 "Command": cmd,
374         }).Info("executing remote command")
375         t0 := time.Now()
376         stdout, stderr, err := t.executor.Execute(nil, cmd, nil)
377         lgr := t.Logger.WithFields(logrus.Fields{
378                 "Duration": time.Since(t0),
379                 "Command":  cmd,
380                 "stdout":   string(stdout),
381                 "stderr":   string(stderr),
382         })
383         if err != nil {
384                 lgr.WithError(err).Info("remote command failed")
385         } else {
386                 lgr.Info("remote command succeeded")
387         }
388         return err
389 }
390
391 // currently, this tries forever until it can return true (success).
392 func (t *tester) destroyTestInstance() bool {
393         if t.testInstance == nil {
394                 return true
395         }
396         for {
397                 lgr := t.Logger.WithField("Instance", t.testInstance.ID())
398                 lgr.Info("destroying instance")
399                 t0 := time.Now()
400
401                 err := t.testInstance.Destroy()
402                 lgrDur := lgr.WithField("Duration", time.Since(t0))
403                 if err != nil {
404                         lgrDur.WithError(err).Error("error destroying instance")
405                 } else {
406                         lgrDur.Info("destroyed instance")
407                 }
408
409                 err = t.refreshTestInstance()
410                 if err == errTestInstanceNotFound {
411                         lgr.Info("instance no longer appears in list")
412                         t.testInstance = nil
413                         return true
414                 } else if err == nil {
415                         lgr.Info("instance still exists after calling Destroy")
416                         t.sleepSyncInterval()
417                         continue
418                 } else {
419                         t.Logger.WithError(err).Error("error getting list of instances")
420                         continue
421                 }
422         }
423 }
424
425 func (t *tester) sleepSyncInterval() {
426         t.Logger.WithField("Duration", t.SyncInterval).Info("waiting SyncInterval")
427         time.Sleep(t.SyncInterval)
428 }
429
430 func (t *tester) sleepProbeInterval() {
431         t.Logger.WithField("Duration", t.ProbeInterval).Info("waiting ProbeInterval")
432         time.Sleep(t.ProbeInterval)
433 }
434
435 // Return a random string of n hexadecimal digits (n*4 random bits). n
436 // must be even.
437 func randomHex(n int) string {
438         buf := make([]byte, n/2)
439         _, err := rand.Read(buf)
440         if err != nil {
441                 panic(err)
442         }
443         return fmt.Sprintf("%x", buf)
444 }