lib/cloud/cloudtest/tester.go

   1 // Copyright (C) The Arvados Authors. All rights reserved.
   2 //
   3 // SPDX-License-Identifier: AGPL-3.0
   4
   5 package cloudtest
   6
   7 import (
   8         "crypto/rand"
   9         "encoding/json"
  10         "errors"
  11         "fmt"
  12         "time"
  13
  14         "git.arvados.org/arvados.git/lib/cloud"
  15         "git.arvados.org/arvados.git/lib/dispatchcloud/sshexecutor"
  16         "git.arvados.org/arvados.git/lib/dispatchcloud/worker"
  17         "git.arvados.org/arvados.git/sdk/go/arvados"
  18         "github.com/sirupsen/logrus"
  19         "golang.org/x/crypto/ssh"
  20 )
  21
  22 var (
  23         errTestInstanceNotFound = errors.New("test instance missing from cloud provider's list")
  24 )
  25
  26 // A tester does a sequence of operations to test a cloud driver and
  27 // configuration. Run() should be called only once, after assigning
  28 // suitable values to public fields.
  29 type tester struct {
  30         Logger             logrus.FieldLogger
  31         Tags               cloud.SharedResourceTags
  32         TagKeyPrefix       string
  33         SetID              cloud.InstanceSetID
  34         DestroyExisting    bool
  35         ProbeInterval      time.Duration
  36         SyncInterval       time.Duration
  37         TimeoutBooting     time.Duration
  38         Driver             cloud.Driver
  39         DriverParameters   json.RawMessage
  40         InstanceType       arvados.InstanceType
  41         ImageID            cloud.ImageID
  42         SSHKey             ssh.Signer
  43         SSHPort            string
  44         BootProbeCommand   string
  45         ShellCommand       string
  46         PauseBeforeDestroy func()
  47
  48         is              cloud.InstanceSet
  49         testInstance    *worker.TagVerifier
  50         secret          string
  51         executor        *sshexecutor.Executor
  52         showedLoginInfo bool
  53
  54         failed bool
  55 }
  56
  57 // Run the test suite as specified, clean up as needed, and return
  58 // true (everything is OK) or false (something went wrong).
  59 func (t *tester) Run() bool {
  60         // This flag gets set when we encounter a non-fatal error, so
  61         // we can continue doing more tests but remember to return
  62         // false (failure) at the end.
  63         deferredError := false
  64
  65         var err error
  66         t.is, err = t.Driver.InstanceSet(t.DriverParameters, t.SetID, t.Tags, t.Logger)
  67         if err != nil {
  68                 t.Logger.WithError(err).Info("error initializing driver")
  69                 return false
  70         }
  71
  72         for {
  73                 // Don't send the driver any filters when getting the
  74                 // initial instance list. This way we can log an
  75                 // instance count (N=...)  that includes all instances
  76                 // in this service account, even if they don't have
  77                 // the same InstanceSetID.
  78                 insts, err := t.getInstances(nil)
  79                 if err != nil {
  80                         t.Logger.WithError(err).Info("error getting list of instances")
  81                         return false
  82                 }
  83
  84                 foundExisting := false
  85                 for _, i := range insts {
  86                         if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
  87                                 continue
  88                         }
  89                         lgr := t.Logger.WithFields(logrus.Fields{
  90                                 "Instance":      i.ID(),
  91                                 "InstanceSetID": t.SetID,
  92                         })
  93                         foundExisting = true
  94                         if t.DestroyExisting {
  95                                 lgr.Info("destroying existing instance with our InstanceSetID")
  96                                 t0 := time.Now()
  97                                 err := i.Destroy()
  98                                 lgr := lgr.WithField("Duration", time.Since(t0))
  99                                 if err != nil {
 100                                         lgr.WithError(err).Error("error destroying existing instance")
 101                                 } else {
 102                                         lgr.Info("Destroy() call succeeded")
 103                                 }
 104                         } else {
 105                                 lgr.Error("found existing instance with our InstanceSetID")
 106                         }
 107                 }
 108                 if !foundExisting {
 109                         break
 110                 } else if t.DestroyExisting {
 111                         t.sleepSyncInterval()
 112                 } else {
 113                         t.Logger.Error("cannot continue with existing instances -- clean up manually, use -destroy-existing=true, or choose a different -instance-set-id")
 114                         return false
 115                 }
 116         }
 117
 118         t.secret = randomHex(40)
 119
 120         tags := cloud.InstanceTags{}
 121         for k, v := range t.Tags {
 122                 tags[k] = v
 123         }
 124         tags[t.TagKeyPrefix+"InstanceSetID"] = string(t.SetID)
 125         tags[t.TagKeyPrefix+"InstanceSecret"] = t.secret
 126
 127         defer t.destroyTestInstance()
 128
 129         bootDeadline := time.Now().Add(t.TimeoutBooting)
 130         initCommand := worker.TagVerifier{Instance: nil, Secret: t.secret, ReportVerified: nil}.InitCommand()
 131
 132         t.Logger.WithFields(logrus.Fields{
 133                 "InstanceType":         t.InstanceType.Name,
 134                 "ProviderInstanceType": t.InstanceType.ProviderType,
 135                 "ImageID":              t.ImageID,
 136                 "Tags":                 tags,
 137                 "InitCommand":          initCommand,
 138         }).Info("creating instance")
 139         t0 := time.Now()
 140         inst, err := t.is.Create(t.InstanceType, t.ImageID, tags, initCommand, t.SSHKey.PublicKey())
 141         lgrC := t.Logger.WithField("Duration", time.Since(t0))
 142         if err != nil {
 143                 // Create() might have failed due to a bug or network
 144                 // error even though the creation was successful, so
 145                 // it's safer to wait a bit for an instance to appear.
 146                 deferredError = true
 147                 lgrC.WithError(err).Error("error creating test instance")
 148                 t.Logger.WithField("Deadline", bootDeadline).Info("waiting for instance to appear anyway, in case the Create response was incorrect")
 149                 for err = t.refreshTestInstance(); err != nil; err = t.refreshTestInstance() {
 150                         if time.Now().After(bootDeadline) {
 151                                 t.Logger.Error("timed out")
 152                                 return false
 153                         }
 154                         t.sleepSyncInterval()
 155                 }
 156                 t.Logger.WithField("Instance", t.testInstance.ID()).Info("new instance appeared")
 157                 t.showLoginInfo()
 158         } else {
 159                 // Create() succeeded. Make sure the new instance
 160                 // appears right away in the Instances() list.
 161                 lgrC.WithField("Instance", inst.ID()).Info("created instance")
 162                 t.testInstance = &worker.TagVerifier{Instance: inst, Secret: t.secret, ReportVerified: nil}
 163                 t.showLoginInfo()
 164                 err = t.refreshTestInstance()
 165                 if err == errTestInstanceNotFound {
 166                         t.Logger.WithError(err).Error("cloud/driver Create succeeded, but instance is not in list")
 167                         deferredError = true
 168                 } else if err != nil {
 169                         t.Logger.WithError(err).Error("error getting list of instances")
 170                         return false
 171                 }
 172         }
 173
 174         if !t.checkTags() {
 175                 // checkTags() already logged the errors
 176                 deferredError = true
 177         }
 178
 179         if !t.waitForBoot(bootDeadline) {
 180                 deferredError = true
 181         }
 182
 183         if t.ShellCommand != "" {
 184                 err = t.runShellCommand(t.ShellCommand)
 185                 if err != nil {
 186                         t.Logger.WithError(err).Error("shell command failed")
 187                         deferredError = true
 188                 }
 189         }
 190
 191         if fn := t.PauseBeforeDestroy; fn != nil {
 192                 t.showLoginInfo()
 193                 fn()
 194         }
 195
 196         return !deferredError
 197 }
 198
 199 // If the test instance has an address, log an "ssh user@host" command
 200 // line that the operator can paste into another terminal, and set
 201 // t.showedLoginInfo.
 202 //
 203 // If the test instance doesn't have an address yet, do nothing.
 204 func (t *tester) showLoginInfo() {
 205         t.updateExecutor()
 206         host, port := t.executor.TargetHostPort()
 207         if host == "" {
 208                 return
 209         }
 210         user := t.testInstance.RemoteUser()
 211         t.Logger.WithField("Command", fmt.Sprintf("ssh -p%s %s@%s", port, user, host)).Info("showing login information")
 212         t.showedLoginInfo = true
 213 }
 214
 215 // Get the latest instance list from the driver. If our test instance
 216 // is found, assign it to t.testIntance.
 217 func (t *tester) refreshTestInstance() error {
 218         insts, err := t.getInstances(cloud.InstanceTags{t.TagKeyPrefix + "InstanceSetID": string(t.SetID)})
 219         if err != nil {
 220                 return err
 221         }
 222         for _, i := range insts {
 223                 if t.testInstance == nil {
 224                         // Filter by InstanceSetID tag value
 225                         if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
 226                                 continue
 227                         }
 228                 } else {
 229                         // Filter by instance ID
 230                         if i.ID() != t.testInstance.ID() {
 231                                 continue
 232                         }
 233                 }
 234                 t.Logger.WithFields(logrus.Fields{
 235                         "Instance": i.ID(),
 236                         "Address":  i.Address(),
 237                 }).Info("found our instance in returned list")
 238                 t.testInstance = &worker.TagVerifier{Instance: i, Secret: t.secret, ReportVerified: nil}
 239                 if !t.showedLoginInfo {
 240                         t.showLoginInfo()
 241                 }
 242                 return nil
 243         }
 244         return errTestInstanceNotFound
 245 }
 246
 247 // Get the list of instances, passing the given tags to the cloud
 248 // driver to filter results.
 249 //
 250 // Return only the instances that have our InstanceSetID tag.
 251 func (t *tester) getInstances(tags cloud.InstanceTags) ([]cloud.Instance, error) {
 252         var ret []cloud.Instance
 253         t.Logger.WithField("FilterTags", tags).Info("getting instance list")
 254         t0 := time.Now()
 255         insts, err := t.is.Instances(tags)
 256         if err != nil {
 257                 return nil, err
 258         }
 259         t.Logger.WithFields(logrus.Fields{
 260                 "Duration": time.Since(t0),
 261                 "N":        len(insts),
 262         }).Info("got instance list")
 263         for _, i := range insts {
 264                 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] == string(t.SetID) {
 265                         ret = append(ret, i)
 266                 }
 267         }
 268         return ret, nil
 269 }
 270
 271 // Check that t.testInstance has every tag in t.Tags. If not, log an
 272 // error and return false.
 273 func (t *tester) checkTags() bool {
 274         ok := true
 275         for k, v := range t.Tags {
 276                 if got := t.testInstance.Tags()[k]; got != v {
 277                         ok = false
 278                         t.Logger.WithFields(logrus.Fields{
 279                                 "Key":           k,
 280                                 "ExpectedValue": v,
 281                                 "GotValue":      got,
 282                         }).Error("tag is missing from test instance")
 283                 }
 284         }
 285         if ok {
 286                 t.Logger.Info("all expected tags are present")
 287         }
 288         return ok
 289 }
 290
 291 // Run t.BootProbeCommand on t.testInstance until it succeeds or the
 292 // deadline arrives.
 293 func (t *tester) waitForBoot(deadline time.Time) bool {
 294         for time.Now().Before(deadline) {
 295                 err := t.runShellCommand(t.BootProbeCommand)
 296                 if err == nil {
 297                         return true
 298                 }
 299                 t.sleepProbeInterval()
 300                 t.refreshTestInstance()
 301         }
 302         t.Logger.Error("timed out")
 303         return false
 304 }
 305
 306 // Create t.executor and/or update its target to t.testInstance's
 307 // current address.
 308 func (t *tester) updateExecutor() {
 309         if t.executor == nil {
 310                 t.executor = sshexecutor.New(t.testInstance)
 311                 t.executor.SetTargetPort(t.SSHPort)
 312                 t.executor.SetSigners(t.SSHKey)
 313         } else {
 314                 t.executor.SetTarget(t.testInstance)
 315         }
 316 }
 317
 318 func (t *tester) runShellCommand(cmd string) error {
 319         t.updateExecutor()
 320         t.Logger.WithFields(logrus.Fields{
 321                 "Command": cmd,
 322         }).Info("executing remote command")
 323         t0 := time.Now()
 324         stdout, stderr, err := t.executor.Execute(nil, cmd, nil)
 325         lgr := t.Logger.WithFields(logrus.Fields{
 326                 "Duration": time.Since(t0),
 327                 "Command":  cmd,
 328                 "stdout":   string(stdout),
 329                 "stderr":   string(stderr),
 330         })
 331         if err != nil {
 332                 lgr.WithError(err).Info("remote command failed")
 333         } else {
 334                 lgr.Info("remote command succeeded")
 335         }
 336         return err
 337 }
 338
 339 // currently, this tries forever until it can return true (success).
 340 func (t *tester) destroyTestInstance() bool {
 341         if t.testInstance == nil {
 342                 return true
 343         }
 344         for {
 345                 lgr := t.Logger.WithField("Instance", t.testInstance.ID())
 346                 lgr.Info("destroying instance")
 347                 t0 := time.Now()
 348
 349                 err := t.testInstance.Destroy()
 350                 lgrDur := lgr.WithField("Duration", time.Since(t0))
 351                 if err != nil {
 352                         lgrDur.WithError(err).Error("error destroying instance")
 353                 } else {
 354                         lgrDur.Info("destroyed instance")
 355                 }
 356
 357                 err = t.refreshTestInstance()
 358                 if err == errTestInstanceNotFound {
 359                         lgr.Info("instance no longer appears in list")
 360                         t.testInstance = nil
 361                         return true
 362                 } else if err == nil {
 363                         lgr.Info("instance still exists after calling Destroy")
 364                         t.sleepSyncInterval()
 365                         continue
 366                 } else {
 367                         t.Logger.WithError(err).Error("error getting list of instances")
 368                         continue
 369                 }
 370         }
 371 }
 372
 373 func (t *tester) sleepSyncInterval() {
 374         t.Logger.WithField("Duration", t.SyncInterval).Info("waiting SyncInterval")
 375         time.Sleep(t.SyncInterval)
 376 }
 377
 378 func (t *tester) sleepProbeInterval() {
 379         t.Logger.WithField("Duration", t.ProbeInterval).Info("waiting ProbeInterval")
 380         time.Sleep(t.ProbeInterval)
 381 }
 382
 383 // Return a random string of n hexadecimal digits (n*4 random bits). n
 384 // must be even.
 385 func randomHex(n int) string {
 386         buf := make([]byte, n/2)
 387         _, err := rand.Read(buf)
 388         if err != nil {
 389                 panic(err)
 390         }
 391         return fmt.Sprintf("%x", buf)
 392 }