lib/cloud/cloudtest/tester.go

   1 // Copyright (C) The Arvados Authors. All rights reserved.
   2 //
   3 // SPDX-License-Identifier: AGPL-3.0
   4
   5 package cloudtest
   6
   7 import (
   8         "crypto/rand"
   9         "encoding/json"
  10         "errors"
  11         "fmt"
  12         "time"
  13
  14         "git.curoverse.com/arvados.git/lib/cloud"
  15         "git.curoverse.com/arvados.git/lib/dispatchcloud/ssh_executor"
  16         "git.curoverse.com/arvados.git/lib/dispatchcloud/worker"
  17         "git.curoverse.com/arvados.git/sdk/go/arvados"
  18         "github.com/sirupsen/logrus"
  19         "golang.org/x/crypto/ssh"
  20 )
  21
  22 var (
  23         errTestInstanceNotFound = errors.New("test instance missing from cloud provider's list")
  24 )
  25
  26 // A tester does a sequence of operations to test a cloud driver and
  27 // configuration. Run() should be called only once, after assigning
  28 // suitable values to public fields.
  29 type tester struct {
  30         Logger             logrus.FieldLogger
  31         Tags               cloud.SharedResourceTags
  32         TagKeyPrefix       string
  33         SetID              cloud.InstanceSetID
  34         DestroyExisting    bool
  35         ProbeInterval      time.Duration
  36         SyncInterval       time.Duration
  37         TimeoutBooting     time.Duration
  38         Driver             cloud.Driver
  39         DriverParameters   json.RawMessage
  40         InstanceType       arvados.InstanceType
  41         ImageID            cloud.ImageID
  42         SSHKey             ssh.Signer
  43         SSHPort            string
  44         BootProbeCommand   string
  45         ShellCommand       string
  46         PauseBeforeDestroy func()
  47
  48         is              cloud.InstanceSet
  49         testInstance    *worker.TagVerifier
  50         secret          string
  51         executor        *ssh_executor.Executor
  52         showedLoginInfo bool
  53
  54         failed bool
  55 }
  56
  57 // Run the test suite as specified, clean up as needed, and return
  58 // true (everything is OK) or false (something went wrong).
  59 func (t *tester) Run() bool {
  60         // This flag gets set when we encounter a non-fatal error, so
  61         // we can continue doing more tests but remember to return
  62         // false (failure) at the end.
  63         deferredError := false
  64
  65         var err error
  66         t.is, err = t.Driver.InstanceSet(t.DriverParameters, t.SetID, t.Tags, t.Logger)
  67         if err != nil {
  68                 t.Logger.WithError(err).Info("error initializing driver")
  69                 return false
  70         }
  71
  72         // Don't send the driver any filters the first time we get the
  73         // instance list. This way we can log an instance count
  74         // (N=...)  that includes all instances in this service
  75         // account, even if they don't have the same InstanceSetID.
  76         insts, err := t.getInstances(nil)
  77         if err != nil {
  78                 t.Logger.WithError(err).Info("error getting initial list of instances")
  79                 return false
  80         }
  81
  82         for {
  83                 foundExisting := false
  84                 for _, i := range insts {
  85                         if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
  86                                 continue
  87                         }
  88                         lgr := t.Logger.WithFields(logrus.Fields{
  89                                 "Instance":      i.ID(),
  90                                 "InstanceSetID": t.SetID,
  91                         })
  92                         foundExisting = true
  93                         if t.DestroyExisting {
  94                                 lgr.Info("destroying existing instance with our InstanceSetID")
  95                                 err := i.Destroy()
  96                                 if err != nil {
  97                                         lgr.WithError(err).Error("error destroying existing instance")
  98                                 } else {
  99                                         lgr.Info("Destroy() call succeeded")
 100                                 }
 101                         } else {
 102                                 lgr.Error("found existing instance with our InstanceSetID")
 103                         }
 104                 }
 105                 if !foundExisting {
 106                         break
 107                 } else if t.DestroyExisting {
 108                         t.sleepSyncInterval()
 109                 } else {
 110                         t.Logger.Error("cannot continue with existing instances -- clean up manually, use -destroy-existing=true, or choose a different -instance-set-id")
 111                         return false
 112                 }
 113         }
 114
 115         t.secret = randomHex(40)
 116
 117         tags := cloud.InstanceTags{}
 118         for k, v := range t.Tags {
 119                 tags[k] = v
 120         }
 121         tags[t.TagKeyPrefix+"InstanceSetID"] = string(t.SetID)
 122         tags[t.TagKeyPrefix+"InstanceSecret"] = t.secret
 123
 124         defer t.destroyTestInstance()
 125
 126         bootDeadline := time.Now().Add(t.TimeoutBooting)
 127         initCommand := worker.TagVerifier{nil, t.secret}.InitCommand()
 128
 129         t.Logger.WithFields(logrus.Fields{
 130                 "InstanceType":         t.InstanceType.Name,
 131                 "ProviderInstanceType": t.InstanceType.ProviderType,
 132                 "ImageID":              t.ImageID,
 133                 "Tags":                 tags,
 134                 "InitCommand":          initCommand,
 135         }).Info("creating instance")
 136         inst, err := t.is.Create(t.InstanceType, t.ImageID, tags, initCommand, t.SSHKey.PublicKey())
 137         if err != nil {
 138                 // Create() might have failed due to a bug or network
 139                 // error even though the creation was successful, so
 140                 // it's safer to wait a bit for an instance to appear.
 141                 deferredError = true
 142                 t.Logger.WithError(err).Error("error creating test instance")
 143                 t.Logger.WithField("Deadline", bootDeadline).Info("waiting for instance to appear anyway, in case the Create response was incorrect")
 144                 for err = t.refreshTestInstance(); err != nil; err = t.refreshTestInstance() {
 145                         if time.Now().After(bootDeadline) {
 146                                 t.Logger.Error("timed out")
 147                                 return false
 148                         } else {
 149                                 t.sleepSyncInterval()
 150                         }
 151                 }
 152                 t.Logger.WithField("Instance", t.testInstance.ID()).Info("new instance appeared")
 153                 t.showLoginInfo()
 154         } else {
 155                 // Create() succeeded. Make sure the new instance
 156                 // appears right away in the Instances() list.
 157                 t.Logger.WithField("Instance", inst.ID()).Info("created instance")
 158                 t.testInstance = &worker.TagVerifier{inst, t.secret}
 159                 t.showLoginInfo()
 160                 err = t.refreshTestInstance()
 161                 if err == errTestInstanceNotFound {
 162                         t.Logger.WithError(err).Error("cloud/driver Create succeeded, but instance is not in list")
 163                         deferredError = true
 164                 } else if err != nil {
 165                         t.Logger.WithError(err).Error("error getting list of instances")
 166                         return false
 167                 }
 168         }
 169
 170         if !t.checkTags() {
 171                 // checkTags() already logged the errors
 172                 deferredError = true
 173         }
 174
 175         if !t.waitForBoot(bootDeadline) {
 176                 deferredError = true
 177         }
 178
 179         if t.ShellCommand != "" {
 180                 err = t.runShellCommand(t.ShellCommand)
 181                 if err != nil {
 182                         t.Logger.WithError(err).Error("shell command failed")
 183                         deferredError = true
 184                 }
 185         }
 186
 187         if fn := t.PauseBeforeDestroy; fn != nil {
 188                 t.showLoginInfo()
 189                 fn()
 190         }
 191
 192         return !deferredError
 193 }
 194
 195 // If the test instance has an address, log an "ssh user@host" command
 196 // line that the operator can paste into another terminal, and set
 197 // t.showedLoginInfo.
 198 //
 199 // If the test instance doesn't have an address yet, do nothing.
 200 func (t *tester) showLoginInfo() {
 201         t.updateExecutor()
 202         host, port := t.executor.TargetHostPort()
 203         if host == "" {
 204                 return
 205         }
 206         user := t.testInstance.RemoteUser()
 207         t.Logger.WithField("Command", fmt.Sprintf("ssh -p%s %s@%s", port, user, host)).Info("showing login information")
 208         t.showedLoginInfo = true
 209 }
 210
 211 // Get the latest instance list from the driver. If our test instance
 212 // is found, assign it to t.testIntance.
 213 func (t *tester) refreshTestInstance() error {
 214         insts, err := t.getInstances(cloud.InstanceTags{t.TagKeyPrefix + "InstanceSetID": string(t.SetID)})
 215         if err != nil {
 216                 return err
 217         }
 218         for _, i := range insts {
 219                 if t.testInstance == nil {
 220                         // Filter by InstanceSetID tag value
 221                         if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
 222                                 continue
 223                         }
 224                 } else {
 225                         // Filter by instance ID
 226                         if i.ID() != t.testInstance.ID() {
 227                                 continue
 228                         }
 229                 }
 230                 t.Logger.WithFields(logrus.Fields{
 231                         "Instance": i.ID(),
 232                         "Address":  i.Address(),
 233                 }).Info("found our instance in returned list")
 234                 t.testInstance = &worker.TagVerifier{i, t.secret}
 235                 if !t.showedLoginInfo {
 236                         t.showLoginInfo()
 237                 }
 238                 return nil
 239         }
 240         return errTestInstanceNotFound
 241 }
 242
 243 // Get the list of instances, passing the given tags to the cloud
 244 // driver to filter results.
 245 //
 246 // Return only the instances that have our InstanceSetID tag.
 247 func (t *tester) getInstances(tags cloud.InstanceTags) ([]cloud.Instance, error) {
 248         var ret []cloud.Instance
 249         t.Logger.WithField("FilterTags", tags).Info("getting instance list")
 250         insts, err := t.is.Instances(tags)
 251         if err != nil {
 252                 return nil, err
 253         }
 254         t.Logger.WithField("N", len(insts)).Info("got instance list")
 255         for _, i := range insts {
 256                 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] == string(t.SetID) {
 257                         ret = append(ret, i)
 258                 }
 259         }
 260         return ret, nil
 261 }
 262
 263 // Check that t.testInstance has every tag in t.Tags. If not, log an
 264 // error and return false.
 265 func (t *tester) checkTags() bool {
 266         ok := true
 267         for k, v := range t.Tags {
 268                 if got := t.testInstance.Tags()[k]; got != v {
 269                         ok = false
 270                         t.Logger.WithFields(logrus.Fields{
 271                                 "Key":           k,
 272                                 "ExpectedValue": v,
 273                                 "GotValue":      got,
 274                         }).Error("tag is missing from test instance")
 275                 }
 276         }
 277         if ok {
 278                 t.Logger.Info("all expected tags are present")
 279         }
 280         return ok
 281 }
 282
 283 // Run t.BootProbeCommand on t.testInstance until it succeeds or the
 284 // deadline arrives.
 285 func (t *tester) waitForBoot(deadline time.Time) bool {
 286         for time.Now().Before(deadline) {
 287                 err := t.runShellCommand(t.BootProbeCommand)
 288                 if err == nil {
 289                         return true
 290                 }
 291                 t.sleepProbeInterval()
 292                 t.refreshTestInstance()
 293         }
 294         t.Logger.Error("timed out")
 295         return false
 296 }
 297
 298 // Create t.executor and/or update its target to t.testInstance's
 299 // current address.
 300 func (t *tester) updateExecutor() {
 301         if t.executor == nil {
 302                 t.executor = ssh_executor.New(t.testInstance)
 303                 t.executor.SetTargetPort(t.SSHPort)
 304                 t.executor.SetSigners(t.SSHKey)
 305         } else {
 306                 t.executor.SetTarget(t.testInstance)
 307         }
 308 }
 309
 310 func (t *tester) runShellCommand(cmd string) error {
 311         t.updateExecutor()
 312         t.Logger.WithFields(logrus.Fields{
 313                 "Command": cmd,
 314         }).Info("executing remote command")
 315         stdout, stderr, err := t.executor.Execute(nil, cmd, nil)
 316         lgr := t.Logger.WithFields(logrus.Fields{
 317                 "Command": cmd,
 318                 "stdout":  string(stdout),
 319                 "stderr":  string(stderr),
 320         })
 321         if err != nil {
 322                 lgr.WithError(err).Info("remote command failed")
 323         } else {
 324                 lgr.Info("remote command succeeded")
 325         }
 326         return err
 327 }
 328
 329 // currently, this tries forever until it can return true (success).
 330 func (t *tester) destroyTestInstance() bool {
 331         if t.testInstance == nil {
 332                 return true
 333         }
 334         for {
 335                 t.Logger.WithField("Instance", t.testInstance.ID()).Info("destroying instance")
 336                 err := t.testInstance.Destroy()
 337                 if err != nil {
 338                         t.Logger.WithError(err).WithField("Instance", t.testInstance.ID()).Error("error destroying instance")
 339                 } else {
 340                         t.Logger.WithField("Instance", t.testInstance.ID()).Info("destroyed instance")
 341                 }
 342                 err = t.refreshTestInstance()
 343                 if err == errTestInstanceNotFound {
 344                         t.Logger.WithField("Instance", t.testInstance.ID()).Info("instance no longer appears in list")
 345                         t.testInstance = nil
 346                         return true
 347                 } else if err == nil {
 348                         t.Logger.WithField("Instance", t.testInstance.ID()).Info("instance still exists after calling Destroy")
 349                         t.sleepSyncInterval()
 350                         continue
 351                 } else {
 352                         t.Logger.WithError(err).Error("error getting list of instances")
 353                         continue
 354                 }
 355         }
 356 }
 357
 358 func (t *tester) sleepSyncInterval() {
 359         t.Logger.WithField("Duration", t.SyncInterval).Info("waiting SyncInterval")
 360         time.Sleep(t.SyncInterval)
 361 }
 362
 363 func (t *tester) sleepProbeInterval() {
 364         t.Logger.WithField("Duration", t.ProbeInterval).Info("waiting ProbeInterval")
 365         time.Sleep(t.ProbeInterval)
 366 }
 367
 368 // Return a random string of n hexadecimal digits (n*4 random bits). n
 369 // must be even.
 370 func randomHex(n int) string {
 371         buf := make([]byte, n/2)
 372         _, err := rand.Read(buf)
 373         if err != nil {
 374                 panic(err)
 375         }
 376         return fmt.Sprintf("%x", buf)
 377 }