lib/cloud/cloudtest/tester.go

   1 // Copyright (C) The Arvados Authors. All rights reserved.
   2 //
   3 // SPDX-License-Identifier: AGPL-3.0
   4
   5 package cloudtest
   6
   7 import (
   8         "crypto/rand"
   9         "encoding/json"
  10         "errors"
  11         "fmt"
  12         "time"
  13
  14         "git.curoverse.com/arvados.git/lib/cloud"
  15         "git.curoverse.com/arvados.git/lib/dispatchcloud/ssh_executor"
  16         "git.curoverse.com/arvados.git/lib/dispatchcloud/worker"
  17         "git.curoverse.com/arvados.git/sdk/go/arvados"
  18         "github.com/sirupsen/logrus"
  19         "golang.org/x/crypto/ssh"
  20 )
  21
  22 var (
  23         errTestInstanceNotFound = errors.New("test instance missing from cloud provider's list")
  24 )
  25
  26 // A tester does a sequence of operations to test a cloud driver and
  27 // configuration. Run() should be called only once, after assigning
  28 // suitable values to public fields.
  29 type tester struct {
  30         Logger             logrus.FieldLogger
  31         Tags               cloud.SharedResourceTags
  32         TagKeyPrefix       string
  33         SetID              cloud.InstanceSetID
  34         DestroyExisting    bool
  35         ProbeInterval      time.Duration
  36         SyncInterval       time.Duration
  37         TimeoutBooting     time.Duration
  38         Driver             cloud.Driver
  39         DriverParameters   json.RawMessage
  40         InstanceType       arvados.InstanceType
  41         ImageID            cloud.ImageID
  42         SSHKey             ssh.Signer
  43         SSHPort            string
  44         BootProbeCommand   string
  45         ShellCommand       string
  46         PauseBeforeDestroy func()
  47
  48         is              cloud.InstanceSet
  49         testInstance    cloud.Instance
  50         secret          string
  51         executor        *ssh_executor.Executor
  52         showedLoginInfo bool
  53
  54         failed bool
  55 }
  56
  57 // Run the test suite as specified, clean up as needed, and return
  58 // true (everything is OK) or false (something went wrong).
  59 func (t *tester) Run() bool {
  60         // This flag gets set when we encounter a non-fatal error, so
  61         // we can continue doing more tests but remember to return
  62         // false (failure) at the end.
  63         deferredError := false
  64
  65         var err error
  66         t.is, err = t.Driver.InstanceSet(t.DriverParameters, t.SetID, t.Tags, t.Logger)
  67         if err != nil {
  68                 t.Logger.WithError(err).Info("error initializing driver")
  69                 return false
  70         }
  71         insts, err := t.getInstances(nil)
  72         if err != nil {
  73                 t.Logger.WithError(err).Info("error getting initial list of instances")
  74                 return false
  75         }
  76         for {
  77                 foundExisting := false
  78                 for _, i := range insts {
  79                         if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
  80                                 continue
  81                         }
  82                         lgr := t.Logger.WithFields(logrus.Fields{
  83                                 "Instance":      i.ID(),
  84                                 "InstanceSetID": t.SetID,
  85                         })
  86                         foundExisting = true
  87                         if t.DestroyExisting {
  88                                 lgr.Info("destroying existing instance with our InstanceSetID")
  89                                 err := i.Destroy()
  90                                 if err != nil {
  91                                         lgr.WithError(err).Error("error destroying existing instance")
  92                                 } else {
  93                                         lgr.Info("Destroy() call succeeded")
  94                                 }
  95                         } else {
  96                                 lgr.Error("found existing instance with our InstanceSetID")
  97                         }
  98                 }
  99                 if !foundExisting {
 100                         break
 101                 } else if t.DestroyExisting {
 102                         t.sleepSyncInterval()
 103                 } else {
 104                         t.Logger.Error("cannot continue with existing instances -- clean up manually, use -destroy-existing=true, or choose a different -instance-set-id")
 105                         return false
 106                 }
 107         }
 108
 109         t.secret = randomHex(40)
 110
 111         tags := cloud.InstanceTags{}
 112         for k, v := range t.Tags {
 113                 tags[k] = v
 114         }
 115         tags[t.TagKeyPrefix+"InstanceSetID"] = string(t.SetID)
 116         tags[t.TagKeyPrefix+"InstanceSecret"] = t.secret
 117
 118         defer t.destroyTestInstance()
 119
 120         bootDeadline := time.Now().Add(t.TimeoutBooting)
 121         initCommand := worker.TagVerifier{nil, t.secret}.InitCommand()
 122
 123         t.Logger.WithFields(logrus.Fields{
 124                 "InstanceType":         t.InstanceType.Name,
 125                 "ProviderInstanceType": t.InstanceType.ProviderType,
 126                 "ImageID":              t.ImageID,
 127                 "Tags":                 tags,
 128                 "InitCommand":          initCommand,
 129         }).Info("creating instance")
 130         inst, err := t.is.Create(t.InstanceType, t.ImageID, tags, initCommand, t.SSHKey.PublicKey())
 131         if err != nil {
 132                 deferredError = true
 133                 t.Logger.WithError(err).Error("error creating test instance")
 134                 t.Logger.WithField("Deadline", bootDeadline).Info("waiting for instance to appear anyway, in case the Create response was incorrect")
 135                 for err = t.refreshTestInstance(); err != nil; err = t.refreshTestInstance() {
 136                         if time.Now().After(bootDeadline) {
 137                                 t.Logger.Error("timed out")
 138                                 return false
 139                         } else {
 140                                 t.sleepSyncInterval()
 141                         }
 142                 }
 143                 t.Logger.WithField("Instance", t.testInstance.ID()).Info("new instance appeared")
 144                 t.showLoginInfo()
 145         } else {
 146                 t.Logger.WithField("Instance", inst.ID()).Info("created instance")
 147                 t.testInstance = inst
 148                 t.showLoginInfo()
 149                 err = t.refreshTestInstance()
 150                 if err == errTestInstanceNotFound {
 151                         t.Logger.WithError(err).Error("cloud/driver Create succeeded, but instance is not in list")
 152                         deferredError = true
 153                 } else if err != nil {
 154                         t.Logger.WithError(err).Error("error getting list of instances")
 155                         return false
 156                 }
 157         }
 158         t.testInstance = worker.TagVerifier{t.testInstance, t.secret}
 159
 160         if !t.checkTags() {
 161                 // checkTags() already logged the errors
 162                 return false
 163         }
 164
 165         if !t.waitForBoot(bootDeadline) {
 166                 return false
 167         }
 168
 169         if t.ShellCommand != "" {
 170                 err = t.runShellCommand(t.ShellCommand)
 171                 if err != nil {
 172                         t.Logger.WithError(err).Error("shell command failed")
 173                         deferredError = true
 174                 }
 175         }
 176
 177         if fn := t.PauseBeforeDestroy; fn != nil {
 178                 t.showLoginInfo()
 179                 fn()
 180         }
 181
 182         return !deferredError
 183 }
 184
 185 func (t *tester) showLoginInfo() {
 186         t.updateExecutor()
 187         host, port := t.executor.TargetHostPort()
 188         if host == "" {
 189                 return
 190         }
 191         user := t.testInstance.RemoteUser()
 192         t.Logger.WithField("Command", fmt.Sprintf("ssh -p%s %s@%s", port, user, host)).Info("showing login information")
 193         t.showedLoginInfo = true
 194 }
 195
 196 // Get the latest instance list from the driver. If our test instance
 197 // is found, assign it to t.testIntance.
 198 func (t *tester) refreshTestInstance() error {
 199         insts, err := t.getInstances(cloud.InstanceTags{t.TagKeyPrefix + "InstanceSetID": string(t.SetID)})
 200         if err != nil {
 201                 return err
 202         }
 203         for _, i := range insts {
 204                 if t.testInstance == nil {
 205                         // Filter by InstanceSetID tag value
 206                         if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
 207                                 continue
 208                         }
 209                 } else {
 210                         // Filter by instance ID
 211                         if i.ID() != t.testInstance.ID() {
 212                                 continue
 213                         }
 214                 }
 215                 t.Logger.WithFields(logrus.Fields{
 216                         "Instance": i.ID(),
 217                         "Address":  i.Address(),
 218                 }).Info("found our instance in returned list")
 219                 t.testInstance = worker.TagVerifier{i, t.secret}
 220                 if !t.showedLoginInfo {
 221                         t.showLoginInfo()
 222                 }
 223                 return nil
 224         }
 225         return errTestInstanceNotFound
 226 }
 227
 228 func (t *tester) getInstances(tags cloud.InstanceTags) ([]cloud.Instance, error) {
 229         var ret []cloud.Instance
 230         t.Logger.WithField("FilterTags", tags).Info("getting instance list")
 231         insts, err := t.is.Instances(tags)
 232         if err != nil {
 233                 return nil, err
 234         }
 235         t.Logger.WithField("N", len(insts)).Info("got instance list")
 236         for _, i := range insts {
 237                 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] == string(t.SetID) {
 238                         ret = append(ret, i)
 239                 }
 240         }
 241         return ret, nil
 242 }
 243
 244 func (t *tester) checkTags() bool {
 245         ok := true
 246         for k, v := range t.Tags {
 247                 if got := t.testInstance.Tags()[k]; got != v {
 248                         ok = false
 249                         t.Logger.WithFields(logrus.Fields{
 250                                 "Key":           k,
 251                                 "ExpectedValue": v,
 252                                 "GotValue":      got,
 253                         }).Error("tag is missing from test instance")
 254                 }
 255         }
 256         if ok {
 257                 t.Logger.Info("all expected tags are present")
 258         }
 259         return ok
 260 }
 261
 262 func (t *tester) waitForBoot(deadline time.Time) bool {
 263         for time.Now().Before(deadline) {
 264                 err := t.runShellCommand(t.BootProbeCommand)
 265                 if err == nil {
 266                         return true
 267                 }
 268                 t.sleepProbeInterval()
 269                 t.refreshTestInstance()
 270         }
 271         t.Logger.Error("timed out")
 272         return false
 273 }
 274
 275 func (t *tester) updateExecutor() {
 276         if t.executor == nil {
 277                 t.executor = ssh_executor.New(t.testInstance)
 278                 t.executor.SetTargetPort(t.SSHPort)
 279                 t.executor.SetSigners(t.SSHKey)
 280         } else {
 281                 t.executor.SetTarget(t.testInstance)
 282         }
 283 }
 284
 285 func (t *tester) runShellCommand(cmd string) error {
 286         t.updateExecutor()
 287         t.Logger.WithFields(logrus.Fields{
 288                 "Command": cmd,
 289         }).Info("executing remote command")
 290         stdout, stderr, err := t.executor.Execute(nil, cmd, nil)
 291         lgr := t.Logger.WithFields(logrus.Fields{
 292                 "Command": cmd,
 293                 "stdout":  string(stdout),
 294                 "stderr":  string(stderr),
 295         })
 296         if err != nil {
 297                 lgr.WithError(err).Info("remote command failed")
 298         } else {
 299                 lgr.Info("remote command succeeded")
 300         }
 301         return err
 302 }
 303
 304 // currently, this tries forever until it can return true (success).
 305 func (t *tester) destroyTestInstance() bool {
 306         if t.testInstance == nil {
 307                 return true
 308         }
 309         for {
 310                 t.Logger.WithField("Instance", t.testInstance.ID()).Info("destroying instance")
 311                 err := t.testInstance.Destroy()
 312                 if err != nil {
 313                         t.Logger.WithError(err).WithField("Instance", t.testInstance.ID()).Error("error destroying instance")
 314                 } else {
 315                         t.Logger.WithField("Instance", t.testInstance.ID()).Info("destroyed instance")
 316                 }
 317                 err = t.refreshTestInstance()
 318                 if err == errTestInstanceNotFound {
 319                         t.Logger.WithField("Instance", t.testInstance.ID()).Info("instance no longer appears in list")
 320                         t.testInstance = nil
 321                         return true
 322                 } else if err == nil {
 323                         t.Logger.WithField("Instance", t.testInstance.ID()).Info("instance still exists after calling Destroy")
 324                         t.sleepSyncInterval()
 325                         continue
 326                 } else {
 327                         t.Logger.WithError(err).Error("error getting list of instances")
 328                         continue
 329                 }
 330         }
 331 }
 332
 333 func (t *tester) sleepSyncInterval() {
 334         t.Logger.WithField("Duration", t.SyncInterval).Info("waiting SyncInterval")
 335         time.Sleep(t.SyncInterval)
 336 }
 337
 338 func (t *tester) sleepProbeInterval() {
 339         t.Logger.WithField("Duration", t.ProbeInterval).Info("waiting ProbeInterval")
 340         time.Sleep(t.ProbeInterval)
 341 }
 342
 343 // Return a random string of n hexadecimal digits (n*4 random bits). n
 344 // must be even.
 345 func randomHex(n int) string {
 346         buf := make([]byte, n/2)
 347         _, err := rand.Read(buf)
 348         if err != nil {
 349                 panic(err)
 350         }
 351         return fmt.Sprintf("%x", buf)
 352 }