lib/cloud/cloudtest/tester.go

   1 // Copyright (C) The Arvados Authors. All rights reserved.
   2 //
   3 // SPDX-License-Identifier: AGPL-3.0
   4
   5 package cloudtest
   6
   7 import (
   8         "crypto/rand"
   9         "encoding/json"
  10         "errors"
  11         "fmt"
  12         "time"
  13
  14         "git.arvados.org/arvados.git/lib/cloud"
  15         "git.arvados.org/arvados.git/lib/dispatchcloud/sshexecutor"
  16         "git.arvados.org/arvados.git/lib/dispatchcloud/worker"
  17         "git.arvados.org/arvados.git/sdk/go/arvados"
  18         "github.com/sirupsen/logrus"
  19         "golang.org/x/crypto/ssh"
  20 )
  21
  22 var (
  23         errTestInstanceNotFound = errors.New("test instance missing from cloud provider's list")
  24 )
  25
  26 // A tester does a sequence of operations to test a cloud driver and
  27 // configuration. Run() should be called only once, after assigning
  28 // suitable values to public fields.
  29 type tester struct {
  30         Logger              logrus.FieldLogger
  31         Tags                cloud.SharedResourceTags
  32         TagKeyPrefix        string
  33         SetID               cloud.InstanceSetID
  34         DestroyExisting     bool
  35         ProbeInterval       time.Duration
  36         SyncInterval        time.Duration
  37         TimeoutBooting      time.Duration
  38         Driver              cloud.Driver
  39         DriverParameters    json.RawMessage
  40         InstanceType        arvados.InstanceType
  41         ImageID             cloud.ImageID
  42         SSHKey              ssh.Signer
  43         SSHPort             string
  44         BootProbeCommand    string
  45         InstanceInitCommand cloud.InitCommand
  46         ShellCommand        string
  47         PauseBeforeDestroy  func()
  48
  49         is              cloud.InstanceSet
  50         testInstance    *worker.TagVerifier
  51         secret          string
  52         executor        *sshexecutor.Executor
  53         showedLoginInfo bool
  54
  55         failed bool
  56 }
  57
  58 // Run the test suite once for each applicable permutation of
  59 // DriverParameters.  Return true if everything worked.
  60 //
  61 // Currently this means run once for each configured SubnetID.
  62 func (t *tester) Run() bool {
  63         var dp map[string]interface{}
  64         if len(t.DriverParameters) > 0 {
  65                 err := json.Unmarshal(t.DriverParameters, &dp)
  66                 if err != nil {
  67                         t.Logger.WithError(err).Error("error decoding configured CloudVMs.DriverParameters")
  68                         return false
  69                 }
  70         }
  71         subnets, ok := dp["SubnetID"].([]interface{})
  72         if !ok || len(subnets) <= 1 {
  73                 // Easy, only one SubnetID to test.
  74                 return t.runWithDriverParameters(t.DriverParameters)
  75         }
  76
  77         deferredError := false
  78         for i, subnet := range subnets {
  79                 subnet, ok := subnet.(string)
  80                 if !ok {
  81                         t.Logger.Errorf("CloudVMs.DriverParameters.SubnetID[%d] is invalid -- must be a string", i)
  82                         deferredError = true
  83                         continue
  84                 }
  85                 dp["SubnetID"] = subnet
  86                 t.Logger.Infof("running tests using SubnetID[%d] %q", i, subnet)
  87                 dpjson, err := json.Marshal(dp)
  88                 if err != nil {
  89                         t.Logger.WithError(err).Error("error encoding driver parameters")
  90                         deferredError = true
  91                         continue
  92                 }
  93                 ok = t.runWithDriverParameters(dpjson)
  94                 if !ok {
  95                         t.Logger.Infof("failed tests using SubnetID[%d] %q", i, subnet)
  96                         deferredError = true
  97                 }
  98         }
  99         return !deferredError
 100 }
 101
 102 // Run the test suite as specified, clean up as needed, and return
 103 // true (everything is OK) or false (something went wrong).
 104 func (t *tester) runWithDriverParameters(driverParameters json.RawMessage) bool {
 105         // This flag gets set when we encounter a non-fatal error, so
 106         // we can continue doing more tests but remember to return
 107         // false (failure) at the end.
 108         deferredError := false
 109
 110         var err error
 111         t.is, err = t.Driver.InstanceSet(driverParameters, t.SetID, t.Tags, t.Logger, nil)
 112         if err != nil {
 113                 t.Logger.WithError(err).Info("error initializing driver")
 114                 return false
 115         }
 116
 117         for {
 118                 // Don't send the driver any filters when getting the
 119                 // initial instance list. This way we can log an
 120                 // instance count (N=...)  that includes all instances
 121                 // in this service account, even if they don't have
 122                 // the same InstanceSetID.
 123                 insts, err := t.getInstances(nil)
 124                 if err != nil {
 125                         t.Logger.WithError(err).Info("error getting list of instances")
 126                         return false
 127                 }
 128
 129                 foundExisting := false
 130                 for _, i := range insts {
 131                         if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
 132                                 continue
 133                         }
 134                         lgr := t.Logger.WithFields(logrus.Fields{
 135                                 "Instance":      i.ID(),
 136                                 "InstanceSetID": t.SetID,
 137                         })
 138                         foundExisting = true
 139                         if t.DestroyExisting {
 140                                 lgr.Info("destroying existing instance with our InstanceSetID")
 141                                 t0 := time.Now()
 142                                 err := i.Destroy()
 143                                 lgr := lgr.WithField("Duration", time.Since(t0))
 144                                 if err != nil {
 145                                         lgr.WithError(err).Error("error destroying existing instance")
 146                                 } else {
 147                                         lgr.Info("Destroy() call succeeded")
 148                                 }
 149                         } else {
 150                                 lgr.Error("found existing instance with our InstanceSetID")
 151                         }
 152                 }
 153                 if !foundExisting {
 154                         break
 155                 } else if t.DestroyExisting {
 156                         t.sleepSyncInterval()
 157                 } else {
 158                         t.Logger.Error("cannot continue with existing instances -- clean up manually, use -destroy-existing=true, or choose a different -instance-set-id")
 159                         return false
 160                 }
 161         }
 162
 163         t.secret = randomHex(40)
 164
 165         tags := cloud.InstanceTags{}
 166         for k, v := range t.Tags {
 167                 tags[k] = v
 168         }
 169         tags[t.TagKeyPrefix+"InstanceSetID"] = string(t.SetID)
 170         tags[t.TagKeyPrefix+"InstanceSecret"] = t.secret
 171
 172         defer t.destroyTestInstance()
 173
 174         bootDeadline := time.Now().Add(t.TimeoutBooting)
 175         initCommand := worker.TagVerifier{Instance: nil, Secret: t.secret, ReportVerified: nil}.InitCommand() + "\n" + t.InstanceInitCommand
 176
 177         t.Logger.WithFields(logrus.Fields{
 178                 "InstanceType":         t.InstanceType.Name,
 179                 "ProviderInstanceType": t.InstanceType.ProviderType,
 180                 "ImageID":              t.ImageID,
 181                 "Tags":                 tags,
 182                 "InitCommand":          initCommand,
 183         }).Info("creating instance")
 184         t0 := time.Now()
 185         inst, err := t.is.Create(t.InstanceType, t.ImageID, tags, initCommand, t.SSHKey.PublicKey())
 186         lgrC := t.Logger.WithField("Duration", time.Since(t0))
 187         if err != nil {
 188                 // Create() might have failed due to a bug or network
 189                 // error even though the creation was successful, so
 190                 // it's safer to wait a bit for an instance to appear.
 191                 deferredError = true
 192                 lgrC.WithError(err).Error("error creating test instance")
 193                 t.Logger.WithField("Deadline", bootDeadline).Info("waiting for instance to appear anyway, in case the Create response was incorrect")
 194                 for err = t.refreshTestInstance(); err != nil; err = t.refreshTestInstance() {
 195                         if time.Now().After(bootDeadline) {
 196                                 t.Logger.Error("timed out")
 197                                 return false
 198                         }
 199                         t.sleepSyncInterval()
 200                 }
 201                 t.Logger.WithField("Instance", t.testInstance.ID()).Info("new instance appeared")
 202                 t.showLoginInfo()
 203         } else {
 204                 // Create() succeeded. Make sure the new instance
 205                 // appears right away in the Instances() list.
 206                 lgrC.WithField("Instance", inst.ID()).Info("created instance")
 207                 t.testInstance = &worker.TagVerifier{Instance: inst, Secret: t.secret, ReportVerified: nil}
 208                 t.showLoginInfo()
 209                 err = t.refreshTestInstance()
 210                 if err == errTestInstanceNotFound {
 211                         t.Logger.WithError(err).Error("cloud/driver Create succeeded, but instance is not in list")
 212                         deferredError = true
 213                 } else if err != nil {
 214                         t.Logger.WithError(err).Error("error getting list of instances")
 215                         return false
 216                 }
 217         }
 218
 219         if !t.checkTags() {
 220                 // checkTags() already logged the errors
 221                 deferredError = true
 222         }
 223
 224         if !t.waitForBoot(bootDeadline) {
 225                 deferredError = true
 226         }
 227
 228         if t.ShellCommand != "" {
 229                 err = t.runShellCommand(t.ShellCommand)
 230                 if err != nil {
 231                         t.Logger.WithError(err).Error("shell command failed")
 232                         deferredError = true
 233                 }
 234         }
 235
 236         if fn := t.PauseBeforeDestroy; fn != nil {
 237                 t.showLoginInfo()
 238                 fn()
 239         }
 240
 241         return !deferredError
 242 }
 243
 244 // If the test instance has an address, log an "ssh user@host" command
 245 // line that the operator can paste into another terminal, and set
 246 // t.showedLoginInfo.
 247 //
 248 // If the test instance doesn't have an address yet, do nothing.
 249 func (t *tester) showLoginInfo() {
 250         t.updateExecutor()
 251         host, port := t.executor.TargetHostPort()
 252         if host == "" {
 253                 return
 254         }
 255         user := t.testInstance.RemoteUser()
 256         t.Logger.WithField("Command", fmt.Sprintf("ssh -p%s %s@%s", port, user, host)).Info("showing login information")
 257         t.showedLoginInfo = true
 258 }
 259
 260 // Get the latest instance list from the driver. If our test instance
 261 // is found, assign it to t.testIntance.
 262 func (t *tester) refreshTestInstance() error {
 263         insts, err := t.getInstances(cloud.InstanceTags{t.TagKeyPrefix + "InstanceSetID": string(t.SetID)})
 264         if err != nil {
 265                 return err
 266         }
 267         for _, i := range insts {
 268                 if t.testInstance == nil {
 269                         // Filter by InstanceSetID tag value
 270                         if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
 271                                 continue
 272                         }
 273                 } else {
 274                         // Filter by instance ID
 275                         if i.ID() != t.testInstance.ID() {
 276                                 continue
 277                         }
 278                 }
 279                 t.Logger.WithFields(logrus.Fields{
 280                         "Instance": i.ID(),
 281                         "Address":  i.Address(),
 282                 }).Info("found our instance in returned list")
 283                 t.testInstance = &worker.TagVerifier{Instance: i, Secret: t.secret, ReportVerified: nil}
 284                 if !t.showedLoginInfo {
 285                         t.showLoginInfo()
 286                 }
 287                 return nil
 288         }
 289         return errTestInstanceNotFound
 290 }
 291
 292 // Get the list of instances, passing the given tags to the cloud
 293 // driver to filter results.
 294 //
 295 // Return only the instances that have our InstanceSetID tag.
 296 func (t *tester) getInstances(tags cloud.InstanceTags) ([]cloud.Instance, error) {
 297         var ret []cloud.Instance
 298         t.Logger.WithField("FilterTags", tags).Info("getting instance list")
 299         t0 := time.Now()
 300         insts, err := t.is.Instances(tags)
 301         if err != nil {
 302                 return nil, err
 303         }
 304         t.Logger.WithFields(logrus.Fields{
 305                 "Duration": time.Since(t0),
 306                 "N":        len(insts),
 307         }).Info("got instance list")
 308         for _, i := range insts {
 309                 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] == string(t.SetID) {
 310                         ret = append(ret, i)
 311                 }
 312         }
 313         return ret, nil
 314 }
 315
 316 // Check that t.testInstance has every tag in t.Tags. If not, log an
 317 // error and return false.
 318 func (t *tester) checkTags() bool {
 319         ok := true
 320         for k, v := range t.Tags {
 321                 if got := t.testInstance.Tags()[k]; got != v {
 322                         ok = false
 323                         t.Logger.WithFields(logrus.Fields{
 324                                 "Key":           k,
 325                                 "ExpectedValue": v,
 326                                 "GotValue":      got,
 327                         }).Error("tag is missing from test instance")
 328                 }
 329         }
 330         if ok {
 331                 t.Logger.Info("all expected tags are present")
 332         }
 333         return ok
 334 }
 335
 336 // Run t.BootProbeCommand on t.testInstance until it succeeds or the
 337 // deadline arrives.
 338 func (t *tester) waitForBoot(deadline time.Time) bool {
 339         for time.Now().Before(deadline) {
 340                 err := t.runShellCommand(t.BootProbeCommand)
 341                 if err == nil {
 342                         return true
 343                 }
 344                 t.sleepProbeInterval()
 345                 t.refreshTestInstance()
 346         }
 347         t.Logger.Error("timed out")
 348         return false
 349 }
 350
 351 // Create t.executor and/or update its target to t.testInstance's
 352 // current address.
 353 func (t *tester) updateExecutor() {
 354         if t.executor == nil {
 355                 t.executor = sshexecutor.New(t.testInstance)
 356                 t.executor.SetTargetPort(t.SSHPort)
 357                 t.executor.SetSigners(t.SSHKey)
 358         } else {
 359                 t.executor.SetTarget(t.testInstance)
 360         }
 361 }
 362
 363 func (t *tester) runShellCommand(cmd string) error {
 364         t.updateExecutor()
 365         t.Logger.WithFields(logrus.Fields{
 366                 "Command": cmd,
 367         }).Info("executing remote command")
 368         t0 := time.Now()
 369         stdout, stderr, err := t.executor.Execute(nil, cmd, nil)
 370         lgr := t.Logger.WithFields(logrus.Fields{
 371                 "Duration": time.Since(t0),
 372                 "Command":  cmd,
 373                 "stdout":   string(stdout),
 374                 "stderr":   string(stderr),
 375         })
 376         if err != nil {
 377                 lgr.WithError(err).Info("remote command failed")
 378         } else {
 379                 lgr.Info("remote command succeeded")
 380         }
 381         return err
 382 }
 383
 384 // currently, this tries forever until it can return true (success).
 385 func (t *tester) destroyTestInstance() bool {
 386         if t.testInstance == nil {
 387                 return true
 388         }
 389         for {
 390                 lgr := t.Logger.WithField("Instance", t.testInstance.ID())
 391                 lgr.Info("destroying instance")
 392                 t0 := time.Now()
 393
 394                 err := t.testInstance.Destroy()
 395                 lgrDur := lgr.WithField("Duration", time.Since(t0))
 396                 if err != nil {
 397                         lgrDur.WithError(err).Error("error destroying instance")
 398                 } else {
 399                         lgrDur.Info("destroyed instance")
 400                 }
 401
 402                 err = t.refreshTestInstance()
 403                 if err == errTestInstanceNotFound {
 404                         lgr.Info("instance no longer appears in list")
 405                         t.testInstance = nil
 406                         return true
 407                 } else if err == nil {
 408                         lgr.Info("instance still exists after calling Destroy")
 409                         t.sleepSyncInterval()
 410                         continue
 411                 } else {
 412                         t.Logger.WithError(err).Error("error getting list of instances")
 413                         continue
 414                 }
 415         }
 416 }
 417
 418 func (t *tester) sleepSyncInterval() {
 419         t.Logger.WithField("Duration", t.SyncInterval).Info("waiting SyncInterval")
 420         time.Sleep(t.SyncInterval)
 421 }
 422
 423 func (t *tester) sleepProbeInterval() {
 424         t.Logger.WithField("Duration", t.ProbeInterval).Info("waiting ProbeInterval")
 425         time.Sleep(t.ProbeInterval)
 426 }
 427
 428 // Return a random string of n hexadecimal digits (n*4 random bits). n
 429 // must be even.
 430 func randomHex(n int) string {
 431         buf := make([]byte, n/2)
 432         _, err := rand.Read(buf)
 433         if err != nil {
 434                 panic(err)
 435         }
 436         return fmt.Sprintf("%x", buf)
 437 }