X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/0684f84ec50069b24b4564474db67d6b44266699..1efba8f3b728a3b8aa3c64c5aa09f441318ff2a8:/lib/cloud/cloudtest/tester.go

diff --git a/lib/cloud/cloudtest/tester.go b/lib/cloud/cloudtest/tester.go
index 33ff264bcc..a335278ed6 100644
--- a/lib/cloud/cloudtest/tester.go
+++ b/lib/cloud/cloudtest/tester.go
@@ -11,10 +11,10 @@ import (
 	"fmt"
 	"time"
 
-	"git.curoverse.com/arvados.git/lib/cloud"
-	"git.curoverse.com/arvados.git/lib/dispatchcloud/ssh_executor"
-	"git.curoverse.com/arvados.git/lib/dispatchcloud/worker"
-	"git.curoverse.com/arvados.git/sdk/go/arvados"
+	"git.arvados.org/arvados.git/lib/cloud"
+	"git.arvados.org/arvados.git/lib/dispatchcloud/sshexecutor"
+	"git.arvados.org/arvados.git/lib/dispatchcloud/worker"
+	"git.arvados.org/arvados.git/sdk/go/arvados"
 	"github.com/sirupsen/logrus"
 	"golang.org/x/crypto/ssh"
 )
@@ -27,59 +27,106 @@ var (
 // configuration. Run() should be called only once, after assigning
 // suitable values to public fields.
 type tester struct {
-	Logger             logrus.FieldLogger
-	Tags               cloud.SharedResourceTags
-	TagKeyPrefix       string
-	SetID              cloud.InstanceSetID
-	DestroyExisting    bool
-	ProbeInterval      time.Duration
-	SyncInterval       time.Duration
-	TimeoutBooting     time.Duration
-	Driver             cloud.Driver
-	DriverParameters   json.RawMessage
-	InstanceType       arvados.InstanceType
-	ImageID            cloud.ImageID
-	SSHKey             ssh.Signer
-	SSHPort            string
-	BootProbeCommand   string
-	ShellCommand       string
-	PauseBeforeDestroy func()
+	Logger              logrus.FieldLogger
+	Tags                cloud.SharedResourceTags
+	TagKeyPrefix        string
+	SetID               cloud.InstanceSetID
+	DestroyExisting     bool
+	ProbeInterval       time.Duration
+	SyncInterval        time.Duration
+	TimeoutBooting      time.Duration
+	Driver              cloud.Driver
+	DriverParameters    json.RawMessage
+	InstanceType        arvados.InstanceType
+	ImageID             cloud.ImageID
+	SSHKey              ssh.Signer
+	SSHPort             string
+	DeployPublicKey     bool
+	BootProbeCommand    string
+	InstanceInitCommand cloud.InitCommand
+	ShellCommand        string
+	PauseBeforeDestroy  func()
 
 	is              cloud.InstanceSet
-	testInstance    cloud.Instance
+	testInstance    *worker.TagVerifier
 	secret          string
-	executor        *ssh_executor.Executor
+	executor        *sshexecutor.Executor
 	showedLoginInfo bool
 
 	failed bool
 }
 
+// Run the test suite once for each applicable permutation of
+// DriverParameters.  Return true if everything worked.
+//
+// Currently this means run once for each configured SubnetID.
+func (t *tester) Run() bool {
+	var dp map[string]interface{}
+	if len(t.DriverParameters) > 0 {
+		err := json.Unmarshal(t.DriverParameters, &dp)
+		if err != nil {
+			t.Logger.WithError(err).Error("error decoding configured CloudVMs.DriverParameters")
+			return false
+		}
+	}
+	subnets, ok := dp["SubnetID"].([]interface{})
+	if !ok || len(subnets) <= 1 {
+		// Easy, only one SubnetID to test.
+		return t.runWithDriverParameters(t.DriverParameters)
+	}
+
+	deferredError := false
+	for i, subnet := range subnets {
+		subnet, ok := subnet.(string)
+		if !ok {
+			t.Logger.Errorf("CloudVMs.DriverParameters.SubnetID[%d] is invalid -- must be a string", i)
+			deferredError = true
+			continue
+		}
+		dp["SubnetID"] = subnet
+		t.Logger.Infof("running tests using SubnetID[%d] %q", i, subnet)
+		dpjson, err := json.Marshal(dp)
+		if err != nil {
+			t.Logger.WithError(err).Error("error encoding driver parameters")
+			deferredError = true
+			continue
+		}
+		ok = t.runWithDriverParameters(dpjson)
+		if !ok {
+			t.Logger.Infof("failed tests using SubnetID[%d] %q", i, subnet)
+			deferredError = true
+		}
+	}
+	return !deferredError
+}
+
 // Run the test suite as specified, clean up as needed, and return
 // true (everything is OK) or false (something went wrong).
-func (t *tester) Run() bool {
+func (t *tester) runWithDriverParameters(driverParameters json.RawMessage) bool {
 	// This flag gets set when we encounter a non-fatal error, so
 	// we can continue doing more tests but remember to return
 	// false (failure) at the end.
 	deferredError := false
 
 	var err error
-	t.is, err = t.Driver.InstanceSet(t.DriverParameters, t.SetID, t.Tags, t.Logger)
+	t.is, err = t.Driver.InstanceSet(driverParameters, t.SetID, t.Tags, t.Logger, nil)
 	if err != nil {
 		t.Logger.WithError(err).Info("error initializing driver")
 		return false
 	}
 
-	// Don't send the driver any filters the first time we get the
-	// instance list. This way we can log an instance count
-	// (N=...)  that includes all instances in this service
-	// account, even if they don't have the same InstanceSetID.
-	insts, err := t.getInstances(nil)
-	if err != nil {
-		t.Logger.WithError(err).Info("error getting initial list of instances")
-		return false
-	}
-
 	for {
+		// Don't send the driver any filters when getting the
+		// initial instance list. This way we can log an
+		// instance count (N=...)  that includes all instances
+		// in this service account, even if they don't have
+		// the same InstanceSetID.
+		insts, err := t.getInstances(nil)
+		if err != nil {
+			t.Logger.WithError(err).Info("error getting list of instances")
+			return false
+		}
+
 		foundExisting := false
 		for _, i := range insts {
 			if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
@@ -92,7 +139,9 @@ func (t *tester) Run() bool {
 			foundExisting = true
 			if t.DestroyExisting {
 				lgr.Info("destroying existing instance with our InstanceSetID")
+				t0 := time.Now()
 				err := i.Destroy()
+				lgr := lgr.WithField("Duration", time.Since(t0))
 				if err != nil {
 					lgr.WithError(err).Error("error destroying existing instance")
 				} else {
@@ -124,7 +173,12 @@ func (t *tester) Run() bool {
 	defer t.destroyTestInstance()
 
 	bootDeadline := time.Now().Add(t.TimeoutBooting)
-	initCommand := worker.TagVerifier{nil, t.secret}.InitCommand()
+	initCommand := worker.TagVerifier{Instance: nil, Secret: t.secret, ReportVerified: nil}.InitCommand() + "\n" + t.InstanceInitCommand
+
+	installPublicKey := t.SSHKey.PublicKey()
+	if !t.DeployPublicKey {
+		installPublicKey = nil
+	}
 
 	t.Logger.WithFields(logrus.Fields{
 		"InstanceType":         t.InstanceType.Name,
@@ -132,30 +186,32 @@ func (t *tester) Run() bool {
 		"ImageID":              t.ImageID,
 		"Tags":                 tags,
 		"InitCommand":          initCommand,
+		"DeployPublicKey":      installPublicKey != nil,
 	}).Info("creating instance")
-	inst, err := t.is.Create(t.InstanceType, t.ImageID, tags, initCommand, t.SSHKey.PublicKey())
+	t0 := time.Now()
+	inst, err := t.is.Create(t.InstanceType, t.ImageID, tags, initCommand, installPublicKey)
+	lgrC := t.Logger.WithField("Duration", time.Since(t0))
 	if err != nil {
 		// Create() might have failed due to a bug or network
 		// error even though the creation was successful, so
 		// it's safer to wait a bit for an instance to appear.
 		deferredError = true
-		t.Logger.WithError(err).Error("error creating test instance")
+		lgrC.WithError(err).Error("error creating test instance")
 		t.Logger.WithField("Deadline", bootDeadline).Info("waiting for instance to appear anyway, in case the Create response was incorrect")
 		for err = t.refreshTestInstance(); err != nil; err = t.refreshTestInstance() {
 			if time.Now().After(bootDeadline) {
 				t.Logger.Error("timed out")
 				return false
-			} else {
-				t.sleepSyncInterval()
 			}
+			t.sleepSyncInterval()
 		}
 		t.Logger.WithField("Instance", t.testInstance.ID()).Info("new instance appeared")
 		t.showLoginInfo()
 	} else {
 		// Create() succeeded. Make sure the new instance
 		// appears right away in the Instances() list.
-		t.Logger.WithField("Instance", inst.ID()).Info("created instance")
-		t.testInstance = inst
+		lgrC.WithField("Instance", inst.ID()).Info("created instance")
+		t.testInstance = &worker.TagVerifier{Instance: inst, Secret: t.secret, ReportVerified: nil}
 		t.showLoginInfo()
 		err = t.refreshTestInstance()
 		if err == errTestInstanceNotFound {
@@ -166,15 +222,14 @@ func (t *tester) Run() bool {
 			return false
 		}
 	}
-	t.testInstance = worker.TagVerifier{t.testInstance, t.secret}
 
 	if !t.checkTags() {
 		// checkTags() already logged the errors
-		return false
+		deferredError = true
 	}
 
 	if !t.waitForBoot(bootDeadline) {
-		return false
+		deferredError = true
 	}
 
 	if t.ShellCommand != "" {
@@ -232,7 +287,7 @@ func (t *tester) refreshTestInstance() error {
 			"Instance": i.ID(),
 			"Address":  i.Address(),
 		}).Info("found our instance in returned list")
-		t.testInstance = worker.TagVerifier{i, t.secret}
+		t.testInstance = &worker.TagVerifier{Instance: i, Secret: t.secret, ReportVerified: nil}
 		if !t.showedLoginInfo {
 			t.showLoginInfo()
 		}
@@ -248,11 +303,15 @@ func (t *tester) refreshTestInstance() error {
 func (t *tester) getInstances(tags cloud.InstanceTags) ([]cloud.Instance, error) {
 	var ret []cloud.Instance
 	t.Logger.WithField("FilterTags", tags).Info("getting instance list")
+	t0 := time.Now()
 	insts, err := t.is.Instances(tags)
 	if err != nil {
 		return nil, err
 	}
-	t.Logger.WithField("N", len(insts)).Info("got instance list")
+	t.Logger.WithFields(logrus.Fields{
+		"Duration": time.Since(t0),
+		"N":        len(insts),
+	}).Info("got instance list")
 	for _, i := range insts {
 		if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] == string(t.SetID) {
 			ret = append(ret, i)
@@ -300,7 +359,7 @@ func (t *tester) waitForBoot(deadline time.Time) bool {
 // current address.
 func (t *tester) updateExecutor() {
 	if t.executor == nil {
-		t.executor = ssh_executor.New(t.testInstance)
+		t.executor = sshexecutor.New(t.testInstance)
 		t.executor.SetTargetPort(t.SSHPort)
 		t.executor.SetSigners(t.SSHKey)
 	} else {
@@ -313,11 +372,13 @@ func (t *tester) runShellCommand(cmd string) error {
 	t.Logger.WithFields(logrus.Fields{
 		"Command": cmd,
 	}).Info("executing remote command")
+	t0 := time.Now()
 	stdout, stderr, err := t.executor.Execute(nil, cmd, nil)
 	lgr := t.Logger.WithFields(logrus.Fields{
-		"Command": cmd,
-		"stdout":  string(stdout),
-		"stderr":  string(stderr),
+		"Duration": time.Since(t0),
+		"Command":  cmd,
+		"stdout":   string(stdout),
+		"stderr":   string(stderr),
 	})
 	if err != nil {
 		lgr.WithError(err).Info("remote command failed")
@@ -333,20 +394,25 @@ func (t *tester) destroyTestInstance() bool {
 		return true
 	}
 	for {
-		t.Logger.WithField("Instance", t.testInstance.ID()).Info("destroying instance")
+		lgr := t.Logger.WithField("Instance", t.testInstance.ID())
+		lgr.Info("destroying instance")
+		t0 := time.Now()
+
 		err := t.testInstance.Destroy()
+		lgrDur := lgr.WithField("Duration", time.Since(t0))
 		if err != nil {
-			t.Logger.WithError(err).WithField("Instance", t.testInstance.ID()).Error("error destroying instance")
+			lgrDur.WithError(err).Error("error destroying instance")
 		} else {
-			t.Logger.WithField("Instance", t.testInstance.ID()).Info("destroyed instance")
+			lgrDur.Info("destroyed instance")
 		}
+
 		err = t.refreshTestInstance()
 		if err == errTestInstanceNotFound {
-			t.Logger.WithField("Instance", t.testInstance.ID()).Info("instance no longer appears in list")
+			lgr.Info("instance no longer appears in list")
 			t.testInstance = nil
 			return true
 		} else if err == nil {
-			t.Logger.WithField("Instance", t.testInstance.ID()).Info("instance still exists after calling Destroy")
+			lgr.Info("instance still exists after calling Destroy")
 			t.sleepSyncInterval()
 			continue
 		} else {