1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
14 "git.arvados.org/arvados.git/lib/cloud"
15 "git.arvados.org/arvados.git/lib/dispatchcloud/sshexecutor"
16 "git.arvados.org/arvados.git/lib/dispatchcloud/worker"
17 "git.arvados.org/arvados.git/sdk/go/arvados"
18 "github.com/sirupsen/logrus"
19 "golang.org/x/crypto/ssh"
23 errTestInstanceNotFound = errors.New("test instance missing from cloud provider's list")
26 // A tester does a sequence of operations to test a cloud driver and
27 // configuration. Run() should be called only once, after assigning
28 // suitable values to public fields.
30 Logger logrus.FieldLogger
31 Tags cloud.SharedResourceTags
33 SetID cloud.InstanceSetID
35 ProbeInterval time.Duration
36 SyncInterval time.Duration
37 TimeoutBooting time.Duration
39 DriverParameters json.RawMessage
40 InstanceType arvados.InstanceType
45 BootProbeCommand string
46 InstanceInitCommand cloud.InitCommand
48 PauseBeforeDestroy func()
51 testInstance *worker.TagVerifier
53 executor *sshexecutor.Executor
59 // Run the test suite once for each applicable permutation of
60 // DriverParameters. Return true if everything worked.
62 // Currently this means run once for each configured SubnetID.
63 func (t *tester) Run() bool {
64 var dp map[string]interface{}
65 if len(t.DriverParameters) > 0 {
66 err := json.Unmarshal(t.DriverParameters, &dp)
68 t.Logger.WithError(err).Error("error decoding configured CloudVMs.DriverParameters")
72 subnets, ok := dp["SubnetID"].([]interface{})
73 if !ok || len(subnets) <= 1 {
74 // Easy, only one SubnetID to test.
75 return t.runWithDriverParameters(t.DriverParameters)
78 deferredError := false
79 for i, subnet := range subnets {
80 subnet, ok := subnet.(string)
82 t.Logger.Errorf("CloudVMs.DriverParameters.SubnetID[%d] is invalid -- must be a string", i)
86 dp["SubnetID"] = subnet
87 t.Logger.Infof("running tests using SubnetID[%d] %q", i, subnet)
88 dpjson, err := json.Marshal(dp)
90 t.Logger.WithError(err).Error("error encoding driver parameters")
94 ok = t.runWithDriverParameters(dpjson)
96 t.Logger.Infof("failed tests using SubnetID[%d] %q", i, subnet)
100 return !deferredError
103 // Run the test suite as specified, clean up as needed, and return
104 // true (everything is OK) or false (something went wrong).
105 func (t *tester) runWithDriverParameters(driverParameters json.RawMessage) bool {
106 // This flag gets set when we encounter a non-fatal error, so
107 // we can continue doing more tests but remember to return
108 // false (failure) at the end.
109 deferredError := false
112 t.is, err = t.Driver.InstanceSet(driverParameters, t.SetID, t.Tags, t.Logger, nil)
114 t.Logger.WithError(err).Info("error initializing driver")
119 // Don't send the driver any filters when getting the
120 // initial instance list. This way we can log an
121 // instance count (N=...) that includes all instances
122 // in this service account, even if they don't have
123 // the same InstanceSetID.
124 insts, err := t.getInstances(nil)
126 t.Logger.WithError(err).Info("error getting list of instances")
130 foundExisting := false
131 for _, i := range insts {
132 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
135 lgr := t.Logger.WithFields(logrus.Fields{
137 "InstanceSetID": t.SetID,
140 if t.DestroyExisting {
141 lgr.Info("destroying existing instance with our InstanceSetID")
144 lgr := lgr.WithField("Duration", time.Since(t0))
146 lgr.WithError(err).Error("error destroying existing instance")
148 lgr.Info("Destroy() call succeeded")
151 lgr.Error("found existing instance with our InstanceSetID")
156 } else if t.DestroyExisting {
157 t.sleepSyncInterval()
159 t.Logger.Error("cannot continue with existing instances -- clean up manually, use -destroy-existing=true, or choose a different -instance-set-id")
164 t.secret = randomHex(40)
166 tags := cloud.InstanceTags{}
167 for k, v := range t.Tags {
170 tags[t.TagKeyPrefix+"InstanceSetID"] = string(t.SetID)
171 tags[t.TagKeyPrefix+"InstanceSecret"] = t.secret
173 defer t.destroyTestInstance()
175 bootDeadline := time.Now().Add(t.TimeoutBooting)
176 initCommand := worker.TagVerifier{Instance: nil, Secret: t.secret, ReportVerified: nil}.InitCommand() + "\n" + t.InstanceInitCommand
178 installPublicKey := t.SSHKey.PublicKey()
179 if !t.DeployPublicKey {
180 installPublicKey = nil
183 t.Logger.WithFields(logrus.Fields{
184 "InstanceType": t.InstanceType.Name,
185 "ProviderInstanceType": t.InstanceType.ProviderType,
186 "ImageID": t.ImageID,
188 "InitCommand": initCommand,
189 "DeployPublicKey": installPublicKey != nil,
190 }).Info("creating instance")
192 inst, err := t.is.Create(t.InstanceType, t.ImageID, tags, initCommand, installPublicKey)
193 lgrC := t.Logger.WithField("Duration", time.Since(t0))
195 // Create() might have failed due to a bug or network
196 // error even though the creation was successful, so
197 // it's safer to wait a bit for an instance to appear.
199 lgrC.WithError(err).Error("error creating test instance")
200 t.Logger.WithField("Deadline", bootDeadline).Info("waiting for instance to appear anyway, in case the Create response was incorrect")
201 for err = t.refreshTestInstance(); err != nil; err = t.refreshTestInstance() {
202 if time.Now().After(bootDeadline) {
203 t.Logger.Error("timed out")
206 t.sleepSyncInterval()
208 t.Logger.WithField("Instance", t.testInstance.ID()).Info("new instance appeared")
211 // Create() succeeded. Make sure the new instance
212 // appears right away in the Instances() list.
213 lgrC.WithField("Instance", inst.ID()).Info("created instance")
214 t.testInstance = &worker.TagVerifier{Instance: inst, Secret: t.secret, ReportVerified: nil}
216 err = t.refreshTestInstance()
217 if err == errTestInstanceNotFound {
218 t.Logger.WithError(err).Error("cloud/driver Create succeeded, but instance is not in list")
220 } else if err != nil {
221 t.Logger.WithError(err).Error("error getting list of instances")
227 // checkTags() already logged the errors
231 if !t.waitForBoot(bootDeadline) {
235 if t.ShellCommand != "" {
236 err = t.runShellCommand(t.ShellCommand)
238 t.Logger.WithError(err).Error("shell command failed")
243 if fn := t.PauseBeforeDestroy; fn != nil {
248 return !deferredError
251 // If the test instance has an address, log an "ssh user@host" command
252 // line that the operator can paste into another terminal, and set
253 // t.showedLoginInfo.
255 // If the test instance doesn't have an address yet, do nothing.
256 func (t *tester) showLoginInfo() {
258 host, port := t.executor.TargetHostPort()
262 user := t.testInstance.RemoteUser()
263 t.Logger.WithField("Command", fmt.Sprintf("ssh -p%s %s@%s", port, user, host)).Info("showing login information")
264 t.showedLoginInfo = true
267 // Get the latest instance list from the driver. If our test instance
268 // is found, assign it to t.testIntance.
269 func (t *tester) refreshTestInstance() error {
270 insts, err := t.getInstances(cloud.InstanceTags{t.TagKeyPrefix + "InstanceSetID": string(t.SetID)})
274 for _, i := range insts {
275 if t.testInstance == nil {
276 // Filter by InstanceSetID tag value
277 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
281 // Filter by instance ID
282 if i.ID() != t.testInstance.ID() {
286 t.Logger.WithFields(logrus.Fields{
288 "Address": i.Address(),
289 }).Info("found our instance in returned list")
290 t.testInstance = &worker.TagVerifier{Instance: i, Secret: t.secret, ReportVerified: nil}
291 if !t.showedLoginInfo {
296 return errTestInstanceNotFound
299 // Get the list of instances, passing the given tags to the cloud
300 // driver to filter results.
302 // Return only the instances that have our InstanceSetID tag.
303 func (t *tester) getInstances(tags cloud.InstanceTags) ([]cloud.Instance, error) {
304 var ret []cloud.Instance
305 t.Logger.WithField("FilterTags", tags).Info("getting instance list")
307 insts, err := t.is.Instances(tags)
311 t.Logger.WithFields(logrus.Fields{
312 "Duration": time.Since(t0),
314 }).Info("got instance list")
315 for _, i := range insts {
316 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] == string(t.SetID) {
323 // Check that t.testInstance has every tag in t.Tags. If not, log an
324 // error and return false.
325 func (t *tester) checkTags() bool {
327 for k, v := range t.Tags {
328 if got := t.testInstance.Tags()[k]; got != v {
330 t.Logger.WithFields(logrus.Fields{
334 }).Error("tag is missing from test instance")
338 t.Logger.Info("all expected tags are present")
343 // Run t.BootProbeCommand on t.testInstance until it succeeds or the
345 func (t *tester) waitForBoot(deadline time.Time) bool {
346 for time.Now().Before(deadline) {
347 err := t.runShellCommand(t.BootProbeCommand)
351 t.sleepProbeInterval()
352 t.refreshTestInstance()
354 t.Logger.Error("timed out")
358 // Create t.executor and/or update its target to t.testInstance's
360 func (t *tester) updateExecutor() {
361 if t.executor == nil {
362 t.executor = sshexecutor.New(t.testInstance)
363 t.executor.SetTargetPort(t.SSHPort)
364 t.executor.SetSigners(t.SSHKey)
366 t.executor.SetTarget(t.testInstance)
370 func (t *tester) runShellCommand(cmd string) error {
372 t.Logger.WithFields(logrus.Fields{
374 }).Info("executing remote command")
376 stdout, stderr, err := t.executor.Execute(nil, cmd, nil)
377 lgr := t.Logger.WithFields(logrus.Fields{
378 "Duration": time.Since(t0),
380 "stdout": string(stdout),
381 "stderr": string(stderr),
384 lgr.WithError(err).Info("remote command failed")
386 lgr.Info("remote command succeeded")
391 // currently, this tries forever until it can return true (success).
392 func (t *tester) destroyTestInstance() bool {
393 if t.testInstance == nil {
397 lgr := t.Logger.WithField("Instance", t.testInstance.ID())
398 lgr.Info("destroying instance")
401 err := t.testInstance.Destroy()
402 lgrDur := lgr.WithField("Duration", time.Since(t0))
404 lgrDur.WithError(err).Error("error destroying instance")
406 lgrDur.Info("destroyed instance")
409 err = t.refreshTestInstance()
410 if err == errTestInstanceNotFound {
411 lgr.Info("instance no longer appears in list")
414 } else if err == nil {
415 lgr.Info("instance still exists after calling Destroy")
416 t.sleepSyncInterval()
419 t.Logger.WithError(err).Error("error getting list of instances")
425 func (t *tester) sleepSyncInterval() {
426 t.Logger.WithField("Duration", t.SyncInterval).Info("waiting SyncInterval")
427 time.Sleep(t.SyncInterval)
430 func (t *tester) sleepProbeInterval() {
431 t.Logger.WithField("Duration", t.ProbeInterval).Info("waiting ProbeInterval")
432 time.Sleep(t.ProbeInterval)
435 // Return a random string of n hexadecimal digits (n*4 random bits). n
437 func randomHex(n int) string {
438 buf := make([]byte, n/2)
439 _, err := rand.Read(buf)
443 return fmt.Sprintf("%x", buf)