1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
14 "git.arvados.org/arvados.git/lib/cloud"
15 "git.arvados.org/arvados.git/lib/dispatchcloud/sshexecutor"
16 "git.arvados.org/arvados.git/lib/dispatchcloud/worker"
17 "git.arvados.org/arvados.git/sdk/go/arvados"
18 "github.com/sirupsen/logrus"
19 "golang.org/x/crypto/ssh"
23 errTestInstanceNotFound = errors.New("test instance missing from cloud provider's list")
26 // A tester does a sequence of operations to test a cloud driver and
27 // configuration. Run() should be called only once, after assigning
28 // suitable values to public fields.
30 Logger logrus.FieldLogger
31 Tags cloud.SharedResourceTags
33 SetID cloud.InstanceSetID
35 ProbeInterval time.Duration
36 SyncInterval time.Duration
37 TimeoutBooting time.Duration
39 DriverParameters json.RawMessage
40 InstanceType arvados.InstanceType
44 BootProbeCommand string
45 InstanceInitCommand cloud.InitCommand
47 PauseBeforeDestroy func()
50 testInstance *worker.TagVerifier
52 executor *sshexecutor.Executor
58 // Run the test suite once for each applicable permutation of
59 // DriverParameters. Return true if everything worked.
61 // Currently this means run once for each configured SubnetID.
62 func (t *tester) Run() bool {
63 var dp map[string]interface{}
64 err := json.Unmarshal(t.DriverParameters, &dp)
66 t.Logger.WithError(err).Error("error decoding configured CloudVMs.DriverParameters")
69 subnets, ok := dp["SubnetID"].([]interface{})
70 if !ok || len(subnets) <= 1 {
71 // Easy, only one SubnetID to test.
72 return t.runWithDriverParameters(t.DriverParameters)
75 deferredError := false
76 for i, subnet := range subnets {
77 subnet, ok := subnet.(string)
79 t.Logger.Errorf("CloudVMs.DriverParameters.SubnetID[%d] is invalid -- must be a string", i)
83 dp["SubnetID"] = subnet
84 t.Logger.Infof("running tests using SubnetID[%d] %q", i, subnet)
85 dpjson, err := json.Marshal(dp)
87 t.Logger.WithError(err).Error("error encoding driver parameters")
91 ok = t.runWithDriverParameters(dpjson)
93 t.Logger.Infof("failed tests using SubnetID[%d] %q", i, subnet)
100 // Run the test suite as specified, clean up as needed, and return
101 // true (everything is OK) or false (something went wrong).
102 func (t *tester) runWithDriverParameters(driverParameters json.RawMessage) bool {
103 // This flag gets set when we encounter a non-fatal error, so
104 // we can continue doing more tests but remember to return
105 // false (failure) at the end.
106 deferredError := false
109 t.is, err = t.Driver.InstanceSet(driverParameters, t.SetID, t.Tags, t.Logger, nil)
111 t.Logger.WithError(err).Info("error initializing driver")
116 // Don't send the driver any filters when getting the
117 // initial instance list. This way we can log an
118 // instance count (N=...) that includes all instances
119 // in this service account, even if they don't have
120 // the same InstanceSetID.
121 insts, err := t.getInstances(nil)
123 t.Logger.WithError(err).Info("error getting list of instances")
127 foundExisting := false
128 for _, i := range insts {
129 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
132 lgr := t.Logger.WithFields(logrus.Fields{
134 "InstanceSetID": t.SetID,
137 if t.DestroyExisting {
138 lgr.Info("destroying existing instance with our InstanceSetID")
141 lgr := lgr.WithField("Duration", time.Since(t0))
143 lgr.WithError(err).Error("error destroying existing instance")
145 lgr.Info("Destroy() call succeeded")
148 lgr.Error("found existing instance with our InstanceSetID")
153 } else if t.DestroyExisting {
154 t.sleepSyncInterval()
156 t.Logger.Error("cannot continue with existing instances -- clean up manually, use -destroy-existing=true, or choose a different -instance-set-id")
161 t.secret = randomHex(40)
163 tags := cloud.InstanceTags{}
164 for k, v := range t.Tags {
167 tags[t.TagKeyPrefix+"InstanceSetID"] = string(t.SetID)
168 tags[t.TagKeyPrefix+"InstanceSecret"] = t.secret
170 defer t.destroyTestInstance()
172 bootDeadline := time.Now().Add(t.TimeoutBooting)
173 initCommand := worker.TagVerifier{Instance: nil, Secret: t.secret, ReportVerified: nil}.InitCommand() + "\n" + t.InstanceInitCommand
175 t.Logger.WithFields(logrus.Fields{
176 "InstanceType": t.InstanceType.Name,
177 "ProviderInstanceType": t.InstanceType.ProviderType,
178 "ImageID": t.ImageID,
180 "InitCommand": initCommand,
181 }).Info("creating instance")
183 inst, err := t.is.Create(t.InstanceType, t.ImageID, tags, initCommand, t.SSHKey.PublicKey())
184 lgrC := t.Logger.WithField("Duration", time.Since(t0))
186 // Create() might have failed due to a bug or network
187 // error even though the creation was successful, so
188 // it's safer to wait a bit for an instance to appear.
190 lgrC.WithError(err).Error("error creating test instance")
191 t.Logger.WithField("Deadline", bootDeadline).Info("waiting for instance to appear anyway, in case the Create response was incorrect")
192 for err = t.refreshTestInstance(); err != nil; err = t.refreshTestInstance() {
193 if time.Now().After(bootDeadline) {
194 t.Logger.Error("timed out")
197 t.sleepSyncInterval()
199 t.Logger.WithField("Instance", t.testInstance.ID()).Info("new instance appeared")
202 // Create() succeeded. Make sure the new instance
203 // appears right away in the Instances() list.
204 lgrC.WithField("Instance", inst.ID()).Info("created instance")
205 t.testInstance = &worker.TagVerifier{Instance: inst, Secret: t.secret, ReportVerified: nil}
207 err = t.refreshTestInstance()
208 if err == errTestInstanceNotFound {
209 t.Logger.WithError(err).Error("cloud/driver Create succeeded, but instance is not in list")
211 } else if err != nil {
212 t.Logger.WithError(err).Error("error getting list of instances")
218 // checkTags() already logged the errors
222 if !t.waitForBoot(bootDeadline) {
226 if t.ShellCommand != "" {
227 err = t.runShellCommand(t.ShellCommand)
229 t.Logger.WithError(err).Error("shell command failed")
234 if fn := t.PauseBeforeDestroy; fn != nil {
239 return !deferredError
242 // If the test instance has an address, log an "ssh user@host" command
243 // line that the operator can paste into another terminal, and set
244 // t.showedLoginInfo.
246 // If the test instance doesn't have an address yet, do nothing.
247 func (t *tester) showLoginInfo() {
249 host, port := t.executor.TargetHostPort()
253 user := t.testInstance.RemoteUser()
254 t.Logger.WithField("Command", fmt.Sprintf("ssh -p%s %s@%s", port, user, host)).Info("showing login information")
255 t.showedLoginInfo = true
258 // Get the latest instance list from the driver. If our test instance
259 // is found, assign it to t.testIntance.
260 func (t *tester) refreshTestInstance() error {
261 insts, err := t.getInstances(cloud.InstanceTags{t.TagKeyPrefix + "InstanceSetID": string(t.SetID)})
265 for _, i := range insts {
266 if t.testInstance == nil {
267 // Filter by InstanceSetID tag value
268 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
272 // Filter by instance ID
273 if i.ID() != t.testInstance.ID() {
277 t.Logger.WithFields(logrus.Fields{
279 "Address": i.Address(),
280 }).Info("found our instance in returned list")
281 t.testInstance = &worker.TagVerifier{Instance: i, Secret: t.secret, ReportVerified: nil}
282 if !t.showedLoginInfo {
287 return errTestInstanceNotFound
290 // Get the list of instances, passing the given tags to the cloud
291 // driver to filter results.
293 // Return only the instances that have our InstanceSetID tag.
294 func (t *tester) getInstances(tags cloud.InstanceTags) ([]cloud.Instance, error) {
295 var ret []cloud.Instance
296 t.Logger.WithField("FilterTags", tags).Info("getting instance list")
298 insts, err := t.is.Instances(tags)
302 t.Logger.WithFields(logrus.Fields{
303 "Duration": time.Since(t0),
305 }).Info("got instance list")
306 for _, i := range insts {
307 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] == string(t.SetID) {
314 // Check that t.testInstance has every tag in t.Tags. If not, log an
315 // error and return false.
316 func (t *tester) checkTags() bool {
318 for k, v := range t.Tags {
319 if got := t.testInstance.Tags()[k]; got != v {
321 t.Logger.WithFields(logrus.Fields{
325 }).Error("tag is missing from test instance")
329 t.Logger.Info("all expected tags are present")
334 // Run t.BootProbeCommand on t.testInstance until it succeeds or the
336 func (t *tester) waitForBoot(deadline time.Time) bool {
337 for time.Now().Before(deadline) {
338 err := t.runShellCommand(t.BootProbeCommand)
342 t.sleepProbeInterval()
343 t.refreshTestInstance()
345 t.Logger.Error("timed out")
349 // Create t.executor and/or update its target to t.testInstance's
351 func (t *tester) updateExecutor() {
352 if t.executor == nil {
353 t.executor = sshexecutor.New(t.testInstance)
354 t.executor.SetTargetPort(t.SSHPort)
355 t.executor.SetSigners(t.SSHKey)
357 t.executor.SetTarget(t.testInstance)
361 func (t *tester) runShellCommand(cmd string) error {
363 t.Logger.WithFields(logrus.Fields{
365 }).Info("executing remote command")
367 stdout, stderr, err := t.executor.Execute(nil, cmd, nil)
368 lgr := t.Logger.WithFields(logrus.Fields{
369 "Duration": time.Since(t0),
371 "stdout": string(stdout),
372 "stderr": string(stderr),
375 lgr.WithError(err).Info("remote command failed")
377 lgr.Info("remote command succeeded")
382 // currently, this tries forever until it can return true (success).
383 func (t *tester) destroyTestInstance() bool {
384 if t.testInstance == nil {
388 lgr := t.Logger.WithField("Instance", t.testInstance.ID())
389 lgr.Info("destroying instance")
392 err := t.testInstance.Destroy()
393 lgrDur := lgr.WithField("Duration", time.Since(t0))
395 lgrDur.WithError(err).Error("error destroying instance")
397 lgrDur.Info("destroyed instance")
400 err = t.refreshTestInstance()
401 if err == errTestInstanceNotFound {
402 lgr.Info("instance no longer appears in list")
405 } else if err == nil {
406 lgr.Info("instance still exists after calling Destroy")
407 t.sleepSyncInterval()
410 t.Logger.WithError(err).Error("error getting list of instances")
416 func (t *tester) sleepSyncInterval() {
417 t.Logger.WithField("Duration", t.SyncInterval).Info("waiting SyncInterval")
418 time.Sleep(t.SyncInterval)
421 func (t *tester) sleepProbeInterval() {
422 t.Logger.WithField("Duration", t.ProbeInterval).Info("waiting ProbeInterval")
423 time.Sleep(t.ProbeInterval)
426 // Return a random string of n hexadecimal digits (n*4 random bits). n
428 func randomHex(n int) string {
429 buf := make([]byte, n/2)
430 _, err := rand.Read(buf)
434 return fmt.Sprintf("%x", buf)