1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
14 "git.curoverse.com/arvados.git/lib/cloud"
15 "git.curoverse.com/arvados.git/lib/dispatchcloud/ssh_executor"
16 "git.curoverse.com/arvados.git/lib/dispatchcloud/worker"
17 "git.curoverse.com/arvados.git/sdk/go/arvados"
18 "github.com/sirupsen/logrus"
19 "golang.org/x/crypto/ssh"
23 errTestInstanceNotFound = errors.New("test instance missing from cloud provider's list")
26 // A tester does a sequence of operations to test a cloud driver and
27 // configuration. Run() should be called only once, after assigning
28 // suitable values to public fields.
30 Logger logrus.FieldLogger
31 Tags cloud.SharedResourceTags
33 SetID cloud.InstanceSetID
35 ProbeInterval time.Duration
36 SyncInterval time.Duration
37 TimeoutBooting time.Duration
39 DriverParameters json.RawMessage
40 InstanceType arvados.InstanceType
44 BootProbeCommand string
46 PauseBeforeDestroy func()
49 testInstance *worker.TagVerifier
51 executor *ssh_executor.Executor
57 // Run the test suite as specified, clean up as needed, and return
58 // true (everything is OK) or false (something went wrong).
59 func (t *tester) Run() bool {
60 // This flag gets set when we encounter a non-fatal error, so
61 // we can continue doing more tests but remember to return
62 // false (failure) at the end.
63 deferredError := false
66 t.is, err = t.Driver.InstanceSet(t.DriverParameters, t.SetID, t.Tags, t.Logger)
68 t.Logger.WithError(err).Info("error initializing driver")
72 // Don't send the driver any filters the first time we get the
73 // instance list. This way we can log an instance count
74 // (N=...) that includes all instances in this service
75 // account, even if they don't have the same InstanceSetID.
76 insts, err := t.getInstances(nil)
78 t.Logger.WithError(err).Info("error getting initial list of instances")
83 foundExisting := false
84 for _, i := range insts {
85 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
88 lgr := t.Logger.WithFields(logrus.Fields{
90 "InstanceSetID": t.SetID,
93 if t.DestroyExisting {
94 lgr.Info("destroying existing instance with our InstanceSetID")
97 lgr := lgr.WithField("Duration", time.Since(t0))
99 lgr.WithError(err).Error("error destroying existing instance")
101 lgr.Info("Destroy() call succeeded")
104 lgr.Error("found existing instance with our InstanceSetID")
109 } else if t.DestroyExisting {
110 t.sleepSyncInterval()
112 t.Logger.Error("cannot continue with existing instances -- clean up manually, use -destroy-existing=true, or choose a different -instance-set-id")
117 t.secret = randomHex(40)
119 tags := cloud.InstanceTags{}
120 for k, v := range t.Tags {
123 tags[t.TagKeyPrefix+"InstanceSetID"] = string(t.SetID)
124 tags[t.TagKeyPrefix+"InstanceSecret"] = t.secret
126 defer t.destroyTestInstance()
128 bootDeadline := time.Now().Add(t.TimeoutBooting)
129 initCommand := worker.TagVerifier{nil, t.secret}.InitCommand()
131 t.Logger.WithFields(logrus.Fields{
132 "InstanceType": t.InstanceType.Name,
133 "ProviderInstanceType": t.InstanceType.ProviderType,
134 "ImageID": t.ImageID,
136 "InitCommand": initCommand,
137 }).Info("creating instance")
139 inst, err := t.is.Create(t.InstanceType, t.ImageID, tags, initCommand, t.SSHKey.PublicKey())
140 lgrC := t.Logger.WithField("Duration", time.Since(t0))
142 // Create() might have failed due to a bug or network
143 // error even though the creation was successful, so
144 // it's safer to wait a bit for an instance to appear.
146 lgrC.WithError(err).Error("error creating test instance")
147 t.Logger.WithField("Deadline", bootDeadline).Info("waiting for instance to appear anyway, in case the Create response was incorrect")
148 for err = t.refreshTestInstance(); err != nil; err = t.refreshTestInstance() {
149 if time.Now().After(bootDeadline) {
150 t.Logger.Error("timed out")
153 t.sleepSyncInterval()
156 t.Logger.WithField("Instance", t.testInstance.ID()).Info("new instance appeared")
159 // Create() succeeded. Make sure the new instance
160 // appears right away in the Instances() list.
161 lgrC.WithField("Instance", inst.ID()).Info("created instance")
162 t.testInstance = &worker.TagVerifier{inst, t.secret}
164 err = t.refreshTestInstance()
165 if err == errTestInstanceNotFound {
166 t.Logger.WithError(err).Error("cloud/driver Create succeeded, but instance is not in list")
168 } else if err != nil {
169 t.Logger.WithError(err).Error("error getting list of instances")
175 // checkTags() already logged the errors
179 if !t.waitForBoot(bootDeadline) {
183 if t.ShellCommand != "" {
184 err = t.runShellCommand(t.ShellCommand)
186 t.Logger.WithError(err).Error("shell command failed")
191 if fn := t.PauseBeforeDestroy; fn != nil {
196 return !deferredError
199 // If the test instance has an address, log an "ssh user@host" command
200 // line that the operator can paste into another terminal, and set
201 // t.showedLoginInfo.
203 // If the test instance doesn't have an address yet, do nothing.
204 func (t *tester) showLoginInfo() {
206 host, port := t.executor.TargetHostPort()
210 user := t.testInstance.RemoteUser()
211 t.Logger.WithField("Command", fmt.Sprintf("ssh -p%s %s@%s", port, user, host)).Info("showing login information")
212 t.showedLoginInfo = true
215 // Get the latest instance list from the driver. If our test instance
216 // is found, assign it to t.testIntance.
217 func (t *tester) refreshTestInstance() error {
218 insts, err := t.getInstances(cloud.InstanceTags{t.TagKeyPrefix + "InstanceSetID": string(t.SetID)})
222 for _, i := range insts {
223 if t.testInstance == nil {
224 // Filter by InstanceSetID tag value
225 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] != string(t.SetID) {
229 // Filter by instance ID
230 if i.ID() != t.testInstance.ID() {
234 t.Logger.WithFields(logrus.Fields{
236 "Address": i.Address(),
237 }).Info("found our instance in returned list")
238 t.testInstance = &worker.TagVerifier{i, t.secret}
239 if !t.showedLoginInfo {
244 return errTestInstanceNotFound
247 // Get the list of instances, passing the given tags to the cloud
248 // driver to filter results.
250 // Return only the instances that have our InstanceSetID tag.
251 func (t *tester) getInstances(tags cloud.InstanceTags) ([]cloud.Instance, error) {
252 var ret []cloud.Instance
253 t.Logger.WithField("FilterTags", tags).Info("getting instance list")
255 insts, err := t.is.Instances(tags)
259 t.Logger.WithFields(logrus.Fields{
260 "Duration": time.Since(t0),
262 }).Info("got instance list")
263 for _, i := range insts {
264 if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] == string(t.SetID) {
271 // Check that t.testInstance has every tag in t.Tags. If not, log an
272 // error and return false.
273 func (t *tester) checkTags() bool {
275 for k, v := range t.Tags {
276 if got := t.testInstance.Tags()[k]; got != v {
278 t.Logger.WithFields(logrus.Fields{
282 }).Error("tag is missing from test instance")
286 t.Logger.Info("all expected tags are present")
291 // Run t.BootProbeCommand on t.testInstance until it succeeds or the
293 func (t *tester) waitForBoot(deadline time.Time) bool {
294 for time.Now().Before(deadline) {
295 err := t.runShellCommand(t.BootProbeCommand)
299 t.sleepProbeInterval()
300 t.refreshTestInstance()
302 t.Logger.Error("timed out")
306 // Create t.executor and/or update its target to t.testInstance's
308 func (t *tester) updateExecutor() {
309 if t.executor == nil {
310 t.executor = ssh_executor.New(t.testInstance)
311 t.executor.SetTargetPort(t.SSHPort)
312 t.executor.SetSigners(t.SSHKey)
314 t.executor.SetTarget(t.testInstance)
318 func (t *tester) runShellCommand(cmd string) error {
320 t.Logger.WithFields(logrus.Fields{
322 }).Info("executing remote command")
324 stdout, stderr, err := t.executor.Execute(nil, cmd, nil)
325 lgr := t.Logger.WithFields(logrus.Fields{
326 "Duration": time.Since(t0),
328 "stdout": string(stdout),
329 "stderr": string(stderr),
332 lgr.WithError(err).Info("remote command failed")
334 lgr.Info("remote command succeeded")
339 // currently, this tries forever until it can return true (success).
340 func (t *tester) destroyTestInstance() bool {
341 if t.testInstance == nil {
345 lgr := t.Logger.WithField("Instance", t.testInstance.ID())
346 lgr.Info("destroying instance")
349 err := t.testInstance.Destroy()
350 lgrDur := lgr.WithField("Duration", time.Since(t0))
352 lgrDur.WithError(err).Error("error destroying instance")
354 lgrDur.Info("destroyed instance")
357 err = t.refreshTestInstance()
358 if err == errTestInstanceNotFound {
359 lgr.Info("instance no longer appears in list")
362 } else if err == nil {
363 lgr.Info("instance still exists after calling Destroy")
364 t.sleepSyncInterval()
367 t.Logger.WithError(err).Error("error getting list of instances")
373 func (t *tester) sleepSyncInterval() {
374 t.Logger.WithField("Duration", t.SyncInterval).Info("waiting SyncInterval")
375 time.Sleep(t.SyncInterval)
378 func (t *tester) sleepProbeInterval() {
379 t.Logger.WithField("Duration", t.ProbeInterval).Info("waiting ProbeInterval")
380 time.Sleep(t.ProbeInterval)
383 // Return a random string of n hexadecimal digits (n*4 random bits). n
385 func randomHex(n int) string {
386 buf := make([]byte, n/2)
387 _, err := rand.Read(buf)
391 return fmt.Sprintf("%x", buf)