Merge branch '15026-cloudtest'
authorTom Clegg <tclegg@veritasgenetics.com>
Mon, 1 Jul 2019 14:56:41 +0000 (10:56 -0400)
committerTom Clegg <tclegg@veritasgenetics.com>
Mon, 1 Jul 2019 14:56:41 +0000 (10:56 -0400)
refs #15026

Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg@veritasgenetics.com>

doc/_config.yml
doc/admin/cloudtest.html.textile.liquid [new file with mode: 0644]
doc/install/install-dispatch-cloud.html.textile.liquid
lib/cloud/cloudtest/tester.go

index 20a2085c11b3403071c83d75ed12a4fc8068d119..36e8c64b86681ab4a44452ccf9f0e1477faf432d 100644 (file)
@@ -180,6 +180,7 @@ navbar:
     - Cloud:
       - admin/storage-classes.html.textile.liquid
       - admin/spot-instances.html.textile.liquid
+      - admin/cloudtest.html.textile.liquid
     - Data Management:
       - admin/collection-versioning.html.textile.liquid
       - admin/collection-managed-properties.html.textile.liquid
diff --git a/doc/admin/cloudtest.html.textile.liquid b/doc/admin/cloudtest.html.textile.liquid
new file mode 100644 (file)
index 0000000..2adce90
--- /dev/null
@@ -0,0 +1,71 @@
+---
+layout: default
+navsection: admin
+title: Testing cloud configuration
+...
+
+{% comment %}
+Copyright (C) The Arvados Authors. All rights reserved.
+
+SPDX-License-Identifier: CC-BY-SA-3.0
+{% endcomment %}
+
+The @arvados-server@ package includes a @cloudtest@ tool that checks compatibility between your Arvados configuration, your cloud driver, your cloud provider's API, your cloud provider's VM instances, and the worker image you use with the *experimental* "cloud dispatcher":../install/install-dispatch-cloud.html.
+
+@arvados-server cloudtest@ performs the following steps:
+# Create a new instance
+# Wait for it to finish booting
+# Run a shell command on the new instance (optional)
+# Pause while you log in to the new instance and do other tests yourself (optional)
+# Shut down the instance
+
+This is an easy way to expose problems like these:
+* Configured cloud credentials don't work
+* Configured image types don't work
+* Configured driver is not compatible with your cloud API/region
+* Newly created instances are not usable due to a network problem or misconfiguration
+* Newly created instances do not accept the configured SSH private key
+* Selected machine image does not boot properly
+* Selected machine image is incompatible with some instance types
+* Driver has bugs
+
+h2. Typical uses
+
+Before bringing up the @arvados-dispatch-cloud@ service for the first time, we recommend running @cloudtest@ to check your configuration:
+
+<notextile><pre>
+$ <span class="userinput">arvados-server cloudtest -command "crunch-run --list"</span>
+</pre></notextile>
+
+Before updating your configuration to use a new VM image, we recommend running @cloudtest@ with the new image:
+
+<notextile><pre>
+$ <span class="userinput">arvados-server cloudtest -image-id <b>new_image_id</b> -command "crunch-run --list"</span>
+</pre></notextile>
+
+After adding an instance type to your configuration, we recommend running @cloudtest@ with the new instance type:
+
+<notextile><pre>
+$ <span class="userinput">arvados-server cloudtest -instance-type <b>new_instance_type_name</b></span>
+</pre></notextile>
+
+For a full list of options, use the @-help@ flag:
+
+<notextile><pre>
+$ <span class="userinput">arvados-server cloudtest -help</span>
+Usage:
+  -command string
+        Run an interactive shell command on the test instance when it boots
+  -config file
+        Site configuration file (default "/etc/arvados/config.yml")
+  -destroy-existing
+        Destroy any existing instances tagged with our InstanceSetID, instead of erroring out
+  -image-id string
+        Image ID to use when creating the test instance (if empty, use cluster config)
+  -instance-set-id value
+        InstanceSetID tag value to use on the test instance (default "cloudtest-user@hostname.example")
+  -instance-type string
+        Instance type to create (if empty, use cheapest type in config)
+  -pause-before-destroy
+        Prompt and wait before destroying the test instance
+</pre></notextile>
index bc3be8f1d7e88f463d1e954245bec978a3ab967b..d72909077e97f4fca2c2c43cf748f7ede66dd723 100644 (file)
@@ -153,10 +153,37 @@ Minimal configuration example for Azure:
 </code></pre>
 </notextile>
 
-h2. Install the dispatcher
+h2. Test your configuration
 
 First, "add the appropriate package repository for your distribution":{{ site.baseurl }}/install/install-manual-prerequisites.html#repos.
 
+Next, install the arvados-server package.
+
+On Red Hat-based systems:
+
+<notextile>
+<pre><code>~$ <span class="userinput">sudo yum install arvados-server</span>
+</code></pre>
+</notextile>
+
+On Debian-based systems:
+
+<notextile>
+<pre><code>~$ <span class="userinput">sudo apt-get install arvados-server</span>
+</code></pre>
+</notextile>
+
+Run the @cloudtest@ tool to verify that your configuration works. This creates a new cloud VM, confirms that it boots correctly and accepts your configured SSH private key, and shuts it down.
+
+<notextile>
+<pre><code>~$ <span class="userinput">arvados-server cloudtest && echo "OK!"</span>
+</code></pre>
+</notextile>
+
+Refer to the "cloudtest tool documentation":../admin/cloudtest.html for more information.
+
+h2. Install the dispatcher
+
 On Red Hat-based systems:
 
 <notextile>
index dc5a4738aa25597e9ce640fde12bd1c4d54ce1d8..adc57803f1f9432ad1be1e74065a32d56bbd32b2 100644 (file)
@@ -92,7 +92,9 @@ func (t *tester) Run() bool {
                        foundExisting = true
                        if t.DestroyExisting {
                                lgr.Info("destroying existing instance with our InstanceSetID")
+                               t0 := time.Now()
                                err := i.Destroy()
+                               lgr := lgr.WithField("Duration", time.Since(t0))
                                if err != nil {
                                        lgr.WithError(err).Error("error destroying existing instance")
                                } else {
@@ -133,13 +135,15 @@ func (t *tester) Run() bool {
                "Tags":                 tags,
                "InitCommand":          initCommand,
        }).Info("creating instance")
+       t0 := time.Now()
        inst, err := t.is.Create(t.InstanceType, t.ImageID, tags, initCommand, t.SSHKey.PublicKey())
+       lgrC := t.Logger.WithField("Duration", time.Since(t0))
        if err != nil {
                // Create() might have failed due to a bug or network
                // error even though the creation was successful, so
                // it's safer to wait a bit for an instance to appear.
                deferredError = true
-               t.Logger.WithError(err).Error("error creating test instance")
+               lgrC.WithError(err).Error("error creating test instance")
                t.Logger.WithField("Deadline", bootDeadline).Info("waiting for instance to appear anyway, in case the Create response was incorrect")
                for err = t.refreshTestInstance(); err != nil; err = t.refreshTestInstance() {
                        if time.Now().After(bootDeadline) {
@@ -154,7 +158,7 @@ func (t *tester) Run() bool {
        } else {
                // Create() succeeded. Make sure the new instance
                // appears right away in the Instances() list.
-               t.Logger.WithField("Instance", inst.ID()).Info("created instance")
+               lgrC.WithField("Instance", inst.ID()).Info("created instance")
                t.testInstance = &worker.TagVerifier{inst, t.secret}
                t.showLoginInfo()
                err = t.refreshTestInstance()
@@ -247,11 +251,15 @@ func (t *tester) refreshTestInstance() error {
 func (t *tester) getInstances(tags cloud.InstanceTags) ([]cloud.Instance, error) {
        var ret []cloud.Instance
        t.Logger.WithField("FilterTags", tags).Info("getting instance list")
+       t0 := time.Now()
        insts, err := t.is.Instances(tags)
        if err != nil {
                return nil, err
        }
-       t.Logger.WithField("N", len(insts)).Info("got instance list")
+       t.Logger.WithFields(logrus.Fields{
+               "Duration": time.Since(t0),
+               "N":        len(insts),
+       }).Info("got instance list")
        for _, i := range insts {
                if i.Tags()[t.TagKeyPrefix+"InstanceSetID"] == string(t.SetID) {
                        ret = append(ret, i)
@@ -312,11 +320,13 @@ func (t *tester) runShellCommand(cmd string) error {
        t.Logger.WithFields(logrus.Fields{
                "Command": cmd,
        }).Info("executing remote command")
+       t0 := time.Now()
        stdout, stderr, err := t.executor.Execute(nil, cmd, nil)
        lgr := t.Logger.WithFields(logrus.Fields{
-               "Command": cmd,
-               "stdout":  string(stdout),
-               "stderr":  string(stderr),
+               "Duration": time.Since(t0),
+               "Command":  cmd,
+               "stdout":   string(stdout),
+               "stderr":   string(stderr),
        })
        if err != nil {
                lgr.WithError(err).Info("remote command failed")
@@ -332,20 +342,25 @@ func (t *tester) destroyTestInstance() bool {
                return true
        }
        for {
-               t.Logger.WithField("Instance", t.testInstance.ID()).Info("destroying instance")
+               lgr := t.Logger.WithField("Instance", t.testInstance.ID())
+               lgr.Info("destroying instance")
+               t0 := time.Now()
+
                err := t.testInstance.Destroy()
+               lgrDur := lgr.WithField("Duration", time.Since(t0))
                if err != nil {
-                       t.Logger.WithError(err).WithField("Instance", t.testInstance.ID()).Error("error destroying instance")
+                       lgrDur.WithError(err).Error("error destroying instance")
                } else {
-                       t.Logger.WithField("Instance", t.testInstance.ID()).Info("destroyed instance")
+                       lgrDur.Info("destroyed instance")
                }
+
                err = t.refreshTestInstance()
                if err == errTestInstanceNotFound {
-                       t.Logger.WithField("Instance", t.testInstance.ID()).Info("instance no longer appears in list")
+                       lgr.Info("instance no longer appears in list")
                        t.testInstance = nil
                        return true
                } else if err == nil {
-                       t.Logger.WithField("Instance", t.testInstance.ID()).Info("instance still exists after calling Destroy")
+                       lgr.Info("instance still exists after calling Destroy")
                        t.sleepSyncInterval()
                        continue
                } else {