20649: Log boot probe errors other than usual wait-for-boot errors.
authorTom Clegg <tom@curii.com>
Fri, 11 Aug 2023 21:47:06 +0000 (17:47 -0400)
committerTom Clegg <tom@curii.com>
Fri, 11 Aug 2023 21:47:06 +0000 (17:47 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

lib/dispatchcloud/sshexecutor/executor.go
lib/dispatchcloud/sshexecutor/executor_test.go
lib/dispatchcloud/worker/worker.go

index c37169921cf594ac035263ad4c53d4c176c13214..3761c699225079704c26889197913ae3cd3e8eca 100644 (file)
@@ -18,6 +18,8 @@ import (
        "golang.org/x/crypto/ssh"
 )
 
+var ErrNoAddress = errors.New("instance has no address")
+
 // New returns a new Executor, using the given target.
 func New(t cloud.ExecutorTarget) *Executor {
        return &Executor{target: t}
@@ -196,7 +198,7 @@ func (exr *Executor) TargetHostPort() (string, string) {
 func (exr *Executor) setupSSHClient() (*ssh.Client, error) {
        addr := net.JoinHostPort(exr.TargetHostPort())
        if addr == ":" {
-               return nil, errors.New("instance has no address")
+               return nil, ErrNoAddress
        }
        var receivedKey ssh.PublicKey
        client, err := ssh.Dial("tcp", addr, &ssh.ClientConfig{
index b4afeafa82dab3e671f48802646df185d8a64590..95b29fa6aceb3420732dc55aef8a213ce8a756f7 100644 (file)
@@ -6,6 +6,7 @@ package sshexecutor
 
 import (
        "bytes"
+       "errors"
        "fmt"
        "io"
        "io/ioutil"
@@ -146,6 +147,7 @@ func (s *ExecutorSuite) TestExecute(c *check.C) {
                exr.SetTargetPort("0")
                _, _, err = exr.Execute(nil, command, nil)
                c.Check(err, check.ErrorMatches, `.*connection refused.*`)
+               c.Check(errors.As(err, new(*net.OpError)), check.Equals, true)
 
                // Use the test server's listening port.
                exr.SetTargetPort(target.Port())
index 8b4be1a3c77aa8f01ebe7dad4a3c266da3c36c81..e6e817458ebf4606a6dd453561b5224452e53b8f 100644 (file)
@@ -7,17 +7,21 @@ package worker
 import (
        "bytes"
        "encoding/json"
+       "errors"
        "fmt"
        "io"
+       "net"
        "path/filepath"
        "strings"
        "sync"
        "time"
 
        "git.arvados.org/arvados.git/lib/cloud"
+       "git.arvados.org/arvados.git/lib/dispatchcloud/sshexecutor"
        "git.arvados.org/arvados.git/sdk/go/arvados"
        "git.arvados.org/arvados.git/sdk/go/stats"
        "github.com/sirupsen/logrus"
+       "golang.org/x/crypto/ssh"
 )
 
 const (
@@ -301,10 +305,10 @@ func (wkr *worker) probeAndUpdate() {
                dur := probeStart.Sub(wkr.probed)
                if wkr.shutdownIfBroken(dur) {
                        // stderr from failed run-probes will have
-                       // been logged already, but boot-probe
+                       // been logged already, but some boot-probe
                        // failures are normal so they are logged only
-                       // at Debug level. This is our chance to log
-                       // some evidence about why the node never
+                       // at Debug level. This may be our chance to
+                       // log some evidence about why the node never
                        // booted, even in non-debug mode.
                        if !booted {
                                wkr.reportBootOutcome(BootOutcomeFailed)
@@ -474,7 +478,26 @@ func (wkr *worker) probeBooted() (ok bool, stderr []byte) {
                "stderr":  string(stderr),
        })
        if err != nil {
-               logger.WithError(err).Debug("boot probe failed")
+               if errors.Is(err, sshexecutor.ErrNoAddress) ||
+                       errors.As(err, new(*net.OpError)) ||
+                       errors.As(err, new(*ssh.ExitError)) {
+                       // These errors are expected while the
+                       // instance is booting, so we only log them at
+                       // debug level.
+                       logger.WithError(err).Debug("boot probe failed")
+               } else {
+                       // Other errors are more likely to indicate a
+                       // configuration problem, and it's more
+                       // sysadmin-friendly to show them right away
+                       // instead of waiting until boot timeout and
+                       // only showing the last error.
+                       //
+                       // Example: "ssh: handshake failed: ssh:
+                       // unable to authenticate, attempted methods
+                       // [none publickey], no supported methods
+                       // remain"
+                       logger.WithError(err).Warn("boot probe failed")
+               }
                return false, stderr
        }
        logger.Info("boot probe succeeded")