18794: Add "arvados-server check" subcommand.
authorTom Clegg <tom@curii.com>
Thu, 7 Apr 2022 20:11:55 +0000 (16:11 -0400)
committerTom Clegg <tom@curii.com>
Tue, 26 Apr 2022 15:19:20 +0000 (11:19 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

cmd/arvados-server/cmd.go
sdk/go/health/aggregator.go
sdk/go/health/aggregator_test.go

index 26c3f485ea348bb723f04e21afd4a92816fedd47..9e02d45b672cfa811aba39b872d7c3022b96fbc6 100644 (file)
@@ -21,6 +21,7 @@ import (
        "git.arvados.org/arvados.git/lib/install"
        "git.arvados.org/arvados.git/lib/lsf"
        "git.arvados.org/arvados.git/lib/recovercollection"
+       "git.arvados.org/arvados.git/sdk/go/health"
        "git.arvados.org/arvados.git/services/keepproxy"
        "git.arvados.org/arvados.git/services/keepstore"
        "git.arvados.org/arvados.git/services/ws"
@@ -33,6 +34,7 @@ var (
                "--version": cmd.Version,
 
                "boot":               boot.Command,
+               "check":              health.CheckCommand,
                "cloudtest":          cloudtest.Command,
                "config-check":       config.CheckCommand,
                "config-defaults":    config.DumpDefaultsCommand,
index 23d7e8d431b7ed1160f3fcf8796e5b755d1dd078..ba532dddca7ee108e0af371086a02a71c078f0db 100644 (file)
@@ -8,19 +8,27 @@ import (
        "context"
        "crypto/tls"
        "encoding/json"
+       "errors"
+       "flag"
        "fmt"
+       "io"
        "net/http"
        "net/url"
        "sync"
        "time"
 
+       "git.arvados.org/arvados.git/lib/cmd"
+       "git.arvados.org/arvados.git/lib/config"
        "git.arvados.org/arvados.git/sdk/go/arvados"
        "git.arvados.org/arvados.git/sdk/go/auth"
+       "git.arvados.org/arvados.git/sdk/go/ctxlog"
+       "github.com/ghodss/yaml"
+       "github.com/sirupsen/logrus"
 )
 
 const defaultTimeout = arvados.Duration(2 * time.Second)
 
-// Aggregator implements http.Handler. It handles "GET /_health/all"
+// Aggregator implements service.Handler. It handles "GET /_health/all"
 // by checking the health of all configured services on the cluster
 // and responding 200 if everything is healthy.
 type Aggregator struct {
@@ -241,3 +249,61 @@ func (agg *Aggregator) checkAuth(req *http.Request) bool {
        }
        return false
 }
+
+var errSilent = errors.New("")
+
+var CheckCommand cmd.Handler = checkCommand{}
+
+type checkCommand struct{}
+
+func (ccmd checkCommand) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
+       logger := ctxlog.New(stderr, "json", "info")
+       ctx := ctxlog.Context(context.Background(), logger)
+       err := ccmd.run(ctx, prog, args, stdin, stdout, stderr)
+       if err != nil {
+               if err != errSilent {
+                       fmt.Fprintln(stdout, err.Error())
+               }
+               return 1
+       }
+       return 0
+}
+
+func (ccmd checkCommand) run(ctx context.Context, prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) error {
+       flags := flag.NewFlagSet("", flag.ContinueOnError)
+       flags.SetOutput(stderr)
+       loader := config.NewLoader(stdin, ctxlog.New(stderr, "text", "info"))
+       loader.SetupFlags(flags)
+       versionFlag := flags.Bool("version", false, "Write version information to stdout and exit 0")
+       timeout := flags.Duration("timeout", defaultTimeout.Duration(), "Maximum time to wait for health responses")
+       if ok, _ := cmd.ParseFlags(flags, prog, args, "", stderr); !ok {
+               // cmd.ParseFlags already reported the error
+               return errSilent
+       } else if *versionFlag {
+               cmd.Version.RunCommand(prog, args, stdin, stdout, stderr)
+               return nil
+       }
+       cfg, err := loader.Load()
+       if err != nil {
+               return err
+       }
+       cluster, err := cfg.GetCluster("")
+       if err != nil {
+               return err
+       }
+       logger := ctxlog.New(stderr, cluster.SystemLogs.Format, cluster.SystemLogs.LogLevel).WithFields(logrus.Fields{
+               "ClusterID": cluster.ClusterID,
+       })
+       ctx = ctxlog.Context(ctx, logger)
+       agg := Aggregator{Cluster: cluster, timeout: arvados.Duration(*timeout)}
+       resp := agg.ClusterHealth()
+       buf, err := yaml.Marshal(resp)
+       if err != nil {
+               return err
+       }
+       stdout.Write(buf)
+       if resp.Health != "OK" {
+               return fmt.Errorf("health check failed")
+       }
+       return nil
+}
index 04106caa442cfb52fbc098a516112e11b55643bb..05f0bdd31b18e0808e32c5321881bf77fc56e75c 100644 (file)
@@ -5,14 +5,19 @@
 package health
 
 import (
+       "bytes"
        "encoding/json"
+       "io/ioutil"
        "net/http"
        "net/http/httptest"
        "strings"
        "time"
 
+       "git.arvados.org/arvados.git/lib/config"
        "git.arvados.org/arvados.git/sdk/go/arvados"
        "git.arvados.org/arvados.git/sdk/go/arvadostest"
+       "git.arvados.org/arvados.git/sdk/go/ctxlog"
+       "github.com/ghodss/yaml"
        "gopkg.in/check.v1"
 )
 
@@ -30,9 +35,17 @@ func (s *AggregatorSuite) TestInterface(c *check.C) {
 }
 
 func (s *AggregatorSuite) SetUpTest(c *check.C) {
-       s.handler = &Aggregator{Cluster: &arvados.Cluster{
-               ManagementToken: arvadostest.ManagementToken,
-       }}
+       ldr := config.NewLoader(bytes.NewBufferString(`Clusters: {zzzzz: {}}`), ctxlog.TestLogger(c))
+       ldr.Path = "-"
+       cfg, err := ldr.Load()
+       c.Assert(err, check.IsNil)
+       cluster, err := cfg.GetCluster("")
+       c.Assert(err, check.IsNil)
+       cluster.ManagementToken = arvadostest.ManagementToken
+       cluster.SystemRootToken = arvadostest.SystemRootToken
+       cluster.Collections.BlobSigningKey = arvadostest.BlobSigningKey
+       cluster.Volumes["z"] = arvados.Volume{StorageClasses: map[string]bool{"default": true}}
+       s.handler = &Aggregator{Cluster: cluster}
        s.req = httptest.NewRequest("GET", "/_health/all", nil)
        s.req.Header.Set("Authorization", "Bearer "+arvadostest.ManagementToken)
        s.resp = httptest.NewRecorder()
@@ -122,6 +135,22 @@ func (s *AggregatorSuite) TestPingTimeout(c *check.C) {
        c.Check(rt > 0.005, check.Equals, true)
 }
 
+func (s *AggregatorSuite) TestCheckCommand(c *check.C) {
+       srv, listen := s.stubServer(&healthyHandler{})
+       defer srv.Close()
+       s.setAllServiceURLs(listen)
+       tmpdir := c.MkDir()
+       confdata, err := yaml.Marshal(arvados.Config{Clusters: map[string]arvados.Cluster{s.handler.Cluster.ClusterID: *s.handler.Cluster}})
+       c.Assert(err, check.IsNil)
+       err = ioutil.WriteFile(tmpdir+"/config.yml", confdata, 0777)
+       c.Assert(err, check.IsNil)
+       var stdout, stderr bytes.Buffer
+       exitcode := CheckCommand.RunCommand("check", []string{"-config=" + tmpdir + "/config.yml"}, &bytes.Buffer{}, &stdout, &stderr)
+       c.Check(exitcode, check.Equals, 0)
+       c.Check(stderr.String(), check.Equals, "")
+       c.Check(stdout.String(), check.Matches, `(?ms).*(\n|^)health: OK\n.*`)
+}
+
 func (s *AggregatorSuite) checkError(c *check.C) {
        c.Check(s.resp.Code, check.Not(check.Equals), http.StatusOK)
        var resp ClusterHealthResponse