19377: Include health check in diagnostics report, if appropriate.
authorTom Clegg <tom@curii.com>
Tue, 4 Oct 2022 19:04:58 +0000 (15:04 -0400)
committerTom Clegg <tom@curii.com>
Tue, 4 Oct 2022 19:04:58 +0000 (15:04 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

lib/diagnostics/cmd.go
sdk/go/health/aggregator.go

index 799abf9da4e278bc7f2f4150e7f284c991c677c4..3a2ebe0c280bf68f6e7e397e65489c70196f91ae 100644 (file)
@@ -16,12 +16,15 @@ import (
        "net"
        "net/http"
        "net/url"
+       "os"
        "strings"
        "time"
 
        "git.arvados.org/arvados.git/lib/cmd"
+       "git.arvados.org/arvados.git/lib/config"
        "git.arvados.org/arvados.git/sdk/go/arvados"
        "git.arvados.org/arvados.git/sdk/go/ctxlog"
+       "git.arvados.org/arvados.git/sdk/go/health"
        "github.com/sirupsen/logrus"
 )
 
@@ -125,6 +128,30 @@ func (diag *diagnoser) runtests() {
                return
        }
 
+       diag.dotest(5, "running health check (same as `arvados-server check`)", func() error {
+               ldr := config.NewLoader(&bytes.Buffer{}, ctxlog.New(&bytes.Buffer{}, "text", "info"))
+               ldr.SetupFlags(flag.NewFlagSet("diagnostics", flag.ContinueOnError))
+               cfg, err := ldr.Load()
+               if err != nil {
+                       diag.infof("skipping because config could not be loaded: %s", err)
+                       return nil
+               }
+               cluster, err := cfg.GetCluster("")
+               if err != nil {
+                       return err
+               }
+               if cluster.SystemRootToken != os.Getenv("ARVADOS_API_TOKEN") {
+                       diag.infof("skipping because provided token is not SystemRootToken")
+               }
+               agg := &health.Aggregator{Cluster: cluster}
+               resp := agg.ClusterHealth()
+               for _, e := range resp.Errors {
+                       diag.errorf("health check: %s", e)
+               }
+               diag.infof("health check: reported clock skew %v", resp.ClockSkew)
+               return nil
+       })
+
        var dd arvados.DiscoveryDocument
        ddpath := "discovery/v1/apis/arvados/v1/rest"
        diag.dotest(10, fmt.Sprintf("getting discovery document from https://%s/%s", client.APIHost, ddpath), func() error {
index b5301dffe006ec280f379d56ca9330818eb029b6..caf99108a632ac44163a4e669fce9bb00366c078 100644 (file)
@@ -223,7 +223,8 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
        for svcName, sh := range resp.Services {
                switch svcName {
                case arvados.ServiceNameDispatchCloud,
-                       arvados.ServiceNameDispatchLSF:
+                       arvados.ServiceNameDispatchLSF,
+                       arvados.ServiceNameDispatchSLURM:
                        // ok to not run any given dispatcher
                case arvados.ServiceNameHealth,
                        arvados.ServiceNameWorkbench1,