X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/16b445f3d52952d284dbaac603e70a7196d9a3e9..9e988394278b9c0c072c27107b67669875b8fca7:/sdk/go/health/aggregator.go diff --git a/sdk/go/health/aggregator.go b/sdk/go/health/aggregator.go index 5e010d88bc..3bf37b1294 100644 --- a/sdk/go/health/aggregator.go +++ b/sdk/go/health/aggregator.go @@ -29,10 +29,14 @@ import ( "git.arvados.org/arvados.git/sdk/go/auth" "git.arvados.org/arvados.git/sdk/go/ctxlog" "github.com/ghodss/yaml" + "github.com/prometheus/client_golang/prometheus" "github.com/sirupsen/logrus" ) -const defaultTimeout = arvados.Duration(2 * time.Second) +const ( + defaultTimeout = arvados.Duration(2 * time.Second) + maxClockSkew = time.Minute +) // Aggregator implements service.Handler. It handles "GET /_health/all" // by checking the health of all configured services on the cluster @@ -46,6 +50,9 @@ type Aggregator struct { // If non-nil, Log is called after handling each request. Log func(*http.Request, error) + + // If non-nil, report clock skew on each health-check. + MetricClockSkew prometheus.Gauge } func (agg *Aggregator) setup() { @@ -101,40 +108,47 @@ func (agg *Aggregator) ServeHTTP(resp http.ResponseWriter, req *http.Request) { type ClusterHealthResponse struct { // "OK" if all needed services are OK, otherwise "ERROR". - Health string `json:"health"` + Health string // An entry for each known health check of each known instance // of each needed component: "instance of service S on node N // reports health-check C is OK." - Checks map[string]CheckResult `json:"checks"` + Checks map[string]CheckResult // An entry for each service type: "service S is OK." This // exposes problems that can't be expressed in Checks, like // "service S is needed, but isn't configured to run // anywhere." - Services map[arvados.ServiceName]ServiceHealth `json:"services"` + Services map[arvados.ServiceName]ServiceHealth - Errors []string `json:"errors"` + // Difference between min/max timestamps in individual + // health-check responses. + ClockSkew arvados.Duration + + Errors []string } type CheckResult struct { - Health string `json:"health"` - Error string `json:"error,omitempty"` + Health string + Error string `json:",omitempty"` HTTPStatusCode int `json:",omitempty"` - HTTPStatusText string `json:",omitempty"` - Response map[string]interface{} `json:"response"` - ResponseTime json.Number `json:"responseTime"` - Metrics Metrics `json:"-"` + Response map[string]interface{} `json:",omitempty"` + ResponseTime json.Number + ClockTime time.Time + Server string // "Server" header in http response + Metrics + respTime time.Duration } type Metrics struct { ConfigSourceTimestamp time.Time ConfigSourceSHA256 string + Version string } type ServiceHealth struct { - Health string `json:"health"` // "OK", "ERROR", or "SKIP" - N int `json:"n"` + Health string // "OK", "ERROR", or "SKIP" + N int } func (agg *Aggregator) ClusterHealth() ClusterHealthResponse { @@ -210,7 +224,8 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse { for svcName, sh := range resp.Services { switch svcName { case arvados.ServiceNameDispatchCloud, - arvados.ServiceNameDispatchLSF: + arvados.ServiceNameDispatchLSF, + arvados.ServiceNameDispatchSLURM: // ok to not run any given dispatcher case arvados.ServiceNameHealth, arvados.ServiceNameWorkbench1, @@ -225,6 +240,35 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse { } } + // Check for clock skew between hosts + var maxResponseTime time.Duration + var clockMin, clockMax time.Time + for _, result := range resp.Checks { + if result.ClockTime.IsZero() { + continue + } + if clockMin.IsZero() || result.ClockTime.Before(clockMin) { + clockMin = result.ClockTime + } + if result.ClockTime.After(clockMax) { + clockMax = result.ClockTime + } + if result.respTime > maxResponseTime { + maxResponseTime = result.respTime + } + } + skew := clockMax.Sub(clockMin) + resp.ClockSkew = arvados.Duration(skew) + if skew > maxClockSkew+maxResponseTime { + msg := fmt.Sprintf("clock skew detected: maximum timestamp spread is %s (exceeds warning threshold of %s)", resp.ClockSkew, arvados.Duration(maxClockSkew)) + resp.Errors = append(resp.Errors, msg) + resp.Health = "ERROR" + } + if agg.MetricClockSkew != nil { + agg.MetricClockSkew.Set(skew.Seconds()) + } + + // Check for mismatched config files var newest Metrics for _, result := range resp.Checks { if result.Metrics.ConfigSourceTimestamp.After(newest.ConfigSourceTimestamp) { @@ -245,6 +289,18 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse { resp.Errors = append(resp.Errors, msg) resp.Health = "ERROR" } + + // Check for services running a different version than we are. + for target, result := range resp.Checks { + if result.Metrics.Version != "" && !sameVersion(result.Metrics.Version, cmd.Version.String()) { + msg := fmt.Sprintf("version mismatch: %s is running %s -- expected %s", + strings.TrimSuffix(target, "/_health/ping"), + result.Metrics.Version, + cmd.Version.String()) + resp.Errors = append(resp.Errors, msg) + resp.Health = "ERROR" + } + } return resp } @@ -256,7 +312,8 @@ func (agg *Aggregator) pingURL(svcURL arvados.URL) (*url.URL, error) { func (agg *Aggregator) ping(target *url.URL) (result CheckResult) { t0 := time.Now() defer func() { - result.ResponseTime = json.Number(fmt.Sprintf("%.6f", time.Since(t0).Seconds())) + result.respTime = time.Since(t0) + result.ResponseTime = json.Number(fmt.Sprintf("%.6f", result.respTime.Seconds())) }() result.Health = "ERROR" @@ -288,7 +345,6 @@ func (agg *Aggregator) ping(target *url.URL) (result CheckResult) { return } result.HTTPStatusCode = resp.StatusCode - result.HTTPStatusText = resp.Status err = json.NewDecoder(resp.Body).Decode(&result.Response) if err != nil { result.Error = fmt.Sprintf("cannot decode response: %s", err) @@ -304,10 +360,15 @@ func (agg *Aggregator) ping(target *url.URL) (result CheckResult) { } } result.Health = "OK" + result.ClockTime, _ = time.Parse(time.RFC1123, resp.Header.Get("Date")) + result.Server = resp.Header.Get("Server") return } -var reMetric = regexp.MustCompile(`([a-z_]+){sha256="([0-9a-f]+)"} (\d[\d\.e\+]+)`) +var ( + reConfigMetric = regexp.MustCompile(`arvados_config_source_timestamp_seconds{sha256="([0-9a-f]+)"} (\d[\d\.e\+]+)`) + reVersionMetric = regexp.MustCompile(`arvados_version_running{version="([^"]+)"} 1`) +) func (agg *Aggregator) metrics(pingURL *url.URL) (result Metrics, err error) { metricsURL, err := pingURL.Parse("/metrics") @@ -335,13 +396,13 @@ func (agg *Aggregator) metrics(pingURL *url.URL) (result Metrics, err error) { scanner := bufio.NewScanner(resp.Body) for scanner.Scan() { - m := reMetric.FindSubmatch(scanner.Bytes()) - if len(m) != 4 || string(m[1]) != "arvados_config_source_timestamp_seconds" { - continue + if m := reConfigMetric.FindSubmatch(scanner.Bytes()); len(m) == 3 && len(m[1]) > 0 { + result.ConfigSourceSHA256 = string(m[1]) + unixtime, _ := strconv.ParseFloat(string(m[2]), 64) + result.ConfigSourceTimestamp = time.UnixMicro(int64(unixtime * 1e6)) + } else if m = reVersionMetric.FindSubmatch(scanner.Bytes()); len(m) == 2 && len(m[1]) > 0 { + result.Version = string(m[1]) } - result.ConfigSourceSHA256 = string(m[2]) - unixtime, _ := strconv.ParseFloat(string(m[3]), 64) - result.ConfigSourceTimestamp = time.UnixMicro(int64(unixtime * 1e6)) } if err = scanner.Err(); err != nil { err = fmt.Errorf("error parsing response from %s: %w", metricsURL.String(), err) @@ -379,7 +440,7 @@ func (ccmd checkCommand) RunCommand(prog string, args []string, stdin io.Reader, err := ccmd.run(ctx, prog, args, stdin, stdout, stderr) if err != nil { if err != errSilent { - fmt.Fprintln(stdout, err.Error()) + fmt.Fprintln(stderr, err.Error()) } return 1 } @@ -393,7 +454,8 @@ func (ccmd checkCommand) run(ctx context.Context, prog string, args []string, st loader.SetupFlags(flags) versionFlag := flags.Bool("version", false, "Write version information to stdout and exit 0") timeout := flags.Duration("timeout", defaultTimeout.Duration(), "Maximum time to wait for health responses") - outputYAML := flags.Bool("yaml", false, "Output full health report in YAML format (default mode shows errors as plain text, is silent on success)") + quiet := flags.Bool("quiet", false, "Silent on success (suppress 'health check OK' message on stderr)") + outputYAML := flags.Bool("yaml", false, "Output full health report in YAML format (default mode prints 'health check OK' or plain text errors)") if ok, _ := cmd.ParseFlags(flags, prog, args, "", stderr); !ok { // cmd.ParseFlags already reported the error return errSilent @@ -428,10 +490,36 @@ func (ccmd checkCommand) run(ctx context.Context, prog string, args []string, st } if resp.Health != "OK" { for _, msg := range resp.Errors { - fmt.Fprintln(stdout, msg) + fmt.Fprintln(stderr, msg) } fmt.Fprintln(stderr, "health check failed") return errSilent } + if !*quiet { + fmt.Fprintln(stderr, "health check OK") + } return nil } + +var reGoVersion = regexp.MustCompile(` \(go\d+([\d.])*\)$`) + +// Return true if either a==b or the only difference is that one has a +// " (go1.2.3)" suffix and the other does not. +// +// This allows us to recognize a non-Go (rails) service as the same +// version as a Go service. +func sameVersion(a, b string) bool { + if a == b { + return true + } + anogo := reGoVersion.ReplaceAllLiteralString(a, "") + bnogo := reGoVersion.ReplaceAllLiteralString(b, "") + if (anogo == a) != (bnogo == b) { + // only one of a/b has a (go1.2.3) suffix, so compare + // without that part + return anogo == bnogo + } + // both or neither has a (go1.2.3) suffix, and we already know + // a!=b + return false +}