From 7f9de270aa34467c1b1668be9333ec28d14b10a1 Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Thu, 19 May 2022 15:19:23 -0400 Subject: [PATCH] 16345: Fail health check on server version mismatch. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- sdk/go/health/aggregator.go | 83 +++++++++++++++++++++++--------- sdk/go/health/aggregator_test.go | 45 ++++++++++++++++- 2 files changed, 104 insertions(+), 24 deletions(-) diff --git a/sdk/go/health/aggregator.go b/sdk/go/health/aggregator.go index f473eff353..b5301dffe0 100644 --- a/sdk/go/health/aggregator.go +++ b/sdk/go/health/aggregator.go @@ -108,46 +108,46 @@ func (agg *Aggregator) ServeHTTP(resp http.ResponseWriter, req *http.Request) { type ClusterHealthResponse struct { // "OK" if all needed services are OK, otherwise "ERROR". - Health string `json:"health"` + Health string // An entry for each known health check of each known instance // of each needed component: "instance of service S on node N // reports health-check C is OK." - Checks map[string]CheckResult `json:"checks"` + Checks map[string]CheckResult // An entry for each service type: "service S is OK." This // exposes problems that can't be expressed in Checks, like // "service S is needed, but isn't configured to run // anywhere." - Services map[arvados.ServiceName]ServiceHealth `json:"services"` + Services map[arvados.ServiceName]ServiceHealth // Difference between min/max timestamps in individual // health-check responses. ClockSkew arvados.Duration - Errors []string `json:"errors"` + Errors []string } type CheckResult struct { - Health string `json:"health"` - Error string `json:"error,omitempty"` + Health string + Error string `json:",omitempty"` HTTPStatusCode int `json:",omitempty"` - HTTPStatusText string `json:",omitempty"` - Response map[string]interface{} `json:"response"` - ResponseTime json.Number `json:"responseTime"` - ClockTime time.Time `json:"clockTime"` - Metrics Metrics `json:"-"` - respTime time.Duration + Response map[string]interface{} `json:",omitempty"` + ResponseTime json.Number + ClockTime time.Time + Metrics + respTime time.Duration } type Metrics struct { ConfigSourceTimestamp time.Time ConfigSourceSHA256 string + Version string } type ServiceHealth struct { - Health string `json:"health"` // "OK", "ERROR", or "SKIP" - N int `json:"n"` + Health string // "OK", "ERROR", or "SKIP" + N int } func (agg *Aggregator) ClusterHealth() ClusterHealthResponse { @@ -238,6 +238,7 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse { } } + // Check for clock skew between hosts var maxResponseTime time.Duration var clockMin, clockMax time.Time for _, result := range resp.Checks { @@ -265,6 +266,7 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse { agg.MetricClockSkew.Set(skew.Seconds()) } + // Check for mismatched config files var newest Metrics for _, result := range resp.Checks { if result.Metrics.ConfigSourceTimestamp.After(newest.ConfigSourceTimestamp) { @@ -285,6 +287,18 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse { resp.Errors = append(resp.Errors, msg) resp.Health = "ERROR" } + + // Check for services running a different version than we are. + for target, result := range resp.Checks { + if result.Metrics.Version != "" && !sameVersion(result.Metrics.Version, cmd.Version.String()) { + msg := fmt.Sprintf("version mismatch: %s is running %s -- expected %s", + strings.TrimSuffix(target, "/_health/ping"), + result.Metrics.Version, + cmd.Version.String()) + resp.Errors = append(resp.Errors, msg) + resp.Health = "ERROR" + } + } return resp } @@ -329,7 +343,6 @@ func (agg *Aggregator) ping(target *url.URL) (result CheckResult) { return } result.HTTPStatusCode = resp.StatusCode - result.HTTPStatusText = resp.Status err = json.NewDecoder(resp.Body).Decode(&result.Response) if err != nil { result.Error = fmt.Sprintf("cannot decode response: %s", err) @@ -349,7 +362,10 @@ func (agg *Aggregator) ping(target *url.URL) (result CheckResult) { return } -var reMetric = regexp.MustCompile(`([a-z_]+){sha256="([0-9a-f]+)"} (\d[\d\.e\+]+)`) +var ( + reConfigMetric = regexp.MustCompile(`arvados_config_source_timestamp_seconds{sha256="([0-9a-f]+)"} (\d[\d\.e\+]+)`) + reVersionMetric = regexp.MustCompile(`arvados_version_running{version="([^"]+)"} 1`) +) func (agg *Aggregator) metrics(pingURL *url.URL) (result Metrics, err error) { metricsURL, err := pingURL.Parse("/metrics") @@ -377,13 +393,13 @@ func (agg *Aggregator) metrics(pingURL *url.URL) (result Metrics, err error) { scanner := bufio.NewScanner(resp.Body) for scanner.Scan() { - m := reMetric.FindSubmatch(scanner.Bytes()) - if len(m) != 4 || string(m[1]) != "arvados_config_source_timestamp_seconds" { - continue + if m := reConfigMetric.FindSubmatch(scanner.Bytes()); len(m) == 3 && len(m[1]) > 0 { + result.ConfigSourceSHA256 = string(m[1]) + unixtime, _ := strconv.ParseFloat(string(m[2]), 64) + result.ConfigSourceTimestamp = time.UnixMicro(int64(unixtime * 1e6)) + } else if m = reVersionMetric.FindSubmatch(scanner.Bytes()); len(m) == 2 && len(m[1]) > 0 { + result.Version = string(m[1]) } - result.ConfigSourceSHA256 = string(m[2]) - unixtime, _ := strconv.ParseFloat(string(m[3]), 64) - result.ConfigSourceTimestamp = time.UnixMicro(int64(unixtime * 1e6)) } if err = scanner.Err(); err != nil { err = fmt.Errorf("error parsing response from %s: %w", metricsURL.String(), err) @@ -477,3 +493,26 @@ func (ccmd checkCommand) run(ctx context.Context, prog string, args []string, st } return nil } + +var reGoVersion = regexp.MustCompile(` \(go\d+([\d.])*\)$`) + +// Return true if either a==b or the only difference is that one has a +// " (go1.2.3)" suffix and the other does not. +// +// This allows us to recognize a non-Go (rails) service as the same +// version as a Go service. +func sameVersion(a, b string) bool { + if a == b { + return true + } + anogo := reGoVersion.ReplaceAllLiteralString(a, "") + bnogo := reGoVersion.ReplaceAllLiteralString(b, "") + if (anogo == a) != (bnogo == b) { + // only one of a/b has a (go1.2.3) suffix, so compare + // without that part + return anogo == bnogo + } + // both or neither has a (go1.2.3) suffix, and we already know + // a!=b + return false +} diff --git a/sdk/go/health/aggregator_test.go b/sdk/go/health/aggregator_test.go index 481054c4de..daad208e0f 100644 --- a/sdk/go/health/aggregator_test.go +++ b/sdk/go/health/aggregator_test.go @@ -13,9 +13,11 @@ import ( "net/http" "net/http/httptest" "regexp" + "runtime" "strings" "time" + "git.arvados.org/arvados.git/lib/cmd" "git.arvados.org/arvados.git/lib/config" "git.arvados.org/arvados.git/sdk/go/arvados" "git.arvados.org/arvados.git/sdk/go/arvadostest" @@ -254,6 +256,40 @@ func (s *AggregatorSuite) TestClockSkew(c *check.C) { } } +func (s *AggregatorSuite) TestVersionSkew(c *check.C) { + // srv1: report same version + handler1 := healthyHandler{version: cmd.Version.String()} + srv1, listen1 := s.stubServer(&handler1) + defer srv1.Close() + // srv2: report same version but without " (go1.2.3)" part + handler2 := healthyHandler{version: strings.Fields(cmd.Version.String())[0]} + srv2, listen2 := s.stubServer(&handler2) + defer srv2.Close() + // srv3: report different version + handler3 := healthyHandler{version: "1.2.3~4 (" + runtime.Version() + ")"} + srv3, listen3 := s.stubServer(&handler3) + defer srv3.Close() + + s.setAllServiceURLs(listen1) + + // same version but without go1.2.3 part => OK + s.resp = httptest.NewRecorder() + arvadostest.SetServiceURL(&s.handler.Cluster.Services.RailsAPI, + "http://localhost"+listen2+"/") + s.handler.ServeHTTP(s.resp, s.req) + s.checkOK(c) + + // different version => error + s.resp = httptest.NewRecorder() + arvadostest.SetServiceURL(&s.handler.Cluster.Services.WebDAV, + "http://localhost"+listen3+"/") + s.handler.ServeHTTP(s.resp, s.req) + resp := s.checkUnhealthy(c) + if c.Check(len(resp.Errors) > 0, check.Equals, true) { + c.Check(resp.Errors[0], check.Matches, `version mismatch: \Qkeep-web+http://localhost`+listen3+`\E is running 1.2.3~4 (.*) -- expected \Q`+cmd.Version.String()+`\E`) + } +} + func (s *AggregatorSuite) TestPingTimeout(c *check.C) { s.handler.timeout = arvados.Duration(100 * time.Millisecond) srv, listen := s.stubServer(&slowHandler{}) @@ -292,7 +328,7 @@ func (s *AggregatorSuite) TestCheckCommand(c *check.C) { exitcode = CheckCommand.RunCommand("check", []string{"-config=" + tmpdir + "/config.yml", "-yaml"}, &bytes.Buffer{}, &stdout, &stderr) c.Check(exitcode, check.Equals, 0) c.Check(stderr.String(), check.Equals, "") - c.Check(stdout.String(), check.Matches, `(?ms).*(\n|^)health: OK\n.*`) + c.Check(stdout.String(), check.Matches, `(?ms).*(\n|^)Health: OK\n.*`) } func (s *AggregatorSuite) checkError(c *check.C) { @@ -354,6 +390,7 @@ func (*unhealthyHandler) ServeHTTP(resp http.ResponseWriter, req *http.Request) } type healthyHandler struct { + version string configHash string configTime time.Time headerDate time.Time @@ -385,9 +422,13 @@ arvados_config_load_timestamp_seconds{sha256="%s"} %g # HELP arvados_config_source_timestamp_seconds Timestamp of config file when it was loaded. # TYPE arvados_config_source_timestamp_seconds gauge arvados_config_source_timestamp_seconds{sha256="%s"} %g +# HELP arvados_version_running Indicated version is running. +# TYPE arvados_version_running gauge +arvados_version_running{version="%s"} 1 `, h.configHash, float64(time.Now().UnixNano())/1e9, - h.configHash, float64(t.UnixNano())/1e9) + h.configHash, float64(t.UnixNano())/1e9, + h.version) } else { http.Error(resp, "not found", http.StatusNotFound) } -- 2.30.2