X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/91db2d8fd32bc3f6c2a26ffc37f6591b1e5f380b..b53513423ab948804425424278ac554870864997:/sdk/go/health/aggregator.go diff --git a/sdk/go/health/aggregator.go b/sdk/go/health/aggregator.go index 699318735f..23d7e8d431 100644 --- a/sdk/go/health/aggregator.go +++ b/sdk/go/health/aggregator.go @@ -1,16 +1,21 @@ +// Copyright (C) The Arvados Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + package health import ( "context" + "crypto/tls" "encoding/json" "fmt" - "net" "net/http" + "net/url" "sync" "time" - "git.curoverse.com/arvados.git/sdk/go/arvados" - "git.curoverse.com/arvados.git/sdk/go/auth" + "git.arvados.org/arvados.git/sdk/go/arvados" + "git.arvados.org/arvados.git/sdk/go/auth" ) const defaultTimeout = arvados.Duration(2 * time.Second) @@ -23,24 +28,39 @@ type Aggregator struct { httpClient *http.Client timeout arvados.Duration - Config *arvados.Config + Cluster *arvados.Cluster // If non-nil, Log is called after handling each request. Log func(*http.Request, error) } func (agg *Aggregator) setup() { - agg.httpClient = http.DefaultClient + agg.httpClient = &http.Client{ + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{ + InsecureSkipVerify: agg.Cluster.TLS.Insecure, + }, + }, + } if agg.timeout == 0 { // this is always the case, except in the test suite agg.timeout = defaultTimeout } } +func (agg *Aggregator) CheckHealth() error { + return nil +} + +func (agg *Aggregator) Done() <-chan struct{} { + return nil +} + func (agg *Aggregator) ServeHTTP(resp http.ResponseWriter, req *http.Request) { + agg.setupOnce.Do(agg.setup) sendErr := func(statusCode int, err error) { resp.WriteHeader(statusCode) - json.NewEncoder(resp).Encode(map[string]interface{}{"error": err}) + json.NewEncoder(resp).Encode(map[string]string{"error": err.Error()}) if agg.Log != nil { agg.Log(req, err) } @@ -48,148 +68,174 @@ func (agg *Aggregator) ServeHTTP(resp http.ResponseWriter, req *http.Request) { resp.Header().Set("Content-Type", "application/json") - if agg.Config == nil { - cfg, err := arvados.GetConfig() - if err != nil { - err = fmt.Errorf("arvados.GetConfig(): %s", err) - sendErr(http.StatusInternalServerError, err) - return - } - agg.Config = cfg - } - cluster, err := agg.Config.GetCluster("") - if err != nil { - err = fmt.Errorf("arvados.GetCluster(): %s", err) - sendErr(http.StatusInternalServerError, err) - return - } - if !agg.checkAuth(req, cluster) { + if !agg.checkAuth(req) { sendErr(http.StatusUnauthorized, errUnauthorized) return } - if req.URL.Path != "/_health/all" { + if req.URL.Path == "/_health/all" { + json.NewEncoder(resp).Encode(agg.ClusterHealth()) + } else if req.URL.Path == "/_health/ping" { + resp.Write(healthyBody) + } else { sendErr(http.StatusNotFound, errNotFound) return } - json.NewEncoder(resp).Encode(agg.checkClusterHealth(cluster)) if agg.Log != nil { agg.Log(req, nil) } } -type serviceHealth struct { +type ClusterHealthResponse struct { + // "OK" if all needed services are OK, otherwise "ERROR". Health string `json:"health"` - N int `json:"n"` + + // An entry for each known health check of each known instance + // of each needed component: "instance of service S on node N + // reports health-check C is OK." + Checks map[string]CheckResult `json:"checks"` + + // An entry for each service type: "service S is OK." This + // exposes problems that can't be expressed in Checks, like + // "service S is needed, but isn't configured to run + // anywhere." + Services map[arvados.ServiceName]ServiceHealth `json:"services"` +} + +type CheckResult struct { + Health string `json:"health"` + Error string `json:"error,omitempty"` + HTTPStatusCode int `json:",omitempty"` + HTTPStatusText string `json:",omitempty"` + Response map[string]interface{} `json:"response"` + ResponseTime json.Number `json:"responseTime"` } -type clusterHealthResponse struct { - Health string `json:"health"` - Endpoints map[string]map[string]interface{} `json:"endpoints"` - Services map[string]serviceHealth `json:"services"` +type ServiceHealth struct { + Health string `json:"health"` + N int `json:"n"` } -func (agg *Aggregator) checkClusterHealth(cluster *arvados.Cluster) clusterHealthResponse { - resp := clusterHealthResponse{ - Health: "OK", - Endpoints: make(map[string]map[string]interface{}), - Services: make(map[string]serviceHealth), +func (agg *Aggregator) ClusterHealth() ClusterHealthResponse { + agg.setupOnce.Do(agg.setup) + resp := ClusterHealthResponse{ + Health: "OK", + Checks: make(map[string]CheckResult), + Services: make(map[arvados.ServiceName]ServiceHealth), } mtx := sync.Mutex{} wg := sync.WaitGroup{} - for node, nodeConfig := range cluster.SystemNodes { - for svc, addr := range map[string]string{ - "keepstore": nodeConfig.Keepstore.Listen, - } { - if addr == "" { - continue - } + for svcName, svc := range agg.Cluster.Services.Map() { + // Ensure svc is listed in resp.Services. + mtx.Lock() + if _, ok := resp.Services[svcName]; !ok { + resp.Services[svcName] = ServiceHealth{Health: "ERROR"} + } + mtx.Unlock() + + checkURLs := map[arvados.URL]bool{} + for addr := range svc.InternalURLs { + checkURLs[addr] = true + } + if len(checkURLs) == 0 && svc.ExternalURL.Host != "" { + checkURLs[svc.ExternalURL] = true + } + for addr := range checkURLs { wg.Add(1) - go func() { + go func(svcName arvados.ServiceName, addr arvados.URL) { defer wg.Done() - pingResp := agg.ping(node, addr) + var result CheckResult + pingURL, err := agg.pingURL(addr) + if err != nil { + result = CheckResult{ + Health: "ERROR", + Error: err.Error(), + } + } else { + result = agg.ping(pingURL) + } mtx.Lock() defer mtx.Unlock() - resp.Endpoints[node+"/"+svc+"/_health/ping"] = pingResp - svHealth := resp.Services[svc] - if agg.isOK(pingResp) { - svHealth.N++ + resp.Checks[fmt.Sprintf("%s+%s", svcName, pingURL)] = result + if result.Health == "OK" { + h := resp.Services[svcName] + h.N++ + h.Health = "OK" + resp.Services[svcName] = h } else { resp.Health = "ERROR" } - resp.Services[svc] = svHealth - }() + }(svcName, addr) } } wg.Wait() - for svc, svHealth := range resp.Services { - if svHealth.N > 0 { - svHealth.Health = "OK" - } else { - svHealth.Health = "ERROR" + // Report ERROR if a needed service didn't fail any checks + // merely because it isn't configured to run anywhere. + for _, sh := range resp.Services { + if sh.Health != "OK" { + resp.Health = "ERROR" + break } - resp.Services[svc] = svHealth } - return resp } -func (agg *Aggregator) isOK(result map[string]interface{}) bool { - h, ok := result["health"].(string) - return ok && h == "OK" +func (agg *Aggregator) pingURL(svcURL arvados.URL) (*url.URL, error) { + base := url.URL(svcURL) + return base.Parse("/_health/ping") } -func (agg *Aggregator) ping(node, addr string) (result map[string]interface{}) { +func (agg *Aggregator) ping(target *url.URL) (result CheckResult) { t0 := time.Now() - result = make(map[string]interface{}) - - var err error defer func() { - result["responseTime"] = json.Number(fmt.Sprintf("%.6f", time.Since(t0).Seconds())) - if err != nil { - result["health"], result["error"] = "ERROR", err - } + result.ResponseTime = json.Number(fmt.Sprintf("%.6f", time.Since(t0).Seconds())) }() + result.Health = "ERROR" - _, port, err := net.SplitHostPort(addr) - if err != nil { - return - } - req, err := http.NewRequest("GET", "http://"+node+":"+port+"/_health/ping", nil) + req, err := http.NewRequest("GET", target.String(), nil) if err != nil { + result.Error = err.Error() return } + req.Header.Set("Authorization", "Bearer "+agg.Cluster.ManagementToken) - ctx, cancel := context.WithCancel(req.Context()) - go func() { - select { - case <-time.After(time.Duration(agg.timeout)): - cancel() - case <-ctx.Done(): - } - }() + // Avoid workbench1's redirect-http-to-https feature + req.Header.Set("X-Forwarded-Proto", "https") + + ctx, cancel := context.WithTimeout(req.Context(), time.Duration(agg.timeout)) + defer cancel() req = req.WithContext(ctx) resp, err := agg.httpClient.Do(req) if err != nil { + result.Error = err.Error() return } - err = json.NewDecoder(resp.Body).Decode(result) + result.HTTPStatusCode = resp.StatusCode + result.HTTPStatusText = resp.Status + err = json.NewDecoder(resp.Body).Decode(&result.Response) if err != nil { - return - } - if resp.StatusCode != 200 { - err = fmt.Errorf("HTTP %d %s", resp.StatusCode, resp.Status) - return + result.Error = fmt.Sprintf("cannot decode response: %s", err) + } else if resp.StatusCode != http.StatusOK { + result.Error = fmt.Sprintf("HTTP %d %s", resp.StatusCode, resp.Status) + } else if h, _ := result.Response["health"].(string); h != "OK" { + if e, ok := result.Response["error"].(string); ok && e != "" { + result.Error = e + return + } else { + result.Error = fmt.Sprintf("health=%q in ping response", h) + return + } } + result.Health = "OK" return } -func (agg *Aggregator) checkAuth(req *http.Request, cluster *arvados.Cluster) bool { - creds := auth.NewCredentialsFromHTTPRequest(req) +func (agg *Aggregator) checkAuth(req *http.Request) bool { + creds := auth.CredentialsFromRequest(req) for _, token := range creds.Tokens { - if token != "" && token == cluster.ManagementToken { + if token != "" && token == agg.Cluster.ManagementToken { return true } }