1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: Apache-2.0
17 "git.curoverse.com/arvados.git/sdk/go/arvados"
18 "git.curoverse.com/arvados.git/sdk/go/auth"
21 const defaultTimeout = arvados.Duration(2 * time.Second)
23 // Aggregator implements http.Handler. It handles "GET /_health/all"
24 // by checking the health of all configured services on the cluster
25 // and responding 200 if everything is healthy.
26 type Aggregator struct {
28 httpClient *http.Client
29 timeout arvados.Duration
31 Config *arvados.Config
33 // If non-nil, Log is called after handling each request.
34 Log func(*http.Request, error)
37 func (agg *Aggregator) setup() {
38 agg.httpClient = http.DefaultClient
40 // this is always the case, except in the test suite
41 agg.timeout = defaultTimeout
45 func (agg *Aggregator) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
46 agg.setupOnce.Do(agg.setup)
47 sendErr := func(statusCode int, err error) {
48 resp.WriteHeader(statusCode)
49 json.NewEncoder(resp).Encode(map[string]string{"error": err.Error()})
55 resp.Header().Set("Content-Type", "application/json")
57 cluster, err := agg.Config.GetCluster("")
59 err = fmt.Errorf("arvados.GetCluster(): %s", err)
60 sendErr(http.StatusInternalServerError, err)
63 if !agg.checkAuth(req, cluster) {
64 sendErr(http.StatusUnauthorized, errUnauthorized)
67 if req.URL.Path != "/_health/all" {
68 sendErr(http.StatusNotFound, errNotFound)
71 json.NewEncoder(resp).Encode(agg.ClusterHealth(cluster))
77 type ClusterHealthResponse struct {
78 // "OK" if all needed services are OK, otherwise "ERROR".
79 Health string `json:"health"`
81 // An entry for each known health check of each known instance
82 // of each needed component: "instance of service S on node N
83 // reports health-check C is OK."
84 Checks map[string]CheckResult `json:"checks"`
86 // An entry for each service type: "service S is OK." This
87 // exposes problems that can't be expressed in Checks, like
88 // "service S is needed, but isn't configured to run
90 Services map[string]ServiceHealth `json:"services"`
93 type CheckResult struct {
94 Health string `json:"health"`
95 Error string `json:"error,omitempty"`
96 HTTPStatusCode int `json:",omitempty"`
97 HTTPStatusText string `json:",omitempty"`
98 Response map[string]interface{} `json:"response"`
99 ResponseTime json.Number `json:"responseTime"`
102 type ServiceHealth struct {
103 Health string `json:"health"`
107 func (agg *Aggregator) ClusterHealth(cluster *arvados.Cluster) ClusterHealthResponse {
108 resp := ClusterHealthResponse{
110 Checks: make(map[string]CheckResult),
111 Services: make(map[string]ServiceHealth),
115 wg := sync.WaitGroup{}
116 for node, nodeConfig := range cluster.SystemNodes {
117 for svc, addr := range nodeConfig.ServicePorts() {
118 // Ensure svc is listed in resp.Services.
120 if _, ok := resp.Services[svc]; !ok {
121 resp.Services[svc] = ServiceHealth{Health: "ERROR"}
126 // svc is not expected on this node.
131 go func(node, svc, addr string) {
133 var result CheckResult
134 url, err := agg.pingURL(node, addr)
136 result = CheckResult{
141 result = agg.ping(url, cluster)
146 resp.Checks[svc+"+"+url] = result
147 if result.Health == "OK" {
148 h := resp.Services[svc]
151 resp.Services[svc] = h
153 resp.Health = "ERROR"
160 // Report ERROR if a needed service didn't fail any checks
161 // merely because it isn't configured to run anywhere.
162 for _, sh := range resp.Services {
163 if sh.Health != "OK" {
164 resp.Health = "ERROR"
171 func (agg *Aggregator) pingURL(node, addr string) (string, error) {
172 _, port, err := net.SplitHostPort(addr)
173 return "http://" + node + ":" + port + "/_health/ping", err
176 func (agg *Aggregator) ping(url string, cluster *arvados.Cluster) (result CheckResult) {
181 result.ResponseTime = json.Number(fmt.Sprintf("%.6f", time.Since(t0).Seconds()))
183 result.Health, result.Error = "ERROR", err.Error()
189 req, err := http.NewRequest("GET", url, nil)
193 req.Header.Set("Authorization", "Bearer "+cluster.ManagementToken)
195 ctx, cancel := context.WithTimeout(req.Context(), time.Duration(agg.timeout))
197 req = req.WithContext(ctx)
198 resp, err := agg.httpClient.Do(req)
202 result.HTTPStatusCode = resp.StatusCode
203 result.HTTPStatusText = resp.Status
204 err = json.NewDecoder(resp.Body).Decode(&result.Response)
206 err = fmt.Errorf("cannot decode response: %s", err)
207 } else if resp.StatusCode != http.StatusOK {
208 err = fmt.Errorf("HTTP %d %s", resp.StatusCode, resp.Status)
209 } else if h, _ := result.Response["health"].(string); h != "OK" {
210 if e, ok := result.Response["error"].(string); ok && e != "" {
213 err = fmt.Errorf("health=%q in ping response", h)
219 func (agg *Aggregator) checkAuth(req *http.Request, cluster *arvados.Cluster) bool {
220 creds := auth.NewCredentialsFromHTTPRequest(req)
221 for _, token := range creds.Tokens {
222 if token != "" && token == cluster.ManagementToken {