1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: Apache-2.0
17 "git.arvados.org/arvados.git/sdk/go/arvados"
18 "git.arvados.org/arvados.git/sdk/go/auth"
21 const defaultTimeout = arvados.Duration(2 * time.Second)
23 // Aggregator implements http.Handler. It handles "GET /_health/all"
24 // by checking the health of all configured services on the cluster
25 // and responding 200 if everything is healthy.
26 type Aggregator struct {
28 httpClient *http.Client
29 timeout arvados.Duration
31 Cluster *arvados.Cluster
33 // If non-nil, Log is called after handling each request.
34 Log func(*http.Request, error)
37 func (agg *Aggregator) setup() {
38 agg.httpClient = &http.Client{
39 Transport: &http.Transport{
40 TLSClientConfig: &tls.Config{
41 InsecureSkipVerify: agg.Cluster.TLS.Insecure,
46 // this is always the case, except in the test suite
47 agg.timeout = defaultTimeout
51 func (agg *Aggregator) CheckHealth() error {
55 func (agg *Aggregator) Done() <-chan struct{} {
59 func (agg *Aggregator) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
60 agg.setupOnce.Do(agg.setup)
61 sendErr := func(statusCode int, err error) {
62 resp.WriteHeader(statusCode)
63 json.NewEncoder(resp).Encode(map[string]string{"error": err.Error()})
69 resp.Header().Set("Content-Type", "application/json")
71 if !agg.checkAuth(req) {
72 sendErr(http.StatusUnauthorized, errUnauthorized)
75 if req.URL.Path == "/_health/all" {
76 json.NewEncoder(resp).Encode(agg.ClusterHealth())
77 } else if req.URL.Path == "/_health/ping" {
78 resp.Write(healthyBody)
80 sendErr(http.StatusNotFound, errNotFound)
88 type ClusterHealthResponse struct {
89 // "OK" if all needed services are OK, otherwise "ERROR".
90 Health string `json:"health"`
92 // An entry for each known health check of each known instance
93 // of each needed component: "instance of service S on node N
94 // reports health-check C is OK."
95 Checks map[string]CheckResult `json:"checks"`
97 // An entry for each service type: "service S is OK." This
98 // exposes problems that can't be expressed in Checks, like
99 // "service S is needed, but isn't configured to run
101 Services map[arvados.ServiceName]ServiceHealth `json:"services"`
104 type CheckResult struct {
105 Health string `json:"health"`
106 Error string `json:"error,omitempty"`
107 HTTPStatusCode int `json:",omitempty"`
108 HTTPStatusText string `json:",omitempty"`
109 Response map[string]interface{} `json:"response"`
110 ResponseTime json.Number `json:"responseTime"`
113 type ServiceHealth struct {
114 Health string `json:"health"`
118 func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
119 agg.setupOnce.Do(agg.setup)
120 resp := ClusterHealthResponse{
122 Checks: make(map[string]CheckResult),
123 Services: make(map[arvados.ServiceName]ServiceHealth),
127 wg := sync.WaitGroup{}
128 for svcName, svc := range agg.Cluster.Services.Map() {
129 // Ensure svc is listed in resp.Services.
131 if _, ok := resp.Services[svcName]; !ok {
132 resp.Services[svcName] = ServiceHealth{Health: "ERROR"}
136 checkURLs := map[arvados.URL]bool{}
137 for addr := range svc.InternalURLs {
138 checkURLs[addr] = true
140 if len(checkURLs) == 0 && svc.ExternalURL.Host != "" {
141 checkURLs[svc.ExternalURL] = true
143 for addr := range checkURLs {
145 go func(svcName arvados.ServiceName, addr arvados.URL) {
147 var result CheckResult
148 pingURL, err := agg.pingURL(addr)
150 result = CheckResult{
155 result = agg.ping(pingURL)
160 resp.Checks[fmt.Sprintf("%s+%s", svcName, pingURL)] = result
161 if result.Health == "OK" {
162 h := resp.Services[svcName]
165 resp.Services[svcName] = h
167 resp.Health = "ERROR"
174 // Report ERROR if a needed service didn't fail any checks
175 // merely because it isn't configured to run anywhere.
176 for _, sh := range resp.Services {
177 if sh.Health != "OK" {
178 resp.Health = "ERROR"
185 func (agg *Aggregator) pingURL(svcURL arvados.URL) (*url.URL, error) {
186 base := url.URL(svcURL)
187 return base.Parse("/_health/ping")
190 func (agg *Aggregator) ping(target *url.URL) (result CheckResult) {
193 result.ResponseTime = json.Number(fmt.Sprintf("%.6f", time.Since(t0).Seconds()))
195 result.Health = "ERROR"
197 req, err := http.NewRequest("GET", target.String(), nil)
199 result.Error = err.Error()
202 req.Header.Set("Authorization", "Bearer "+agg.Cluster.ManagementToken)
204 // Avoid workbench1's redirect-http-to-https feature
205 req.Header.Set("X-Forwarded-Proto", "https")
207 ctx, cancel := context.WithTimeout(req.Context(), time.Duration(agg.timeout))
209 req = req.WithContext(ctx)
210 resp, err := agg.httpClient.Do(req)
212 result.Error = err.Error()
215 result.HTTPStatusCode = resp.StatusCode
216 result.HTTPStatusText = resp.Status
217 err = json.NewDecoder(resp.Body).Decode(&result.Response)
219 result.Error = fmt.Sprintf("cannot decode response: %s", err)
220 } else if resp.StatusCode != http.StatusOK {
221 result.Error = fmt.Sprintf("HTTP %d %s", resp.StatusCode, resp.Status)
222 } else if h, _ := result.Response["health"].(string); h != "OK" {
223 if e, ok := result.Response["error"].(string); ok && e != "" {
227 result.Error = fmt.Sprintf("health=%q in ping response", h)
235 func (agg *Aggregator) checkAuth(req *http.Request) bool {
236 creds := auth.CredentialsFromRequest(req)
237 for _, token := range creds.Tokens {
238 if token != "" && token == agg.Cluster.ManagementToken {