Merge branch '18947-keep-balance'
[arvados.git] / sdk / go / health / aggregator.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 package health
6
7 import (
8         "context"
9         "crypto/tls"
10         "encoding/json"
11         "fmt"
12         "net/http"
13         "net/url"
14         "sync"
15         "time"
16
17         "git.arvados.org/arvados.git/sdk/go/arvados"
18         "git.arvados.org/arvados.git/sdk/go/auth"
19 )
20
21 const defaultTimeout = arvados.Duration(2 * time.Second)
22
23 // Aggregator implements http.Handler. It handles "GET /_health/all"
24 // by checking the health of all configured services on the cluster
25 // and responding 200 if everything is healthy.
26 type Aggregator struct {
27         setupOnce  sync.Once
28         httpClient *http.Client
29         timeout    arvados.Duration
30
31         Cluster *arvados.Cluster
32
33         // If non-nil, Log is called after handling each request.
34         Log func(*http.Request, error)
35 }
36
37 func (agg *Aggregator) setup() {
38         agg.httpClient = &http.Client{
39                 Transport: &http.Transport{
40                         TLSClientConfig: &tls.Config{
41                                 InsecureSkipVerify: agg.Cluster.TLS.Insecure,
42                         },
43                 },
44         }
45         if agg.timeout == 0 {
46                 // this is always the case, except in the test suite
47                 agg.timeout = defaultTimeout
48         }
49 }
50
51 func (agg *Aggregator) CheckHealth() error {
52         return nil
53 }
54
55 func (agg *Aggregator) Done() <-chan struct{} {
56         return nil
57 }
58
59 func (agg *Aggregator) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
60         agg.setupOnce.Do(agg.setup)
61         sendErr := func(statusCode int, err error) {
62                 resp.WriteHeader(statusCode)
63                 json.NewEncoder(resp).Encode(map[string]string{"error": err.Error()})
64                 if agg.Log != nil {
65                         agg.Log(req, err)
66                 }
67         }
68
69         resp.Header().Set("Content-Type", "application/json")
70
71         if !agg.checkAuth(req) {
72                 sendErr(http.StatusUnauthorized, errUnauthorized)
73                 return
74         }
75         if req.URL.Path == "/_health/all" {
76                 json.NewEncoder(resp).Encode(agg.ClusterHealth())
77         } else if req.URL.Path == "/_health/ping" {
78                 resp.Write(healthyBody)
79         } else {
80                 sendErr(http.StatusNotFound, errNotFound)
81                 return
82         }
83         if agg.Log != nil {
84                 agg.Log(req, nil)
85         }
86 }
87
88 type ClusterHealthResponse struct {
89         // "OK" if all needed services are OK, otherwise "ERROR".
90         Health string `json:"health"`
91
92         // An entry for each known health check of each known instance
93         // of each needed component: "instance of service S on node N
94         // reports health-check C is OK."
95         Checks map[string]CheckResult `json:"checks"`
96
97         // An entry for each service type: "service S is OK." This
98         // exposes problems that can't be expressed in Checks, like
99         // "service S is needed, but isn't configured to run
100         // anywhere."
101         Services map[arvados.ServiceName]ServiceHealth `json:"services"`
102 }
103
104 type CheckResult struct {
105         Health         string                 `json:"health"`
106         Error          string                 `json:"error,omitempty"`
107         HTTPStatusCode int                    `json:",omitempty"`
108         HTTPStatusText string                 `json:",omitempty"`
109         Response       map[string]interface{} `json:"response"`
110         ResponseTime   json.Number            `json:"responseTime"`
111 }
112
113 type ServiceHealth struct {
114         Health string `json:"health"`
115         N      int    `json:"n"`
116 }
117
118 func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
119         agg.setupOnce.Do(agg.setup)
120         resp := ClusterHealthResponse{
121                 Health:   "OK",
122                 Checks:   make(map[string]CheckResult),
123                 Services: make(map[arvados.ServiceName]ServiceHealth),
124         }
125
126         mtx := sync.Mutex{}
127         wg := sync.WaitGroup{}
128         for svcName, svc := range agg.Cluster.Services.Map() {
129                 // Ensure svc is listed in resp.Services.
130                 mtx.Lock()
131                 if _, ok := resp.Services[svcName]; !ok {
132                         resp.Services[svcName] = ServiceHealth{Health: "ERROR"}
133                 }
134                 mtx.Unlock()
135
136                 checkURLs := map[arvados.URL]bool{}
137                 for addr := range svc.InternalURLs {
138                         checkURLs[addr] = true
139                 }
140                 if len(checkURLs) == 0 && svc.ExternalURL.Host != "" {
141                         checkURLs[svc.ExternalURL] = true
142                 }
143                 for addr := range checkURLs {
144                         wg.Add(1)
145                         go func(svcName arvados.ServiceName, addr arvados.URL) {
146                                 defer wg.Done()
147                                 var result CheckResult
148                                 pingURL, err := agg.pingURL(addr)
149                                 if err != nil {
150                                         result = CheckResult{
151                                                 Health: "ERROR",
152                                                 Error:  err.Error(),
153                                         }
154                                 } else {
155                                         result = agg.ping(pingURL)
156                                 }
157
158                                 mtx.Lock()
159                                 defer mtx.Unlock()
160                                 resp.Checks[fmt.Sprintf("%s+%s", svcName, pingURL)] = result
161                                 if result.Health == "OK" {
162                                         h := resp.Services[svcName]
163                                         h.N++
164                                         h.Health = "OK"
165                                         resp.Services[svcName] = h
166                                 } else {
167                                         resp.Health = "ERROR"
168                                 }
169                         }(svcName, addr)
170                 }
171         }
172         wg.Wait()
173
174         // Report ERROR if a needed service didn't fail any checks
175         // merely because it isn't configured to run anywhere.
176         for _, sh := range resp.Services {
177                 if sh.Health != "OK" {
178                         resp.Health = "ERROR"
179                         break
180                 }
181         }
182         return resp
183 }
184
185 func (agg *Aggregator) pingURL(svcURL arvados.URL) (*url.URL, error) {
186         base := url.URL(svcURL)
187         return base.Parse("/_health/ping")
188 }
189
190 func (agg *Aggregator) ping(target *url.URL) (result CheckResult) {
191         t0 := time.Now()
192         defer func() {
193                 result.ResponseTime = json.Number(fmt.Sprintf("%.6f", time.Since(t0).Seconds()))
194         }()
195         result.Health = "ERROR"
196
197         req, err := http.NewRequest("GET", target.String(), nil)
198         if err != nil {
199                 result.Error = err.Error()
200                 return
201         }
202         req.Header.Set("Authorization", "Bearer "+agg.Cluster.ManagementToken)
203
204         // Avoid workbench1's redirect-http-to-https feature
205         req.Header.Set("X-Forwarded-Proto", "https")
206
207         ctx, cancel := context.WithTimeout(req.Context(), time.Duration(agg.timeout))
208         defer cancel()
209         req = req.WithContext(ctx)
210         resp, err := agg.httpClient.Do(req)
211         if err != nil {
212                 result.Error = err.Error()
213                 return
214         }
215         result.HTTPStatusCode = resp.StatusCode
216         result.HTTPStatusText = resp.Status
217         err = json.NewDecoder(resp.Body).Decode(&result.Response)
218         if err != nil {
219                 result.Error = fmt.Sprintf("cannot decode response: %s", err)
220         } else if resp.StatusCode != http.StatusOK {
221                 result.Error = fmt.Sprintf("HTTP %d %s", resp.StatusCode, resp.Status)
222         } else if h, _ := result.Response["health"].(string); h != "OK" {
223                 if e, ok := result.Response["error"].(string); ok && e != "" {
224                         result.Error = e
225                         return
226                 } else {
227                         result.Error = fmt.Sprintf("health=%q in ping response", h)
228                         return
229                 }
230         }
231         result.Health = "OK"
232         return
233 }
234
235 func (agg *Aggregator) checkAuth(req *http.Request) bool {
236         creds := auth.CredentialsFromRequest(req)
237         for _, token := range creds.Tokens {
238                 if token != "" && token == agg.Cluster.ManagementToken {
239                         return true
240                 }
241         }
242         return false
243 }