2411: Merge branch '2411-another-pass'
[arvados.git] / sdk / go / health / aggregator.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 package health
6
7 import (
8         "context"
9         "encoding/json"
10         "errors"
11         "fmt"
12         "net"
13         "net/http"
14         "sync"
15         "time"
16
17         "git.curoverse.com/arvados.git/sdk/go/arvados"
18         "git.curoverse.com/arvados.git/sdk/go/auth"
19 )
20
21 const defaultTimeout = arvados.Duration(2 * time.Second)
22
23 // Aggregator implements http.Handler. It handles "GET /_health/all"
24 // by checking the health of all configured services on the cluster
25 // and responding 200 if everything is healthy.
26 type Aggregator struct {
27         setupOnce  sync.Once
28         httpClient *http.Client
29         timeout    arvados.Duration
30
31         Config *arvados.Config
32
33         // If non-nil, Log is called after handling each request.
34         Log func(*http.Request, error)
35 }
36
37 func (agg *Aggregator) setup() {
38         agg.httpClient = http.DefaultClient
39         if agg.timeout == 0 {
40                 // this is always the case, except in the test suite
41                 agg.timeout = defaultTimeout
42         }
43 }
44
45 func (agg *Aggregator) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
46         agg.setupOnce.Do(agg.setup)
47         sendErr := func(statusCode int, err error) {
48                 resp.WriteHeader(statusCode)
49                 json.NewEncoder(resp).Encode(map[string]string{"error": err.Error()})
50                 if agg.Log != nil {
51                         agg.Log(req, err)
52                 }
53         }
54
55         resp.Header().Set("Content-Type", "application/json")
56
57         cluster, err := agg.Config.GetCluster("")
58         if err != nil {
59                 err = fmt.Errorf("arvados.GetCluster(): %s", err)
60                 sendErr(http.StatusInternalServerError, err)
61                 return
62         }
63         if !agg.checkAuth(req, cluster) {
64                 sendErr(http.StatusUnauthorized, errUnauthorized)
65                 return
66         }
67         if req.URL.Path != "/_health/all" {
68                 sendErr(http.StatusNotFound, errNotFound)
69                 return
70         }
71         json.NewEncoder(resp).Encode(agg.ClusterHealth(cluster))
72         if agg.Log != nil {
73                 agg.Log(req, nil)
74         }
75 }
76
77 type ClusterHealthResponse struct {
78         // "OK" if all needed services are OK, otherwise "ERROR".
79         Health string `json:"health"`
80
81         // An entry for each known health check of each known instance
82         // of each needed component: "instance of service S on node N
83         // reports health-check C is OK."
84         Checks map[string]CheckResult `json:"checks"`
85
86         // An entry for each service type: "service S is OK." This
87         // exposes problems that can't be expressed in Checks, like
88         // "service S is needed, but isn't configured to run
89         // anywhere."
90         Services map[string]ServiceHealth `json:"services"`
91 }
92
93 type CheckResult struct {
94         Health         string                 `json:"health"`
95         Error          string                 `json:"error,omitempty"`
96         HTTPStatusCode int                    `json:",omitempty"`
97         HTTPStatusText string                 `json:",omitempty"`
98         Response       map[string]interface{} `json:"response"`
99         ResponseTime   json.Number            `json:"responseTime"`
100 }
101
102 type ServiceHealth struct {
103         Health string `json:"health"`
104         N      int    `json:"n"`
105 }
106
107 func (agg *Aggregator) ClusterHealth(cluster *arvados.Cluster) ClusterHealthResponse {
108         resp := ClusterHealthResponse{
109                 Health:   "OK",
110                 Checks:   make(map[string]CheckResult),
111                 Services: make(map[string]ServiceHealth),
112         }
113
114         mtx := sync.Mutex{}
115         wg := sync.WaitGroup{}
116         for node, nodeConfig := range cluster.SystemNodes {
117                 for svc, addr := range nodeConfig.ServicePorts() {
118                         // Ensure svc is listed in resp.Services.
119                         mtx.Lock()
120                         if _, ok := resp.Services[svc]; !ok {
121                                 resp.Services[svc] = ServiceHealth{Health: "ERROR"}
122                         }
123                         mtx.Unlock()
124
125                         if addr == "" {
126                                 // svc is not expected on this node.
127                                 continue
128                         }
129
130                         wg.Add(1)
131                         go func(node, svc, addr string) {
132                                 defer wg.Done()
133                                 var result CheckResult
134                                 url, err := agg.pingURL(node, addr)
135                                 if err != nil {
136                                         result = CheckResult{
137                                                 Health: "ERROR",
138                                                 Error:  err.Error(),
139                                         }
140                                 } else {
141                                         result = agg.ping(url, cluster)
142                                 }
143
144                                 mtx.Lock()
145                                 defer mtx.Unlock()
146                                 resp.Checks[svc+"+"+url] = result
147                                 if result.Health == "OK" {
148                                         h := resp.Services[svc]
149                                         h.N++
150                                         h.Health = "OK"
151                                         resp.Services[svc] = h
152                                 } else {
153                                         resp.Health = "ERROR"
154                                 }
155                         }(node, svc, addr)
156                 }
157         }
158         wg.Wait()
159
160         // Report ERROR if a needed service didn't fail any checks
161         // merely because it isn't configured to run anywhere.
162         for _, sh := range resp.Services {
163                 if sh.Health != "OK" {
164                         resp.Health = "ERROR"
165                         break
166                 }
167         }
168         return resp
169 }
170
171 func (agg *Aggregator) pingURL(node, addr string) (string, error) {
172         _, port, err := net.SplitHostPort(addr)
173         return "http://" + node + ":" + port + "/_health/ping", err
174 }
175
176 func (agg *Aggregator) ping(url string, cluster *arvados.Cluster) (result CheckResult) {
177         t0 := time.Now()
178
179         var err error
180         defer func() {
181                 result.ResponseTime = json.Number(fmt.Sprintf("%.6f", time.Since(t0).Seconds()))
182                 if err != nil {
183                         result.Health, result.Error = "ERROR", err.Error()
184                 } else {
185                         result.Health = "OK"
186                 }
187         }()
188
189         req, err := http.NewRequest("GET", url, nil)
190         if err != nil {
191                 return
192         }
193         req.Header.Set("Authorization", "Bearer "+cluster.ManagementToken)
194
195         ctx, cancel := context.WithTimeout(req.Context(), time.Duration(agg.timeout))
196         defer cancel()
197         req = req.WithContext(ctx)
198         resp, err := agg.httpClient.Do(req)
199         if err != nil {
200                 return
201         }
202         result.HTTPStatusCode = resp.StatusCode
203         result.HTTPStatusText = resp.Status
204         err = json.NewDecoder(resp.Body).Decode(&result.Response)
205         if err != nil {
206                 err = fmt.Errorf("cannot decode response: %s", err)
207         } else if resp.StatusCode != http.StatusOK {
208                 err = fmt.Errorf("HTTP %d %s", resp.StatusCode, resp.Status)
209         } else if h, _ := result.Response["health"].(string); h != "OK" {
210                 if e, ok := result.Response["error"].(string); ok && e != "" {
211                         err = errors.New(e)
212                 } else {
213                         err = fmt.Errorf("health=%q in ping response", h)
214                 }
215         }
216         return
217 }
218
219 func (agg *Aggregator) checkAuth(req *http.Request, cluster *arvados.Cluster) bool {
220         creds := auth.NewCredentialsFromHTTPRequest(req)
221         for _, token := range creds.Tokens {
222                 if token != "" && token == cluster.ManagementToken {
223                         return true
224                 }
225         }
226         return false
227 }