ba532dddca7ee108e0af371086a02a71c078f0db
[arvados.git] / sdk / go / health / aggregator.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 package health
6
7 import (
8         "context"
9         "crypto/tls"
10         "encoding/json"
11         "errors"
12         "flag"
13         "fmt"
14         "io"
15         "net/http"
16         "net/url"
17         "sync"
18         "time"
19
20         "git.arvados.org/arvados.git/lib/cmd"
21         "git.arvados.org/arvados.git/lib/config"
22         "git.arvados.org/arvados.git/sdk/go/arvados"
23         "git.arvados.org/arvados.git/sdk/go/auth"
24         "git.arvados.org/arvados.git/sdk/go/ctxlog"
25         "github.com/ghodss/yaml"
26         "github.com/sirupsen/logrus"
27 )
28
29 const defaultTimeout = arvados.Duration(2 * time.Second)
30
31 // Aggregator implements service.Handler. It handles "GET /_health/all"
32 // by checking the health of all configured services on the cluster
33 // and responding 200 if everything is healthy.
34 type Aggregator struct {
35         setupOnce  sync.Once
36         httpClient *http.Client
37         timeout    arvados.Duration
38
39         Cluster *arvados.Cluster
40
41         // If non-nil, Log is called after handling each request.
42         Log func(*http.Request, error)
43 }
44
45 func (agg *Aggregator) setup() {
46         agg.httpClient = &http.Client{
47                 Transport: &http.Transport{
48                         TLSClientConfig: &tls.Config{
49                                 InsecureSkipVerify: agg.Cluster.TLS.Insecure,
50                         },
51                 },
52         }
53         if agg.timeout == 0 {
54                 // this is always the case, except in the test suite
55                 agg.timeout = defaultTimeout
56         }
57 }
58
59 func (agg *Aggregator) CheckHealth() error {
60         return nil
61 }
62
63 func (agg *Aggregator) Done() <-chan struct{} {
64         return nil
65 }
66
67 func (agg *Aggregator) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
68         agg.setupOnce.Do(agg.setup)
69         sendErr := func(statusCode int, err error) {
70                 resp.WriteHeader(statusCode)
71                 json.NewEncoder(resp).Encode(map[string]string{"error": err.Error()})
72                 if agg.Log != nil {
73                         agg.Log(req, err)
74                 }
75         }
76
77         resp.Header().Set("Content-Type", "application/json")
78
79         if !agg.checkAuth(req) {
80                 sendErr(http.StatusUnauthorized, errUnauthorized)
81                 return
82         }
83         if req.URL.Path == "/_health/all" {
84                 json.NewEncoder(resp).Encode(agg.ClusterHealth())
85         } else if req.URL.Path == "/_health/ping" {
86                 resp.Write(healthyBody)
87         } else {
88                 sendErr(http.StatusNotFound, errNotFound)
89                 return
90         }
91         if agg.Log != nil {
92                 agg.Log(req, nil)
93         }
94 }
95
96 type ClusterHealthResponse struct {
97         // "OK" if all needed services are OK, otherwise "ERROR".
98         Health string `json:"health"`
99
100         // An entry for each known health check of each known instance
101         // of each needed component: "instance of service S on node N
102         // reports health-check C is OK."
103         Checks map[string]CheckResult `json:"checks"`
104
105         // An entry for each service type: "service S is OK." This
106         // exposes problems that can't be expressed in Checks, like
107         // "service S is needed, but isn't configured to run
108         // anywhere."
109         Services map[arvados.ServiceName]ServiceHealth `json:"services"`
110 }
111
112 type CheckResult struct {
113         Health         string                 `json:"health"`
114         Error          string                 `json:"error,omitempty"`
115         HTTPStatusCode int                    `json:",omitempty"`
116         HTTPStatusText string                 `json:",omitempty"`
117         Response       map[string]interface{} `json:"response"`
118         ResponseTime   json.Number            `json:"responseTime"`
119 }
120
121 type ServiceHealth struct {
122         Health string `json:"health"`
123         N      int    `json:"n"`
124 }
125
126 func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
127         agg.setupOnce.Do(agg.setup)
128         resp := ClusterHealthResponse{
129                 Health:   "OK",
130                 Checks:   make(map[string]CheckResult),
131                 Services: make(map[arvados.ServiceName]ServiceHealth),
132         }
133
134         mtx := sync.Mutex{}
135         wg := sync.WaitGroup{}
136         for svcName, svc := range agg.Cluster.Services.Map() {
137                 // Ensure svc is listed in resp.Services.
138                 mtx.Lock()
139                 if _, ok := resp.Services[svcName]; !ok {
140                         resp.Services[svcName] = ServiceHealth{Health: "ERROR"}
141                 }
142                 mtx.Unlock()
143
144                 checkURLs := map[arvados.URL]bool{}
145                 for addr := range svc.InternalURLs {
146                         checkURLs[addr] = true
147                 }
148                 if len(checkURLs) == 0 && svc.ExternalURL.Host != "" {
149                         checkURLs[svc.ExternalURL] = true
150                 }
151                 for addr := range checkURLs {
152                         wg.Add(1)
153                         go func(svcName arvados.ServiceName, addr arvados.URL) {
154                                 defer wg.Done()
155                                 var result CheckResult
156                                 pingURL, err := agg.pingURL(addr)
157                                 if err != nil {
158                                         result = CheckResult{
159                                                 Health: "ERROR",
160                                                 Error:  err.Error(),
161                                         }
162                                 } else {
163                                         result = agg.ping(pingURL)
164                                 }
165
166                                 mtx.Lock()
167                                 defer mtx.Unlock()
168                                 resp.Checks[fmt.Sprintf("%s+%s", svcName, pingURL)] = result
169                                 if result.Health == "OK" {
170                                         h := resp.Services[svcName]
171                                         h.N++
172                                         h.Health = "OK"
173                                         resp.Services[svcName] = h
174                                 } else {
175                                         resp.Health = "ERROR"
176                                 }
177                         }(svcName, addr)
178                 }
179         }
180         wg.Wait()
181
182         // Report ERROR if a needed service didn't fail any checks
183         // merely because it isn't configured to run anywhere.
184         for _, sh := range resp.Services {
185                 if sh.Health != "OK" {
186                         resp.Health = "ERROR"
187                         break
188                 }
189         }
190         return resp
191 }
192
193 func (agg *Aggregator) pingURL(svcURL arvados.URL) (*url.URL, error) {
194         base := url.URL(svcURL)
195         return base.Parse("/_health/ping")
196 }
197
198 func (agg *Aggregator) ping(target *url.URL) (result CheckResult) {
199         t0 := time.Now()
200         defer func() {
201                 result.ResponseTime = json.Number(fmt.Sprintf("%.6f", time.Since(t0).Seconds()))
202         }()
203         result.Health = "ERROR"
204
205         req, err := http.NewRequest("GET", target.String(), nil)
206         if err != nil {
207                 result.Error = err.Error()
208                 return
209         }
210         req.Header.Set("Authorization", "Bearer "+agg.Cluster.ManagementToken)
211
212         // Avoid workbench1's redirect-http-to-https feature
213         req.Header.Set("X-Forwarded-Proto", "https")
214
215         ctx, cancel := context.WithTimeout(req.Context(), time.Duration(agg.timeout))
216         defer cancel()
217         req = req.WithContext(ctx)
218         resp, err := agg.httpClient.Do(req)
219         if err != nil {
220                 result.Error = err.Error()
221                 return
222         }
223         result.HTTPStatusCode = resp.StatusCode
224         result.HTTPStatusText = resp.Status
225         err = json.NewDecoder(resp.Body).Decode(&result.Response)
226         if err != nil {
227                 result.Error = fmt.Sprintf("cannot decode response: %s", err)
228         } else if resp.StatusCode != http.StatusOK {
229                 result.Error = fmt.Sprintf("HTTP %d %s", resp.StatusCode, resp.Status)
230         } else if h, _ := result.Response["health"].(string); h != "OK" {
231                 if e, ok := result.Response["error"].(string); ok && e != "" {
232                         result.Error = e
233                         return
234                 } else {
235                         result.Error = fmt.Sprintf("health=%q in ping response", h)
236                         return
237                 }
238         }
239         result.Health = "OK"
240         return
241 }
242
243 func (agg *Aggregator) checkAuth(req *http.Request) bool {
244         creds := auth.CredentialsFromRequest(req)
245         for _, token := range creds.Tokens {
246                 if token != "" && token == agg.Cluster.ManagementToken {
247                         return true
248                 }
249         }
250         return false
251 }
252
253 var errSilent = errors.New("")
254
255 var CheckCommand cmd.Handler = checkCommand{}
256
257 type checkCommand struct{}
258
259 func (ccmd checkCommand) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
260         logger := ctxlog.New(stderr, "json", "info")
261         ctx := ctxlog.Context(context.Background(), logger)
262         err := ccmd.run(ctx, prog, args, stdin, stdout, stderr)
263         if err != nil {
264                 if err != errSilent {
265                         fmt.Fprintln(stdout, err.Error())
266                 }
267                 return 1
268         }
269         return 0
270 }
271
272 func (ccmd checkCommand) run(ctx context.Context, prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) error {
273         flags := flag.NewFlagSet("", flag.ContinueOnError)
274         flags.SetOutput(stderr)
275         loader := config.NewLoader(stdin, ctxlog.New(stderr, "text", "info"))
276         loader.SetupFlags(flags)
277         versionFlag := flags.Bool("version", false, "Write version information to stdout and exit 0")
278         timeout := flags.Duration("timeout", defaultTimeout.Duration(), "Maximum time to wait for health responses")
279         if ok, _ := cmd.ParseFlags(flags, prog, args, "", stderr); !ok {
280                 // cmd.ParseFlags already reported the error
281                 return errSilent
282         } else if *versionFlag {
283                 cmd.Version.RunCommand(prog, args, stdin, stdout, stderr)
284                 return nil
285         }
286         cfg, err := loader.Load()
287         if err != nil {
288                 return err
289         }
290         cluster, err := cfg.GetCluster("")
291         if err != nil {
292                 return err
293         }
294         logger := ctxlog.New(stderr, cluster.SystemLogs.Format, cluster.SystemLogs.LogLevel).WithFields(logrus.Fields{
295                 "ClusterID": cluster.ClusterID,
296         })
297         ctx = ctxlog.Context(ctx, logger)
298         agg := Aggregator{Cluster: cluster, timeout: arvados.Duration(*timeout)}
299         resp := agg.ClusterHealth()
300         buf, err := yaml.Marshal(resp)
301         if err != nil {
302                 return err
303         }
304         stdout.Write(buf)
305         if resp.Health != "OK" {
306                 return fmt.Errorf("health check failed")
307         }
308         return nil
309 }