16217: Exit service command if the service handler fails.
[arvados.git] / sdk / go / health / aggregator.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 package health
6
7 import (
8         "context"
9         "encoding/json"
10         "errors"
11         "fmt"
12         "net/http"
13         "net/url"
14         "sync"
15         "time"
16
17         "git.arvados.org/arvados.git/sdk/go/arvados"
18         "git.arvados.org/arvados.git/sdk/go/auth"
19 )
20
21 const defaultTimeout = arvados.Duration(2 * time.Second)
22
23 // Aggregator implements http.Handler. It handles "GET /_health/all"
24 // by checking the health of all configured services on the cluster
25 // and responding 200 if everything is healthy.
26 type Aggregator struct {
27         setupOnce  sync.Once
28         httpClient *http.Client
29         timeout    arvados.Duration
30
31         Cluster *arvados.Cluster
32
33         // If non-nil, Log is called after handling each request.
34         Log func(*http.Request, error)
35 }
36
37 func (agg *Aggregator) setup() {
38         agg.httpClient = http.DefaultClient
39         if agg.timeout == 0 {
40                 // this is always the case, except in the test suite
41                 agg.timeout = defaultTimeout
42         }
43 }
44
45 func (agg *Aggregator) CheckHealth() error {
46         return nil
47 }
48
49 func (agg *Aggregator) Done() <-chan struct{} {
50         return nil
51 }
52
53 func (agg *Aggregator) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
54         agg.setupOnce.Do(agg.setup)
55         sendErr := func(statusCode int, err error) {
56                 resp.WriteHeader(statusCode)
57                 json.NewEncoder(resp).Encode(map[string]string{"error": err.Error()})
58                 if agg.Log != nil {
59                         agg.Log(req, err)
60                 }
61         }
62
63         resp.Header().Set("Content-Type", "application/json")
64
65         if !agg.checkAuth(req) {
66                 sendErr(http.StatusUnauthorized, errUnauthorized)
67                 return
68         }
69         if req.URL.Path == "/_health/all" {
70                 json.NewEncoder(resp).Encode(agg.ClusterHealth())
71         } else if req.URL.Path == "/_health/ping" {
72                 resp.Write(healthyBody)
73         } else {
74                 sendErr(http.StatusNotFound, errNotFound)
75                 return
76         }
77         if agg.Log != nil {
78                 agg.Log(req, nil)
79         }
80 }
81
82 type ClusterHealthResponse struct {
83         // "OK" if all needed services are OK, otherwise "ERROR".
84         Health string `json:"health"`
85
86         // An entry for each known health check of each known instance
87         // of each needed component: "instance of service S on node N
88         // reports health-check C is OK."
89         Checks map[string]CheckResult `json:"checks"`
90
91         // An entry for each service type: "service S is OK." This
92         // exposes problems that can't be expressed in Checks, like
93         // "service S is needed, but isn't configured to run
94         // anywhere."
95         Services map[arvados.ServiceName]ServiceHealth `json:"services"`
96 }
97
98 type CheckResult struct {
99         Health         string                 `json:"health"`
100         Error          string                 `json:"error,omitempty"`
101         HTTPStatusCode int                    `json:",omitempty"`
102         HTTPStatusText string                 `json:",omitempty"`
103         Response       map[string]interface{} `json:"response"`
104         ResponseTime   json.Number            `json:"responseTime"`
105 }
106
107 type ServiceHealth struct {
108         Health string `json:"health"`
109         N      int    `json:"n"`
110 }
111
112 func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
113         agg.setupOnce.Do(agg.setup)
114         resp := ClusterHealthResponse{
115                 Health:   "OK",
116                 Checks:   make(map[string]CheckResult),
117                 Services: make(map[arvados.ServiceName]ServiceHealth),
118         }
119
120         mtx := sync.Mutex{}
121         wg := sync.WaitGroup{}
122         for svcName, svc := range agg.Cluster.Services.Map() {
123                 // Ensure svc is listed in resp.Services.
124                 mtx.Lock()
125                 if _, ok := resp.Services[svcName]; !ok {
126                         resp.Services[svcName] = ServiceHealth{Health: "ERROR"}
127                 }
128                 mtx.Unlock()
129
130                 for addr := range svc.InternalURLs {
131                         wg.Add(1)
132                         go func(svcName arvados.ServiceName, addr arvados.URL) {
133                                 defer wg.Done()
134                                 var result CheckResult
135                                 pingURL, err := agg.pingURL(addr)
136                                 if err != nil {
137                                         result = CheckResult{
138                                                 Health: "ERROR",
139                                                 Error:  err.Error(),
140                                         }
141                                 } else {
142                                         result = agg.ping(pingURL)
143                                 }
144
145                                 mtx.Lock()
146                                 defer mtx.Unlock()
147                                 resp.Checks[fmt.Sprintf("%s+%s", svcName, pingURL)] = result
148                                 if result.Health == "OK" {
149                                         h := resp.Services[svcName]
150                                         h.N++
151                                         h.Health = "OK"
152                                         resp.Services[svcName] = h
153                                 } else {
154                                         resp.Health = "ERROR"
155                                 }
156                         }(svcName, addr)
157                 }
158         }
159         wg.Wait()
160
161         // Report ERROR if a needed service didn't fail any checks
162         // merely because it isn't configured to run anywhere.
163         for _, sh := range resp.Services {
164                 if sh.Health != "OK" {
165                         resp.Health = "ERROR"
166                         break
167                 }
168         }
169         return resp
170 }
171
172 func (agg *Aggregator) pingURL(svcURL arvados.URL) (*url.URL, error) {
173         base := url.URL(svcURL)
174         return base.Parse("/_health/ping")
175 }
176
177 func (agg *Aggregator) ping(target *url.URL) (result CheckResult) {
178         t0 := time.Now()
179
180         var err error
181         defer func() {
182                 result.ResponseTime = json.Number(fmt.Sprintf("%.6f", time.Since(t0).Seconds()))
183                 if err != nil {
184                         result.Health, result.Error = "ERROR", err.Error()
185                 } else {
186                         result.Health = "OK"
187                 }
188         }()
189
190         req, err := http.NewRequest("GET", target.String(), nil)
191         if err != nil {
192                 return
193         }
194         req.Header.Set("Authorization", "Bearer "+agg.Cluster.ManagementToken)
195
196         ctx, cancel := context.WithTimeout(req.Context(), time.Duration(agg.timeout))
197         defer cancel()
198         req = req.WithContext(ctx)
199         resp, err := agg.httpClient.Do(req)
200         if err != nil {
201                 return
202         }
203         result.HTTPStatusCode = resp.StatusCode
204         result.HTTPStatusText = resp.Status
205         err = json.NewDecoder(resp.Body).Decode(&result.Response)
206         if err != nil {
207                 err = fmt.Errorf("cannot decode response: %s", err)
208         } else if resp.StatusCode != http.StatusOK {
209                 err = fmt.Errorf("HTTP %d %s", resp.StatusCode, resp.Status)
210         } else if h, _ := result.Response["health"].(string); h != "OK" {
211                 if e, ok := result.Response["error"].(string); ok && e != "" {
212                         err = errors.New(e)
213                 } else {
214                         err = fmt.Errorf("health=%q in ping response", h)
215                 }
216         }
217         return
218 }
219
220 func (agg *Aggregator) checkAuth(req *http.Request) bool {
221         creds := auth.CredentialsFromRequest(req)
222         for _, token := range creds.Tokens {
223                 if token != "" && token == agg.Cluster.ManagementToken {
224                         return true
225                 }
226         }
227         return false
228 }