12260: Merge branch 'master' into 12260-system-health
[arvados.git] / sdk / go / health / aggregator.go
1 package health
2
3 import (
4         "context"
5         "encoding/json"
6         "fmt"
7         "net"
8         "net/http"
9         "sync"
10         "time"
11
12         "git.curoverse.com/arvados.git/sdk/go/arvados"
13         "git.curoverse.com/arvados.git/sdk/go/auth"
14 )
15
16 const defaultTimeout = arvados.Duration(2 * time.Second)
17
18 // Aggregator implements http.Handler. It handles "GET /_health/all"
19 // by checking the health of all configured services on the cluster
20 // and responding 200 if everything is healthy.
21 type Aggregator struct {
22         setupOnce  sync.Once
23         httpClient *http.Client
24         timeout    arvados.Duration
25
26         Config *arvados.Config
27
28         // If non-nil, Log is called after handling each request.
29         Log func(*http.Request, error)
30 }
31
32 func (agg *Aggregator) setup() {
33         agg.httpClient = http.DefaultClient
34         if agg.timeout == 0 {
35                 // this is always the case, except in the test suite
36                 agg.timeout = defaultTimeout
37         }
38 }
39
40 func (agg *Aggregator) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
41         agg.setupOnce.Do(agg.setup)
42         sendErr := func(statusCode int, err error) {
43                 resp.WriteHeader(statusCode)
44                 json.NewEncoder(resp).Encode(map[string]string{"error": err.Error()})
45                 if agg.Log != nil {
46                         agg.Log(req, err)
47                 }
48         }
49
50         resp.Header().Set("Content-Type", "application/json")
51
52         cluster, err := agg.Config.GetCluster("")
53         if err != nil {
54                 err = fmt.Errorf("arvados.GetCluster(): %s", err)
55                 sendErr(http.StatusInternalServerError, err)
56                 return
57         }
58         if !agg.checkAuth(req, cluster) {
59                 sendErr(http.StatusUnauthorized, errUnauthorized)
60                 return
61         }
62         if req.URL.Path != "/_health/all" {
63                 sendErr(http.StatusNotFound, errNotFound)
64                 return
65         }
66         json.NewEncoder(resp).Encode(agg.ClusterHealth(cluster))
67         if agg.Log != nil {
68                 agg.Log(req, nil)
69         }
70 }
71
72 type ClusterHealthResponse struct {
73         // "OK" if all needed services are OK, otherwise "ERROR".
74         Health string `json:"health"`
75
76         // An entry for each known health check of each known instance
77         // of each needed component: "instance of service S on node N
78         // reports health-check C is OK."
79         Checks map[string]CheckResult `json:"checks"`
80
81         // An entry for each service type: "service S is OK." This
82         // exposes problems that can't be expressed in Checks, like
83         // "service S is needed, but isn't configured to run
84         // anywhere."
85         Services map[string]ServiceHealth `json:"services"`
86 }
87
88 type CheckResult struct {
89         Health         string                 `json:"health"`
90         Error          string                 `json:"error,omitempty"`
91         HTTPStatusCode int                    `json:",omitempty"`
92         HTTPStatusText string                 `json:",omitempty"`
93         Response       map[string]interface{} `json:"response"`
94         ResponseTime   json.Number            `json:"responseTime"`
95 }
96
97 type ServiceHealth struct {
98         Health string `json:"health"`
99         N      int    `json:"n"`
100 }
101
102 func (agg *Aggregator) ClusterHealth(cluster *arvados.Cluster) ClusterHealthResponse {
103         resp := ClusterHealthResponse{
104                 Health:   "OK",
105                 Checks:   make(map[string]CheckResult),
106                 Services: make(map[string]ServiceHealth),
107         }
108
109         mtx := sync.Mutex{}
110         wg := sync.WaitGroup{}
111         for node, nodeConfig := range cluster.SystemNodes {
112                 for svc, addr := range nodeConfig.ServicePorts() {
113                         // Ensure svc is listed in resp.Services.
114                         mtx.Lock()
115                         if _, ok := resp.Services[svc]; !ok {
116                                 resp.Services[svc] = ServiceHealth{Health: "ERROR"}
117                         }
118                         mtx.Unlock()
119
120                         if addr == "" {
121                                 // svc is not expected on this node.
122                                 continue
123                         }
124
125                         wg.Add(1)
126                         go func(node, svc, addr string) {
127                                 defer wg.Done()
128                                 var result CheckResult
129                                 url, err := agg.pingURL(node, addr)
130                                 if err != nil {
131                                         result = CheckResult{
132                                                 Health: "ERROR",
133                                                 Error:  err.Error(),
134                                         }
135                                 } else {
136                                         result = agg.ping(url, cluster)
137                                 }
138
139                                 mtx.Lock()
140                                 defer mtx.Unlock()
141                                 resp.Checks[svc+"+"+url] = result
142                                 if result.Health == "OK" {
143                                         h := resp.Services[svc]
144                                         h.N++
145                                         h.Health = "OK"
146                                         resp.Services[svc] = h
147                                 } else {
148                                         resp.Health = "ERROR"
149                                 }
150                         }(node, svc, addr)
151                 }
152         }
153         wg.Wait()
154
155         // Report ERROR if a needed service didn't fail any checks
156         // merely because it isn't configured to run anywhere.
157         for _, sh := range resp.Services {
158                 if sh.Health != "OK" {
159                         resp.Health = "ERROR"
160                         break
161                 }
162         }
163         return resp
164 }
165
166 func (agg *Aggregator) pingURL(node, addr string) (string, error) {
167         _, port, err := net.SplitHostPort(addr)
168         return "http://" + node + ":" + port + "/_health/ping", err
169 }
170
171 func (agg *Aggregator) ping(url string, cluster *arvados.Cluster) (result CheckResult) {
172         t0 := time.Now()
173
174         var err error
175         defer func() {
176                 result.ResponseTime = json.Number(fmt.Sprintf("%.6f", time.Since(t0).Seconds()))
177                 if err != nil {
178                         result.Health, result.Error = "ERROR", err.Error()
179                 } else {
180                         result.Health = "OK"
181                 }
182         }()
183
184         req, err := http.NewRequest("GET", url, nil)
185         if err != nil {
186                 return
187         }
188         req.Header.Set("Authorization", "Bearer "+cluster.ManagementToken)
189
190         ctx, cancel := context.WithTimeout(req.Context(), time.Duration(agg.timeout))
191         defer cancel()
192         req = req.WithContext(ctx)
193         resp, err := agg.httpClient.Do(req)
194         if err != nil {
195                 return
196         }
197         result.HTTPStatusCode = resp.StatusCode
198         result.HTTPStatusText = resp.Status
199         err = json.NewDecoder(resp.Body).Decode(&result.Response)
200         if err != nil {
201                 err = fmt.Errorf("cannot decode response: %s", err)
202                 return
203         } else if resp.StatusCode != http.StatusOK {
204                 err = fmt.Errorf("HTTP %d %s", resp.StatusCode, resp.Status)
205                 return
206         }
207         return
208 }
209
210 func (agg *Aggregator) checkAuth(req *http.Request, cluster *arvados.Cluster) bool {
211         creds := auth.NewCredentialsFromHTTPRequest(req)
212         for _, token := range creds.Tokens {
213                 if token != "" && token == cluster.ManagementToken {
214                         return true
215                 }
216         }
217         return false
218 }