12260: Fix remote ping auth. Make server work.
[arvados.git] / sdk / go / health / aggregator.go
1 package health
2
3 import (
4         "context"
5         "encoding/json"
6         "fmt"
7         "net"
8         "net/http"
9         "sync"
10         "time"
11
12         "git.curoverse.com/arvados.git/sdk/go/arvados"
13         "git.curoverse.com/arvados.git/sdk/go/auth"
14 )
15
16 const defaultTimeout = arvados.Duration(2 * time.Second)
17
18 // Aggregator implements http.Handler. It handles "GET /_health/all"
19 // by checking the health of all configured services on the cluster
20 // and responding 200 if everything is healthy.
21 type Aggregator struct {
22         setupOnce  sync.Once
23         httpClient *http.Client
24         timeout    arvados.Duration
25
26         Config *arvados.Config
27
28         // If non-nil, Log is called after handling each request.
29         Log func(*http.Request, error)
30 }
31
32 func (agg *Aggregator) setup() {
33         agg.httpClient = http.DefaultClient
34         if agg.timeout == 0 {
35                 // this is always the case, except in the test suite
36                 agg.timeout = defaultTimeout
37         }
38 }
39
40 func (agg *Aggregator) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
41         agg.setupOnce.Do(agg.setup)
42         sendErr := func(statusCode int, err error) {
43                 resp.WriteHeader(statusCode)
44                 json.NewEncoder(resp).Encode(map[string]string{"error": err.Error()})
45                 if agg.Log != nil {
46                         agg.Log(req, err)
47                 }
48         }
49
50         resp.Header().Set("Content-Type", "application/json")
51
52         if agg.Config == nil {
53                 cfg, err := arvados.GetConfig()
54                 if err != nil {
55                         err = fmt.Errorf("arvados.GetConfig(): %s", err)
56                         sendErr(http.StatusInternalServerError, err)
57                         return
58                 }
59                 agg.Config = cfg
60         }
61         cluster, err := agg.Config.GetCluster("")
62         if err != nil {
63                 err = fmt.Errorf("arvados.GetCluster(): %s", err)
64                 sendErr(http.StatusInternalServerError, err)
65                 return
66         }
67         if !agg.checkAuth(req, cluster) {
68                 sendErr(http.StatusUnauthorized, errUnauthorized)
69                 return
70         }
71         if req.URL.Path != "/_health/all" {
72                 sendErr(http.StatusNotFound, errNotFound)
73                 return
74         }
75         json.NewEncoder(resp).Encode(agg.ClusterHealth(cluster))
76         if agg.Log != nil {
77                 agg.Log(req, nil)
78         }
79 }
80
81 type ServiceHealth struct {
82         Health string `json:"health"`
83         N      int    `json:"n"`
84 }
85
86 type ClusterHealthResponse struct {
87         Health   string                   `json:"health"`
88         Checks   map[string]CheckResponse `json:"checks"`
89         Services map[string]ServiceHealth `json:"services"`
90 }
91
92 type CheckResponse struct {
93         Status       int         `json:"status"`
94         Health       string      `json:"health"`
95         Error        string      `json:"error,omitempty"`
96         ResponseTime json.Number `json:"responseTime"`
97 }
98
99 func (r *CheckResponse) OK() bool {
100         return r.Health == "OK" && r.Status == http.StatusOK
101 }
102
103 func (agg *Aggregator) ClusterHealth(cluster *arvados.Cluster) ClusterHealthResponse {
104         resp := ClusterHealthResponse{
105                 Health:   "OK",
106                 Checks:   make(map[string]CheckResponse),
107                 Services: make(map[string]ServiceHealth),
108         }
109
110         mtx := sync.Mutex{}
111         wg := sync.WaitGroup{}
112         for node, nodeConfig := range cluster.SystemNodes {
113                 for svc, addr := range map[string]string{
114                         "keepstore": nodeConfig.Keepstore.Listen,
115                 } {
116                         if addr == "" {
117                                 continue
118                         }
119                         wg.Add(1)
120                         go func(node string) {
121                                 defer wg.Done()
122                                 pingResp := agg.ping(node, addr, cluster)
123
124                                 mtx.Lock()
125                                 defer mtx.Unlock()
126                                 resp.Checks[node+"/"+svc+"/_health/ping"] = pingResp
127                                 svHealth := resp.Services[svc]
128                                 if pingResp.OK() {
129                                         svHealth.N++
130                                 } else {
131                                         resp.Health = "ERROR"
132                                 }
133                                 resp.Services[svc] = svHealth
134                         }(node)
135                 }
136         }
137         wg.Wait()
138
139         for svc, svHealth := range resp.Services {
140                 if svHealth.N > 0 {
141                         svHealth.Health = "OK"
142                 } else {
143                         svHealth.Health = "ERROR"
144                 }
145                 resp.Services[svc] = svHealth
146         }
147
148         return resp
149 }
150
151 func (agg *Aggregator) ping(node, addr string, cluster *arvados.Cluster) (result CheckResponse) {
152         t0 := time.Now()
153
154         var err error
155         defer func() {
156                 result.ResponseTime = json.Number(fmt.Sprintf("%.6f", time.Since(t0).Seconds()))
157                 if err != nil {
158                         result.Health, result.Error = "ERROR", err.Error()
159                 }
160         }()
161
162         _, port, err := net.SplitHostPort(addr)
163         if err != nil {
164                 return
165         }
166         req, err := http.NewRequest("GET", "http://"+node+":"+port+"/_health/ping", nil)
167         if err != nil {
168                 return
169         }
170         req.Header.Set("Authorization", "Bearer "+cluster.ManagementToken)
171
172         ctx, cancel := context.WithCancel(req.Context())
173         go func() {
174                 select {
175                 case <-time.After(time.Duration(agg.timeout)):
176                         cancel()
177                 case <-ctx.Done():
178                 }
179         }()
180         req = req.WithContext(ctx)
181         resp, err := agg.httpClient.Do(req)
182         if err != nil {
183                 return
184         }
185         result.Status = resp.StatusCode
186         err = json.NewDecoder(resp.Body).Decode(&result)
187         if err != nil {
188                 err = fmt.Errorf("cannot decode response: %s", err)
189                 return
190         }
191         if resp.StatusCode != http.StatusOK {
192                 err = fmt.Errorf("HTTP %d %s", resp.StatusCode, resp.Status)
193                 return
194         }
195         return
196 }
197
198 func (agg *Aggregator) checkAuth(req *http.Request, cluster *arvados.Cluster) bool {
199         creds := auth.NewCredentialsFromHTTPRequest(req)
200         for _, token := range creds.Tokens {
201                 if token != "" && token == cluster.ManagementToken {
202                         return true
203                 }
204         }
205         return false
206 }