12 "git.curoverse.com/arvados.git/sdk/go/arvados"
13 "git.curoverse.com/arvados.git/sdk/go/auth"
16 const defaultTimeout = arvados.Duration(2 * time.Second)
18 // Aggregator implements http.Handler. It handles "GET /_health/all"
19 // by checking the health of all configured services on the cluster
20 // and responding 200 if everything is healthy.
21 type Aggregator struct {
23 httpClient *http.Client
24 timeout arvados.Duration
26 Config *arvados.Config
28 // If non-nil, Log is called after handling each request.
29 Log func(*http.Request, error)
32 func (agg *Aggregator) setup() {
33 agg.httpClient = http.DefaultClient
35 // this is always the case, except in the test suite
36 agg.timeout = defaultTimeout
40 func (agg *Aggregator) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
41 agg.setupOnce.Do(agg.setup)
42 sendErr := func(statusCode int, err error) {
43 resp.WriteHeader(statusCode)
44 json.NewEncoder(resp).Encode(map[string]string{"error": err.Error()})
50 resp.Header().Set("Content-Type", "application/json")
52 if agg.Config == nil {
53 cfg, err := arvados.GetConfig()
55 err = fmt.Errorf("arvados.GetConfig(): %s", err)
56 sendErr(http.StatusInternalServerError, err)
61 cluster, err := agg.Config.GetCluster("")
63 err = fmt.Errorf("arvados.GetCluster(): %s", err)
64 sendErr(http.StatusInternalServerError, err)
67 if !agg.checkAuth(req, cluster) {
68 sendErr(http.StatusUnauthorized, errUnauthorized)
71 if req.URL.Path != "/_health/all" {
72 sendErr(http.StatusNotFound, errNotFound)
75 json.NewEncoder(resp).Encode(agg.ClusterHealth(cluster))
81 type ServiceHealth struct {
82 Health string `json:"health"`
86 type ClusterHealthResponse struct {
87 Health string `json:"health"`
88 Checks map[string]CheckResponse `json:"checks"`
89 Services map[string]ServiceHealth `json:"services"`
92 type CheckResponse struct {
93 Status int `json:"status"`
94 Health string `json:"health"`
95 Error string `json:"error,omitempty"`
96 ResponseTime json.Number `json:"responseTime"`
99 func (r *CheckResponse) OK() bool {
100 return r.Health == "OK" && r.Status == http.StatusOK
103 func (agg *Aggregator) ClusterHealth(cluster *arvados.Cluster) ClusterHealthResponse {
104 resp := ClusterHealthResponse{
106 Checks: make(map[string]CheckResponse),
107 Services: make(map[string]ServiceHealth),
111 wg := sync.WaitGroup{}
112 for node, nodeConfig := range cluster.SystemNodes {
113 for svc, addr := range map[string]string{
114 "keepstore": nodeConfig.Keepstore.Listen,
120 go func(node string) {
122 pingResp := agg.ping(node, addr, cluster)
126 resp.Checks[node+"/"+svc+"/_health/ping"] = pingResp
127 svHealth := resp.Services[svc]
131 resp.Health = "ERROR"
133 resp.Services[svc] = svHealth
139 for svc, svHealth := range resp.Services {
141 svHealth.Health = "OK"
143 svHealth.Health = "ERROR"
145 resp.Services[svc] = svHealth
151 func (agg *Aggregator) ping(node, addr string, cluster *arvados.Cluster) (result CheckResponse) {
156 result.ResponseTime = json.Number(fmt.Sprintf("%.6f", time.Since(t0).Seconds()))
158 result.Health, result.Error = "ERROR", err.Error()
162 _, port, err := net.SplitHostPort(addr)
166 req, err := http.NewRequest("GET", "http://"+node+":"+port+"/_health/ping", nil)
170 req.Header.Set("Authorization", "Bearer "+cluster.ManagementToken)
172 ctx, cancel := context.WithCancel(req.Context())
175 case <-time.After(time.Duration(agg.timeout)):
180 req = req.WithContext(ctx)
181 resp, err := agg.httpClient.Do(req)
185 result.Status = resp.StatusCode
186 err = json.NewDecoder(resp.Body).Decode(&result)
188 err = fmt.Errorf("cannot decode response: %s", err)
191 if resp.StatusCode != http.StatusOK {
192 err = fmt.Errorf("HTTP %d %s", resp.StatusCode, resp.Status)
198 func (agg *Aggregator) checkAuth(req *http.Request, cluster *arvados.Cluster) bool {
199 creds := auth.NewCredentialsFromHTTPRequest(req)
200 for _, token := range creds.Tokens {
201 if token != "" && token == cluster.ManagementToken {