Merge branch 'master' into 13822-nm-delayed-daemon
[arvados.git] / sdk / go / health / aggregator.go
index e881db8827ba278e9c7409b6448ab57941317707..a6cb8798aa328a468c1db98c3c3e5bf38773f15c 100644 (file)
@@ -1,8 +1,13 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
 package health
 
 import (
        "context"
        "encoding/json"
+       "errors"
        "fmt"
        "net"
        "net/http"
@@ -69,82 +74,97 @@ func (agg *Aggregator) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
        }
 }
 
-type ServiceHealth struct {
+type ClusterHealthResponse struct {
+       // "OK" if all needed services are OK, otherwise "ERROR".
        Health string `json:"health"`
-       N      int    `json:"n"`
-}
 
-type ClusterHealthResponse struct {
-       Health   string                   `json:"health"`
-       Checks   map[string]CheckResponse `json:"checks"`
-       Services map[string]ServiceHealth `json:"services"`
+       // An entry for each known health check of each known instance
+       // of each needed component: "instance of service S on node N
+       // reports health-check C is OK."
+       Checks map[string]CheckResult `json:"checks"`
+
+       // An entry for each service type: "service S is OK." This
+       // exposes problems that can't be expressed in Checks, like
+       // "service S is needed, but isn't configured to run
+       // anywhere."
+       Services map[arvados.ServiceName]ServiceHealth `json:"services"`
 }
 
-type CheckResponse struct {
-       Status       int         `json:"status"`
-       Health       string      `json:"health"`
-       Error        string      `json:"error,omitempty"`
-       ResponseTime json.Number `json:"responseTime"`
+type CheckResult struct {
+       Health         string                 `json:"health"`
+       Error          string                 `json:"error,omitempty"`
+       HTTPStatusCode int                    `json:",omitempty"`
+       HTTPStatusText string                 `json:",omitempty"`
+       Response       map[string]interface{} `json:"response"`
+       ResponseTime   json.Number            `json:"responseTime"`
 }
 
-func (r *CheckResponse) OK() bool {
-       return r.Health == "OK" && r.Status == http.StatusOK
+type ServiceHealth struct {
+       Health string `json:"health"`
+       N      int    `json:"n"`
 }
 
 func (agg *Aggregator) ClusterHealth(cluster *arvados.Cluster) ClusterHealthResponse {
        resp := ClusterHealthResponse{
                Health:   "OK",
-               Checks:   make(map[string]CheckResponse),
-               Services: make(map[string]ServiceHealth),
+               Checks:   make(map[string]CheckResult),
+               Services: make(map[arvados.ServiceName]ServiceHealth),
        }
 
        mtx := sync.Mutex{}
        wg := sync.WaitGroup{}
-       for node, nodeConfig := range cluster.SystemNodes {
-               for svc, addr := range map[string]string{
-                       "keepstore": nodeConfig.Keepstore.Listen,
-               } {
+       for profileName, profile := range cluster.NodeProfiles {
+               for svc, addr := range profile.ServicePorts() {
+                       // Ensure svc is listed in resp.Services.
+                       mtx.Lock()
+                       if _, ok := resp.Services[svc]; !ok {
+                               resp.Services[svc] = ServiceHealth{Health: "ERROR"}
+                       }
+                       mtx.Unlock()
+
                        if addr == "" {
+                               // svc is not expected on this node.
                                continue
                        }
+
                        wg.Add(1)
-                       go func(node string) {
+                       go func(profileName string, svc arvados.ServiceName, addr string) {
                                defer wg.Done()
-                               var pingResp CheckResponse
-                               url, err := agg.pingURL(node, addr)
+                               var result CheckResult
+                               url, err := agg.pingURL(profileName, addr)
                                if err != nil {
-                                       pingResp = CheckResponse{
+                                       result = CheckResult{
                                                Health: "ERROR",
                                                Error:  err.Error(),
                                        }
                                } else {
-                                       pingResp = agg.ping(url, cluster)
+                                       result = agg.ping(url, cluster)
                                }
 
                                mtx.Lock()
                                defer mtx.Unlock()
-                               resp.Checks[svc+"+"+url] = pingResp
-                               svHealth := resp.Services[svc]
-                               if pingResp.OK() {
-                                       svHealth.N++
+                               resp.Checks[fmt.Sprintf("%s+%s", svc, url)] = result
+                               if result.Health == "OK" {
+                                       h := resp.Services[svc]
+                                       h.N++
+                                       h.Health = "OK"
+                                       resp.Services[svc] = h
                                } else {
                                        resp.Health = "ERROR"
                                }
-                               resp.Services[svc] = svHealth
-                       }(node)
+                       }(profileName, svc, addr)
                }
        }
        wg.Wait()
 
-       for svc, svHealth := range resp.Services {
-               if svHealth.N > 0 {
-                       svHealth.Health = "OK"
-               } else {
-                       svHealth.Health = "ERROR"
+       // Report ERROR if a needed service didn't fail any checks
+       // merely because it isn't configured to run anywhere.
+       for _, sh := range resp.Services {
+               if sh.Health != "OK" {
+                       resp.Health = "ERROR"
+                       break
                }
-               resp.Services[svc] = svHealth
        }
-
        return resp
 }
 
@@ -153,7 +173,7 @@ func (agg *Aggregator) pingURL(node, addr string) (string, error) {
        return "http://" + node + ":" + port + "/_health/ping", err
 }
 
-func (agg *Aggregator) ping(url string, cluster *arvados.Cluster) (result CheckResponse) {
+func (agg *Aggregator) ping(url string, cluster *arvados.Cluster) (result CheckResult) {
        t0 := time.Now()
 
        var err error
@@ -161,6 +181,8 @@ func (agg *Aggregator) ping(url string, cluster *arvados.Cluster) (result CheckR
                result.ResponseTime = json.Number(fmt.Sprintf("%.6f", time.Since(t0).Seconds()))
                if err != nil {
                        result.Health, result.Error = "ERROR", err.Error()
+               } else {
+                       result.Health = "OK"
                }
        }()
 
@@ -170,28 +192,26 @@ func (agg *Aggregator) ping(url string, cluster *arvados.Cluster) (result CheckR
        }
        req.Header.Set("Authorization", "Bearer "+cluster.ManagementToken)
 
-       ctx, cancel := context.WithCancel(req.Context())
-       go func() {
-               select {
-               case <-time.After(time.Duration(agg.timeout)):
-                       cancel()
-               case <-ctx.Done():
-               }
-       }()
+       ctx, cancel := context.WithTimeout(req.Context(), time.Duration(agg.timeout))
+       defer cancel()
        req = req.WithContext(ctx)
        resp, err := agg.httpClient.Do(req)
        if err != nil {
                return
        }
-       result.Status = resp.StatusCode
-       err = json.NewDecoder(resp.Body).Decode(&result)
+       result.HTTPStatusCode = resp.StatusCode
+       result.HTTPStatusText = resp.Status
+       err = json.NewDecoder(resp.Body).Decode(&result.Response)
        if err != nil {
                err = fmt.Errorf("cannot decode response: %s", err)
-               return
-       }
-       if resp.StatusCode != http.StatusOK {
+       } else if resp.StatusCode != http.StatusOK {
                err = fmt.Errorf("HTTP %d %s", resp.StatusCode, resp.Status)
-               return
+       } else if h, _ := result.Response["health"].(string); h != "OK" {
+               if e, ok := result.Response["error"].(string); ok && e != "" {
+                       err = errors.New(e)
+               } else {
+                       err = fmt.Errorf("health=%q in ping response", h)
+               }
        }
        return
 }