15003: Remove NodeProfiles section from cluster config.
[arvados.git] / sdk / go / health / aggregator.go
index 334584b6225f98c59bbaa8912550c9566739b109..d8c0a4abfbafead971e25fb7faa330484d8dedee 100644 (file)
@@ -1,11 +1,16 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
 package health
 
 import (
        "context"
        "encoding/json"
+       "errors"
        "fmt"
-       "net"
        "net/http"
+       "net/url"
        "sync"
        "time"
 
@@ -82,7 +87,7 @@ type ClusterHealthResponse struct {
        // exposes problems that can't be expressed in Checks, like
        // "service S is needed, but isn't configured to run
        // anywhere."
-       Services map[string]ServiceHealth `json:"services"`
+       Services map[arvados.ServiceName]ServiceHealth `json:"services"`
 }
 
 type CheckResult struct {
@@ -103,51 +108,46 @@ func (agg *Aggregator) ClusterHealth(cluster *arvados.Cluster) ClusterHealthResp
        resp := ClusterHealthResponse{
                Health:   "OK",
                Checks:   make(map[string]CheckResult),
-               Services: make(map[string]ServiceHealth),
+               Services: make(map[arvados.ServiceName]ServiceHealth),
        }
 
        mtx := sync.Mutex{}
        wg := sync.WaitGroup{}
-       for node, nodeConfig := range cluster.SystemNodes {
-               for svc, addr := range nodeConfig.ServicePorts() {
-                       // Ensure svc is listed in resp.Services.
-                       mtx.Lock()
-                       if _, ok := resp.Services[svc]; !ok {
-                               resp.Services[svc] = ServiceHealth{Health: "ERROR"}
-                       }
-                       mtx.Unlock()
-
-                       if addr == "" {
-                               // svc is not expected on this node.
-                               continue
-                       }
+       for svcName, svc := range cluster.Services.Map() {
+               // Ensure svc is listed in resp.Services.
+               mtx.Lock()
+               if _, ok := resp.Services[svcName]; !ok {
+                       resp.Services[svcName] = ServiceHealth{Health: "ERROR"}
+               }
+               mtx.Unlock()
 
+               for addr := range svc.InternalURLs {
                        wg.Add(1)
-                       go func(node, svc, addr string) {
+                       go func(svcName arvados.ServiceName, addr arvados.URL) {
                                defer wg.Done()
                                var result CheckResult
-                               url, err := agg.pingURL(node, addr)
+                               pingURL, err := agg.pingURL(addr)
                                if err != nil {
                                        result = CheckResult{
                                                Health: "ERROR",
                                                Error:  err.Error(),
                                        }
                                } else {
-                                       result = agg.ping(url, cluster)
+                                       result = agg.ping(pingURL, cluster)
                                }
 
                                mtx.Lock()
                                defer mtx.Unlock()
-                               resp.Checks[svc+"+"+url] = result
+                               resp.Checks[fmt.Sprintf("%s+%s", svcName, pingURL)] = result
                                if result.Health == "OK" {
-                                       h := resp.Services[svc]
+                                       h := resp.Services[svcName]
                                        h.N++
                                        h.Health = "OK"
-                                       resp.Services[svc] = h
+                                       resp.Services[svcName] = h
                                } else {
                                        resp.Health = "ERROR"
                                }
-                       }(node, svc, addr)
+                       }(svcName, addr)
                }
        }
        wg.Wait()
@@ -163,12 +163,12 @@ func (agg *Aggregator) ClusterHealth(cluster *arvados.Cluster) ClusterHealthResp
        return resp
 }
 
-func (agg *Aggregator) pingURL(node, addr string) (string, error) {
-       _, port, err := net.SplitHostPort(addr)
-       return "http://" + node + ":" + port + "/_health/ping", err
+func (agg *Aggregator) pingURL(svcURL arvados.URL) (*url.URL, error) {
+       base := url.URL(svcURL)
+       return base.Parse("/_health/ping")
 }
 
-func (agg *Aggregator) ping(url string, cluster *arvados.Cluster) (result CheckResult) {
+func (agg *Aggregator) ping(target *url.URL, cluster *arvados.Cluster) (result CheckResult) {
        t0 := time.Now()
 
        var err error
@@ -181,7 +181,7 @@ func (agg *Aggregator) ping(url string, cluster *arvados.Cluster) (result CheckR
                }
        }()
 
-       req, err := http.NewRequest("GET", url, nil)
+       req, err := http.NewRequest("GET", target.String(), nil)
        if err != nil {
                return
        }
@@ -199,16 +199,20 @@ func (agg *Aggregator) ping(url string, cluster *arvados.Cluster) (result CheckR
        err = json.NewDecoder(resp.Body).Decode(&result.Response)
        if err != nil {
                err = fmt.Errorf("cannot decode response: %s", err)
-               return
        } else if resp.StatusCode != http.StatusOK {
                err = fmt.Errorf("HTTP %d %s", resp.StatusCode, resp.Status)
-               return
+       } else if h, _ := result.Response["health"].(string); h != "OK" {
+               if e, ok := result.Response["error"].(string); ok && e != "" {
+                       err = errors.New(e)
+               } else {
+                       err = fmt.Errorf("health=%q in ping response", h)
+               }
        }
        return
 }
 
 func (agg *Aggregator) checkAuth(req *http.Request, cluster *arvados.Cluster) bool {
-       creds := auth.NewCredentialsFromHTTPRequest(req)
+       creds := auth.CredentialsFromRequest(req)
        for _, token := range creds.Tokens {
                if token != "" && token == cluster.ManagementToken {
                        return true