18794: Avoid failing health check on incomplete-but-working config.
authorTom Clegg <tom@curii.com>
Wed, 13 Apr 2022 19:11:56 +0000 (15:11 -0400)
committerTom Clegg <tom@curii.com>
Tue, 26 Apr 2022 15:19:20 +0000 (11:19 -0400)
It is typical to have some InternalURLs that are only reachable by
from the same node (not necessarily where health-check runs) or are
missing entirely because only Nginx needs to know where they are.

Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

sdk/go/health/aggregator.go
sdk/go/health/aggregator_test.go

index 249df5e23b5e74904d061a3bb63fbad2f2e2b2f6..07fc72e08fe958bb55729804e8e7961c176a71e7 100644 (file)
@@ -6,6 +6,7 @@ package health
 
 import (
        "bufio"
+       "bytes"
        "context"
        "crypto/tls"
        "encoding/json"
@@ -13,6 +14,7 @@ import (
        "flag"
        "fmt"
        "io"
+       "net"
        "net/http"
        "net/url"
        "regexp"
@@ -131,7 +133,7 @@ type Metrics struct {
 }
 
 type ServiceHealth struct {
-       Health string `json:"health"`
+       Health string `json:"health"` // "OK", "ERROR", or "SKIP"
        N      int    `json:"n"`
 }
 
@@ -149,7 +151,7 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
                // Ensure svc is listed in resp.Services.
                mtx.Lock()
                if _, ok := resp.Services[svcName]; !ok {
-                       resp.Services[svcName] = ServiceHealth{Health: "ERROR"}
+                       resp.Services[svcName] = ServiceHealth{Health: "NONE"}
                }
                mtx.Unlock()
 
@@ -173,11 +175,13 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
                                        }
                                } else {
                                        result = agg.ping(pingURL)
-                                       m, err := agg.metrics(pingURL)
-                                       if err != nil {
-                                               result.Error = "metrics: " + err.Error()
+                                       if result.Health != "SKIP" {
+                                               m, err := agg.metrics(pingURL)
+                                               if err != nil && result.Error == "" {
+                                                       result.Error = "metrics: " + err.Error()
+                                               }
+                                               result.Metrics = m
                                        }
-                                       result.Metrics = m
                                }
 
                                mtx.Lock()
@@ -188,7 +192,7 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
                                        h.N++
                                        h.Health = "OK"
                                        resp.Services[svcName] = h
-                               } else {
+                               } else if result.Health != "SKIP" {
                                        resp.Health = "ERROR"
                                }
                        }(svcName, addr)
@@ -198,10 +202,20 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
 
        // Report ERROR if a needed service didn't fail any checks
        // merely because it isn't configured to run anywhere.
-       for _, sh := range resp.Services {
-               if sh.Health != "OK" {
-                       resp.Health = "ERROR"
-                       break
+       for svcName, sh := range resp.Services {
+               switch svcName {
+               case arvados.ServiceNameDispatchCloud,
+                       arvados.ServiceNameDispatchLSF:
+                       // ok to not run any given dispatcher
+               case arvados.ServiceNameHealth,
+                       arvados.ServiceNameWorkbench1,
+                       arvados.ServiceNameWorkbench2:
+                       // typically doesn't have InternalURLs in config
+               default:
+                       if sh.Health != "OK" && sh.Health != "SKIP" {
+                               resp.Health = "ERROR"
+                               continue
+                       }
                }
        }
 
@@ -253,6 +267,16 @@ func (agg *Aggregator) ping(target *url.URL) (result CheckResult) {
        req.Header.Set("X-Forwarded-Proto", "https")
 
        resp, err := agg.httpClient.Do(req)
+       if urlerr, ok := err.(*url.Error); ok {
+               if neterr, ok := urlerr.Err.(*net.OpError); ok && isLocalHost(target.Hostname()) {
+                       result = CheckResult{
+                               Health: "SKIP",
+                               Error:  neterr.Error(),
+                       }
+                       err = nil
+                       return
+               }
+       }
        if err != nil {
                result.Error = err.Error()
                return
@@ -320,6 +344,13 @@ func (agg *Aggregator) metrics(pingURL *url.URL) (result Metrics, err error) {
        return
 }
 
+// Test whether host is an easily recognizable loopback address:
+// 0.0.0.0, 127.x.x.x, ::1, or localhost.
+func isLocalHost(host string) bool {
+       ip := net.ParseIP(host)
+       return ip.IsLoopback() || bytes.Equal(ip.To4(), []byte{0, 0, 0, 0}) || strings.EqualFold(host, "localhost")
+}
+
 func (agg *Aggregator) checkAuth(req *http.Request) bool {
        creds := auth.CredentialsFromRequest(req)
        for _, token := range creds.Tokens {
index 5d76c19f29bac94def16b00c7441f5874d4140c4..050b2c0fcc2c12ffdf8594cb2637d8609213a58e 100644 (file)
@@ -123,6 +123,44 @@ func (s *AggregatorSuite) TestHealthyAndUnhealthy(c *check.C) {
        c.Logf("%#v", ep)
 }
 
+// If an InternalURL host is 0.0.0.0, localhost, 127/8, or ::1 and
+// nothing is listening there, don't fail the health check -- instead,
+// assume the relevant component just isn't installed/enabled on this
+// node, but does work when contacted through ExternalURL.
+func (s *AggregatorSuite) TestUnreachableLoopbackPort(c *check.C) {
+       srvH, listenH := s.stubServer(&healthyHandler{})
+       defer srvH.Close()
+       s.setAllServiceURLs(listenH)
+       arvadostest.SetServiceURL(&s.handler.Cluster.Services.Keepproxy, "http://localhost:9/")
+       arvadostest.SetServiceURL(&s.handler.Cluster.Services.Workbench1, "http://0.0.0.0:9/")
+       arvadostest.SetServiceURL(&s.handler.Cluster.Services.Keepbalance, "http://127.0.0.127:9/")
+       arvadostest.SetServiceURL(&s.handler.Cluster.Services.WebDAV, "http://[::1]:9/")
+       s.handler.ServeHTTP(s.resp, s.req)
+       s.checkOK(c)
+
+       // If a non-loopback address is unreachable, that's still a
+       // fail.
+       s.resp = httptest.NewRecorder()
+       arvadostest.SetServiceURL(&s.handler.Cluster.Services.WebDAV, "http://172.31.255.254:9/")
+       s.handler.ServeHTTP(s.resp, s.req)
+       s.checkUnhealthy(c)
+}
+
+func (s *AggregatorSuite) TestIsLocalHost(c *check.C) {
+       c.Check(isLocalHost("Localhost"), check.Equals, true)
+       c.Check(isLocalHost("localhost"), check.Equals, true)
+       c.Check(isLocalHost("127.0.0.1"), check.Equals, true)
+       c.Check(isLocalHost("127.0.0.127"), check.Equals, true)
+       c.Check(isLocalHost("127.1.2.7"), check.Equals, true)
+       c.Check(isLocalHost("0.0.0.0"), check.Equals, true)
+       c.Check(isLocalHost("::1"), check.Equals, true)
+       c.Check(isLocalHost("1.2.3.4"), check.Equals, false)
+       c.Check(isLocalHost("1::1"), check.Equals, false)
+       c.Check(isLocalHost("example.com"), check.Equals, false)
+       c.Check(isLocalHost("127.0.0"), check.Equals, false)
+       c.Check(isLocalHost(""), check.Equals, false)
+}
+
 func (s *AggregatorSuite) TestConfigMismatch(c *check.C) {
        // time1/hash1: current config
        time1 := time.Now().Add(time.Second - time.Minute - time.Hour)