From e9ce920388dbee730f68cf5edd387452935fba3e Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Wed, 13 Apr 2022 15:11:56 -0400 Subject: [PATCH] 18794: Avoid failing health check on incomplete-but-working config. It is typical to have some InternalURLs that are only reachable by from the same node (not necessarily where health-check runs) or are missing entirely because only Nginx needs to know where they are. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- sdk/go/health/aggregator.go | 53 +++++++++++++++++++++++++------- sdk/go/health/aggregator_test.go | 38 +++++++++++++++++++++++ 2 files changed, 80 insertions(+), 11 deletions(-) diff --git a/sdk/go/health/aggregator.go b/sdk/go/health/aggregator.go index 249df5e23b..07fc72e08f 100644 --- a/sdk/go/health/aggregator.go +++ b/sdk/go/health/aggregator.go @@ -6,6 +6,7 @@ package health import ( "bufio" + "bytes" "context" "crypto/tls" "encoding/json" @@ -13,6 +14,7 @@ import ( "flag" "fmt" "io" + "net" "net/http" "net/url" "regexp" @@ -131,7 +133,7 @@ type Metrics struct { } type ServiceHealth struct { - Health string `json:"health"` + Health string `json:"health"` // "OK", "ERROR", or "SKIP" N int `json:"n"` } @@ -149,7 +151,7 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse { // Ensure svc is listed in resp.Services. mtx.Lock() if _, ok := resp.Services[svcName]; !ok { - resp.Services[svcName] = ServiceHealth{Health: "ERROR"} + resp.Services[svcName] = ServiceHealth{Health: "NONE"} } mtx.Unlock() @@ -173,11 +175,13 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse { } } else { result = agg.ping(pingURL) - m, err := agg.metrics(pingURL) - if err != nil { - result.Error = "metrics: " + err.Error() + if result.Health != "SKIP" { + m, err := agg.metrics(pingURL) + if err != nil && result.Error == "" { + result.Error = "metrics: " + err.Error() + } + result.Metrics = m } - result.Metrics = m } mtx.Lock() @@ -188,7 +192,7 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse { h.N++ h.Health = "OK" resp.Services[svcName] = h - } else { + } else if result.Health != "SKIP" { resp.Health = "ERROR" } }(svcName, addr) @@ -198,10 +202,20 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse { // Report ERROR if a needed service didn't fail any checks // merely because it isn't configured to run anywhere. - for _, sh := range resp.Services { - if sh.Health != "OK" { - resp.Health = "ERROR" - break + for svcName, sh := range resp.Services { + switch svcName { + case arvados.ServiceNameDispatchCloud, + arvados.ServiceNameDispatchLSF: + // ok to not run any given dispatcher + case arvados.ServiceNameHealth, + arvados.ServiceNameWorkbench1, + arvados.ServiceNameWorkbench2: + // typically doesn't have InternalURLs in config + default: + if sh.Health != "OK" && sh.Health != "SKIP" { + resp.Health = "ERROR" + continue + } } } @@ -253,6 +267,16 @@ func (agg *Aggregator) ping(target *url.URL) (result CheckResult) { req.Header.Set("X-Forwarded-Proto", "https") resp, err := agg.httpClient.Do(req) + if urlerr, ok := err.(*url.Error); ok { + if neterr, ok := urlerr.Err.(*net.OpError); ok && isLocalHost(target.Hostname()) { + result = CheckResult{ + Health: "SKIP", + Error: neterr.Error(), + } + err = nil + return + } + } if err != nil { result.Error = err.Error() return @@ -320,6 +344,13 @@ func (agg *Aggregator) metrics(pingURL *url.URL) (result Metrics, err error) { return } +// Test whether host is an easily recognizable loopback address: +// 0.0.0.0, 127.x.x.x, ::1, or localhost. +func isLocalHost(host string) bool { + ip := net.ParseIP(host) + return ip.IsLoopback() || bytes.Equal(ip.To4(), []byte{0, 0, 0, 0}) || strings.EqualFold(host, "localhost") +} + func (agg *Aggregator) checkAuth(req *http.Request) bool { creds := auth.CredentialsFromRequest(req) for _, token := range creds.Tokens { diff --git a/sdk/go/health/aggregator_test.go b/sdk/go/health/aggregator_test.go index 5d76c19f29..050b2c0fcc 100644 --- a/sdk/go/health/aggregator_test.go +++ b/sdk/go/health/aggregator_test.go @@ -123,6 +123,44 @@ func (s *AggregatorSuite) TestHealthyAndUnhealthy(c *check.C) { c.Logf("%#v", ep) } +// If an InternalURL host is 0.0.0.0, localhost, 127/8, or ::1 and +// nothing is listening there, don't fail the health check -- instead, +// assume the relevant component just isn't installed/enabled on this +// node, but does work when contacted through ExternalURL. +func (s *AggregatorSuite) TestUnreachableLoopbackPort(c *check.C) { + srvH, listenH := s.stubServer(&healthyHandler{}) + defer srvH.Close() + s.setAllServiceURLs(listenH) + arvadostest.SetServiceURL(&s.handler.Cluster.Services.Keepproxy, "http://localhost:9/") + arvadostest.SetServiceURL(&s.handler.Cluster.Services.Workbench1, "http://0.0.0.0:9/") + arvadostest.SetServiceURL(&s.handler.Cluster.Services.Keepbalance, "http://127.0.0.127:9/") + arvadostest.SetServiceURL(&s.handler.Cluster.Services.WebDAV, "http://[::1]:9/") + s.handler.ServeHTTP(s.resp, s.req) + s.checkOK(c) + + // If a non-loopback address is unreachable, that's still a + // fail. + s.resp = httptest.NewRecorder() + arvadostest.SetServiceURL(&s.handler.Cluster.Services.WebDAV, "http://172.31.255.254:9/") + s.handler.ServeHTTP(s.resp, s.req) + s.checkUnhealthy(c) +} + +func (s *AggregatorSuite) TestIsLocalHost(c *check.C) { + c.Check(isLocalHost("Localhost"), check.Equals, true) + c.Check(isLocalHost("localhost"), check.Equals, true) + c.Check(isLocalHost("127.0.0.1"), check.Equals, true) + c.Check(isLocalHost("127.0.0.127"), check.Equals, true) + c.Check(isLocalHost("127.1.2.7"), check.Equals, true) + c.Check(isLocalHost("0.0.0.0"), check.Equals, true) + c.Check(isLocalHost("::1"), check.Equals, true) + c.Check(isLocalHost("1.2.3.4"), check.Equals, false) + c.Check(isLocalHost("1::1"), check.Equals, false) + c.Check(isLocalHost("example.com"), check.Equals, false) + c.Check(isLocalHost("127.0.0"), check.Equals, false) + c.Check(isLocalHost(""), check.Equals, false) +} + func (s *AggregatorSuite) TestConfigMismatch(c *check.C) { // time1/hash1: current config time1 := time.Now().Add(time.Second - time.Minute - time.Hour) -- 2.30.2