Merge branch '19070-update-workflow-deps' refs #19070
authorPeter Amstutz <peter.amstutz@curii.com>
Fri, 13 May 2022 15:08:08 +0000 (11:08 -0400)
committerPeter Amstutz <peter.amstutz@curii.com>
Fri, 13 May 2022 15:08:08 +0000 (11:08 -0400)
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>

lib/config/load_test.go
sdk/go/arvados/config.go
sdk/go/arvados/config_test.go
sdk/go/health/aggregator.go
sdk/go/health/aggregator_test.go
services/health/main.go

index abf321705662f19e8f1bdb006cee71f7d87da659..4ae9a513c8c0d7874cfcfaaa0aeb47143ade075a 100644 (file)
@@ -316,7 +316,11 @@ Clusters:
   ManagementToken: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
   SystemRootToken: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
   Collections:
-   BlobSigningKey: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa`, &logbuf).Load()
+   BlobSigningKey: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+  InstanceTypes:
+   abc:
+    IncludedScratch: 123456
+`, &logbuf).Load()
        c.Assert(err, check.IsNil)
        yaml, err := yaml.Marshal(cfg)
        c.Assert(err, check.IsNil)
index f0adcda5f1c0ba2710ea0bd62b643453f46acfca..6a90c30ce4932926840d969de30216fa8495b390 100644 (file)
@@ -424,11 +424,11 @@ type CUDAFeatures struct {
 }
 
 type InstanceType struct {
-       Name            string
+       Name            string `json:"-"`
        ProviderType    string
        VCPUs           int
        RAM             ByteSize
-       Scratch         ByteSize
+       Scratch         ByteSize `json:"-"`
        IncludedScratch ByteSize
        AddedScratch    ByteSize
        Price           float64
@@ -528,49 +528,23 @@ type InstanceTypeMap map[string]InstanceType
 
 var errDuplicateInstanceTypeName = errors.New("duplicate instance type name")
 
-// UnmarshalJSON handles old config files that provide an array of
-// instance types instead of a hash.
+// UnmarshalJSON does special handling of InstanceTypes:
+// * populate computed fields (Name and Scratch)
+// * error out if InstancesTypes are populated as an array, which was
+//   deprecated in Arvados 1.2.0
 func (it *InstanceTypeMap) UnmarshalJSON(data []byte) error {
        fixup := func(t InstanceType) (InstanceType, error) {
                if t.ProviderType == "" {
                        t.ProviderType = t.Name
                }
-               if t.Scratch == 0 {
-                       t.Scratch = t.IncludedScratch + t.AddedScratch
-               } else if t.AddedScratch == 0 {
-                       t.AddedScratch = t.Scratch - t.IncludedScratch
-               } else if t.IncludedScratch == 0 {
-                       t.IncludedScratch = t.Scratch - t.AddedScratch
-               }
-
-               if t.Scratch != (t.IncludedScratch + t.AddedScratch) {
-                       return t, fmt.Errorf("InstanceType %q: Scratch != (IncludedScratch + AddedScratch)", t.Name)
-               }
+               // If t.Scratch is set in the configuration file, it will be ignored and overwritten.
+               // It will also generate a "deprecated or unknown config entry" warning.
+               t.Scratch = t.IncludedScratch + t.AddedScratch
                return t, nil
        }
 
        if len(data) > 0 && data[0] == '[' {
-               var arr []InstanceType
-               err := json.Unmarshal(data, &arr)
-               if err != nil {
-                       return err
-               }
-               if len(arr) == 0 {
-                       *it = nil
-                       return nil
-               }
-               *it = make(map[string]InstanceType, len(arr))
-               for _, t := range arr {
-                       if _, ok := (*it)[t.Name]; ok {
-                               return errDuplicateInstanceTypeName
-                       }
-                       t, err := fixup(t)
-                       if err != nil {
-                               return err
-                       }
-                       (*it)[t.Name] = t
-               }
-               return nil
+               return fmt.Errorf("InstanceTypes must be specified as a map, not an array, see https://doc.arvados.org/admin/config.html")
        }
        var hash map[string]InstanceType
        err := json.Unmarshal(data, &hash)
index 8c77e292875f23b536edcf8dbdef2a1d4f46d51d..58f4b961bbcc7c9b8945ed2b56ff69b502e1f99c 100644 (file)
@@ -15,7 +15,7 @@ var _ = check.Suite(&ConfigSuite{})
 
 type ConfigSuite struct{}
 
-func (s *ConfigSuite) TestInstanceTypesAsArray(c *check.C) {
+func (s *ConfigSuite) TestStringSetAsArray(c *check.C) {
        var cluster Cluster
        yaml.Unmarshal([]byte(`
 API:
@@ -25,13 +25,6 @@ API:
        c.Check(ok, check.Equals, true)
 }
 
-func (s *ConfigSuite) TestStringSetAsArray(c *check.C) {
-       var cluster Cluster
-       yaml.Unmarshal([]byte("InstanceTypes:\n- Name: foo\n"), &cluster)
-       c.Check(len(cluster.InstanceTypes), check.Equals, 1)
-       c.Check(cluster.InstanceTypes["foo"].Name, check.Equals, "foo")
-}
-
 func (s *ConfigSuite) TestInstanceTypesAsHash(c *check.C) {
        var cluster Cluster
        yaml.Unmarshal([]byte("InstanceTypes:\n  foo:\n    ProviderType: bar\n"), &cluster)
@@ -42,18 +35,16 @@ func (s *ConfigSuite) TestInstanceTypesAsHash(c *check.C) {
 
 func (s *ConfigSuite) TestInstanceTypeSize(c *check.C) {
        var it InstanceType
-       err := yaml.Unmarshal([]byte("Name: foo\nScratch: 4GB\nRAM: 4GiB\n"), &it)
+       err := yaml.Unmarshal([]byte("Name: foo\nIncludedScratch: 4GB\nRAM: 4GiB\n"), &it)
        c.Check(err, check.IsNil)
-       c.Check(int64(it.Scratch), check.Equals, int64(4000000000))
+       c.Check(int64(it.IncludedScratch), check.Equals, int64(4000000000))
        c.Check(int64(it.RAM), check.Equals, int64(4294967296))
 }
 
 func (s *ConfigSuite) TestInstanceTypeFixup(c *check.C) {
        for _, confdata := range []string{
                // Current format: map of entries
-               `{foo4: {IncludedScratch: 4GB}, foo8: {ProviderType: foo_8, Scratch: 8GB}}`,
-               // Legacy format: array of entries with key in "Name" field
-               `[{Name: foo4, IncludedScratch: 4GB}, {Name: foo8, ProviderType: foo_8, Scratch: 8GB}]`,
+               `{foo4: {IncludedScratch: 4GB}, foo8: {ProviderType: foo_8, AddedScratch: 8GB}}`,
        } {
                c.Log(confdata)
                var itm InstanceTypeMap
index 5e010d88bc1bd32159ebfaf928392b948357cfd4..f473eff3537bba9186d41363d132fd980d94c382 100644 (file)
@@ -29,10 +29,14 @@ import (
        "git.arvados.org/arvados.git/sdk/go/auth"
        "git.arvados.org/arvados.git/sdk/go/ctxlog"
        "github.com/ghodss/yaml"
+       "github.com/prometheus/client_golang/prometheus"
        "github.com/sirupsen/logrus"
 )
 
-const defaultTimeout = arvados.Duration(2 * time.Second)
+const (
+       defaultTimeout = arvados.Duration(2 * time.Second)
+       maxClockSkew   = time.Minute
+)
 
 // Aggregator implements service.Handler. It handles "GET /_health/all"
 // by checking the health of all configured services on the cluster
@@ -46,6 +50,9 @@ type Aggregator struct {
 
        // If non-nil, Log is called after handling each request.
        Log func(*http.Request, error)
+
+       // If non-nil, report clock skew on each health-check.
+       MetricClockSkew prometheus.Gauge
 }
 
 func (agg *Aggregator) setup() {
@@ -114,6 +121,10 @@ type ClusterHealthResponse struct {
        // anywhere."
        Services map[arvados.ServiceName]ServiceHealth `json:"services"`
 
+       // Difference between min/max timestamps in individual
+       // health-check responses.
+       ClockSkew arvados.Duration
+
        Errors []string `json:"errors"`
 }
 
@@ -124,7 +135,9 @@ type CheckResult struct {
        HTTPStatusText string                 `json:",omitempty"`
        Response       map[string]interface{} `json:"response"`
        ResponseTime   json.Number            `json:"responseTime"`
+       ClockTime      time.Time              `json:"clockTime"`
        Metrics        Metrics                `json:"-"`
+       respTime       time.Duration
 }
 
 type Metrics struct {
@@ -225,6 +238,33 @@ func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
                }
        }
 
+       var maxResponseTime time.Duration
+       var clockMin, clockMax time.Time
+       for _, result := range resp.Checks {
+               if result.ClockTime.IsZero() {
+                       continue
+               }
+               if clockMin.IsZero() || result.ClockTime.Before(clockMin) {
+                       clockMin = result.ClockTime
+               }
+               if result.ClockTime.After(clockMax) {
+                       clockMax = result.ClockTime
+               }
+               if result.respTime > maxResponseTime {
+                       maxResponseTime = result.respTime
+               }
+       }
+       skew := clockMax.Sub(clockMin)
+       resp.ClockSkew = arvados.Duration(skew)
+       if skew > maxClockSkew+maxResponseTime {
+               msg := fmt.Sprintf("clock skew detected: maximum timestamp spread is %s (exceeds warning threshold of %s)", resp.ClockSkew, arvados.Duration(maxClockSkew))
+               resp.Errors = append(resp.Errors, msg)
+               resp.Health = "ERROR"
+       }
+       if agg.MetricClockSkew != nil {
+               agg.MetricClockSkew.Set(skew.Seconds())
+       }
+
        var newest Metrics
        for _, result := range resp.Checks {
                if result.Metrics.ConfigSourceTimestamp.After(newest.ConfigSourceTimestamp) {
@@ -256,7 +296,8 @@ func (agg *Aggregator) pingURL(svcURL arvados.URL) (*url.URL, error) {
 func (agg *Aggregator) ping(target *url.URL) (result CheckResult) {
        t0 := time.Now()
        defer func() {
-               result.ResponseTime = json.Number(fmt.Sprintf("%.6f", time.Since(t0).Seconds()))
+               result.respTime = time.Since(t0)
+               result.ResponseTime = json.Number(fmt.Sprintf("%.6f", result.respTime.Seconds()))
        }()
        result.Health = "ERROR"
 
@@ -304,6 +345,7 @@ func (agg *Aggregator) ping(target *url.URL) (result CheckResult) {
                }
        }
        result.Health = "OK"
+       result.ClockTime, _ = time.Parse(time.RFC1123, resp.Header.Get("Date"))
        return
 }
 
index f8f7ff9f1b7e189b83423d017be2a48fd763cf28..5f60cf67f347f2e3ad86d97f9c709aa86833bc7e 100644 (file)
@@ -220,6 +220,40 @@ func (s *AggregatorSuite) TestConfigMismatch(c *check.C) {
        s.checkOK(c)
 }
 
+func (s *AggregatorSuite) TestClockSkew(c *check.C) {
+       // srv1: report real wall clock time
+       handler1 := healthyHandler{}
+       srv1, listen1 := s.stubServer(&handler1)
+       defer srv1.Close()
+       // srv2: report near-future time
+       handler2 := healthyHandler{headerDate: time.Now().Add(3 * time.Second)}
+       srv2, listen2 := s.stubServer(&handler2)
+       defer srv2.Close()
+       // srv3: report far-future time
+       handler3 := healthyHandler{headerDate: time.Now().Add(3*time.Minute + 3*time.Second)}
+       srv3, listen3 := s.stubServer(&handler3)
+       defer srv3.Close()
+
+       s.setAllServiceURLs(listen1)
+
+       // near-future time => OK
+       s.resp = httptest.NewRecorder()
+       arvadostest.SetServiceURL(&s.handler.Cluster.Services.DispatchCloud,
+               "http://localhost"+listen2+"/")
+       s.handler.ServeHTTP(s.resp, s.req)
+       s.checkOK(c)
+
+       // far-future time => error
+       s.resp = httptest.NewRecorder()
+       arvadostest.SetServiceURL(&s.handler.Cluster.Services.WebDAV,
+               "http://localhost"+listen3+"/")
+       s.handler.ServeHTTP(s.resp, s.req)
+       resp := s.checkUnhealthy(c)
+       if c.Check(len(resp.Errors) > 0, check.Equals, true) {
+               c.Check(resp.Errors[0], check.Matches, `clock skew detected: maximum timestamp spread is 3m.* \(exceeds warning threshold of 1m\)`)
+       }
+}
+
 func (s *AggregatorSuite) TestPingTimeout(c *check.C) {
        s.handler.timeout = arvados.Duration(100 * time.Millisecond)
        srv, listen := s.stubServer(&slowHandler{})
@@ -321,9 +355,13 @@ func (*unhealthyHandler) ServeHTTP(resp http.ResponseWriter, req *http.Request)
 type healthyHandler struct {
        configHash string
        configTime time.Time
+       headerDate time.Time
 }
 
 func (h *healthyHandler) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
+       if !h.headerDate.IsZero() {
+               resp.Header().Set("Date", h.headerDate.Format(time.RFC1123))
+       }
        authOK := req.Header.Get("Authorization") == "Bearer "+arvadostest.ManagementToken
        if req.URL.Path == "/_health/ping" {
                if !authOK {
index bc57d36d04b1158cd8c6ade4191a63eed5fad354..92bd377c800041583ded6c6b161e88332639f69b 100644 (file)
@@ -20,8 +20,18 @@ var (
        command cmd.Handler = service.Command(arvados.ServiceNameHealth, newHandler)
 )
 
-func newHandler(ctx context.Context, cluster *arvados.Cluster, _ string, _ *prometheus.Registry) service.Handler {
-       return &health.Aggregator{Cluster: cluster}
+func newHandler(ctx context.Context, cluster *arvados.Cluster, _ string, reg *prometheus.Registry) service.Handler {
+       mClockSkew := prometheus.NewGauge(prometheus.GaugeOpts{
+               Namespace: "arvados",
+               Subsystem: "health",
+               Name:      "clock_skew_seconds",
+               Help:      "Clock skew observed in most recent health check",
+       })
+       reg.MustRegister(mClockSkew)
+       return &health.Aggregator{
+               Cluster:         cluster,
+               MetricClockSkew: mClockSkew,
+       }
 }
 
 func main() {