ManagementToken: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
SystemRootToken: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
Collections:
- BlobSigningKey: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa`, &logbuf).Load()
+ BlobSigningKey: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+ InstanceTypes:
+ abc:
+ IncludedScratch: 123456
+`, &logbuf).Load()
c.Assert(err, check.IsNil)
yaml, err := yaml.Marshal(cfg)
c.Assert(err, check.IsNil)
}
type InstanceType struct {
- Name string
+ Name string `json:"-"`
ProviderType string
VCPUs int
RAM ByteSize
- Scratch ByteSize
+ Scratch ByteSize `json:"-"`
IncludedScratch ByteSize
AddedScratch ByteSize
Price float64
var errDuplicateInstanceTypeName = errors.New("duplicate instance type name")
-// UnmarshalJSON handles old config files that provide an array of
-// instance types instead of a hash.
+// UnmarshalJSON does special handling of InstanceTypes:
+// * populate computed fields (Name and Scratch)
+// * error out if InstancesTypes are populated as an array, which was
+// deprecated in Arvados 1.2.0
func (it *InstanceTypeMap) UnmarshalJSON(data []byte) error {
fixup := func(t InstanceType) (InstanceType, error) {
if t.ProviderType == "" {
t.ProviderType = t.Name
}
- if t.Scratch == 0 {
- t.Scratch = t.IncludedScratch + t.AddedScratch
- } else if t.AddedScratch == 0 {
- t.AddedScratch = t.Scratch - t.IncludedScratch
- } else if t.IncludedScratch == 0 {
- t.IncludedScratch = t.Scratch - t.AddedScratch
- }
-
- if t.Scratch != (t.IncludedScratch + t.AddedScratch) {
- return t, fmt.Errorf("InstanceType %q: Scratch != (IncludedScratch + AddedScratch)", t.Name)
- }
+ // If t.Scratch is set in the configuration file, it will be ignored and overwritten.
+ // It will also generate a "deprecated or unknown config entry" warning.
+ t.Scratch = t.IncludedScratch + t.AddedScratch
return t, nil
}
if len(data) > 0 && data[0] == '[' {
- var arr []InstanceType
- err := json.Unmarshal(data, &arr)
- if err != nil {
- return err
- }
- if len(arr) == 0 {
- *it = nil
- return nil
- }
- *it = make(map[string]InstanceType, len(arr))
- for _, t := range arr {
- if _, ok := (*it)[t.Name]; ok {
- return errDuplicateInstanceTypeName
- }
- t, err := fixup(t)
- if err != nil {
- return err
- }
- (*it)[t.Name] = t
- }
- return nil
+ return fmt.Errorf("InstanceTypes must be specified as a map, not an array, see https://doc.arvados.org/admin/config.html")
}
var hash map[string]InstanceType
err := json.Unmarshal(data, &hash)
type ConfigSuite struct{}
-func (s *ConfigSuite) TestInstanceTypesAsArray(c *check.C) {
+func (s *ConfigSuite) TestStringSetAsArray(c *check.C) {
var cluster Cluster
yaml.Unmarshal([]byte(`
API:
c.Check(ok, check.Equals, true)
}
-func (s *ConfigSuite) TestStringSetAsArray(c *check.C) {
- var cluster Cluster
- yaml.Unmarshal([]byte("InstanceTypes:\n- Name: foo\n"), &cluster)
- c.Check(len(cluster.InstanceTypes), check.Equals, 1)
- c.Check(cluster.InstanceTypes["foo"].Name, check.Equals, "foo")
-}
-
func (s *ConfigSuite) TestInstanceTypesAsHash(c *check.C) {
var cluster Cluster
yaml.Unmarshal([]byte("InstanceTypes:\n foo:\n ProviderType: bar\n"), &cluster)
func (s *ConfigSuite) TestInstanceTypeSize(c *check.C) {
var it InstanceType
- err := yaml.Unmarshal([]byte("Name: foo\nScratch: 4GB\nRAM: 4GiB\n"), &it)
+ err := yaml.Unmarshal([]byte("Name: foo\nIncludedScratch: 4GB\nRAM: 4GiB\n"), &it)
c.Check(err, check.IsNil)
- c.Check(int64(it.Scratch), check.Equals, int64(4000000000))
+ c.Check(int64(it.IncludedScratch), check.Equals, int64(4000000000))
c.Check(int64(it.RAM), check.Equals, int64(4294967296))
}
func (s *ConfigSuite) TestInstanceTypeFixup(c *check.C) {
for _, confdata := range []string{
// Current format: map of entries
- `{foo4: {IncludedScratch: 4GB}, foo8: {ProviderType: foo_8, Scratch: 8GB}}`,
- // Legacy format: array of entries with key in "Name" field
- `[{Name: foo4, IncludedScratch: 4GB}, {Name: foo8, ProviderType: foo_8, Scratch: 8GB}]`,
+ `{foo4: {IncludedScratch: 4GB}, foo8: {ProviderType: foo_8, AddedScratch: 8GB}}`,
} {
c.Log(confdata)
var itm InstanceTypeMap
"git.arvados.org/arvados.git/sdk/go/auth"
"git.arvados.org/arvados.git/sdk/go/ctxlog"
"github.com/ghodss/yaml"
+ "github.com/prometheus/client_golang/prometheus"
"github.com/sirupsen/logrus"
)
-const defaultTimeout = arvados.Duration(2 * time.Second)
+const (
+ defaultTimeout = arvados.Duration(2 * time.Second)
+ maxClockSkew = time.Minute
+)
// Aggregator implements service.Handler. It handles "GET /_health/all"
// by checking the health of all configured services on the cluster
// If non-nil, Log is called after handling each request.
Log func(*http.Request, error)
+
+ // If non-nil, report clock skew on each health-check.
+ MetricClockSkew prometheus.Gauge
}
func (agg *Aggregator) setup() {
// anywhere."
Services map[arvados.ServiceName]ServiceHealth `json:"services"`
+ // Difference between min/max timestamps in individual
+ // health-check responses.
+ ClockSkew arvados.Duration
+
Errors []string `json:"errors"`
}
HTTPStatusText string `json:",omitempty"`
Response map[string]interface{} `json:"response"`
ResponseTime json.Number `json:"responseTime"`
+ ClockTime time.Time `json:"clockTime"`
Metrics Metrics `json:"-"`
+ respTime time.Duration
}
type Metrics struct {
}
}
+ var maxResponseTime time.Duration
+ var clockMin, clockMax time.Time
+ for _, result := range resp.Checks {
+ if result.ClockTime.IsZero() {
+ continue
+ }
+ if clockMin.IsZero() || result.ClockTime.Before(clockMin) {
+ clockMin = result.ClockTime
+ }
+ if result.ClockTime.After(clockMax) {
+ clockMax = result.ClockTime
+ }
+ if result.respTime > maxResponseTime {
+ maxResponseTime = result.respTime
+ }
+ }
+ skew := clockMax.Sub(clockMin)
+ resp.ClockSkew = arvados.Duration(skew)
+ if skew > maxClockSkew+maxResponseTime {
+ msg := fmt.Sprintf("clock skew detected: maximum timestamp spread is %s (exceeds warning threshold of %s)", resp.ClockSkew, arvados.Duration(maxClockSkew))
+ resp.Errors = append(resp.Errors, msg)
+ resp.Health = "ERROR"
+ }
+ if agg.MetricClockSkew != nil {
+ agg.MetricClockSkew.Set(skew.Seconds())
+ }
+
var newest Metrics
for _, result := range resp.Checks {
if result.Metrics.ConfigSourceTimestamp.After(newest.ConfigSourceTimestamp) {
func (agg *Aggregator) ping(target *url.URL) (result CheckResult) {
t0 := time.Now()
defer func() {
- result.ResponseTime = json.Number(fmt.Sprintf("%.6f", time.Since(t0).Seconds()))
+ result.respTime = time.Since(t0)
+ result.ResponseTime = json.Number(fmt.Sprintf("%.6f", result.respTime.Seconds()))
}()
result.Health = "ERROR"
}
}
result.Health = "OK"
+ result.ClockTime, _ = time.Parse(time.RFC1123, resp.Header.Get("Date"))
return
}
s.checkOK(c)
}
+func (s *AggregatorSuite) TestClockSkew(c *check.C) {
+ // srv1: report real wall clock time
+ handler1 := healthyHandler{}
+ srv1, listen1 := s.stubServer(&handler1)
+ defer srv1.Close()
+ // srv2: report near-future time
+ handler2 := healthyHandler{headerDate: time.Now().Add(3 * time.Second)}
+ srv2, listen2 := s.stubServer(&handler2)
+ defer srv2.Close()
+ // srv3: report far-future time
+ handler3 := healthyHandler{headerDate: time.Now().Add(3*time.Minute + 3*time.Second)}
+ srv3, listen3 := s.stubServer(&handler3)
+ defer srv3.Close()
+
+ s.setAllServiceURLs(listen1)
+
+ // near-future time => OK
+ s.resp = httptest.NewRecorder()
+ arvadostest.SetServiceURL(&s.handler.Cluster.Services.DispatchCloud,
+ "http://localhost"+listen2+"/")
+ s.handler.ServeHTTP(s.resp, s.req)
+ s.checkOK(c)
+
+ // far-future time => error
+ s.resp = httptest.NewRecorder()
+ arvadostest.SetServiceURL(&s.handler.Cluster.Services.WebDAV,
+ "http://localhost"+listen3+"/")
+ s.handler.ServeHTTP(s.resp, s.req)
+ resp := s.checkUnhealthy(c)
+ if c.Check(len(resp.Errors) > 0, check.Equals, true) {
+ c.Check(resp.Errors[0], check.Matches, `clock skew detected: maximum timestamp spread is 3m.* \(exceeds warning threshold of 1m\)`)
+ }
+}
+
func (s *AggregatorSuite) TestPingTimeout(c *check.C) {
s.handler.timeout = arvados.Duration(100 * time.Millisecond)
srv, listen := s.stubServer(&slowHandler{})
type healthyHandler struct {
configHash string
configTime time.Time
+ headerDate time.Time
}
func (h *healthyHandler) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
+ if !h.headerDate.IsZero() {
+ resp.Header().Set("Date", h.headerDate.Format(time.RFC1123))
+ }
authOK := req.Header.Get("Authorization") == "Bearer "+arvadostest.ManagementToken
if req.URL.Path == "/_health/ping" {
if !authOK {
command cmd.Handler = service.Command(arvados.ServiceNameHealth, newHandler)
)
-func newHandler(ctx context.Context, cluster *arvados.Cluster, _ string, _ *prometheus.Registry) service.Handler {
- return &health.Aggregator{Cluster: cluster}
+func newHandler(ctx context.Context, cluster *arvados.Cluster, _ string, reg *prometheus.Registry) service.Handler {
+ mClockSkew := prometheus.NewGauge(prometheus.GaugeOpts{
+ Namespace: "arvados",
+ Subsystem: "health",
+ Name: "clock_skew_seconds",
+ Help: "Clock skew observed in most recent health check",
+ })
+ reg.MustRegister(mClockSkew)
+ return &health.Aggregator{
+ Cluster: cluster,
+ MetricClockSkew: mClockSkew,
+ }
}
func main() {