+ for svcName, sh := range resp.Services {
+ switch svcName {
+ case arvados.ServiceNameDispatchCloud,
+ arvados.ServiceNameDispatchLSF:
+ // ok to not run any given dispatcher
+ case arvados.ServiceNameHealth,
+ arvados.ServiceNameWorkbench1,
+ arvados.ServiceNameWorkbench2:
+ // typically doesn't have InternalURLs in config
+ default:
+ if sh.Health != "OK" && sh.Health != "SKIP" {
+ resp.Health = "ERROR"
+ resp.Errors = append(resp.Errors, fmt.Sprintf("%s: %s: no InternalURLs configured", svcName, sh.Health))
+ continue
+ }
+ }
+ }
+
+ // Check for clock skew between hosts
+ var maxResponseTime time.Duration
+ var clockMin, clockMax time.Time
+ for _, result := range resp.Checks {
+ if result.ClockTime.IsZero() {
+ continue
+ }
+ if clockMin.IsZero() || result.ClockTime.Before(clockMin) {
+ clockMin = result.ClockTime
+ }
+ if result.ClockTime.After(clockMax) {
+ clockMax = result.ClockTime
+ }
+ if result.respTime > maxResponseTime {
+ maxResponseTime = result.respTime
+ }
+ }
+ skew := clockMax.Sub(clockMin)
+ resp.ClockSkew = arvados.Duration(skew)
+ if skew > maxClockSkew+maxResponseTime {
+ msg := fmt.Sprintf("clock skew detected: maximum timestamp spread is %s (exceeds warning threshold of %s)", resp.ClockSkew, arvados.Duration(maxClockSkew))
+ resp.Errors = append(resp.Errors, msg)
+ resp.Health = "ERROR"
+ }
+ if agg.MetricClockSkew != nil {
+ agg.MetricClockSkew.Set(skew.Seconds())
+ }
+
+ // Check for mismatched config files
+ var newest Metrics
+ for _, result := range resp.Checks {
+ if result.Metrics.ConfigSourceTimestamp.After(newest.ConfigSourceTimestamp) {
+ newest = result.Metrics
+ }
+ }
+ var mismatches []string
+ for target, result := range resp.Checks {
+ if hash := result.Metrics.ConfigSourceSHA256; hash != "" && hash != newest.ConfigSourceSHA256 {
+ mismatches = append(mismatches, target)
+ }
+ }
+ for _, target := range mismatches {
+ msg := fmt.Sprintf("outdated config: %s: config file (sha256 %s) does not match latest version with timestamp %s",
+ strings.TrimSuffix(target, "/_health/ping"),
+ resp.Checks[target].Metrics.ConfigSourceSHA256,
+ newest.ConfigSourceTimestamp.Format(time.RFC3339))
+ resp.Errors = append(resp.Errors, msg)
+ resp.Health = "ERROR"
+ }
+
+ // Check for services running a different version than we are.
+ for target, result := range resp.Checks {
+ if result.Metrics.Version != "" && !sameVersion(result.Metrics.Version, cmd.Version.String()) {
+ msg := fmt.Sprintf("version mismatch: %s is running %s -- expected %s",
+ strings.TrimSuffix(target, "/_health/ping"),
+ result.Metrics.Version,
+ cmd.Version.String())
+ resp.Errors = append(resp.Errors, msg)