Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>
// Ensure svc is listed in resp.Services.
mtx.Lock()
if _, ok := resp.Services[svcName]; !ok {
// Ensure svc is listed in resp.Services.
mtx.Lock()
if _, ok := resp.Services[svcName]; !ok {
- resp.Services[svcName] = ServiceHealth{Health: "NONE"}
+ resp.Services[svcName] = ServiceHealth{Health: "MISSING"}
mtx.Lock()
defer mtx.Unlock()
resp.Checks[fmt.Sprintf("%s+%s", svcName, pingURL)] = result
mtx.Lock()
defer mtx.Unlock()
resp.Checks[fmt.Sprintf("%s+%s", svcName, pingURL)] = result
- if result.Health == "OK" {
+ if result.Health == "OK" || result.Health == "SKIP" {
h := resp.Services[svcName]
h.N++
h := resp.Services[svcName]
h.N++
+ if result.Health == "OK" || h.N == 1 {
+ // "" => "SKIP" or "OK"
+ // "SKIP" => "OK"
+ h.Health = result.Health
+ }
resp.Services[svcName] = h
resp.Services[svcName] = h
- } else if result.Health != "SKIP" {
+ resp.Errors = append(resp.Errors, fmt.Sprintf("%s: %s: %s", svcName, result.Health, result.Error))
default:
if sh.Health != "OK" && sh.Health != "SKIP" {
resp.Health = "ERROR"
default:
if sh.Health != "OK" && sh.Health != "SKIP" {
resp.Health = "ERROR"
+ resp.Errors = append(resp.Errors, fmt.Sprintf("%s: %s: no InternalURLs configured", svcName, sh.Health))
loader.SetupFlags(flags)
versionFlag := flags.Bool("version", false, "Write version information to stdout and exit 0")
timeout := flags.Duration("timeout", defaultTimeout.Duration(), "Maximum time to wait for health responses")
loader.SetupFlags(flags)
versionFlag := flags.Bool("version", false, "Write version information to stdout and exit 0")
timeout := flags.Duration("timeout", defaultTimeout.Duration(), "Maximum time to wait for health responses")
+ outputYAML := flags.Bool("yaml", false, "Output full health report in YAML format (default mode shows errors as plain text, is silent on success)")
if ok, _ := cmd.ParseFlags(flags, prog, args, "", stderr); !ok {
// cmd.ParseFlags already reported the error
return errSilent
if ok, _ := cmd.ParseFlags(flags, prog, args, "", stderr); !ok {
// cmd.ParseFlags already reported the error
return errSilent
ctx = ctxlog.Context(ctx, logger)
agg := Aggregator{Cluster: cluster, timeout: arvados.Duration(*timeout)}
resp := agg.ClusterHealth()
ctx = ctxlog.Context(ctx, logger)
agg := Aggregator{Cluster: cluster, timeout: arvados.Duration(*timeout)}
resp := agg.ClusterHealth()
- buf, err := yaml.Marshal(resp)
- if err != nil {
- return err
+ if *outputYAML {
+ y, err := yaml.Marshal(resp)
+ if err != nil {
+ return err
+ }
+ stdout.Write(y)
+ if resp.Health != "OK" {
+ return errSilent
+ }
+ return nil
- return fmt.Errorf("health check failed")
+ for _, msg := range resp.Errors {
+ fmt.Fprintln(stdout, msg)
+ }
+ fmt.Fprintln(stderr, "health check failed")
+ return errSilent