c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_sum [0-9.]*`)
c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_queue_to_crunch_run_seconds_count [0-9]*`)
c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_queue_to_crunch_run_seconds_sum [0-9e+.]*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*run_probe_duration_seconds_count{outcome="success"} [0-9]*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*run_probe_duration_seconds_sum{outcome="success"} [0-9e+.]*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*run_probe_duration_seconds_count{outcome="fail"} [0-9]*`)
+ c.Check(resp.Body.String(), check.Matches, `(?ms).*run_probe_duration_seconds_sum{outcome="fail"} [0-9e+.]*`)
}
func (s *DispatcherSuite) TestAPIPermissions(c *check.C) {
mTimeToReadyForContainer prometheus.Summary
mTimeFromShutdownToGone prometheus.Summary
mTimeFromQueueToCrunchRun prometheus.Summary
+ mRunProbeDuration *prometheus.SummaryVec
}
type createCall struct {
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
})
reg.MustRegister(wp.mTimeFromQueueToCrunchRun)
+ wp.mRunProbeDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "instances_run_probe_duration_seconds",
+ Help: "Number of seconds per runProbe call.",
+ Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
+ }, []string{"outcome"})
+ for _, v := range []string{"success", "fail"} {
+ wp.mRunProbeDuration.WithLabelValues(v).Observe(0)
+ }
+ reg.MustRegister(wp.mRunProbeDuration)
}
func (wp *Pool) runMetrics() {
}
// ProbeAndUpdate conducts appropriate boot/running probes (if any)
-// for the worker's curent state. If a previous probe is still
+// for the worker's current state. If a previous probe is still
// running, it does nothing.
//
// It should be called in a new goroutine.
if u := wkr.instance.RemoteUser(); u != "root" {
cmd = "sudo " + cmd
}
+ before := time.Now()
stdout, stderr, err := wkr.executor.Execute(nil, cmd, nil)
if err != nil {
wkr.logger.WithFields(logrus.Fields{
"stdout": string(stdout),
"stderr": string(stderr),
}).WithError(err).Warn("probe failed")
+ wkr.wp.mRunProbeDuration.WithLabelValues("fail").Observe(time.Now().Sub(before).Seconds())
return
}
+ wkr.wp.mRunProbeDuration.WithLabelValues("success").Observe(time.Now().Sub(before).Seconds())
ok = true
staleRunLock := false