From 6036c55e1239281746152e85dfabbc9ed3cb6864 Mon Sep 17 00:00:00 2001 From: Ward Vandewege Date: Fri, 31 Jul 2020 17:37:12 -0400 Subject: [PATCH] 16636: add boot outcome metrics. Arvados-DCO-1.1-Signed-off-by: Ward Vandewege --- lib/dispatchcloud/worker/pool.go | 12 ++++++ lib/dispatchcloud/worker/worker.go | 64 ++++++++++++++++++++++-------- 2 files changed, 60 insertions(+), 16 deletions(-) diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go index 79af5a0cb3..bcf35e2854 100644 --- a/lib/dispatchcloud/worker/pool.go +++ b/lib/dispatchcloud/worker/pool.go @@ -176,6 +176,7 @@ type Pool struct { mInstancesPrice *prometheus.GaugeVec mVCPUs *prometheus.GaugeVec mMemory *prometheus.GaugeVec + mBootOutcomes *prometheus.CounterVec mDisappearances *prometheus.CounterVec } @@ -593,6 +594,16 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) { Help: "Total memory on all cloud VMs.", }, []string{"category"}) reg.MustRegister(wp.mMemory) + wp.mBootOutcomes = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "arvados", + Subsystem: "dispatchcloud", + Name: "boot_outcomes", + Help: "Boot outcomes by type.", + }, []string{"state"}) + for k := range validBootOutcomes { + wp.mBootOutcomes.WithLabelValues(string(k)).Add(0) + } + reg.MustRegister(wp.mBootOutcomes) wp.mDisappearances = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: "arvados", Subsystem: "dispatchcloud", @@ -867,6 +878,7 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) { "WorkerState": wkr.state, }) logger.Info("instance disappeared in cloud") + wkr.reportBootOutcome(BootOutcomeDisappeared) if wp.mDisappearances != nil { wp.mDisappearances.WithLabelValues(stateString[wkr.state]).Inc() } diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go index 357ac20a03..6878bb0655 100644 --- a/lib/dispatchcloud/worker/worker.go +++ b/lib/dispatchcloud/worker/worker.go @@ -43,6 +43,33 @@ var stateString = map[State]string{ StateShutdown: "shutdown", } +// BootOutcome is the result of a worker boot. It is used as a label in a metric. +type BootOutcome string + +const ( + BootOutcomeFailed BootOutcome = "failure" + BootOutcomeSucceeded BootOutcome = "success" + BootOutcomeIdleShutdown BootOutcome = "idle shutdown" + BootOutcomeDisappeared BootOutcome = "disappeared" +) + +var validBootOutcomes = map[BootOutcome]bool{ + BootOutcomeFailed: true, + BootOutcomeSucceeded: true, + BootOutcomeIdleShutdown: true, + BootOutcomeDisappeared: true, +} + +func (wkr *worker) reportBootOutcome(outcome BootOutcome) { + if wkr.bootOutcomeReported { + return + } + if wkr.wp.mBootOutcomes != nil { + wkr.wp.mBootOutcomes.WithLabelValues(string(outcome)).Inc() + } + wkr.bootOutcomeReported = true +} + // String implements fmt.Stringer. func (s State) String() string { return stateString[s] @@ -74,22 +101,23 @@ type worker struct { executor Executor wp *Pool - mtx sync.Locker // must be wp's Locker. - state State - idleBehavior IdleBehavior - instance cloud.Instance - instType arvados.InstanceType - vcpus int64 - memory int64 - appeared time.Time - probed time.Time - updated time.Time - busy time.Time - destroyed time.Time - lastUUID string - running map[string]*remoteRunner // remember to update state idle<->running when this changes - starting map[string]*remoteRunner // remember to update state idle<->running when this changes - probing chan struct{} + mtx sync.Locker // must be wp's Locker. + state State + idleBehavior IdleBehavior + instance cloud.Instance + instType arvados.InstanceType + vcpus int64 + memory int64 + appeared time.Time + probed time.Time + updated time.Time + busy time.Time + destroyed time.Time + lastUUID string + running map[string]*remoteRunner // remember to update state idle<->running when this changes + starting map[string]*remoteRunner // remember to update state idle<->running when this changes + probing chan struct{} + bootOutcomeReported bool } func (wkr *worker) onUnkillable(uuid string) { @@ -224,6 +252,7 @@ func (wkr *worker) probeAndUpdate() { defer wkr.mtx.Unlock() if reportedBroken && wkr.idleBehavior == IdleBehaviorRun { logger.Info("probe reported broken instance") + wkr.reportBootOutcome(BootOutcomeFailed) wkr.setIdleBehavior(IdleBehaviorDrain) } if !ok || (!booted && len(ctrUUIDs) == 0 && len(wkr.running) == 0) { @@ -247,6 +276,7 @@ func (wkr *worker) probeAndUpdate() { // some evidence about why the node never // booted, even in non-debug mode. if !booted { + wkr.reportBootOutcome(BootOutcomeFailed) logger.WithFields(logrus.Fields{ "Duration": dur, "stderr": string(stderr), @@ -311,6 +341,7 @@ func (wkr *worker) probeAndUpdate() { } wkr.updated = updateTime if booted && (initialState == StateUnknown || initialState == StateBooting) { + wkr.reportBootOutcome(BootOutcomeSucceeded) logger.WithFields(logrus.Fields{ "RunningContainers": len(wkr.running), "State": wkr.state, @@ -468,6 +499,7 @@ func (wkr *worker) shutdownIfIdle() bool { "IdleDuration": stats.Duration(time.Since(wkr.busy)), "IdleBehavior": wkr.idleBehavior, }).Info("shutdown worker") + wkr.reportBootOutcome(BootOutcomeIdleShutdown) wkr.shutdown() return true } -- 2.39.5