16636: add boot outcome metrics.
authorWard Vandewege <ward@curii.com>
Fri, 31 Jul 2020 21:37:12 +0000 (17:37 -0400)
committerWard Vandewege <ward@curii.com>
Fri, 31 Jul 2020 21:37:12 +0000 (17:37 -0400)
Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward@curii.com>

lib/dispatchcloud/worker/pool.go
lib/dispatchcloud/worker/worker.go

index 79af5a0cb3c01853c9800f0d52e5ba7448cd5145..bcf35e285486d817000ee07ef825b185a8489b84 100644 (file)
@@ -176,6 +176,7 @@ type Pool struct {
        mInstancesPrice    *prometheus.GaugeVec
        mVCPUs             *prometheus.GaugeVec
        mMemory            *prometheus.GaugeVec
+       mBootOutcomes      *prometheus.CounterVec
        mDisappearances    *prometheus.CounterVec
 }
 
@@ -593,6 +594,16 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
                Help:      "Total memory on all cloud VMs.",
        }, []string{"category"})
        reg.MustRegister(wp.mMemory)
+       wp.mBootOutcomes = prometheus.NewCounterVec(prometheus.CounterOpts{
+               Namespace: "arvados",
+               Subsystem: "dispatchcloud",
+               Name:      "boot_outcomes",
+               Help:      "Boot outcomes by type.",
+       }, []string{"state"})
+       for k := range validBootOutcomes {
+               wp.mBootOutcomes.WithLabelValues(string(k)).Add(0)
+       }
+       reg.MustRegister(wp.mBootOutcomes)
        wp.mDisappearances = prometheus.NewCounterVec(prometheus.CounterOpts{
                Namespace: "arvados",
                Subsystem: "dispatchcloud",
@@ -867,6 +878,7 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
                        "WorkerState": wkr.state,
                })
                logger.Info("instance disappeared in cloud")
+               wkr.reportBootOutcome(BootOutcomeDisappeared)
                if wp.mDisappearances != nil {
                        wp.mDisappearances.WithLabelValues(stateString[wkr.state]).Inc()
                }
index 357ac20a038d56ca7a4778c6df38f05f2e2dae08..6878bb0655ea1e3bc1401396f0c9cbfe4ad9bba0 100644 (file)
@@ -43,6 +43,33 @@ var stateString = map[State]string{
        StateShutdown: "shutdown",
 }
 
+// BootOutcome is the result of a worker boot. It is used as a label in a metric.
+type BootOutcome string
+
+const (
+       BootOutcomeFailed       BootOutcome = "failure"
+       BootOutcomeSucceeded    BootOutcome = "success"
+       BootOutcomeIdleShutdown BootOutcome = "idle shutdown"
+       BootOutcomeDisappeared  BootOutcome = "disappeared"
+)
+
+var validBootOutcomes = map[BootOutcome]bool{
+       BootOutcomeFailed:       true,
+       BootOutcomeSucceeded:    true,
+       BootOutcomeIdleShutdown: true,
+       BootOutcomeDisappeared:  true,
+}
+
+func (wkr *worker) reportBootOutcome(outcome BootOutcome) {
+       if wkr.bootOutcomeReported {
+               return
+       }
+       if wkr.wp.mBootOutcomes != nil {
+               wkr.wp.mBootOutcomes.WithLabelValues(string(outcome)).Inc()
+       }
+       wkr.bootOutcomeReported = true
+}
+
 // String implements fmt.Stringer.
 func (s State) String() string {
        return stateString[s]
@@ -74,22 +101,23 @@ type worker struct {
        executor Executor
        wp       *Pool
 
-       mtx          sync.Locker // must be wp's Locker.
-       state        State
-       idleBehavior IdleBehavior
-       instance     cloud.Instance
-       instType     arvados.InstanceType
-       vcpus        int64
-       memory       int64
-       appeared     time.Time
-       probed       time.Time
-       updated      time.Time
-       busy         time.Time
-       destroyed    time.Time
-       lastUUID     string
-       running      map[string]*remoteRunner // remember to update state idle<->running when this changes
-       starting     map[string]*remoteRunner // remember to update state idle<->running when this changes
-       probing      chan struct{}
+       mtx                 sync.Locker // must be wp's Locker.
+       state               State
+       idleBehavior        IdleBehavior
+       instance            cloud.Instance
+       instType            arvados.InstanceType
+       vcpus               int64
+       memory              int64
+       appeared            time.Time
+       probed              time.Time
+       updated             time.Time
+       busy                time.Time
+       destroyed           time.Time
+       lastUUID            string
+       running             map[string]*remoteRunner // remember to update state idle<->running when this changes
+       starting            map[string]*remoteRunner // remember to update state idle<->running when this changes
+       probing             chan struct{}
+       bootOutcomeReported bool
 }
 
 func (wkr *worker) onUnkillable(uuid string) {
@@ -224,6 +252,7 @@ func (wkr *worker) probeAndUpdate() {
        defer wkr.mtx.Unlock()
        if reportedBroken && wkr.idleBehavior == IdleBehaviorRun {
                logger.Info("probe reported broken instance")
+               wkr.reportBootOutcome(BootOutcomeFailed)
                wkr.setIdleBehavior(IdleBehaviorDrain)
        }
        if !ok || (!booted && len(ctrUUIDs) == 0 && len(wkr.running) == 0) {
@@ -247,6 +276,7 @@ func (wkr *worker) probeAndUpdate() {
                        // some evidence about why the node never
                        // booted, even in non-debug mode.
                        if !booted {
+                               wkr.reportBootOutcome(BootOutcomeFailed)
                                logger.WithFields(logrus.Fields{
                                        "Duration": dur,
                                        "stderr":   string(stderr),
@@ -311,6 +341,7 @@ func (wkr *worker) probeAndUpdate() {
        }
        wkr.updated = updateTime
        if booted && (initialState == StateUnknown || initialState == StateBooting) {
+               wkr.reportBootOutcome(BootOutcomeSucceeded)
                logger.WithFields(logrus.Fields{
                        "RunningContainers": len(wkr.running),
                        "State":             wkr.state,
@@ -468,6 +499,7 @@ func (wkr *worker) shutdownIfIdle() bool {
                "IdleDuration": stats.Duration(time.Since(wkr.busy)),
                "IdleBehavior": wkr.idleBehavior,
        }).Info("shutdown worker")
+       wkr.reportBootOutcome(BootOutcomeIdleShutdown)
        wkr.shutdown()
        return true
 }