16636: add boot outcome metrics.
[arvados.git] / lib / dispatchcloud / worker / worker.go
index 357ac20a038d56ca7a4778c6df38f05f2e2dae08..6878bb0655ea1e3bc1401396f0c9cbfe4ad9bba0 100644 (file)
@@ -43,6 +43,33 @@ var stateString = map[State]string{
        StateShutdown: "shutdown",
 }
 
+// BootOutcome is the result of a worker boot. It is used as a label in a metric.
+type BootOutcome string
+
+const (
+       BootOutcomeFailed       BootOutcome = "failure"
+       BootOutcomeSucceeded    BootOutcome = "success"
+       BootOutcomeIdleShutdown BootOutcome = "idle shutdown"
+       BootOutcomeDisappeared  BootOutcome = "disappeared"
+)
+
+var validBootOutcomes = map[BootOutcome]bool{
+       BootOutcomeFailed:       true,
+       BootOutcomeSucceeded:    true,
+       BootOutcomeIdleShutdown: true,
+       BootOutcomeDisappeared:  true,
+}
+
+func (wkr *worker) reportBootOutcome(outcome BootOutcome) {
+       if wkr.bootOutcomeReported {
+               return
+       }
+       if wkr.wp.mBootOutcomes != nil {
+               wkr.wp.mBootOutcomes.WithLabelValues(string(outcome)).Inc()
+       }
+       wkr.bootOutcomeReported = true
+}
+
 // String implements fmt.Stringer.
 func (s State) String() string {
        return stateString[s]
@@ -74,22 +101,23 @@ type worker struct {
        executor Executor
        wp       *Pool
 
-       mtx          sync.Locker // must be wp's Locker.
-       state        State
-       idleBehavior IdleBehavior
-       instance     cloud.Instance
-       instType     arvados.InstanceType
-       vcpus        int64
-       memory       int64
-       appeared     time.Time
-       probed       time.Time
-       updated      time.Time
-       busy         time.Time
-       destroyed    time.Time
-       lastUUID     string
-       running      map[string]*remoteRunner // remember to update state idle<->running when this changes
-       starting     map[string]*remoteRunner // remember to update state idle<->running when this changes
-       probing      chan struct{}
+       mtx                 sync.Locker // must be wp's Locker.
+       state               State
+       idleBehavior        IdleBehavior
+       instance            cloud.Instance
+       instType            arvados.InstanceType
+       vcpus               int64
+       memory              int64
+       appeared            time.Time
+       probed              time.Time
+       updated             time.Time
+       busy                time.Time
+       destroyed           time.Time
+       lastUUID            string
+       running             map[string]*remoteRunner // remember to update state idle<->running when this changes
+       starting            map[string]*remoteRunner // remember to update state idle<->running when this changes
+       probing             chan struct{}
+       bootOutcomeReported bool
 }
 
 func (wkr *worker) onUnkillable(uuid string) {
@@ -224,6 +252,7 @@ func (wkr *worker) probeAndUpdate() {
        defer wkr.mtx.Unlock()
        if reportedBroken && wkr.idleBehavior == IdleBehaviorRun {
                logger.Info("probe reported broken instance")
+               wkr.reportBootOutcome(BootOutcomeFailed)
                wkr.setIdleBehavior(IdleBehaviorDrain)
        }
        if !ok || (!booted && len(ctrUUIDs) == 0 && len(wkr.running) == 0) {
@@ -247,6 +276,7 @@ func (wkr *worker) probeAndUpdate() {
                        // some evidence about why the node never
                        // booted, even in non-debug mode.
                        if !booted {
+                               wkr.reportBootOutcome(BootOutcomeFailed)
                                logger.WithFields(logrus.Fields{
                                        "Duration": dur,
                                        "stderr":   string(stderr),
@@ -311,6 +341,7 @@ func (wkr *worker) probeAndUpdate() {
        }
        wkr.updated = updateTime
        if booted && (initialState == StateUnknown || initialState == StateBooting) {
+               wkr.reportBootOutcome(BootOutcomeSucceeded)
                logger.WithFields(logrus.Fields{
                        "RunningContainers": len(wkr.running),
                        "State":             wkr.state,
@@ -468,6 +499,7 @@ func (wkr *worker) shutdownIfIdle() bool {
                "IdleDuration": stats.Duration(time.Since(wkr.busy)),
                "IdleBehavior": wkr.idleBehavior,
        }).Info("shutdown worker")
+       wkr.reportBootOutcome(BootOutcomeIdleShutdown)
        wkr.shutdown()
        return true
 }