X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/6036c55e1239281746152e85dfabbc9ed3cb6864..407fc461d20ece8b11b7b56f29a3caff3083ff8d:/lib/dispatchcloud/worker/worker.go diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go index 6878bb0655..95794d0b36 100644 --- a/lib/dispatchcloud/worker/worker.go +++ b/lib/dispatchcloud/worker/worker.go @@ -43,33 +43,6 @@ var stateString = map[State]string{ StateShutdown: "shutdown", } -// BootOutcome is the result of a worker boot. It is used as a label in a metric. -type BootOutcome string - -const ( - BootOutcomeFailed BootOutcome = "failure" - BootOutcomeSucceeded BootOutcome = "success" - BootOutcomeIdleShutdown BootOutcome = "idle shutdown" - BootOutcomeDisappeared BootOutcome = "disappeared" -) - -var validBootOutcomes = map[BootOutcome]bool{ - BootOutcomeFailed: true, - BootOutcomeSucceeded: true, - BootOutcomeIdleShutdown: true, - BootOutcomeDisappeared: true, -} - -func (wkr *worker) reportBootOutcome(outcome BootOutcome) { - if wkr.bootOutcomeReported { - return - } - if wkr.wp.mBootOutcomes != nil { - wkr.wp.mBootOutcomes.WithLabelValues(string(outcome)).Inc() - } - wkr.bootOutcomeReported = true -} - // String implements fmt.Stringer. func (s State) String() string { return stateString[s] @@ -81,6 +54,23 @@ func (s State) MarshalText() ([]byte, error) { return []byte(stateString[s]), nil } +// BootOutcome is the result of a worker boot. It is used as a label in a metric. +type BootOutcome string + +const ( + BootOutcomeFailed BootOutcome = "failure" + BootOutcomeSucceeded BootOutcome = "success" + BootOutcomeAborted BootOutcome = "aborted" + BootOutcomeDisappeared BootOutcome = "disappeared" +) + +var validBootOutcomes = map[BootOutcome]bool{ + BootOutcomeFailed: true, + BootOutcomeSucceeded: true, + BootOutcomeAborted: true, + BootOutcomeDisappeared: true, +} + // IdleBehavior indicates the behavior desired when a node becomes idle. type IdleBehavior string @@ -113,11 +103,13 @@ type worker struct { updated time.Time busy time.Time destroyed time.Time + firstSSHConnection time.Time lastUUID string running map[string]*remoteRunner // remember to update state idle<->running when this changes starting map[string]*remoteRunner // remember to update state idle<->running when this changes probing chan struct{} bootOutcomeReported bool + timeToReadyReported bool } func (wkr *worker) onUnkillable(uuid string) { @@ -139,6 +131,28 @@ func (wkr *worker) onKilled(uuid string) { go wkr.wp.notify() } +// caller must have lock. +func (wkr *worker) reportBootOutcome(outcome BootOutcome) { + if wkr.bootOutcomeReported { + return + } + if wkr.wp.mBootOutcomes != nil { + wkr.wp.mBootOutcomes.WithLabelValues(string(outcome)).Inc() + } + wkr.bootOutcomeReported = true +} + +// caller must have lock. +func (wkr *worker) reportTimeBetweenFirstSSHAndReadyForContainer() { + if wkr.timeToReadyReported { + return + } + if wkr.wp.mTimeToSSH != nil { + wkr.wp.mTimeToReadyForContainer.Observe(time.Since(wkr.firstSSHConnection).Seconds()) + } + wkr.timeToReadyReported = true +} + // caller must have lock. func (wkr *worker) setIdleBehavior(idleBehavior IdleBehavior) { wkr.logger.WithField("IdleBehavior", idleBehavior).Info("set idle behavior") @@ -162,6 +176,9 @@ func (wkr *worker) startContainer(ctr arvados.Container) { } go func() { rr.Start() + if wkr.wp.mTimeFromQueueToCrunchRun != nil { + wkr.wp.mTimeFromQueueToCrunchRun.Observe(time.Since(ctr.CreatedAt).Seconds()) + } wkr.mtx.Lock() defer wkr.mtx.Unlock() now := time.Now() @@ -312,6 +329,9 @@ func (wkr *worker) probeAndUpdate() { // Update state if this was the first successful boot-probe. if booted && (wkr.state == StateUnknown || wkr.state == StateBooting) { + if wkr.state == StateBooting { + wkr.reportTimeBetweenFirstSSHAndReadyForContainer() + } // Note: this will change again below if // len(wkr.starting)+len(wkr.running) > 0. wkr.state = StateIdle @@ -499,7 +519,7 @@ func (wkr *worker) shutdownIfIdle() bool { "IdleDuration": stats.Duration(time.Since(wkr.busy)), "IdleBehavior": wkr.idleBehavior, }).Info("shutdown worker") - wkr.reportBootOutcome(BootOutcomeIdleShutdown) + wkr.reportBootOutcome(BootOutcomeAborted) wkr.shutdown() return true }