mInstancesPrice *prometheus.GaugeVec
mVCPUs *prometheus.GaugeVec
mMemory *prometheus.GaugeVec
+ mBootOutcomes *prometheus.CounterVec
mDisappearances *prometheus.CounterVec
}
// time (Idle) or the earliest create time (Booting)
for _, wkr := range wp.workers {
if wkr.idleBehavior != IdleBehaviorHold && wkr.state == tryState && wkr.instType == it {
- logger.WithField("Instance", wkr.instance).Info("shutting down")
+ logger.WithField("Instance", wkr.instance.ID()).Info("shutting down")
+ wkr.reportBootOutcome(BootOutcomeAborted)
wkr.shutdown()
return true
}
defer wp.mtx.Unlock()
var wkr *worker
for _, w := range wp.workers {
- if w.instType == it && w.state == StateIdle {
+ if w.instType == it && w.state == StateIdle && w.idleBehavior == IdleBehaviorRun {
if wkr == nil || w.busy.After(wkr.busy) {
wkr = w
}
Subsystem: "dispatchcloud",
Name: "instances_total",
Help: "Number of cloud VMs.",
- }, []string{"category"})
+ }, []string{"category", "instance_type"})
reg.MustRegister(wp.mInstances)
wp.mInstancesPrice = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "arvados",
Help: "Total memory on all cloud VMs.",
}, []string{"category"})
reg.MustRegister(wp.mMemory)
+ wp.mBootOutcomes = prometheus.NewCounterVec(prometheus.CounterOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "boot_outcomes",
+ Help: "Boot outcomes by type.",
+ }, []string{"state"})
+ for k := range validBootOutcomes {
+ wp.mBootOutcomes.WithLabelValues(string(k)).Add(0)
+ }
+ reg.MustRegister(wp.mBootOutcomes)
wp.mDisappearances = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "arvados",
Subsystem: "dispatchcloud",
Name: "instances_disappeared",
Help: "Number of occurrences of an instance disappearing from the cloud provider's list of instances.",
- }, []string{"state"})
+ }, []string{"outcome"})
for _, v := range stateString {
wp.mDisappearances.WithLabelValues(v).Add(0)
}
wp.mtx.RLock()
defer wp.mtx.RUnlock()
- instances := map[string]int64{}
+ type entKey struct {
+ cat string
+ instType string
+ }
+ instances := map[entKey]int64{}
price := map[string]float64{}
cpu := map[string]int64{}
mem := map[string]int64{}
default:
cat = "idle"
}
- instances[cat]++
+ instances[entKey{cat, wkr.instType.Name}]++
price[cat] += wkr.instType.Price
cpu[cat] += int64(wkr.instType.VCPUs)
mem[cat] += int64(wkr.instType.RAM)
running += int64(len(wkr.running) + len(wkr.starting))
}
for _, cat := range []string{"inuse", "hold", "booting", "unknown", "idle"} {
- wp.mInstances.WithLabelValues(cat).Set(float64(instances[cat]))
wp.mInstancesPrice.WithLabelValues(cat).Set(price[cat])
wp.mVCPUs.WithLabelValues(cat).Set(float64(cpu[cat]))
wp.mMemory.WithLabelValues(cat).Set(float64(mem[cat]))
+ // make sure to reset gauges for non-existing category/nodetype combinations
+ for _, it := range wp.instanceTypes {
+ if _, ok := instances[entKey{cat, it.Name}]; !ok {
+ wp.mInstances.WithLabelValues(cat, it.Name).Set(float64(0))
+ }
+ }
+ }
+ for k, v := range instances {
+ wp.mInstances.WithLabelValues(k.cat, k.instType).Set(float64(v))
}
wp.mContainersRunning.Set(float64(running))
}
return errors.New("instance not found")
}
wkr.logger.WithField("Reason", reason).Info("shutting down")
+ wkr.reportBootOutcome(BootOutcomeAborted)
wkr.shutdown()
return nil
}
itTag := inst.Tags()[wp.tagKeyPrefix+tagKeyInstanceType]
it, ok := wp.instanceTypes[itTag]
if !ok {
- wp.logger.WithField("Instance", inst).Errorf("unknown InstanceType tag %q --- ignoring", itTag)
+ wp.logger.WithField("Instance", inst.ID()).Errorf("unknown InstanceType tag %q --- ignoring", itTag)
continue
}
if wkr, isNew := wp.updateWorker(inst, it); isNew {
notify = true
} else if wkr.state == StateShutdown && time.Since(wkr.destroyed) > wp.timeoutShutdown {
- wp.logger.WithField("Instance", inst).Info("worker still listed after shutdown; retrying")
+ wp.logger.WithField("Instance", inst.ID()).Info("worker still listed after shutdown; retrying")
wkr.shutdown()
}
}
"WorkerState": wkr.state,
})
logger.Info("instance disappeared in cloud")
+ wkr.reportBootOutcome(BootOutcomeDisappeared)
if wp.mDisappearances != nil {
wp.mDisappearances.WithLabelValues(stateString[wkr.state]).Inc()
}