instanceSet: &throttledInstanceSet{InstanceSet: instanceSet},
newExecutor: newExecutor,
bootProbeCommand: cluster.Containers.CloudVMs.BootProbeCommand,
+ runnerSource: cluster.Containers.CloudVMs.DeployRunnerBinary,
imageID: cloud.ImageID(cluster.Containers.CloudVMs.ImageID),
instanceTypes: cluster.InstanceTypes,
maxProbesPerSecond: cluster.Containers.CloudVMs.MaxProbesPerSecond,
// time (Idle) or the earliest create time (Booting)
for _, wkr := range wp.workers {
if wkr.idleBehavior != IdleBehaviorHold && wkr.state == tryState && wkr.instType == it {
- logger.WithField("Instance", wkr.instance).Info("shutting down")
+ logger.WithField("Instance", wkr.instance.ID()).Info("shutting down")
wkr.shutdown()
return true
}
Subsystem: "dispatchcloud",
Name: "instances_total",
Help: "Number of cloud VMs.",
- }, []string{"category"})
+ }, []string{"category", "instance_type"})
reg.MustRegister(wp.mInstances)
wp.mInstancesPrice = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "arvados",
wp.mtx.RLock()
defer wp.mtx.RUnlock()
- instances := map[string]int64{}
+ type entKey struct {
+ cat string
+ instType string
+ }
+ instances := map[entKey]int64{}
price := map[string]float64{}
cpu := map[string]int64{}
mem := map[string]int64{}
default:
cat = "idle"
}
- instances[cat]++
+ instances[entKey{cat, wkr.instType.Name}]++
price[cat] += wkr.instType.Price
cpu[cat] += int64(wkr.instType.VCPUs)
mem[cat] += int64(wkr.instType.RAM)
running += int64(len(wkr.running) + len(wkr.starting))
}
for _, cat := range []string{"inuse", "hold", "booting", "unknown", "idle"} {
- wp.mInstances.WithLabelValues(cat).Set(float64(instances[cat]))
wp.mInstancesPrice.WithLabelValues(cat).Set(price[cat])
wp.mVCPUs.WithLabelValues(cat).Set(float64(cpu[cat]))
wp.mMemory.WithLabelValues(cat).Set(float64(mem[cat]))
+ // make sure to reset gauges for non-existing category/nodetype combinations
+ for _, it := range wp.instanceTypes {
+ if _, ok := instances[entKey{cat, it.Name}]; !ok {
+ wp.mInstances.WithLabelValues(cat, it.Name).Set(float64(0))
+ }
+ }
+ }
+ for k, v := range instances {
+ wp.mInstances.WithLabelValues(k.cat, k.instType).Set(float64(v))
}
wp.mContainersRunning.Set(float64(running))
}
}
wp.runnerData = buf
wp.runnerMD5 = md5.Sum(buf)
- wp.runnerCmd = fmt.Sprintf("/var/run/arvados/crunch-run~%x", wp.runnerMD5)
+ wp.runnerCmd = fmt.Sprintf("/var/lib/arvados/crunch-run~%x", wp.runnerMD5)
return nil
}
itTag := inst.Tags()[wp.tagKeyPrefix+tagKeyInstanceType]
it, ok := wp.instanceTypes[itTag]
if !ok {
- wp.logger.WithField("Instance", inst).Errorf("unknown InstanceType tag %q --- ignoring", itTag)
+ wp.logger.WithField("Instance", inst.ID()).Errorf("unknown InstanceType tag %q --- ignoring", itTag)
continue
}
if wkr, isNew := wp.updateWorker(inst, it); isNew {
notify = true
} else if wkr.state == StateShutdown && time.Since(wkr.destroyed) > wp.timeoutShutdown {
- wp.logger.WithField("Instance", inst).Info("worker still listed after shutdown; retrying")
+ wp.logger.WithField("Instance", inst.ID()).Info("worker still listed after shutdown; retrying")
wkr.shutdown()
}
}