dontstart := map[arvados.InstanceType]bool{}
var overquota []container.QueueEnt // entries that are unmappable because of worker pool quota
var containerAllocatedWorkerBootingCount int
+ var longestWaitTimeCandidate, previousLongestWaitTimeCandidate float64
tryrun:
for i, ctr := range sorted {
if _, running := running[ctr.UUID]; running || ctr.Priority < 1 {
continue
}
+ previousLongestWaitTimeCandidate = longestWaitTimeCandidate
+ since := time.Since(ctr.CreatedAt).Seconds()
+ if since > longestWaitTimeCandidate {
+ longestWaitTimeCandidate = since
+ }
switch ctr.State {
case arvados.ContainerStateQueued:
if unalloc[it] < 1 && sch.pool.AtQuota() {
logger.Info("not restarting yet: crunch-run process from previous attempt has not exited")
} else if sch.pool.StartContainer(it, ctr) {
// Success.
+ longestWaitTimeCandidate = previousLongestWaitTimeCandidate
} else {
containerAllocatedWorkerBootingCount += 1
dontstart[it] = true
sch.mContainersAllocatedNotStarted.Set(float64(containerAllocatedWorkerBootingCount))
sch.mContainersNotAllocatedOverQuota.Set(float64(len(overquota)))
+ sch.mLongestWaitTimeSinceQueue.Set(longestWaitTimeCandidate)
if len(overquota) > 0 {
// Unlock any containers that are unmappable while
c.Check(pool.Running(), check.HasLen, 0)
}
-func (*SchedulerSuite) TestContainersAllocatedNotStartedMetric(c *check.C) {
+func (*SchedulerSuite) TestContainersMetrics(c *check.C) {
ctx := ctxlog.Context(context.Background(), ctxlog.TestLogger(c))
queue := test.Queue{
ChooseType: chooseType,
Containers: []arvados.Container{
{
- UUID: test.ContainerUUID(1),
- Priority: 1,
- State: arvados.ContainerStateLocked,
+ UUID: test.ContainerUUID(1),
+ Priority: 1,
+ State: arvados.ContainerStateLocked,
+ CreatedAt: time.Now().Add(-10 * time.Second),
RuntimeConstraints: arvados.RuntimeConstraints{
VCPUs: 1,
RAM: 1 << 30,
c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 1)
c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 0)
+ c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 10)
// Create a pool without workers. The queued container will not be started, and the
// 'over quota' metric will be 1 because no workers are available and canCreate defaults
c.Check(int(testutil.ToFloat64(sch.mContainersAllocatedNotStarted)), check.Equals, 0)
c.Check(int(testutil.ToFloat64(sch.mContainersNotAllocatedOverQuota)), check.Equals, 1)
+ c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 10)
+
+ // Reset the queue, and create a pool with an idle worker. The queued
+ // container will be started immediately and mLongestWaitTimeSinceQueue
+ // should be zero.
+ queue = test.Queue{
+ ChooseType: chooseType,
+ Containers: []arvados.Container{
+ {
+ UUID: test.ContainerUUID(1),
+ Priority: 1,
+ State: arvados.ContainerStateLocked,
+ CreatedAt: time.Now().Add(-10 * time.Second),
+ RuntimeConstraints: arvados.RuntimeConstraints{
+ VCPUs: 1,
+ RAM: 1 << 30,
+ },
+ },
+ },
+ }
+ queue.Update()
+
+ pool = stubPool{
+ idle: map[arvados.InstanceType]int{test.InstanceType(1): 1},
+ unalloc: map[arvados.InstanceType]int{test.InstanceType(1): 1},
+ running: map[string]time.Time{},
+ }
+ sch = New(ctx, &queue, &pool, nil, time.Millisecond, time.Millisecond)
+ sch.runQueue()
+
+ c.Check(int(testutil.ToFloat64(sch.mLongestWaitTimeSinceQueue)), check.Equals, 0)
}
mContainersAllocatedNotStarted prometheus.Gauge
mContainersNotAllocatedOverQuota prometheus.Gauge
+ mLongestWaitTimeSinceQueue prometheus.Gauge
}
// New returns a new unstarted Scheduler.
Help: "Number of containers not allocated to a worker because the system has hit a quota.",
})
reg.MustRegister(sch.mContainersNotAllocatedOverQuota)
+ sch.mLongestWaitTimeSinceQueue = prometheus.NewGauge(prometheus.GaugeOpts{
+ Namespace: "arvados",
+ Subsystem: "dispatchcloud",
+ Name: "containers_longest_wait_time_seconds",
+ Help: "Current longest wait time of any container since queuing, and before the start of crunch-run.",
+ })
+ reg.MustRegister(sch.mLongestWaitTimeSinceQueue)
}
// Start starts the scheduler.