sorted = append(sorted, ent)
}
sort.Slice(sorted, func(i, j int) bool {
- return sorted[i].Container.Priority > sorted[j].Container.Priority
+ if pi, pj := sorted[i].Container.Priority, sorted[j].Container.Priority; pi != pj {
+ return pi > pj
+ } else {
+ // When containers have identical priority,
+ // start them in the order we first noticed
+ // them. This avoids extra lock/unlock cycles
+ // when we unlock the containers that don't
+ // fit in the available pool.
+ return sorted[i].FirstSeenAt.Before(sorted[j].FirstSeenAt)
+ }
})
running := sch.pool.Running()
dontstart := map[arvados.InstanceType]bool{}
var overquota []container.QueueEnt // entries that are unmappable because of worker pool quota
+ var containerAllocatedWorkerBootingCount int
tryrun:
for i, ctr := range sorted {
overquota = sorted[i:]
break tryrun
}
+ if sch.pool.KillContainer(ctr.UUID, "about to lock") {
+ logger.Info("not locking: crunch-run process from previous attempt has not exited")
+ continue
+ }
go sch.lockContainer(logger, ctr.UUID)
unalloc[it]--
case arvados.ContainerStateLocked:
if unalloc[it] > 0 {
unalloc[it]--
} else if sch.pool.AtQuota() {
- logger.Debug("not starting: AtQuota and no unalloc workers")
+ // Don't let lower-priority containers
+ // starve this one by using keeping
+ // idle workers alive on different
+ // instance types.
+ logger.Trace("overquota")
overquota = sorted[i:]
break tryrun
- } else {
+ } else if sch.pool.Create(it) {
+ // Success. (Note pool.Create works
+ // asynchronously and does its own
+ // logging about the eventual outcome,
+ // so we don't need to.)
logger.Info("creating new instance")
- if !sch.pool.Create(it) {
- // (Note pool.Create works
- // asynchronously and logs its
- // own failures, so we don't
- // need to log this as a
- // failure.)
-
- sch.queue.Unlock(ctr.UUID)
- // Don't let lower-priority
- // containers starve this one
- // by using keeping idle
- // workers alive on different
- // instance types. TODO:
- // avoid getting starved here
- // if instances of a specific
- // type always fail.
- overquota = sorted[i:]
- break tryrun
- }
+ } else {
+ // Failed despite not being at quota,
+ // e.g., cloud ops throttled. TODO:
+ // avoid getting starved here if
+ // instances of a specific type always
+ // fail.
+ logger.Trace("pool declined to create new instance")
+ continue
}
if dontstart[it] {
// a higher-priority container on the
// same instance type. Don't let this
// one sneak in ahead of it.
+ } else if sch.pool.KillContainer(ctr.UUID, "about to start") {
+ logger.Info("not restarting yet: crunch-run process from previous attempt has not exited")
} else if sch.pool.StartContainer(it, ctr) {
// Success.
} else {
+ containerAllocatedWorkerBootingCount += 1
dontstart[it] = true
}
}
}
+ sch.mContainersAllocatedNotStarted.Set(float64(containerAllocatedWorkerBootingCount))
+ sch.mContainersNotAllocatedOverQuota.Set(float64(len(overquota)))
+
if len(overquota) > 0 {
// Unlock any containers that are unmappable while
// we're at quota.