// instances have been shutdown.
quotaErrorTTL = time.Minute
+ // Time after a capacity error to try again
+ capacityErrorTTL = time.Minute
+
// Time between "X failed because rate limiting" messages
logRateLimitErrorInterval = time.Second * 10
)
atQuotaUntilFewerInstances int
atQuotaUntil time.Time
atQuotaErr cloud.QuotaError
+ atCapacityUntil map[string]time.Time
stop chan bool
mtx sync.RWMutex
setupOnce sync.Once
// Boot probe is certain to fail.
return false
}
- wp.mtx.Lock()
- defer wp.mtx.Unlock()
- if time.Now().Before(wp.atQuotaUntil) ||
- wp.atQuotaUntilFewerInstances > 0 ||
- wp.instanceSet.throttleCreate.Error() != nil ||
- (wp.maxInstances > 0 && wp.maxInstances <= len(wp.workers)+len(wp.creating)) {
+ if wp.AtCapacity(it) || wp.AtQuota() || wp.instanceSet.throttleCreate.Error() != nil {
return false
}
+ wp.mtx.Lock()
+ defer wp.mtx.Unlock()
// The maxConcurrentInstanceCreateOps knob throttles the number of node create
// requests in flight. It was added to work around a limitation in Azure's
// managed disks, which support no more than 20 concurrent node creation
logger.WithField("atQuotaUntilFewerInstances", n).Info("quota error -- waiting for next instance shutdown")
}
}
+ if err, ok := err.(cloud.CapacityError); ok && err.IsCapacityError() {
+ capKey := it.ProviderType
+ if !err.IsInstanceTypeSpecific() {
+ // set capacity flag for all
+ // instance types
+ capKey = ""
+ }
+ if wp.atCapacityUntil == nil {
+ wp.atCapacityUntil = map[string]time.Time{}
+ }
+ wp.atCapacityUntil[capKey] = time.Now().Add(capacityErrorTTL)
+ time.AfterFunc(capacityErrorTTL, wp.notify)
+ }
logger.WithError(err).Error("create failed")
wp.instanceSet.throttleCreate.CheckRateLimitError(err, wp.logger, "create instance", wp.notify)
return
return true
}
+// AtCapacity returns true if Create() is currently expected to fail
+// for the given instance type.
+func (wp *Pool) AtCapacity(it arvados.InstanceType) bool {
+ wp.mtx.Lock()
+ defer wp.mtx.Unlock()
+ if t, ok := wp.atCapacityUntil[it.ProviderType]; ok && time.Now().Before(t) {
+ // at capacity for this instance type
+ return true
+ }
+ if t, ok := wp.atCapacityUntil[""]; ok && time.Now().Before(t) {
+ // at capacity for all instance types
+ return true
+ }
+ return false
+}
+
// AtQuota returns true if Create is not expected to work at the
// moment (e.g., cloud provider has reported quota errors, or we are
// already at our own configured quota).
}
if wp.atQuotaUntilFewerInstances > len(wp.workers)+len(wp.creating) {
+ // After syncing, there are fewer instances (including
+ // pending creates) than there were last time we saw a
+ // quota error. This might mean it's now possible to
+ // create new instances. Reset our "at quota" state.
wp.atQuotaUntilFewerInstances = 0
}