16739: implement review feedback.
authorWard Vandewege <ward@curii.com>
Tue, 1 Sep 2020 13:43:57 +0000 (09:43 -0400)
committerWard Vandewege <ward@curii.com>
Tue, 1 Sep 2020 13:43:57 +0000 (09:43 -0400)
Arvados-DCO-1.1-Signed-off-by: Ward Vandewege <ward@curii.com>

lib/config/config.default.yml
lib/config/generated_config.go
lib/dispatchcloud/worker/pool.go
lib/dispatchcloud/worker/pool_test.go
lib/dispatchcloud/worker/throttle.go
sdk/go/arvados/config.go

index 010459b0ee24519af1acee109fb57d7666fad850..57baf534277625aa2566baeb89b2dff70682f0a0 100644 (file)
@@ -945,11 +945,11 @@ Clusters:
         # unlimited).
         MaxCloudOpsPerSecond: 0
 
-        # Maximum concurrent node creation operations (0 = unlimited).  (azure)
-        # managed disks: set MaxConcurrentNodeCreateOps to 20 to avoid
-        # timeouts, cf
-        # https://docs.microsoft.com/en-us/azure/virtual-machines/linux/capture-image
-        MaxConcurrentNodeCreateOps: 0
+        # Maximum concurrent node creation operations (0 = unlimited). This is
+        # recommended by Azure in certain scenarios (see
+        # https://docs.microsoft.com/en-us/azure/virtual-machines/linux/capture-image)
+        # and can be used with other cloud providers too, if desired.
+        MaxConcurrentInstanceCreateOps: 0
 
         # Interval between cloud provider syncs/updates ("list all
         # instances").
index 866012e09977792b85f4b308cf0a796416f1f1ac..3df644cf6ea56ef686f12abb1b51c599c3825fe8 100644 (file)
@@ -951,11 +951,11 @@ Clusters:
         # unlimited).
         MaxCloudOpsPerSecond: 0
 
-        # Maximum concurrent node creation operations (0 = unlimited).  (azure)
-        # managed disks: set MaxConcurrentNodeCreateOps to 20 to avoid
-        # timeouts, cf
-        # https://docs.microsoft.com/en-us/azure/virtual-machines/linux/capture-image
-        MaxConcurrentNodeCreateOps: 0
+        # Maximum concurrent node creation operations (0 = unlimited). This is
+        # recommended by Azure in certain scenarios (see
+        # https://docs.microsoft.com/en-us/azure/virtual-machines/linux/capture-image)
+        # and can be used with other cloud providers too, if desired.
+        MaxConcurrentInstanceCreateOps: 0
 
         # Interval between cloud provider syncs/updates ("list all
         # instances").
index ec6a049e640e7e1d619251e21af07f418aa28713..435b6e43ae4a3e150f680883aa7e124daf6e4230 100644 (file)
@@ -96,28 +96,28 @@ func duration(conf arvados.Duration, def time.Duration) time.Duration {
 // cluster configuration.
 func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *prometheus.Registry, instanceSetID cloud.InstanceSetID, instanceSet cloud.InstanceSet, newExecutor func(cloud.Instance) Executor, installPublicKey ssh.PublicKey, cluster *arvados.Cluster) *Pool {
        wp := &Pool{
-               logger:                     logger,
-               arvClient:                  arvClient,
-               instanceSetID:              instanceSetID,
-               instanceSet:                &throttledInstanceSet{InstanceSet: instanceSet},
-               newExecutor:                newExecutor,
-               bootProbeCommand:           cluster.Containers.CloudVMs.BootProbeCommand,
-               runnerSource:               cluster.Containers.CloudVMs.DeployRunnerBinary,
-               imageID:                    cloud.ImageID(cluster.Containers.CloudVMs.ImageID),
-               instanceTypes:              cluster.InstanceTypes,
-               maxProbesPerSecond:         cluster.Containers.CloudVMs.MaxProbesPerSecond,
-               maxConcurrentNodeCreateOps: cluster.Containers.CloudVMs.MaxConcurrentNodeCreateOps,
-               probeInterval:              duration(cluster.Containers.CloudVMs.ProbeInterval, defaultProbeInterval),
-               syncInterval:               duration(cluster.Containers.CloudVMs.SyncInterval, defaultSyncInterval),
-               timeoutIdle:                duration(cluster.Containers.CloudVMs.TimeoutIdle, defaultTimeoutIdle),
-               timeoutBooting:             duration(cluster.Containers.CloudVMs.TimeoutBooting, defaultTimeoutBooting),
-               timeoutProbe:               duration(cluster.Containers.CloudVMs.TimeoutProbe, defaultTimeoutProbe),
-               timeoutShutdown:            duration(cluster.Containers.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
-               timeoutTERM:                duration(cluster.Containers.CloudVMs.TimeoutTERM, defaultTimeoutTERM),
-               timeoutSignal:              duration(cluster.Containers.CloudVMs.TimeoutSignal, defaultTimeoutSignal),
-               installPublicKey:           installPublicKey,
-               tagKeyPrefix:               cluster.Containers.CloudVMs.TagKeyPrefix,
-               stop:                       make(chan bool),
+               logger:                         logger,
+               arvClient:                      arvClient,
+               instanceSetID:                  instanceSetID,
+               instanceSet:                    &throttledInstanceSet{InstanceSet: instanceSet},
+               newExecutor:                    newExecutor,
+               bootProbeCommand:               cluster.Containers.CloudVMs.BootProbeCommand,
+               runnerSource:                   cluster.Containers.CloudVMs.DeployRunnerBinary,
+               imageID:                        cloud.ImageID(cluster.Containers.CloudVMs.ImageID),
+               instanceTypes:                  cluster.InstanceTypes,
+               maxProbesPerSecond:             cluster.Containers.CloudVMs.MaxProbesPerSecond,
+               maxConcurrentInstanceCreateOps: cluster.Containers.CloudVMs.MaxConcurrentInstanceCreateOps,
+               probeInterval:                  duration(cluster.Containers.CloudVMs.ProbeInterval, defaultProbeInterval),
+               syncInterval:                   duration(cluster.Containers.CloudVMs.SyncInterval, defaultSyncInterval),
+               timeoutIdle:                    duration(cluster.Containers.CloudVMs.TimeoutIdle, defaultTimeoutIdle),
+               timeoutBooting:                 duration(cluster.Containers.CloudVMs.TimeoutBooting, defaultTimeoutBooting),
+               timeoutProbe:                   duration(cluster.Containers.CloudVMs.TimeoutProbe, defaultTimeoutProbe),
+               timeoutShutdown:                duration(cluster.Containers.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
+               timeoutTERM:                    duration(cluster.Containers.CloudVMs.TimeoutTERM, defaultTimeoutTERM),
+               timeoutSignal:                  duration(cluster.Containers.CloudVMs.TimeoutSignal, defaultTimeoutSignal),
+               installPublicKey:               installPublicKey,
+               tagKeyPrefix:                   cluster.Containers.CloudVMs.TagKeyPrefix,
+               stop:                           make(chan bool),
        }
        wp.registerMetrics(reg)
        go func() {
@@ -133,27 +133,27 @@ func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *promethe
 // zero Pool should not be used. Call NewPool to create a new Pool.
 type Pool struct {
        // configuration
-       logger                     logrus.FieldLogger
-       arvClient                  *arvados.Client
-       instanceSetID              cloud.InstanceSetID
-       instanceSet                *throttledInstanceSet
-       newExecutor                func(cloud.Instance) Executor
-       bootProbeCommand           string
-       runnerSource               string
-       imageID                    cloud.ImageID
-       instanceTypes              map[string]arvados.InstanceType
-       syncInterval               time.Duration
-       probeInterval              time.Duration
-       maxProbesPerSecond         int
-       maxConcurrentNodeCreateOps int
-       timeoutIdle                time.Duration
-       timeoutBooting             time.Duration
-       timeoutProbe               time.Duration
-       timeoutShutdown            time.Duration
-       timeoutTERM                time.Duration
-       timeoutSignal              time.Duration
-       installPublicKey           ssh.PublicKey
-       tagKeyPrefix               string
+       logger                         logrus.FieldLogger
+       arvClient                      *arvados.Client
+       instanceSetID                  cloud.InstanceSetID
+       instanceSet                    *throttledInstanceSet
+       newExecutor                    func(cloud.Instance) Executor
+       bootProbeCommand               string
+       runnerSource                   string
+       imageID                        cloud.ImageID
+       instanceTypes                  map[string]arvados.InstanceType
+       syncInterval                   time.Duration
+       probeInterval                  time.Duration
+       maxProbesPerSecond             int
+       maxConcurrentInstanceCreateOps int
+       timeoutIdle                    time.Duration
+       timeoutBooting                 time.Duration
+       timeoutProbe                   time.Duration
+       timeoutShutdown                time.Duration
+       timeoutTERM                    time.Duration
+       timeoutSignal                  time.Duration
+       installPublicKey               ssh.PublicKey
+       tagKeyPrefix                   string
 
        // private state
        subscribers  map[<-chan struct{}]chan<- struct{}
@@ -280,13 +280,6 @@ func (wp *Pool) Unallocated() map[arvados.InstanceType]int {
        return unalloc
 }
 
-type RateLimitError struct{ Retry time.Time }
-
-func (e RateLimitError) Error() string {
-       return fmt.Sprintf("node creation request failed, hit maxConcurrentNodeCreateOps, wait until %s", e.Retry)
-}
-func (e RateLimitError) EarliestRetry() time.Time { return e.Retry }
-
 // Create a new instance with the given type, and add it to the worker
 // pool. The worker is added immediately; instance creation runs in
 // the background.
@@ -307,15 +300,16 @@ func (wp *Pool) Create(it arvados.InstanceType) bool {
        if time.Now().Before(wp.atQuotaUntil) || wp.instanceSet.throttleCreate.Error() != nil {
                return false
        }
-       // The maxConcurrentNodeCreateOps knob throttles the number of node create
+       // The maxConcurrentInstanceCreateOps knob throttles the number of node create
        // requests in flight. It was added to work around a limitation in Azure's
        // managed disks, which support no more than 20 concurrent node creation
        // requests from a single disk image (cf.
        // https://docs.microsoft.com/en-us/azure/virtual-machines/linux/capture-image).
        // The code assumes that node creation, from Azure's perspective, means the
        // period until the instance appears in the "get all instances" list.
-       if wp.maxConcurrentNodeCreateOps > 0 && len(wp.creating) >= wp.maxConcurrentNodeCreateOps {
-               wp.instanceSet.throttleCreate.CheckRateLimitError(RateLimitError{Retry: time.Now().Add(5 * time.Second)}, wp.logger, "create instance", wp.notify)
+       if wp.maxConcurrentInstanceCreateOps > 0 && len(wp.creating) >= wp.maxConcurrentInstanceCreateOps {
+               logger.Info("reached MaxConcurrentInstanceCreateOps")
+               wp.instanceSet.throttleCreate.ErrorUntil(errors.New("reached MaxConcurrentInstanceCreateOps"), time.Now().Add(5*time.Second), wp.notify)
                return false
        }
        now := time.Now()
index 6b540eee72a85fe0385982537f110f77a07590e1..7b116c3eefb09cdba27043598786f711a9c7cf53 100644 (file)
@@ -207,9 +207,9 @@ func (suite *PoolSuite) TestNodeCreateThrottle(c *check.C) {
 
        type1 := test.InstanceType(1)
        pool := &Pool{
-               logger:                     logger,
-               instanceSet:                &throttledInstanceSet{InstanceSet: instanceSet},
-               maxConcurrentNodeCreateOps: 1,
+               logger:                         logger,
+               instanceSet:                    &throttledInstanceSet{InstanceSet: instanceSet},
+               maxConcurrentInstanceCreateOps: 1,
                instanceTypes: arvados.InstanceTypeMap{
                        type1.Name: type1,
                },
@@ -224,13 +224,15 @@ func (suite *PoolSuite) TestNodeCreateThrottle(c *check.C) {
        c.Check(pool.Unallocated()[type1], check.Equals, 1)
        c.Check(res, check.Equals, false)
 
-       pool.maxConcurrentNodeCreateOps = 2
+       pool.instanceSet.throttleCreate.ResetError()
+       pool.maxConcurrentInstanceCreateOps = 2
 
        res = pool.Create(type1)
        c.Check(pool.Unallocated()[type1], check.Equals, 2)
        c.Check(res, check.Equals, true)
 
-       pool.maxConcurrentNodeCreateOps = 0
+       pool.instanceSet.throttleCreate.ResetError()
+       pool.maxConcurrentInstanceCreateOps = 0
 
        res = pool.Create(type1)
        c.Check(pool.Unallocated()[type1], check.Equals, 3)
index defbf91ff38390d22062aa03d1f3f34149428e27..5374afb44fccb0408e26039a8c53b1bfa1f8481d 100644 (file)
@@ -61,6 +61,12 @@ func (thr *throttle) Error() error {
        return thr.err
 }
 
+func (thr *throttle) ResetError() {
+       thr.mtx.Lock()
+       defer thr.mtx.Unlock()
+       thr.err = nil
+}
+
 type throttledInstanceSet struct {
        cloud.InstanceSet
        throttleCreate    throttle
index 1e190a947671bf7c6afc95c67625c359d6786ba3..829d4097c8496daea349d3fe290367ff2e8c9eda 100644 (file)
@@ -446,24 +446,24 @@ type ContainersConfig struct {
 type CloudVMsConfig struct {
        Enable bool
 
-       BootProbeCommand           string
-       DeployRunnerBinary         string
-       ImageID                    string
-       MaxCloudOpsPerSecond       int
-       MaxProbesPerSecond         int
-       MaxConcurrentNodeCreateOps int
-       PollInterval               Duration
-       ProbeInterval              Duration
-       SSHPort                    string
-       SyncInterval               Duration
-       TimeoutBooting             Duration
-       TimeoutIdle                Duration
-       TimeoutProbe               Duration
-       TimeoutShutdown            Duration
-       TimeoutSignal              Duration
-       TimeoutTERM                Duration
-       ResourceTags               map[string]string
-       TagKeyPrefix               string
+       BootProbeCommand               string
+       DeployRunnerBinary             string
+       ImageID                        string
+       MaxCloudOpsPerSecond           int
+       MaxProbesPerSecond             int
+       MaxConcurrentInstanceCreateOps int
+       PollInterval                   Duration
+       ProbeInterval                  Duration
+       SSHPort                        string
+       SyncInterval                   Duration
+       TimeoutBooting                 Duration
+       TimeoutIdle                    Duration
+       TimeoutProbe                   Duration
+       TimeoutShutdown                Duration
+       TimeoutSignal                  Duration
+       TimeoutTERM                    Duration
+       ResourceTags                   map[string]string
+       TagKeyPrefix                   string
 
        Driver           string
        DriverParameters json.RawMessage