Merge branch '17530-arvados-client-fastfail-take2'
authorNico Cesar <nico@nicocesar.com>
Fri, 16 Apr 2021 21:00:42 +0000 (17:00 -0400)
committerNico Cesar <nico@nicocesar.com>
Fri, 16 Apr 2021 21:00:42 +0000 (17:00 -0400)
closes #17530

Arvados-DCO-1.1-Signed-off-by: Nico Cesar <nico@curii.com>

lib/cloud/ec2/ec2.go
lib/cloud/ec2/ec2_test.go
lib/dispatchcloud/driver.go
sdk/python/arvados/api.py
sdk/python/tests/test_api.py

index 1e0de74024f52851ebe4eb08c0414617d0bdc7db..071c95006c9b305b1f47737bbb6eab588961785c 100644 (file)
@@ -14,6 +14,8 @@ import (
        "fmt"
        "math/big"
        "sync"
+       "sync/atomic"
+       "time"
 
        "git.arvados.org/arvados.git/lib/cloud"
        "git.arvados.org/arvados.git/sdk/go/arvados"
@@ -21,6 +23,7 @@ import (
        "github.com/aws/aws-sdk-go/aws/credentials"
        "github.com/aws/aws-sdk-go/aws/credentials/ec2rolecreds"
        "github.com/aws/aws-sdk-go/aws/ec2metadata"
+       "github.com/aws/aws-sdk-go/aws/request"
        "github.com/aws/aws-sdk-go/aws/session"
        "github.com/aws/aws-sdk-go/service/ec2"
        "github.com/sirupsen/logrus"
@@ -30,6 +33,11 @@ import (
 // Driver is the ec2 implementation of the cloud.Driver interface.
 var Driver = cloud.DriverFunc(newEC2InstanceSet)
 
+const (
+       throttleDelayMin = time.Second
+       throttleDelayMax = time.Minute
+)
+
 type ec2InstanceSetConfig struct {
        AccessKeyID      string
        SecretAccessKey  string
@@ -50,12 +58,14 @@ type ec2Interface interface {
 }
 
 type ec2InstanceSet struct {
-       ec2config     ec2InstanceSetConfig
-       instanceSetID cloud.InstanceSetID
-       logger        logrus.FieldLogger
-       client        ec2Interface
-       keysMtx       sync.Mutex
-       keys          map[string]string
+       ec2config              ec2InstanceSetConfig
+       instanceSetID          cloud.InstanceSetID
+       logger                 logrus.FieldLogger
+       client                 ec2Interface
+       keysMtx                sync.Mutex
+       keys                   map[string]string
+       throttleDelayCreate    atomic.Value
+       throttleDelayInstances atomic.Value
 }
 
 func newEC2InstanceSet(config json.RawMessage, instanceSetID cloud.InstanceSetID, _ cloud.SharedResourceTags, logger logrus.FieldLogger) (prv cloud.InstanceSet, err error) {
@@ -220,7 +230,7 @@ func (instanceSet *ec2InstanceSet) Create(
        }
 
        rsv, err := instanceSet.client.RunInstances(&rii)
-
+       err = wrapError(err, &instanceSet.throttleDelayCreate)
        if err != nil {
                return nil, err
        }
@@ -242,6 +252,7 @@ func (instanceSet *ec2InstanceSet) Instances(tags cloud.InstanceTags) (instances
        dii := &ec2.DescribeInstancesInput{Filters: filters}
        for {
                dio, err := instanceSet.client.DescribeInstances(dii)
+               err = wrapError(err, &instanceSet.throttleDelayInstances)
                if err != nil {
                        return nil, err
                }
@@ -328,3 +339,33 @@ func (inst *ec2Instance) RemoteUser() string {
 func (inst *ec2Instance) VerifyHostKey(ssh.PublicKey, *ssh.Client) error {
        return cloud.ErrNotImplemented
 }
+
+type rateLimitError struct {
+       error
+       earliestRetry time.Time
+}
+
+func (err rateLimitError) EarliestRetry() time.Time {
+       return err.earliestRetry
+}
+
+func wrapError(err error, throttleValue *atomic.Value) error {
+       if request.IsErrorThrottle(err) {
+               // Back off exponentially until an upstream call
+               // either succeeds or returns a non-throttle error.
+               d, _ := throttleValue.Load().(time.Duration)
+               d = d*3/2 + time.Second
+               if d < throttleDelayMin {
+                       d = throttleDelayMin
+               } else if d > throttleDelayMax {
+                       d = throttleDelayMax
+               }
+               throttleValue.Store(d)
+               return rateLimitError{error: err, earliestRetry: time.Now().Add(d)}
+       } else if err != nil {
+               throttleValue.Store(time.Duration(0))
+               return err
+       }
+       throttleValue.Store(time.Duration(0))
+       return nil
+}
index 6aa6e857ff59b278aa3a54292ab461527502d84b..e7319a0cb66d7fe2e0132c9092f88ab5d5714a28 100644 (file)
@@ -245,3 +245,5 @@ func (*EC2InstanceSetSuite) TestDestroyInstances(c *check.C) {
                c.Check(i.Destroy(), check.IsNil)
        }
 }
+
+var TestRateLimitErrorInterface cloud.RateLimitError = rateLimitError{}
index fe498d0484b0d41a0ceb0428aafc68a879033cc6..5fcc0903f5d1b43d3b58e83d8e6721832985c4ed 100644 (file)
@@ -55,6 +55,15 @@ type rateLimitedInstanceSet struct {
        ticker *time.Ticker
 }
 
+func (is rateLimitedInstanceSet) Instances(tags cloud.InstanceTags) ([]cloud.Instance, error) {
+       <-is.ticker.C
+       insts, err := is.InstanceSet.Instances(tags)
+       for i, inst := range insts {
+               insts[i] = &rateLimitedInstance{inst, is.ticker}
+       }
+       return insts, err
+}
+
 func (is rateLimitedInstanceSet) Create(it arvados.InstanceType, image cloud.ImageID, tags cloud.InstanceTags, init cloud.InitCommand, pk ssh.PublicKey) (cloud.Instance, error) {
        <-is.ticker.C
        inst, err := is.InstanceSet.Create(it, image, tags, init, pk)
@@ -71,6 +80,11 @@ func (inst *rateLimitedInstance) Destroy() error {
        return inst.Instance.Destroy()
 }
 
+func (inst *rateLimitedInstance) SetTags(tags cloud.InstanceTags) error {
+       <-inst.ticker.C
+       return inst.Instance.SetTags(tags)
+}
+
 // Adds the specified defaultTags to every Create() call.
 type defaultTaggingInstanceSet struct {
        cloud.InstanceSet
index 315fc74a713f42fbee7b7b030c36576ed5426bc0..4fe3999f2c3098391f387f51fe96cc628bf1a790 100644 (file)
@@ -157,7 +157,7 @@ def http_cache(data_type):
     return cache.SafeHTTPCache(path, max_age=60*60*24*2)
 
 def api(version=None, cache=True, host=None, token=None, insecure=False,
-        request_id=None, timeout=5*60, **kwargs):
+        request_id=None, timeout=10, **kwargs):
     """Return an apiclient Resources object for an Arvados instance.
 
     :version:
index 8d3142ab6aa49980babae66e255d2f183224109e..60183e06a352259530534bedf56da1bbba5c3443 100644 (file)
@@ -101,6 +101,12 @@ class ArvadosApiTest(run_test_server.TestCaseWithServers):
             text = "X" * maxsize
             arvados.api('v1').collections().create(body={"manifest_text": text}).execute()
 
+    # Checks for bug #17171
+    def test_default_request_timeout(self):
+        api = arvados.api('v1')
+        self.assertEqual(api._http.timeout, 10,
+            "Default timeout value should be 10")
+
     def test_ordered_json_model(self):
         mock_responses = {
             'arvados.humans.get': (