14399: Retry Azure ListBlobs call after 503 errors.
authorTom Clegg <tclegg@veritasgenetics.com>
Thu, 25 Apr 2019 17:54:03 +0000 (13:54 -0400)
committerTom Clegg <tclegg@veritasgenetics.com>
Fri, 26 Apr 2019 17:29:40 +0000 (13:29 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg@veritasgenetics.com>

services/keepstore/azure_blob_volume.go
services/keepstore/azure_blob_volume_test.go

index 6b5b233c2a6701912ce06b1356fdb864778d0cf8..3c17b3bd0641e2bee23007d775b1740e2c7a14d4 100644 (file)
@@ -26,7 +26,11 @@ import (
        "github.com/prometheus/client_golang/prometheus"
 )
 
-const azureDefaultRequestTimeout = arvados.Duration(10 * time.Minute)
+const (
+       azureDefaultRequestTimeout       = arvados.Duration(10 * time.Minute)
+       azureDefaultListBlobsMaxAttempts = 12
+       azureDefaultListBlobsRetryDelay  = arvados.Duration(10 * time.Second)
+)
 
 var (
        azureMaxGetBytes           int
@@ -108,6 +112,8 @@ type AzureBlobVolume struct {
        ReadOnly              bool
        RequestTimeout        arvados.Duration
        StorageClasses        []string
+       ListBlobsRetryDelay   arvados.Duration
+       ListBlobsMaxAttempts  int
 
        azClient  storage.Client
        container *azureContainer
@@ -149,6 +155,12 @@ func (v *AzureBlobVolume) Type() string {
 
 // Start implements Volume.
 func (v *AzureBlobVolume) Start(vm *volumeMetricsVecs) error {
+       if v.ListBlobsRetryDelay == 0 {
+               v.ListBlobsRetryDelay = azureDefaultListBlobsRetryDelay
+       }
+       if v.ListBlobsMaxAttempts == 0 {
+               v.ListBlobsMaxAttempts = azureDefaultListBlobsMaxAttempts
+       }
        if v.ContainerName == "" {
                return errors.New("no container name given")
        }
@@ -486,8 +498,8 @@ func (v *AzureBlobVolume) IndexTo(prefix string, writer io.Writer) error {
                Prefix:  prefix,
                Include: &storage.IncludeBlobDataset{Metadata: true},
        }
-       for {
-               resp, err := v.container.ListBlobs(params)
+       for page := 1; ; page++ {
+               resp, err := v.listBlobs(page, params)
                if err != nil {
                        return err
                }
@@ -517,6 +529,22 @@ func (v *AzureBlobVolume) IndexTo(prefix string, writer io.Writer) error {
        }
 }
 
+// call v.container.ListBlobs, retrying if needed.
+func (v *AzureBlobVolume) listBlobs(page int, params storage.ListBlobsParameters) (resp storage.BlobListResponse, err error) {
+       for i := 0; i < v.ListBlobsMaxAttempts; i++ {
+               resp, err = v.container.ListBlobs(params)
+               err = v.translateError(err)
+               if err == VolumeBusyError {
+                       log.Printf("ListBlobs: will retry page %d in %s after error: %s", page, v.ListBlobsRetryDelay, err)
+                       time.Sleep(time.Duration(v.ListBlobsRetryDelay))
+                       continue
+               } else {
+                       break
+               }
+       }
+       return
+}
+
 // Trash a Keep block.
 func (v *AzureBlobVolume) Trash(loc string) error {
        if v.ReadOnly {
@@ -674,8 +702,8 @@ func (v *AzureBlobVolume) EmptyTrash() {
        }
 
        params := storage.ListBlobsParameters{Include: &storage.IncludeBlobDataset{Metadata: true}}
-       for {
-               resp, err := v.container.ListBlobs(params)
+       for page := 1; ; page++ {
+               resp, err := v.listBlobs(page, params)
                if err != nil {
                        log.Printf("EmptyTrash: ListBlobs: %v", err)
                        break
index cfad7577c59d850d25e9f2281a4ad374a60295af..8d02def1445c3f0d7f6ed5806c4c226b75e41644 100644 (file)
@@ -27,6 +27,7 @@ import (
        "testing"
        "time"
 
+       "git.curoverse.com/arvados.git/sdk/go/arvados"
        "github.com/Azure/azure-sdk-for-go/storage"
        "github.com/ghodss/yaml"
        "github.com/prometheus/client_golang/prometheus"
@@ -65,8 +66,9 @@ type azBlob struct {
 
 type azStubHandler struct {
        sync.Mutex
-       blobs map[string]*azBlob
-       race  chan chan struct{}
+       blobs      map[string]*azBlob
+       race       chan chan struct{}
+       didlist503 bool
 }
 
 func newAzStubHandler() *azStubHandler {
@@ -281,6 +283,11 @@ func (h *azStubHandler) ServeHTTP(rw http.ResponseWriter, r *http.Request) {
                rw.WriteHeader(http.StatusAccepted)
        case r.Method == "GET" && r.Form.Get("comp") == "list" && r.Form.Get("restype") == "container":
                // "List Blobs" API
+               if !h.didlist503 {
+                       h.didlist503 = true
+                       rw.WriteHeader(http.StatusServiceUnavailable)
+                       return
+               }
                prefix := container + "|" + r.Form.Get("prefix")
                marker := r.Form.Get("marker")
 
@@ -388,14 +395,17 @@ func NewTestableAzureBlobVolume(t TB, readonly bool, replication int) *TestableA
                        t.Fatal(err)
                }
        }
+       azClient.Sender = &singleSender{}
 
        bs := azClient.GetBlobService()
        v := &AzureBlobVolume{
-               ContainerName:    container,
-               ReadOnly:         readonly,
-               AzureReplication: replication,
-               azClient:         azClient,
-               container:        &azureContainer{ctr: bs.GetContainerReference(container)},
+               ContainerName:        container,
+               ReadOnly:             readonly,
+               AzureReplication:     replication,
+               ListBlobsMaxAttempts: 2,
+               ListBlobsRetryDelay:  arvados.Duration(time.Millisecond),
+               azClient:             azClient,
+               container:            &azureContainer{ctr: bs.GetContainerReference(container)},
        }
 
        return &TestableAzureBlobVolume{