Merge branch '5353-node-sizes' closes #5353
[arvados.git] / sdk / go / keepclient / support.go
index 4cc644749a391689814e582d7dd5b8624dc9f608..90ac3bcb17dc12913abe6b5a130c0c44d1efb6fa 100644 (file)
@@ -8,13 +8,19 @@ import (
        "io"
        "io/ioutil"
        "log"
+       "math/rand"
        "net"
        "net/http"
        "strings"
        "time"
 )
 
-type keepServices struct {
+// Function used to emit debug messages. The easiest way to enable
+// keepclient debug messages in your application is to assign
+// log.Printf to DebugPrintf.
+var DebugPrintf = func(string, ...interface{}) {}
+
+type keepService struct {
        Uuid     string `json:"uuid"`
        Hostname string `json:"service_host"`
        Port     int    `json:"service_port"`
@@ -76,71 +82,8 @@ func (this *KeepClient) setClientSettingsDisk() {
        }
 }
 
-// DiscoverKeepServers gets list of available keep services from api server
-func (this *KeepClient) DiscoverKeepServers() error {
-       type svcList struct {
-               Items []keepServices `json:"items"`
-       }
-       var m svcList
-
-       // Get keep services from api server
-       err := this.Arvados.Call("GET", "keep_services", "", "accessible", nil, &m)
-
-       // If there is error getting keep services, get list of keep disks
-       if err != nil {
-               if err := this.Arvados.List("keep_disks", nil, &m); err != nil {
-                       return err
-               }
-       }
-
-       listed := make(map[string]bool)
-       localRoots := make(map[string]string)
-       gatewayRoots := make(map[string]string)
-       writableLocalRoots := make(map[string]string)
-
-       this.replicasPerService = 1 // set to 1 until writable non-disk services are found
-
-       for _, service := range m.Items {
-               scheme := "http"
-               if service.SSL {
-                       scheme = "https"
-               }
-               url := fmt.Sprintf("%s://%s:%d", scheme, service.Hostname, service.Port)
-
-               // Skip duplicates
-               if listed[url] {
-                       continue
-               }
-               listed[url] = true
-
-               localRoots[service.Uuid] = url
-               if service.SvcType == "proxy" {
-                       this.Using_proxy = true
-               }
-
-               if service.ReadOnly == false {
-                       writableLocalRoots[service.Uuid] = url
-                       if service.SvcType != "disk" {
-                               this.replicasPerService = 0
-                       }
-               }
-
-               // Gateway services are only used when specified by
-               // UUID, so there's nothing to gain by filtering them
-               // by service type. Including all accessible services
-               // (gateway and otherwise) merely accommodates more
-               // service configurations.
-               gatewayRoots[service.Uuid] = url
-       }
-
-       if this.Using_proxy {
-               this.setClientSettingsProxy()
-       } else {
-               this.setClientSettingsDisk()
-       }
-
-       this.SetServiceRoots(localRoots, writableLocalRoots, gatewayRoots)
-       return nil
+type svcList struct {
+       Items []keepService `json:"items"`
 }
 
 type uploadStatus struct {
@@ -151,14 +94,14 @@ type uploadStatus struct {
        response        string
 }
 
-func (this KeepClient) uploadToKeepServer(host string, hash string, body io.ReadCloser,
-       upload_status chan<- uploadStatus, expectedLength int64, requestId string) {
+func (this *KeepClient) uploadToKeepServer(host string, hash string, body io.ReadCloser,
+       upload_status chan<- uploadStatus, expectedLength int64, requestID int32) {
 
        var req *http.Request
        var err error
        var url = fmt.Sprintf("%s/%s", host, hash)
        if req, err = http.NewRequest("PUT", url, nil); err != nil {
-               log.Printf("[%v] Error creating request PUT %v error: %v", requestId, url, err.Error())
+               log.Printf("[%08x] Error creating request PUT %v error: %v", requestID, url, err.Error())
                upload_status <- uploadStatus{err, url, 0, 0, ""}
                body.Close()
                return
@@ -183,7 +126,7 @@ func (this KeepClient) uploadToKeepServer(host string, hash string, body io.Read
 
        var resp *http.Response
        if resp, err = this.Client.Do(req); err != nil {
-               log.Printf("[%v] Upload failed %v error: %v", requestId, url, err.Error())
+               log.Printf("[%08x] Upload failed %v error: %v", requestID, url, err.Error())
                upload_status <- uploadStatus{err, url, 0, 0, ""}
                return
        }
@@ -196,28 +139,28 @@ func (this KeepClient) uploadToKeepServer(host string, hash string, body io.Read
        defer resp.Body.Close()
        defer io.Copy(ioutil.Discard, resp.Body)
 
-       respbody, err2 := ioutil.ReadAll(&io.LimitedReader{resp.Body, 4096})
+       respbody, err2 := ioutil.ReadAll(&io.LimitedReader{R: resp.Body, N: 4096})
        response := strings.TrimSpace(string(respbody))
        if err2 != nil && err2 != io.EOF {
-               log.Printf("[%v] Upload %v error: %v response: %v", requestId, url, err2.Error(), response)
+               log.Printf("[%08x] Upload %v error: %v response: %v", requestID, url, err2.Error(), response)
                upload_status <- uploadStatus{err2, url, resp.StatusCode, rep, response}
        } else if resp.StatusCode == http.StatusOK {
-               log.Printf("[%v] Upload %v success", requestId, url)
+               log.Printf("[%08x] Upload %v success", requestID, url)
                upload_status <- uploadStatus{nil, url, resp.StatusCode, rep, response}
        } else {
-               log.Printf("[%v] Upload %v error: %v response: %v", requestId, url, resp.StatusCode, response)
+               log.Printf("[%08x] Upload %v error: %v response: %v", requestID, url, resp.StatusCode, response)
                upload_status <- uploadStatus{errors.New(resp.Status), url, resp.StatusCode, rep, response}
        }
 }
 
-func (this KeepClient) putReplicas(
+func (this *KeepClient) putReplicas(
        hash string,
        tr *streamer.AsyncStream,
        expectedLength int64) (locator string, replicas int, err error) {
 
-       // Take the hash of locator and timestamp in order to identify this
-       // specific transaction in log statements.
-       requestId := fmt.Sprintf("%x", md5.Sum([]byte(locator+time.Now().String())))[0:8]
+       // Generate an arbitrary ID to identify this specific
+       // transaction in debug logs.
+       requestID := rand.Int31()
 
        // Calculate the ordering for uploading to servers
        sv := NewRootSorter(this.WritableLocalRoots(), hash).GetSortedRoots()
@@ -230,45 +173,74 @@ func (this KeepClient) putReplicas(
 
        // Used to communicate status from the upload goroutines
        upload_status := make(chan uploadStatus)
-       defer close(upload_status)
+       defer func() {
+               // Wait for any abandoned uploads (e.g., we started
+               // two uploads and the first replied with replicas=2)
+               // to finish before closing the status channel.
+               go func() {
+                       for active > 0 {
+                               <-upload_status
+                       }
+                       close(upload_status)
+               }()
+       }()
 
        // Desired number of replicas
        remaining_replicas := this.Want_replicas
 
-       for remaining_replicas > 0 {
-               replicasPerThread := this.replicasPerService
-               if replicasPerThread < 1 {
-                       // unlimited or unknown
-                       replicasPerThread = remaining_replicas
-               }
+       replicasPerThread := this.replicasPerService
+       if replicasPerThread < 1 {
+               // unlimited or unknown
+               replicasPerThread = remaining_replicas
+       }
 
-               for active*replicasPerThread < remaining_replicas {
-                       // Start some upload requests
-                       if next_server < len(sv) {
-                               log.Printf("[%v] Begin upload %s to %s", requestId, hash, sv[next_server])
-                               go this.uploadToKeepServer(sv[next_server], hash, tr.MakeStreamReader(), upload_status, expectedLength, requestId)
-                               next_server += 1
-                               active += 1
-                       } else {
-                               if active == 0 {
-                                       return locator, (this.Want_replicas - remaining_replicas), InsufficientReplicasError
+       retriesRemaining := 1 + this.Retries
+       var retryServers []string
+
+       for retriesRemaining > 0 {
+               retriesRemaining -= 1
+               next_server = 0
+               retryServers = []string{}
+               for remaining_replicas > 0 {
+                       for active*replicasPerThread < remaining_replicas {
+                               // Start some upload requests
+                               if next_server < len(sv) {
+                                       log.Printf("[%08x] Begin upload %s to %s", requestID, hash, sv[next_server])
+                                       go this.uploadToKeepServer(sv[next_server], hash, tr.MakeStreamReader(), upload_status, expectedLength, requestID)
+                                       next_server += 1
+                                       active += 1
                                } else {
-                                       break
+                                       if active == 0 && retriesRemaining == 0 {
+                                               return locator, (this.Want_replicas - remaining_replicas), InsufficientReplicasError
+                                       } else {
+                                               break
+                                       }
+                               }
+                       }
+                       log.Printf("[%08x] Replicas remaining to write: %v active uploads: %v",
+                               requestID, remaining_replicas, active)
+
+                       // Now wait for something to happen.
+                       if active > 0 {
+                               status := <-upload_status
+                               active -= 1
+
+                               if status.statusCode == 200 {
+                                       // good news!
+                                       remaining_replicas -= status.replicas_stored
+                                       locator = status.response
+                               } else if status.statusCode == 0 || status.statusCode == 408 || status.statusCode == 429 ||
+                                       (status.statusCode >= 500 && status.statusCode != 503) {
+                                       // Timeout, too many requests, or other server side failure
+                                       // Do not retry when status code is 503, which means the keep server is full
+                                       retryServers = append(retryServers, status.url[0:strings.LastIndex(status.url, "/")])
                                }
+                       } else {
+                               break
                        }
                }
-               log.Printf("[%v] Replicas remaining to write: %v active uploads: %v",
-                       requestId, remaining_replicas, active)
 
-               // Now wait for something to happen.
-               status := <-upload_status
-               active -= 1
-
-               if status.statusCode == 200 {
-                       // good news!
-                       remaining_replicas -= status.replicas_stored
-                       locator = status.response
-               }
+               sv = retryServers
        }
 
        return locator, this.Want_replicas, nil