X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/f4ca9ad94a6bb006d1f3c7ba207837f1736d1247..e56ae6aad06c37d5512537047871d7363dd97620:/sdk/go/keepclient/support.go diff --git a/sdk/go/keepclient/support.go b/sdk/go/keepclient/support.go index b12f512507..33ba8720bc 100644 --- a/sdk/go/keepclient/support.go +++ b/sdk/go/keepclient/support.go @@ -4,14 +4,18 @@ import ( "crypto/md5" "errors" "fmt" - "git.curoverse.com/arvados.git/sdk/go/streamer" "io" "io/ioutil" + "log" "math/rand" "net" "net/http" + "os" + "regexp" "strings" "time" + + "git.curoverse.com/arvados.git/sdk/go/streamer" ) // Function used to emit debug messages. The easiest way to enable @@ -19,6 +23,13 @@ import ( // log.Printf to DebugPrintf. var DebugPrintf = func(string, ...interface{}) {} +func init() { + var matchTrue = regexp.MustCompile("^(?i:1|yes|true)$") + if matchTrue.MatchString(os.Getenv("ARVADOS_DEBUG")) { + DebugPrintf = log.Printf + } +} + type keepService struct { Uuid string `json:"uuid"` Hostname string `json:"service_host"` @@ -35,49 +46,45 @@ func Md5String(s string) string { // Set timeouts applicable when connecting to non-disk services // (assumed to be over the Internet). -func (this *KeepClient) setClientSettingsNonDisk() { - if this.Client.Timeout == 0 { - // Maximum time to wait for a complete response - this.Client.Timeout = 300 * time.Second - - // TCP and TLS connection settings - this.Client.Transport = &http.Transport{ - Dial: (&net.Dialer{ - // The maximum time to wait to set up - // the initial TCP connection. - Timeout: 30 * time.Second, - - // The TCP keep alive heartbeat - // interval. - KeepAlive: 120 * time.Second, - }).Dial, - - TLSHandshakeTimeout: 10 * time.Second, - } +func (*KeepClient) setClientSettingsNonDisk(client *http.Client) { + // Maximum time to wait for a complete response + client.Timeout = 300 * time.Second + + // TCP and TLS connection settings + client.Transport = &http.Transport{ + Dial: (&net.Dialer{ + // The maximum time to wait to set up + // the initial TCP connection. + Timeout: 30 * time.Second, + + // The TCP keep alive heartbeat + // interval. + KeepAlive: 120 * time.Second, + }).Dial, + + TLSHandshakeTimeout: 10 * time.Second, } } // Set timeouts applicable when connecting to keepstore services directly // (assumed to be on the local network). -func (this *KeepClient) setClientSettingsDisk() { - if this.Client.Timeout == 0 { - // Maximum time to wait for a complete response - this.Client.Timeout = 20 * time.Second - - // TCP and TLS connection timeouts - this.Client.Transport = &http.Transport{ - Dial: (&net.Dialer{ - // The maximum time to wait to set up - // the initial TCP connection. - Timeout: 2 * time.Second, - - // The TCP keep alive heartbeat - // interval. - KeepAlive: 180 * time.Second, - }).Dial, - - TLSHandshakeTimeout: 4 * time.Second, - } +func (*KeepClient) setClientSettingsDisk(client *http.Client) { + // Maximum time to wait for a complete response + client.Timeout = 20 * time.Second + + // TCP and TLS connection timeouts + client.Transport = &http.Transport{ + Dial: (&net.Dialer{ + // The maximum time to wait to set up + // the initial TCP connection. + Timeout: 2 * time.Second, + + // The TCP keep alive heartbeat + // interval. + KeepAlive: 180 * time.Second, + }).Dial, + + TLSHandshakeTimeout: 4 * time.Second, } } @@ -147,6 +154,9 @@ func (this *KeepClient) uploadToKeepServer(host string, hash string, body io.Rea DebugPrintf("DEBUG: [%08x] Upload %v success", requestID, url) upload_status <- uploadStatus{nil, url, resp.StatusCode, rep, response} } else { + if resp.StatusCode >= 300 && response == "" { + response = resp.Status + } DebugPrintf("DEBUG: [%08x] Upload %v error: %v response: %v", requestID, url, resp.StatusCode, response) upload_status <- uploadStatus{errors.New(resp.Status), url, resp.StatusCode, rep, response} } @@ -184,24 +194,26 @@ func (this *KeepClient) putReplicas( }() }() - // Desired number of replicas - remaining_replicas := this.Want_replicas + replicasDone := 0 + replicasTodo := this.Want_replicas replicasPerThread := this.replicasPerService if replicasPerThread < 1 { // unlimited or unknown - replicasPerThread = remaining_replicas + replicasPerThread = replicasTodo } retriesRemaining := 1 + this.Retries var retryServers []string + lastError := make(map[string]string) + for retriesRemaining > 0 { retriesRemaining -= 1 next_server = 0 retryServers = []string{} - for remaining_replicas > 0 { - for active*replicasPerThread < remaining_replicas { + for replicasTodo > 0 { + for active*replicasPerThread < replicasTodo { // Start some upload requests if next_server < len(sv) { DebugPrintf("DEBUG: [%08x] Begin upload %s to %s", requestID, hash, sv[next_server]) @@ -210,14 +222,19 @@ func (this *KeepClient) putReplicas( active += 1 } else { if active == 0 && retriesRemaining == 0 { - return locator, (this.Want_replicas - remaining_replicas), InsufficientReplicasError + msg := "Could not write sufficient replicas: " + for _, resp := range lastError { + msg += resp + "; " + } + msg = msg[:len(msg)-2] + return locator, replicasDone, InsufficientReplicasError(errors.New(msg)) } else { break } } } DebugPrintf("DEBUG: [%08x] Replicas remaining to write: %v active uploads: %v", - requestID, remaining_replicas, active) + requestID, replicasTodo, active) // Now wait for something to happen. if active > 0 { @@ -226,9 +243,19 @@ func (this *KeepClient) putReplicas( if status.statusCode == 200 { // good news! - remaining_replicas -= status.replicas_stored + replicasDone += status.replicas_stored + replicasTodo -= status.replicas_stored locator = status.response - } else if status.statusCode == 0 || status.statusCode == 408 || status.statusCode == 429 || + delete(lastError, status.url) + } else { + msg := fmt.Sprintf("[%d] %s", status.statusCode, status.response) + if len(msg) > 100 { + msg = msg[:100] + } + lastError[status.url] = msg + } + + if status.statusCode == 0 || status.statusCode == 408 || status.statusCode == 429 || (status.statusCode >= 500 && status.statusCode != 503) { // Timeout, too many requests, or other server side failure // Do not retry when status code is 503, which means the keep server is full @@ -242,5 +269,5 @@ func (this *KeepClient) putReplicas( sv = retryServers } - return locator, this.Want_replicas, nil + return locator, replicasDone, nil }