7710: improve keepclient to use non-disk timeouts when any non-disk typed keepservice...
[arvados.git] / sdk / go / keepclient / support.go
1 package keepclient
2
3 import (
4         "crypto/md5"
5         "errors"
6         "fmt"
7         "git.curoverse.com/arvados.git/sdk/go/streamer"
8         "io"
9         "io/ioutil"
10         "log"
11         "math/rand"
12         "net"
13         "net/http"
14         "strings"
15         "time"
16 )
17
18 // Function used to emit debug messages. The easiest way to enable
19 // keepclient debug messages in your application is to assign
20 // log.Printf to DebugPrintf.
21 var DebugPrintf = func(string, ...interface{}) {}
22
23 type keepService struct {
24         Uuid     string `json:"uuid"`
25         Hostname string `json:"service_host"`
26         Port     int    `json:"service_port"`
27         SSL      bool   `json:"service_ssl_flag"`
28         SvcType  string `json:"service_type"`
29         ReadOnly bool   `json:"read_only"`
30 }
31
32 // Md5String returns md5 hash for the bytes in the given string
33 func Md5String(s string) string {
34         return fmt.Sprintf("%x", md5.Sum([]byte(s)))
35 }
36
37 // Set timeouts applicable when connecting to non-disk services
38 // (assumed to be over the Internet).
39 func (this *KeepClient) setClientSettingsNonDisk() {
40         if this.Client.Timeout == 0 {
41                 // Maximum time to wait for a complete response
42                 this.Client.Timeout = 300 * time.Second
43
44                 // TCP and TLS connection settings
45                 this.Client.Transport = &http.Transport{
46                         Dial: (&net.Dialer{
47                                 // The maximum time to wait to set up
48                                 // the initial TCP connection.
49                                 Timeout: 30 * time.Second,
50
51                                 // The TCP keep alive heartbeat
52                                 // interval.
53                                 KeepAlive: 120 * time.Second,
54                         }).Dial,
55
56                         TLSHandshakeTimeout: 10 * time.Second,
57                 }
58         }
59 }
60
61 // Set timeouts applicable when connecting to keepstore services directly
62 // (assumed to be on the local network).
63 func (this *KeepClient) setClientSettingsDisk() {
64         if this.Client.Timeout == 0 {
65                 // Maximum time to wait for a complete response
66                 this.Client.Timeout = 20 * time.Second
67
68                 // TCP and TLS connection timeouts
69                 this.Client.Transport = &http.Transport{
70                         Dial: (&net.Dialer{
71                                 // The maximum time to wait to set up
72                                 // the initial TCP connection.
73                                 Timeout: 2 * time.Second,
74
75                                 // The TCP keep alive heartbeat
76                                 // interval.
77                                 KeepAlive: 180 * time.Second,
78                         }).Dial,
79
80                         TLSHandshakeTimeout: 4 * time.Second,
81                 }
82         }
83 }
84
85 type svcList struct {
86         Items []keepService `json:"items"`
87 }
88
89 type uploadStatus struct {
90         err             error
91         url             string
92         statusCode      int
93         replicas_stored int
94         response        string
95 }
96
97 func (this *KeepClient) uploadToKeepServer(host string, hash string, body io.ReadCloser,
98         upload_status chan<- uploadStatus, expectedLength int64, requestID int32) {
99
100         var req *http.Request
101         var err error
102         var url = fmt.Sprintf("%s/%s", host, hash)
103         if req, err = http.NewRequest("PUT", url, nil); err != nil {
104                 log.Printf("[%08x] Error creating request PUT %v error: %v", requestID, url, err.Error())
105                 upload_status <- uploadStatus{err, url, 0, 0, ""}
106                 body.Close()
107                 return
108         }
109
110         req.ContentLength = expectedLength
111         if expectedLength > 0 {
112                 // http.Client.Do will close the body ReadCloser when it is
113                 // done with it.
114                 req.Body = body
115         } else {
116                 // "For client requests, a value of 0 means unknown if Body is
117                 // not nil."  In this case we do want the body to be empty, so
118                 // don't set req.Body.  However, we still need to close the
119                 // body ReadCloser.
120                 body.Close()
121         }
122
123         req.Header.Add("Authorization", fmt.Sprintf("OAuth2 %s", this.Arvados.ApiToken))
124         req.Header.Add("Content-Type", "application/octet-stream")
125         req.Header.Add(X_Keep_Desired_Replicas, fmt.Sprint(this.Want_replicas))
126
127         var resp *http.Response
128         if resp, err = this.Client.Do(req); err != nil {
129                 log.Printf("[%08x] Upload failed %v error: %v", requestID, url, err.Error())
130                 upload_status <- uploadStatus{err, url, 0, 0, ""}
131                 return
132         }
133
134         rep := 1
135         if xr := resp.Header.Get(X_Keep_Replicas_Stored); xr != "" {
136                 fmt.Sscanf(xr, "%d", &rep)
137         }
138
139         defer resp.Body.Close()
140         defer io.Copy(ioutil.Discard, resp.Body)
141
142         respbody, err2 := ioutil.ReadAll(&io.LimitedReader{R: resp.Body, N: 4096})
143         response := strings.TrimSpace(string(respbody))
144         if err2 != nil && err2 != io.EOF {
145                 log.Printf("[%08x] Upload %v error: %v response: %v", requestID, url, err2.Error(), response)
146                 upload_status <- uploadStatus{err2, url, resp.StatusCode, rep, response}
147         } else if resp.StatusCode == http.StatusOK {
148                 log.Printf("[%08x] Upload %v success", requestID, url)
149                 upload_status <- uploadStatus{nil, url, resp.StatusCode, rep, response}
150         } else {
151                 log.Printf("[%08x] Upload %v error: %v response: %v", requestID, url, resp.StatusCode, response)
152                 upload_status <- uploadStatus{errors.New(resp.Status), url, resp.StatusCode, rep, response}
153         }
154 }
155
156 func (this *KeepClient) putReplicas(
157         hash string,
158         tr *streamer.AsyncStream,
159         expectedLength int64) (locator string, replicas int, err error) {
160
161         // Generate an arbitrary ID to identify this specific
162         // transaction in debug logs.
163         requestID := rand.Int31()
164
165         // Calculate the ordering for uploading to servers
166         sv := NewRootSorter(this.WritableLocalRoots(), hash).GetSortedRoots()
167
168         // The next server to try contacting
169         next_server := 0
170
171         // The number of active writers
172         active := 0
173
174         // Used to communicate status from the upload goroutines
175         upload_status := make(chan uploadStatus)
176         defer func() {
177                 // Wait for any abandoned uploads (e.g., we started
178                 // two uploads and the first replied with replicas=2)
179                 // to finish before closing the status channel.
180                 go func() {
181                         for active > 0 {
182                                 <-upload_status
183                         }
184                         close(upload_status)
185                 }()
186         }()
187
188         // Desired number of replicas
189         remaining_replicas := this.Want_replicas
190
191         replicasPerThread := this.replicasPerService
192         if replicasPerThread < 1 {
193                 // unlimited or unknown
194                 replicasPerThread = remaining_replicas
195         }
196
197         retriesRemaining := 1 + this.Retries
198         var retryServers []string
199
200         for retriesRemaining > 0 {
201                 retriesRemaining -= 1
202                 next_server = 0
203                 retryServers = []string{}
204                 for remaining_replicas > 0 {
205                         for active*replicasPerThread < remaining_replicas {
206                                 // Start some upload requests
207                                 if next_server < len(sv) {
208                                         log.Printf("[%08x] Begin upload %s to %s", requestID, hash, sv[next_server])
209                                         go this.uploadToKeepServer(sv[next_server], hash, tr.MakeStreamReader(), upload_status, expectedLength, requestID)
210                                         next_server += 1
211                                         active += 1
212                                 } else {
213                                         if active == 0 && retriesRemaining == 0 {
214                                                 return locator, (this.Want_replicas - remaining_replicas), InsufficientReplicasError
215                                         } else {
216                                                 break
217                                         }
218                                 }
219                         }
220                         log.Printf("[%08x] Replicas remaining to write: %v active uploads: %v",
221                                 requestID, remaining_replicas, active)
222
223                         // Now wait for something to happen.
224                         if active > 0 {
225                                 status := <-upload_status
226                                 active -= 1
227
228                                 if status.statusCode == 200 {
229                                         // good news!
230                                         remaining_replicas -= status.replicas_stored
231                                         locator = status.response
232                                 } else if status.statusCode == 0 || status.statusCode == 408 || status.statusCode == 429 ||
233                                         (status.statusCode >= 500 && status.statusCode != 503) {
234                                         // Timeout, too many requests, or other server side failure
235                                         // Do not retry when status code is 503, which means the keep server is full
236                                         retryServers = append(retryServers, status.url[0:strings.LastIndex(status.url, "/")])
237                                 }
238                         } else {
239                                 break
240                         }
241                 }
242
243                 sv = retryServers
244         }
245
246         return locator, this.Want_replicas, nil
247 }