14399: Don't insert error text in the middle of an index response.
[arvados.git] / services / keepstore / handlers.go
index e079b96784a16b985ed6ce47f99655e39a571ce9..9a4d02df850fab836cdafaa4e21abb070b492782 100644 (file)
@@ -20,11 +20,11 @@ import (
        "sync"
        "time"
 
-       "github.com/gorilla/mux"
-
        "git.curoverse.com/arvados.git/sdk/go/arvados"
        "git.curoverse.com/arvados.git/sdk/go/health"
        "git.curoverse.com/arvados.git/sdk/go/httpserver"
+       "github.com/gorilla/mux"
+       "github.com/prometheus/client_golang/prometheus"
 )
 
 type router struct {
@@ -32,14 +32,16 @@ type router struct {
        limiter     httpserver.RequestCounter
        cluster     *arvados.Cluster
        remoteProxy remoteProxy
+       metrics     *nodeMetrics
 }
 
 // MakeRESTRouter returns a new router that forwards all Keep requests
 // to the appropriate handlers.
-func MakeRESTRouter(cluster *arvados.Cluster) http.Handler {
+func MakeRESTRouter(cluster *arvados.Cluster, reg *prometheus.Registry) http.Handler {
        rtr := &router{
                Router:  mux.NewRouter(),
                cluster: cluster,
+               metrics: &nodeMetrics{reg: reg},
        }
 
        rtr.HandleFunc(
@@ -86,8 +88,12 @@ func MakeRESTRouter(cluster *arvados.Cluster) http.Handler {
        rtr.NotFoundHandler = http.HandlerFunc(BadRequestHandler)
 
        rtr.limiter = httpserver.NewRequestLimiter(theConfig.MaxRequests, rtr)
+       rtr.metrics.setupBufferPoolMetrics(bufs)
+       rtr.metrics.setupWorkQueueMetrics(pullq, "pull")
+       rtr.metrics.setupWorkQueueMetrics(trashq, "trash")
+       rtr.metrics.setupRequestMetrics(rtr.limiter)
 
-       instrumented := httpserver.Instrument(nil, nil,
+       instrumented := httpserver.Instrument(rtr.metrics.reg, nil,
                httpserver.AddRequestIDs(httpserver.LogRequests(nil, rtr.limiter)))
        return instrumented.ServeAPI(theConfig.ManagementToken, instrumented)
 }
@@ -271,13 +277,19 @@ func (rtr *router) IndexHandler(resp http.ResponseWriter, req *http.Request) {
 
        for _, v := range vols {
                if err := v.IndexTo(prefix, resp); err != nil {
-                       // The only errors returned by IndexTo are
-                       // write errors returned by resp.Write(),
-                       // which probably means the client has
-                       // disconnected and this error will never be
-                       // reported to the client -- but it will
-                       // appear in our own error log.
-                       http.Error(resp, err.Error(), http.StatusInternalServerError)
+                       // We can't send an error message to the
+                       // client because we might have already sent
+                       // headers and index content. All we can do is
+                       // log the error in our own logs, and (in
+                       // cases where headers haven't been sent yet)
+                       // set a 500 status.
+                       //
+                       // If headers have already been sent, the
+                       // client must notice the lack of trailing
+                       // newline as an indication that the response
+                       // is incomplete.
+                       log.Printf("index error from volume %s: %s", v, err)
+                       http.Error(resp, "", http.StatusInternalServerError)
                        return
                }
        }
@@ -669,6 +681,11 @@ func GetBlock(ctx context.Context, hash string, buf []byte, resp http.ResponseWr
                        if !os.IsNotExist(err) {
                                log.Printf("%s: Get(%s): %s", vol, hash, err)
                        }
+                       // If some volume returns a transient error, return it to the caller
+                       // instead of "Not found" so it can retry.
+                       if err == VolumeBusyError {
+                               errorToCaller = err.(*KeepError)
+                       }
                        continue
                }
                // Check the file checksum.