Merge remote-tracking branch 'origin/master' into 14645-fuse-operations-reporting
[arvados.git] / services / keepstore / handlers.go
index e4f025d6b1d71cd0e490da6f5c6525b97273874a..51dd73a513c1d4c729a6743aaabe0cefa1202c4b 100644 (file)
@@ -32,17 +32,16 @@ type router struct {
        limiter     httpserver.RequestCounter
        cluster     *arvados.Cluster
        remoteProxy remoteProxy
-       registry    *prometheus.Registry
-       metrics     nodeMetrics
+       metrics     *nodeMetrics
 }
 
 // MakeRESTRouter returns a new router that forwards all Keep requests
 // to the appropriate handlers.
-func MakeRESTRouter(cluster *arvados.Cluster) http.Handler {
+func MakeRESTRouter(cluster *arvados.Cluster, reg *prometheus.Registry) http.Handler {
        rtr := &router{
-               Router:   mux.NewRouter(),
-               cluster:  cluster,
-               registry: prometheus.NewRegistry(),
+               Router:  mux.NewRouter(),
+               cluster: cluster,
+               metrics: &nodeMetrics{reg: reg},
        }
 
        rtr.HandleFunc(
@@ -89,13 +88,12 @@ func MakeRESTRouter(cluster *arvados.Cluster) http.Handler {
        rtr.NotFoundHandler = http.HandlerFunc(BadRequestHandler)
 
        rtr.limiter = httpserver.NewRequestLimiter(theConfig.MaxRequests, rtr)
-       rtr.metrics = nodeMetrics{
-               reg: rtr.registry,
-               rc:  rtr.limiter,
-       }
-       rtr.metrics.setup()
+       rtr.metrics.setupBufferPoolMetrics(bufs)
+       rtr.metrics.setupWorkQueueMetrics(pullq, "pull")
+       rtr.metrics.setupWorkQueueMetrics(trashq, "trash")
+       rtr.metrics.setupRequestMetrics(rtr.limiter)
 
-       instrumented := httpserver.Instrument(rtr.registry, nil,
+       instrumented := httpserver.Instrument(rtr.metrics.reg, nil,
                httpserver.AddRequestIDs(httpserver.LogRequests(nil, rtr.limiter)))
        return instrumented.ServeAPI(theConfig.ManagementToken, instrumented)
 }
@@ -677,6 +675,11 @@ func GetBlock(ctx context.Context, hash string, buf []byte, resp http.ResponseWr
                        if !os.IsNotExist(err) {
                                log.Printf("%s: Get(%s): %s", vol, hash, err)
                        }
+                       // If some volume returns a transient error, return it to the caller
+                       // instead of "Not found" so it can retry.
+                       if err == VolumeBusyError {
+                               errorToCaller = err.(*KeepError)
+                       }
                        continue
                }
                // Check the file checksum.