13198: Add cache stats to metrics.
authorTom Clegg <tclegg@veritasgenetics.com>
Mon, 30 Jul 2018 16:03:07 +0000 (12:03 -0400)
committerTom Clegg <tclegg@veritasgenetics.com>
Mon, 30 Jul 2018 19:54:04 +0000 (15:54 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg@veritasgenetics.com>

build/run-tests.sh
sdk/go/httpserver/metrics.go
services/keep-web/cache.go
services/keep-web/server.go
services/keep-web/server_test.go
services/keepstore/handlers.go

index 636c0306ca94a7948b1e4a63302ae467ca7aea37..e669e326c512b8d29694c05106ad26d05507a4fa 100755 (executable)
@@ -732,7 +732,7 @@ do_test_once() {
 do_install() {
   skipit=false
 
-  if [[ -z "${only_install}" || "${only_install}" == "${1}" ]]; then
+  if [[ -z "${only_install}" || "${only_install}" == "${1}" || "${only_install}" == "${2}" ]]; then
       retry do_install_once ${@}
   else
       skipit=true
index 77525a80f12ce269132cc932cd83e343b73d788b..b52068e9571d8518a5eb5afee1267e9470821617 100644 (file)
@@ -99,10 +99,17 @@ func (m *metrics) ServeAPI(next http.Handler) http.Handler {
 // For the metrics to be accurate, the caller must ensure every
 // request passed to the Handler also passes through
 // LogRequests(logger, ...), and vice versa.
-func Instrument(logger *logrus.Logger, next http.Handler) Handler {
+//
+// If registry is nil, a new registry is created.
+//
+// If logger is nil, logrus.StandardLogger() is used.
+func Instrument(registry *prometheus.Registry, logger *logrus.Logger, next http.Handler) Handler {
        if logger == nil {
                logger = logrus.StandardLogger()
        }
+       if registry == nil {
+               registry = prometheus.NewRegistry()
+       }
        reqDuration := prometheus.NewSummaryVec(prometheus.SummaryOpts{
                Name: "request_duration_seconds",
                Help: "Summary of request duration.",
@@ -111,7 +118,6 @@ func Instrument(logger *logrus.Logger, next http.Handler) Handler {
                Name: "time_to_status_seconds",
                Help: "Summary of request TTFB.",
        }, []string{"code", "method"})
-       registry := prometheus.NewRegistry()
        registry.MustRegister(timeToStatus)
        registry.MustRegister(reqDuration)
        m := &metrics{
index 59e8de3bc9f884dec899e22072c3afe684aceb1a..b2bab78216eaf921a4710a5037d9af2231df31c5 100644 (file)
@@ -12,8 +12,11 @@ import (
        "git.curoverse.com/arvados.git/sdk/go/arvados"
        "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
        "github.com/hashicorp/golang-lru"
+       "github.com/prometheus/client_golang/prometheus"
 )
 
+const metricsUpdateInterval = time.Second / 10
+
 type cache struct {
        TTL                  arvados.Duration
        UUIDTTL              arvados.Duration
@@ -22,13 +25,16 @@ type cache struct {
        MaxPermissionEntries int
        MaxUUIDEntries       int
 
+       registry    *prometheus.Registry
        stats       cacheStats
+       metrics     cacheMetrics
        pdhs        *lru.TwoQueueCache
        collections *lru.TwoQueueCache
        permissions *lru.TwoQueueCache
        setupOnce   sync.Once
 }
 
+// cacheStats is EOL - add new metrics to cacheMetrics instead
 type cacheStats struct {
        Requests          uint64 `json:"Cache.Requests"`
        CollectionBytes   uint64 `json:"Cache.CollectionBytes"`
@@ -39,6 +45,68 @@ type cacheStats struct {
        APICalls          uint64 `json:"Cache.APICalls"`
 }
 
+type cacheMetrics struct {
+       requests          prometheus.Counter
+       collectionBytes   prometheus.Gauge
+       collectionEntries prometheus.Gauge
+       collectionHits    prometheus.Counter
+       pdhHits           prometheus.Counter
+       permissionHits    prometheus.Counter
+       apiCalls          prometheus.Counter
+}
+
+func (m *cacheMetrics) setup(reg *prometheus.Registry) {
+       m.requests = prometheus.NewCounter(prometheus.CounterOpts{
+               Namespace: "arvados",
+               Subsystem: "keepweb_collectioncache",
+               Name:      "requests",
+               Help:      "Number of targetID-to-manifest lookups handled.",
+       })
+       reg.MustRegister(m.requests)
+       m.collectionHits = prometheus.NewCounter(prometheus.CounterOpts{
+               Namespace: "arvados",
+               Subsystem: "keepweb_collectioncache",
+               Name:      "hits",
+               Help:      "Number of pdh-to-manifest cache hits.",
+       })
+       reg.MustRegister(m.collectionHits)
+       m.pdhHits = prometheus.NewCounter(prometheus.CounterOpts{
+               Namespace: "arvados",
+               Subsystem: "keepweb_collectioncache",
+               Name:      "pdh_hits",
+               Help:      "Number of uuid-to-pdh cache hits.",
+       })
+       reg.MustRegister(m.pdhHits)
+       m.permissionHits = prometheus.NewCounter(prometheus.CounterOpts{
+               Namespace: "arvados",
+               Subsystem: "keepweb_collectioncache",
+               Name:      "permission_hits",
+               Help:      "Number of targetID-to-permission cache hits.",
+       })
+       reg.MustRegister(m.permissionHits)
+       m.apiCalls = prometheus.NewCounter(prometheus.CounterOpts{
+               Namespace: "arvados",
+               Subsystem: "keepweb_collectioncache",
+               Name:      "api_calls",
+               Help:      "Number of outgoing API calls made by cache.",
+       })
+       reg.MustRegister(m.apiCalls)
+       m.collectionBytes = prometheus.NewGauge(prometheus.GaugeOpts{
+               Namespace: "arvados",
+               Subsystem: "keepweb_collectioncache",
+               Name:      "cached_manifest_bytes",
+               Help:      "Total size of all manifests in cache.",
+       })
+       reg.MustRegister(m.collectionBytes)
+       m.collectionEntries = prometheus.NewGauge(prometheus.GaugeOpts{
+               Namespace: "arvados",
+               Subsystem: "keepweb_collectioncache",
+               Name:      "cached_manifests",
+               Help:      "Number of manifests in cache.",
+       })
+       reg.MustRegister(m.collectionEntries)
+}
+
 type cachedPDH struct {
        expire time.Time
        pdh    string
@@ -67,6 +135,22 @@ func (c *cache) setup() {
        if err != nil {
                panic(err)
        }
+
+       reg := c.registry
+       if reg == nil {
+               reg = prometheus.NewRegistry()
+       }
+       c.metrics.setup(reg)
+       go func() {
+               for range time.Tick(metricsUpdateInterval) {
+                       c.updateGauges()
+               }
+       }()
+}
+
+func (c *cache) updateGauges() {
+       c.metrics.collectionBytes.Set(float64(c.collectionBytes()))
+       c.metrics.collectionEntries.Set(float64(c.collections.Len()))
 }
 
 var selectPDH = map[string]interface{}{
@@ -113,6 +197,7 @@ func (c *cache) Get(arv *arvadosclient.ArvadosClient, targetID string, forceRelo
        c.setupOnce.Do(c.setup)
 
        atomic.AddUint64(&c.stats.Requests, 1)
+       c.metrics.requests.Inc()
 
        permOK := false
        permKey := arv.ApiToken + "\000" + targetID
@@ -124,6 +209,7 @@ func (c *cache) Get(arv *arvadosclient.ArvadosClient, targetID string, forceRelo
                } else {
                        permOK = true
                        atomic.AddUint64(&c.stats.PermissionHits, 1)
+                       c.metrics.permissionHits.Inc()
                }
        }
 
@@ -137,6 +223,7 @@ func (c *cache) Get(arv *arvadosclient.ArvadosClient, targetID string, forceRelo
                } else {
                        pdh = ent.pdh
                        atomic.AddUint64(&c.stats.PDHHits, 1)
+                       c.metrics.pdhHits.Inc()
                }
        }
 
@@ -153,6 +240,7 @@ func (c *cache) Get(arv *arvadosclient.ArvadosClient, targetID string, forceRelo
                // _and_ the current token has permission, we can
                // use our cached manifest.
                atomic.AddUint64(&c.stats.APICalls, 1)
+               c.metrics.apiCalls.Inc()
                var current arvados.Collection
                err := arv.Get("collections", targetID, selectPDH, &current)
                if err != nil {
@@ -181,6 +269,7 @@ func (c *cache) Get(arv *arvadosclient.ArvadosClient, targetID string, forceRelo
 
        // Collection manifest is not cached.
        atomic.AddUint64(&c.stats.APICalls, 1)
+       c.metrics.apiCalls.Inc()
        err := arv.Get("collections", targetID, nil, &collection)
        if err != nil {
                return nil, err
@@ -261,16 +350,16 @@ func (c *cache) collectionBytes() uint64 {
 }
 
 func (c *cache) lookupCollection(key string) *arvados.Collection {
-       if ent, cached := c.collections.Get(key); !cached {
+       e, cached := c.collections.Get(key)
+       if !cached {
+               return nil
+       }
+       ent := e.(*cachedCollection)
+       if ent.expire.Before(time.Now()) {
+               c.collections.Remove(key)
                return nil
-       } else {
-               ent := ent.(*cachedCollection)
-               if ent.expire.Before(time.Now()) {
-                       c.collections.Remove(key)
-                       return nil
-               } else {
-                       atomic.AddUint64(&c.stats.CollectionHits, 1)
-                       return ent.collection
-               }
        }
+       atomic.AddUint64(&c.stats.CollectionHits, 1)
+       c.metrics.collectionHits.Inc()
+       return ent.collection
 }
index 58ec348c882b88e6c92116dca947bd24e62c5877..68ff8a7b013c2d685299eae2dc7c7da1d84f5606 100644 (file)
@@ -8,6 +8,7 @@ import (
        "net/http"
 
        "git.curoverse.com/arvados.git/sdk/go/httpserver"
+       "github.com/prometheus/client_golang/prometheus"
 )
 
 type server struct {
@@ -17,7 +18,9 @@ type server struct {
 
 func (srv *server) Start() error {
        h := &handler{Config: srv.Config}
-       mh := httpserver.Instrument(nil, httpserver.AddRequestIDs(httpserver.LogRequests(nil, h)))
+       reg := prometheus.NewRegistry()
+       h.Config.Cache.registry = reg
+       mh := httpserver.Instrument(reg, nil, httpserver.AddRequestIDs(httpserver.LogRequests(nil, h)))
        h.MetricsAPI = mh.ServeAPI(http.NotFoundHandler())
        srv.Handler = mh
        srv.Addr = srv.Config.Listen
index 6688cc2ee743ec53bf4f2ce15fdfcb4621f09253..7e738cb9f3467a63c5da91cbac253429f0dc5cad 100644 (file)
@@ -306,14 +306,19 @@ func (s *IntegrationSuite) TestMetrics(c *check.C) {
        resp, err := http.DefaultClient.Do(req)
        c.Assert(err, check.IsNil)
        c.Check(resp.StatusCode, check.Equals, http.StatusOK)
-       req, _ = http.NewRequest("GET", origin+"/foo", nil)
-       req.Host = arvadostest.FooCollection + ".example.com"
-       req.Header.Set("Authorization", "Bearer "+arvadostest.ActiveToken)
-       resp, err = http.DefaultClient.Do(req)
-       c.Assert(err, check.IsNil)
-       c.Check(resp.StatusCode, check.Equals, http.StatusOK)
-       buf, _ := ioutil.ReadAll(resp.Body)
-       c.Check(buf, check.DeepEquals, []byte("foo"))
+       for i := 0; i < 2; i++ {
+               req, _ = http.NewRequest("GET", origin+"/foo", nil)
+               req.Host = arvadostest.FooCollection + ".example.com"
+               req.Header.Set("Authorization", "Bearer "+arvadostest.ActiveToken)
+               resp, err = http.DefaultClient.Do(req)
+               c.Assert(err, check.IsNil)
+               c.Check(resp.StatusCode, check.Equals, http.StatusOK)
+               buf, _ := ioutil.ReadAll(resp.Body)
+               c.Check(buf, check.DeepEquals, []byte("foo"))
+               resp.Body.Close()
+       }
+
+       s.testServer.Config.Cache.updateGauges()
 
        req, _ = http.NewRequest("GET", origin+"/metrics.json", nil)
        resp, err = http.DefaultClient.Do(req)
@@ -327,6 +332,12 @@ func (s *IntegrationSuite) TestMetrics(c *check.C) {
                        Value    float64
                }
        }
+       type counter struct {
+               Value int64
+       }
+       type gauge struct {
+               Value float64
+       }
        var ents []struct {
                Name   string
                Help   string
@@ -336,24 +347,38 @@ func (s *IntegrationSuite) TestMetrics(c *check.C) {
                                Name  string
                                Value string
                        }
+                       Counter counter
+                       Gauge   gauge
                        Summary summary
                }
        }
        json.NewDecoder(resp.Body).Decode(&ents)
-       flat := map[string]summary{}
+       summaries := map[string]summary{}
+       gauges := map[string]gauge{}
+       counters := map[string]counter{}
        for _, e := range ents {
                for _, m := range e.Metric {
                        labels := map[string]string{}
                        for _, lbl := range m.Label {
                                labels[lbl.Name] = lbl.Value
                        }
-                       flat[e.Name+"/"+labels["method"]+"/"+labels["code"]] = m.Summary
+                       summaries[e.Name+"/"+labels["method"]+"/"+labels["code"]] = m.Summary
+                       counters[e.Name+"/"+labels["method"]+"/"+labels["code"]] = m.Counter
+                       gauges[e.Name+"/"+labels["method"]+"/"+labels["code"]] = m.Gauge
                }
        }
-       c.Check(flat["request_duration_seconds/get/200"].SampleSum, check.Not(check.Equals), 0)
-       c.Check(flat["request_duration_seconds/get/200"].SampleCount, check.Equals, "2")
-       c.Check(flat["request_duration_seconds/get/404"].SampleCount, check.Equals, "1")
-       c.Check(flat["time_to_status_seconds/get/404"].SampleCount, check.Equals, "1")
+       c.Check(summaries["request_duration_seconds/get/200"].SampleSum, check.Not(check.Equals), 0)
+       c.Check(summaries["request_duration_seconds/get/200"].SampleCount, check.Equals, "3")
+       c.Check(summaries["request_duration_seconds/get/404"].SampleCount, check.Equals, "1")
+       c.Check(summaries["time_to_status_seconds/get/404"].SampleCount, check.Equals, "1")
+       c.Check(counters["arvados_keepweb_collectioncache_requests//"].Value, check.Equals, int64(2))
+       c.Check(counters["arvados_keepweb_collectioncache_api_calls//"].Value, check.Equals, int64(1))
+       c.Check(counters["arvados_keepweb_collectioncache_hits//"].Value, check.Equals, int64(1))
+       c.Check(counters["arvados_keepweb_collectioncache_pdh_hits//"].Value, check.Equals, int64(1))
+       c.Check(counters["arvados_keepweb_collectioncache_permission_hits//"].Value, check.Equals, int64(1))
+       c.Check(gauges["arvados_keepweb_collectioncache_cached_manifests//"].Value, check.Equals, float64(1))
+       // FooCollection's cached manifest size is 45 ("1f4b0....+45") plus one 51-byte blob signature
+       c.Check(gauges["arvados_keepweb_collectioncache_cached_manifest_bytes//"].Value, check.Equals, float64(45+51))
 
        // If the Host header indicates a collection, /metrics.json
        // refers to a file in the collection -- the metrics handler
index d19be61e9ade77958b376450169f5b2abdafc66b..c31ab9c2e38fde497f451e95c2a99437735e4455 100644 (file)
@@ -88,7 +88,7 @@ func MakeRESTRouter() http.Handler {
 
        rtr.limiter = httpserver.NewRequestLimiter(theConfig.MaxRequests, rtr)
 
-       stack := httpserver.Instrument(nil,
+       stack := httpserver.Instrument(nil, nil,
                httpserver.AddRequestIDs(httpserver.LogRequests(nil, rtr.limiter)))
        return stack.ServeAPI(stack)
 }