13937: Export stats as prometheus metrics. (WIP)
authorLucas Di Pentima <ldipentima@veritasgenetics.com>
Mon, 11 Feb 2019 15:24:15 +0000 (12:24 -0300)
committerLucas Di Pentima <ldipentima@veritasgenetics.com>
Mon, 11 Feb 2019 15:24:15 +0000 (12:24 -0300)
Includes common metrics and driver-specific (unix backend only, for now)

Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima@veritasgenetics.com>

services/keepstore/handlers.go
services/keepstore/metrics.go [new file with mode: 0644]
services/keepstore/mounts_test.go
services/keepstore/stats_ticker.go
services/keepstore/volume.go
services/keepstore/volume_unix.go

index e079b96784a16b985ed6ce47f99655e39a571ce9..e4f025d6b1d71cd0e490da6f5c6525b97273874a 100644 (file)
@@ -20,11 +20,11 @@ import (
        "sync"
        "time"
 
-       "github.com/gorilla/mux"
-
        "git.curoverse.com/arvados.git/sdk/go/arvados"
        "git.curoverse.com/arvados.git/sdk/go/health"
        "git.curoverse.com/arvados.git/sdk/go/httpserver"
+       "github.com/gorilla/mux"
+       "github.com/prometheus/client_golang/prometheus"
 )
 
 type router struct {
@@ -32,14 +32,17 @@ type router struct {
        limiter     httpserver.RequestCounter
        cluster     *arvados.Cluster
        remoteProxy remoteProxy
+       registry    *prometheus.Registry
+       metrics     nodeMetrics
 }
 
 // MakeRESTRouter returns a new router that forwards all Keep requests
 // to the appropriate handlers.
 func MakeRESTRouter(cluster *arvados.Cluster) http.Handler {
        rtr := &router{
-               Router:  mux.NewRouter(),
-               cluster: cluster,
+               Router:   mux.NewRouter(),
+               cluster:  cluster,
+               registry: prometheus.NewRegistry(),
        }
 
        rtr.HandleFunc(
@@ -86,8 +89,13 @@ func MakeRESTRouter(cluster *arvados.Cluster) http.Handler {
        rtr.NotFoundHandler = http.HandlerFunc(BadRequestHandler)
 
        rtr.limiter = httpserver.NewRequestLimiter(theConfig.MaxRequests, rtr)
+       rtr.metrics = nodeMetrics{
+               reg: rtr.registry,
+               rc:  rtr.limiter,
+       }
+       rtr.metrics.setup()
 
-       instrumented := httpserver.Instrument(nil, nil,
+       instrumented := httpserver.Instrument(rtr.registry, nil,
                httpserver.AddRequestIDs(httpserver.LogRequests(nil, rtr.limiter)))
        return instrumented.ServeAPI(theConfig.ManagementToken, instrumented)
 }
diff --git a/services/keepstore/metrics.go b/services/keepstore/metrics.go
new file mode 100644 (file)
index 0000000..f0815ae
--- /dev/null
@@ -0,0 +1,214 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package main
+
+import (
+       "fmt"
+
+       "git.curoverse.com/arvados.git/sdk/go/httpserver"
+       "github.com/prometheus/client_golang/prometheus"
+)
+
+type nodeMetrics struct {
+       reg *prometheus.Registry
+       rc  httpserver.RequestCounter
+}
+
+func (m *nodeMetrics) setup() {
+       m.reg.MustRegister(prometheus.NewGaugeFunc(
+               prometheus.GaugeOpts{
+                       Namespace: "arvados",
+                       Subsystem: "keepstore",
+                       Name:      "bufferpool_bytes_allocated",
+                       Help:      "Number of bytes allocated to buffers",
+               },
+               func() float64 { return float64(bufs.Alloc()) },
+       ))
+       m.reg.MustRegister(prometheus.NewGaugeFunc(
+               prometheus.GaugeOpts{
+                       Namespace: "arvados",
+                       Subsystem: "keepstore",
+                       Name:      "bufferpool_buffers_max",
+                       Help:      "Maximum number of buffers allowed",
+               },
+               func() float64 { return float64(bufs.Cap()) },
+       ))
+       m.reg.MustRegister(prometheus.NewGaugeFunc(
+               prometheus.GaugeOpts{
+                       Namespace: "arvados",
+                       Subsystem: "keepstore",
+                       Name:      "bufferpool_buffers_in_use",
+                       Help:      "Number of buffers in use",
+               },
+               func() float64 { return float64(bufs.Len()) },
+       ))
+       m.reg.MustRegister(prometheus.NewGaugeFunc(
+               prometheus.GaugeOpts{
+                       Namespace: "arvados",
+                       Subsystem: "keepstore",
+                       Name:      "pull_queue_in_progress",
+                       Help:      "Number of pull requests in progress",
+               },
+               func() float64 { return float64(getWorkQueueStatus(pullq).InProgress) },
+       ))
+       m.reg.MustRegister(prometheus.NewGaugeFunc(
+               prometheus.GaugeOpts{
+                       Namespace: "arvados",
+                       Subsystem: "keepstore",
+                       Name:      "pull_queue_queued",
+                       Help:      "Number of queued pull requests",
+               },
+               func() float64 { return float64(getWorkQueueStatus(pullq).Queued) },
+       ))
+       m.reg.MustRegister(prometheus.NewGaugeFunc(
+               prometheus.GaugeOpts{
+                       Namespace: "arvados",
+                       Subsystem: "keepstore",
+                       Name:      "trash_queue_in_progress",
+                       Help:      "Number of trash requests in progress",
+               },
+               func() float64 { return float64(getWorkQueueStatus(trashq).InProgress) },
+       ))
+       m.reg.MustRegister(prometheus.NewGaugeFunc(
+               prometheus.GaugeOpts{
+                       Namespace: "arvados",
+                       Subsystem: "keepstore",
+                       Name:      "trash_queue_queued",
+                       Help:      "Number of queued trash requests",
+               },
+               func() float64 { return float64(getWorkQueueStatus(trashq).Queued) },
+       ))
+       m.reg.MustRegister(prometheus.NewGaugeFunc(
+               prometheus.GaugeOpts{
+                       Namespace: "arvados",
+                       Subsystem: "keepstore",
+                       Name:      "requests_current",
+                       Help:      "Number of requests in progress",
+               },
+               func() float64 { return float64(m.rc.Current()) },
+       ))
+       m.reg.MustRegister(prometheus.NewGaugeFunc(
+               prometheus.GaugeOpts{
+                       Namespace: "arvados",
+                       Subsystem: "keepstore",
+                       Name:      "requests_max",
+                       Help:      "Maximum number of concurrent requests",
+               },
+               func() float64 { return float64(m.rc.Max()) },
+       ))
+       // Register individual volume's metrics
+       vols := KeepVM.AllReadable()
+       for _, vol := range vols {
+               labels := prometheus.Labels{
+                       "label":         vol.String(),
+                       "mount_point":   vol.Status().MountPoint,
+                       "device_number": fmt.Sprintf("%d", vol.Status().DeviceNum),
+               }
+               if vol, ok := vol.(InternalMetricser); ok {
+                       // Per-driver internal metrics
+                       vol.SetupInternalMetrics(m.reg, labels)
+               }
+               m.reg.Register(prometheus.NewGaugeFunc(
+                       prometheus.GaugeOpts{
+                               Namespace:   "arvados",
+                               Subsystem:   "keepstore",
+                               Name:        "volume_bytes_free",
+                               Help:        "Number of free bytes on the volume",
+                               ConstLabels: labels,
+                       },
+                       func() float64 { return float64(vol.Status().BytesFree) },
+               ))
+               m.reg.Register(prometheus.NewGaugeFunc(
+                       prometheus.GaugeOpts{
+                               Namespace:   "arvados",
+                               Subsystem:   "keepstore",
+                               Name:        "volume_bytes_used",
+                               Help:        "Number of used bytes on the volume",
+                               ConstLabels: labels,
+                       },
+                       func() float64 { return float64(vol.Status().BytesUsed) },
+               ))
+               m.reg.Register(prometheus.NewGaugeFunc(
+                       prometheus.GaugeOpts{
+                               Namespace:   "arvados",
+                               Subsystem:   "keepstore",
+                               Name:        "volume_io_errors",
+                               Help:        "Number of I/O errors",
+                               ConstLabels: labels,
+                       },
+                       func() float64 { return float64(KeepVM.VolumeStats(vol).Errors) },
+               ))
+               m.reg.Register(prometheus.NewGaugeFunc(
+                       prometheus.GaugeOpts{
+                               Namespace:   "arvados",
+                               Subsystem:   "keepstore",
+                               Name:        "volume_io_ops",
+                               Help:        "Number of I/O operations",
+                               ConstLabels: labels,
+                       },
+                       func() float64 { return float64(KeepVM.VolumeStats(vol).Ops) },
+               ))
+               m.reg.Register(prometheus.NewGaugeFunc(
+                       prometheus.GaugeOpts{
+                               Namespace:   "arvados",
+                               Subsystem:   "keepstore",
+                               Name:        "volume_io_compare_ops",
+                               Help:        "Number of I/O compare operations",
+                               ConstLabels: labels,
+                       },
+                       func() float64 { return float64(KeepVM.VolumeStats(vol).CompareOps) },
+               ))
+               m.reg.Register(prometheus.NewGaugeFunc(
+                       prometheus.GaugeOpts{
+                               Namespace:   "arvados",
+                               Subsystem:   "keepstore",
+                               Name:        "volume_io_get_ops",
+                               Help:        "Number of I/O get operations",
+                               ConstLabels: labels,
+                       },
+                       func() float64 { return float64(KeepVM.VolumeStats(vol).GetOps) },
+               ))
+               m.reg.Register(prometheus.NewGaugeFunc(
+                       prometheus.GaugeOpts{
+                               Namespace:   "arvados",
+                               Subsystem:   "keepstore",
+                               Name:        "volume_io_put_ops",
+                               Help:        "Number of I/O put operations",
+                               ConstLabels: labels,
+                       },
+                       func() float64 { return float64(KeepVM.VolumeStats(vol).PutOps) },
+               ))
+               m.reg.Register(prometheus.NewGaugeFunc(
+                       prometheus.GaugeOpts{
+                               Namespace:   "arvados",
+                               Subsystem:   "keepstore",
+                               Name:        "volume_io_touch_ops",
+                               Help:        "Number of I/O touch operations",
+                               ConstLabels: labels,
+                       },
+                       func() float64 { return float64(KeepVM.VolumeStats(vol).TouchOps) },
+               ))
+               m.reg.Register(prometheus.NewGaugeFunc(
+                       prometheus.GaugeOpts{
+                               Namespace:   "arvados",
+                               Subsystem:   "keepstore",
+                               Name:        "volume_io_input_bytes",
+                               Help:        "Number of input bytes",
+                               ConstLabels: labels,
+                       },
+                       func() float64 { return float64(KeepVM.VolumeStats(vol).InBytes) },
+               ))
+               m.reg.Register(prometheus.NewGaugeFunc(
+                       prometheus.GaugeOpts{
+                               Namespace:   "arvados",
+                               Subsystem:   "keepstore",
+                               Name:        "volume_io_output_bytes",
+                               Help:        "Number of output bytes",
+                               ConstLabels: labels,
+                       },
+                       func() float64 { return float64(KeepVM.VolumeStats(vol).OutBytes) },
+               ))
+       }
+}
index 31b1a684fe6a077ebbbfebf7bb846f6f508a00b5..588bb4299c531ff72d6633b95464790c7e9bb0b6 100644 (file)
@@ -131,7 +131,9 @@ func (s *MountsSuite) TestMetrics(c *check.C) {
        }
        json.NewDecoder(resp.Body).Decode(&j)
        found := make(map[string]bool)
+       names := map[string]bool{}
        for _, g := range j {
+               names[g.Name] = true
                for _, m := range g.Metric {
                        if len(m.Label) == 2 && m.Label[0].Name == "code" && m.Label[0].Value == "200" && m.Label[1].Name == "method" && m.Label[1].Value == "put" {
                                c.Check(m.Summary.SampleCount, check.Equals, "2")
@@ -143,6 +145,24 @@ func (s *MountsSuite) TestMetrics(c *check.C) {
        }
        c.Check(found["request_duration_seconds"], check.Equals, true)
        c.Check(found["time_to_status_seconds"], check.Equals, true)
+
+       metricsNames := []string{
+               "arvados_keepstore_bufferpool_buffers_in_use",
+               "arvados_keepstore_bufferpool_buffers_max",
+               "arvados_keepstore_bufferpool_bytes_allocated",
+               "arvados_keepstore_pull_queue_in_progress",
+               "arvados_keepstore_pull_queue_queued",
+               "arvados_keepstore_requests_current",
+               "arvados_keepstore_requests_max",
+               "arvados_keepstore_trash_queue_in_progress",
+               "arvados_keepstore_trash_queue_queued",
+               "request_duration_seconds",
+               "time_to_status_seconds",
+       }
+       for _, m := range metricsNames {
+               _, ok := names[m]
+               c.Check(ok, check.Equals, true)
+       }
 }
 
 func (s *MountsSuite) call(method, path, tok string, body []byte) *httptest.ResponseRecorder {
index 377a53675783b890fa7863dd98ea50681074697b..36fbcf98af216183afc0169b7643c5a1afcfafb2 100644 (file)
@@ -5,8 +5,11 @@
 package main
 
 import (
+       "fmt"
        "sync"
        "sync/atomic"
+
+       "github.com/prometheus/client_golang/prometheus"
 )
 
 type statsTicker struct {
@@ -18,6 +21,28 @@ type statsTicker struct {
        lock       sync.Mutex
 }
 
+func (s *statsTicker) setupPrometheus(drv string, reg *prometheus.Registry, lbl prometheus.Labels) {
+       metrics := map[string][]interface{}{
+               "errors":    []interface{}{string("errors"), s.Errors},
+               "in_bytes":  []interface{}{string("input bytes"), s.InBytes},
+               "out_bytes": []interface{}{string("output bytes"), s.OutBytes},
+       }
+       for mName, data := range metrics {
+               mHelp := data[0].(string)
+               mVal := data[1].(uint64)
+               reg.Register(prometheus.NewGaugeFunc(
+                       prometheus.GaugeOpts{
+                               Namespace:   "arvados",
+                               Subsystem:   "keepstore",
+                               Name:        fmt.Sprintf("%s_%s", drv, mName),
+                               Help:        fmt.Sprintf("Number of %s backend %s", drv, mHelp),
+                               ConstLabels: lbl,
+                       },
+                       func() float64 { return float64(mVal) },
+               ))
+       }
+}
+
 // Tick increments each of the given counters by 1 using
 // atomic.AddUint64.
 func (s *statsTicker) Tick(counters ...*uint64) {
index 6bce05bec033fbda6c759b6b4266bcbff0f3e051..9638046391db29ff60b075327f0f9cb274345c91 100644 (file)
@@ -14,6 +14,7 @@ import (
        "time"
 
        "git.curoverse.com/arvados.git/sdk/go/arvados"
+       "github.com/prometheus/client_golang/prometheus"
 )
 
 type BlockWriter interface {
@@ -415,3 +416,9 @@ type ioStats struct {
 type InternalStatser interface {
        InternalStats() interface{}
 }
+
+// InternalMetricser provides an interface for volume drivers to register their
+// own specific metrics.
+type InternalMetricser interface {
+       SetupInternalMetrics(*prometheus.Registry, prometheus.Labels)
+}
index 23d675359244942097072d88e1bd98daf9d46c6c..a80bb7bf4af606248cde60deb989d574d703f389 100644 (file)
@@ -21,6 +21,8 @@ import (
        "sync/atomic"
        "syscall"
        "time"
+
+       "github.com/prometheus/client_golang/prometheus"
 )
 
 type unixVolumeAdder struct {
@@ -789,6 +791,42 @@ func (v *UnixVolume) EmptyTrash() {
        log.Printf("EmptyTrash stats for %v: Deleted %v bytes in %v blocks. Remaining in trash: %v bytes in %v blocks.", v.String(), bytesDeleted, blocksDeleted, bytesInTrash-bytesDeleted, blocksInTrash-blocksDeleted)
 }
 
+// SetupInternalMetrics registers driver stats to Prometheus.
+// Implements InternalMetricser interface.
+func (v *UnixVolume) SetupInternalMetrics(reg *prometheus.Registry, lbl prometheus.Labels) {
+       v.os.stats.setupPrometheus(reg, lbl)
+}
+
+func (s *unixStats) setupPrometheus(reg *prometheus.Registry, lbl prometheus.Labels) {
+       // Common backend metrics
+       s.statsTicker.setupPrometheus("unix", reg, lbl)
+       // Driver-specific backend metrics
+       metrics := map[string][]interface{}{
+               "open_ops":    []interface{}{string("open operations"), s.OpenOps},
+               "stat_ops":    []interface{}{string("stat operations"), s.StatOps},
+               "flock_ops":   []interface{}{string("flock operations"), s.FlockOps},
+               "utimes_ops":  []interface{}{string("utimes operations"), s.UtimesOps},
+               "create_ops":  []interface{}{string("create operations"), s.CreateOps},
+               "rename_ops":  []interface{}{string("rename operations"), s.RenameOps},
+               "unlink_ops":  []interface{}{string("unlink operations"), s.UnlinkOps},
+               "readdir_ops": []interface{}{string("readdir operations"), s.ReaddirOps},
+       }
+       for mName, data := range metrics {
+               mHelp := data[0].(string)
+               mVal := data[1].(uint64)
+               reg.Register(prometheus.NewGaugeFunc(
+                       prometheus.GaugeOpts{
+                               Namespace:   "arvados",
+                               Subsystem:   "keepstore",
+                               Name:        fmt.Sprintf("unix_%s", mName),
+                               Help:        fmt.Sprintf("Number of unix backend %s", mHelp),
+                               ConstLabels: lbl,
+                       },
+                       func() float64 { return float64(mVal) },
+               ))
+       }
+}
+
 type unixStats struct {
        statsTicker
        OpenOps    uint64