From f0553505e32ee00999d1d680da14260a9a0f6b99 Mon Sep 17 00:00:00 2001 From: Lucas Di Pentima Date: Mon, 11 Feb 2019 12:24:15 -0300 Subject: [PATCH] 13937: Export stats as prometheus metrics. (WIP) Includes common metrics and driver-specific (unix backend only, for now) Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima --- services/keepstore/handlers.go | 18 ++- services/keepstore/metrics.go | 214 +++++++++++++++++++++++++++++ services/keepstore/mounts_test.go | 20 +++ services/keepstore/stats_ticker.go | 25 ++++ services/keepstore/volume.go | 7 + services/keepstore/volume_unix.go | 38 +++++ 6 files changed, 317 insertions(+), 5 deletions(-) create mode 100644 services/keepstore/metrics.go diff --git a/services/keepstore/handlers.go b/services/keepstore/handlers.go index e079b96784..e4f025d6b1 100644 --- a/services/keepstore/handlers.go +++ b/services/keepstore/handlers.go @@ -20,11 +20,11 @@ import ( "sync" "time" - "github.com/gorilla/mux" - "git.curoverse.com/arvados.git/sdk/go/arvados" "git.curoverse.com/arvados.git/sdk/go/health" "git.curoverse.com/arvados.git/sdk/go/httpserver" + "github.com/gorilla/mux" + "github.com/prometheus/client_golang/prometheus" ) type router struct { @@ -32,14 +32,17 @@ type router struct { limiter httpserver.RequestCounter cluster *arvados.Cluster remoteProxy remoteProxy + registry *prometheus.Registry + metrics nodeMetrics } // MakeRESTRouter returns a new router that forwards all Keep requests // to the appropriate handlers. func MakeRESTRouter(cluster *arvados.Cluster) http.Handler { rtr := &router{ - Router: mux.NewRouter(), - cluster: cluster, + Router: mux.NewRouter(), + cluster: cluster, + registry: prometheus.NewRegistry(), } rtr.HandleFunc( @@ -86,8 +89,13 @@ func MakeRESTRouter(cluster *arvados.Cluster) http.Handler { rtr.NotFoundHandler = http.HandlerFunc(BadRequestHandler) rtr.limiter = httpserver.NewRequestLimiter(theConfig.MaxRequests, rtr) + rtr.metrics = nodeMetrics{ + reg: rtr.registry, + rc: rtr.limiter, + } + rtr.metrics.setup() - instrumented := httpserver.Instrument(nil, nil, + instrumented := httpserver.Instrument(rtr.registry, nil, httpserver.AddRequestIDs(httpserver.LogRequests(nil, rtr.limiter))) return instrumented.ServeAPI(theConfig.ManagementToken, instrumented) } diff --git a/services/keepstore/metrics.go b/services/keepstore/metrics.go new file mode 100644 index 0000000000..f0815ae4ab --- /dev/null +++ b/services/keepstore/metrics.go @@ -0,0 +1,214 @@ +// Copyright (C) The Arvados Authors. All rights reserved. +// +// SPDX-License-Identifier: AGPL-3.0 + +package main + +import ( + "fmt" + + "git.curoverse.com/arvados.git/sdk/go/httpserver" + "github.com/prometheus/client_golang/prometheus" +) + +type nodeMetrics struct { + reg *prometheus.Registry + rc httpserver.RequestCounter +} + +func (m *nodeMetrics) setup() { + m.reg.MustRegister(prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Namespace: "arvados", + Subsystem: "keepstore", + Name: "bufferpool_bytes_allocated", + Help: "Number of bytes allocated to buffers", + }, + func() float64 { return float64(bufs.Alloc()) }, + )) + m.reg.MustRegister(prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Namespace: "arvados", + Subsystem: "keepstore", + Name: "bufferpool_buffers_max", + Help: "Maximum number of buffers allowed", + }, + func() float64 { return float64(bufs.Cap()) }, + )) + m.reg.MustRegister(prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Namespace: "arvados", + Subsystem: "keepstore", + Name: "bufferpool_buffers_in_use", + Help: "Number of buffers in use", + }, + func() float64 { return float64(bufs.Len()) }, + )) + m.reg.MustRegister(prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Namespace: "arvados", + Subsystem: "keepstore", + Name: "pull_queue_in_progress", + Help: "Number of pull requests in progress", + }, + func() float64 { return float64(getWorkQueueStatus(pullq).InProgress) }, + )) + m.reg.MustRegister(prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Namespace: "arvados", + Subsystem: "keepstore", + Name: "pull_queue_queued", + Help: "Number of queued pull requests", + }, + func() float64 { return float64(getWorkQueueStatus(pullq).Queued) }, + )) + m.reg.MustRegister(prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Namespace: "arvados", + Subsystem: "keepstore", + Name: "trash_queue_in_progress", + Help: "Number of trash requests in progress", + }, + func() float64 { return float64(getWorkQueueStatus(trashq).InProgress) }, + )) + m.reg.MustRegister(prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Namespace: "arvados", + Subsystem: "keepstore", + Name: "trash_queue_queued", + Help: "Number of queued trash requests", + }, + func() float64 { return float64(getWorkQueueStatus(trashq).Queued) }, + )) + m.reg.MustRegister(prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Namespace: "arvados", + Subsystem: "keepstore", + Name: "requests_current", + Help: "Number of requests in progress", + }, + func() float64 { return float64(m.rc.Current()) }, + )) + m.reg.MustRegister(prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Namespace: "arvados", + Subsystem: "keepstore", + Name: "requests_max", + Help: "Maximum number of concurrent requests", + }, + func() float64 { return float64(m.rc.Max()) }, + )) + // Register individual volume's metrics + vols := KeepVM.AllReadable() + for _, vol := range vols { + labels := prometheus.Labels{ + "label": vol.String(), + "mount_point": vol.Status().MountPoint, + "device_number": fmt.Sprintf("%d", vol.Status().DeviceNum), + } + if vol, ok := vol.(InternalMetricser); ok { + // Per-driver internal metrics + vol.SetupInternalMetrics(m.reg, labels) + } + m.reg.Register(prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Namespace: "arvados", + Subsystem: "keepstore", + Name: "volume_bytes_free", + Help: "Number of free bytes on the volume", + ConstLabels: labels, + }, + func() float64 { return float64(vol.Status().BytesFree) }, + )) + m.reg.Register(prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Namespace: "arvados", + Subsystem: "keepstore", + Name: "volume_bytes_used", + Help: "Number of used bytes on the volume", + ConstLabels: labels, + }, + func() float64 { return float64(vol.Status().BytesUsed) }, + )) + m.reg.Register(prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Namespace: "arvados", + Subsystem: "keepstore", + Name: "volume_io_errors", + Help: "Number of I/O errors", + ConstLabels: labels, + }, + func() float64 { return float64(KeepVM.VolumeStats(vol).Errors) }, + )) + m.reg.Register(prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Namespace: "arvados", + Subsystem: "keepstore", + Name: "volume_io_ops", + Help: "Number of I/O operations", + ConstLabels: labels, + }, + func() float64 { return float64(KeepVM.VolumeStats(vol).Ops) }, + )) + m.reg.Register(prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Namespace: "arvados", + Subsystem: "keepstore", + Name: "volume_io_compare_ops", + Help: "Number of I/O compare operations", + ConstLabels: labels, + }, + func() float64 { return float64(KeepVM.VolumeStats(vol).CompareOps) }, + )) + m.reg.Register(prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Namespace: "arvados", + Subsystem: "keepstore", + Name: "volume_io_get_ops", + Help: "Number of I/O get operations", + ConstLabels: labels, + }, + func() float64 { return float64(KeepVM.VolumeStats(vol).GetOps) }, + )) + m.reg.Register(prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Namespace: "arvados", + Subsystem: "keepstore", + Name: "volume_io_put_ops", + Help: "Number of I/O put operations", + ConstLabels: labels, + }, + func() float64 { return float64(KeepVM.VolumeStats(vol).PutOps) }, + )) + m.reg.Register(prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Namespace: "arvados", + Subsystem: "keepstore", + Name: "volume_io_touch_ops", + Help: "Number of I/O touch operations", + ConstLabels: labels, + }, + func() float64 { return float64(KeepVM.VolumeStats(vol).TouchOps) }, + )) + m.reg.Register(prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Namespace: "arvados", + Subsystem: "keepstore", + Name: "volume_io_input_bytes", + Help: "Number of input bytes", + ConstLabels: labels, + }, + func() float64 { return float64(KeepVM.VolumeStats(vol).InBytes) }, + )) + m.reg.Register(prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Namespace: "arvados", + Subsystem: "keepstore", + Name: "volume_io_output_bytes", + Help: "Number of output bytes", + ConstLabels: labels, + }, + func() float64 { return float64(KeepVM.VolumeStats(vol).OutBytes) }, + )) + } +} diff --git a/services/keepstore/mounts_test.go b/services/keepstore/mounts_test.go index 31b1a684fe..588bb4299c 100644 --- a/services/keepstore/mounts_test.go +++ b/services/keepstore/mounts_test.go @@ -131,7 +131,9 @@ func (s *MountsSuite) TestMetrics(c *check.C) { } json.NewDecoder(resp.Body).Decode(&j) found := make(map[string]bool) + names := map[string]bool{} for _, g := range j { + names[g.Name] = true for _, m := range g.Metric { if len(m.Label) == 2 && m.Label[0].Name == "code" && m.Label[0].Value == "200" && m.Label[1].Name == "method" && m.Label[1].Value == "put" { c.Check(m.Summary.SampleCount, check.Equals, "2") @@ -143,6 +145,24 @@ func (s *MountsSuite) TestMetrics(c *check.C) { } c.Check(found["request_duration_seconds"], check.Equals, true) c.Check(found["time_to_status_seconds"], check.Equals, true) + + metricsNames := []string{ + "arvados_keepstore_bufferpool_buffers_in_use", + "arvados_keepstore_bufferpool_buffers_max", + "arvados_keepstore_bufferpool_bytes_allocated", + "arvados_keepstore_pull_queue_in_progress", + "arvados_keepstore_pull_queue_queued", + "arvados_keepstore_requests_current", + "arvados_keepstore_requests_max", + "arvados_keepstore_trash_queue_in_progress", + "arvados_keepstore_trash_queue_queued", + "request_duration_seconds", + "time_to_status_seconds", + } + for _, m := range metricsNames { + _, ok := names[m] + c.Check(ok, check.Equals, true) + } } func (s *MountsSuite) call(method, path, tok string, body []byte) *httptest.ResponseRecorder { diff --git a/services/keepstore/stats_ticker.go b/services/keepstore/stats_ticker.go index 377a536757..36fbcf98af 100644 --- a/services/keepstore/stats_ticker.go +++ b/services/keepstore/stats_ticker.go @@ -5,8 +5,11 @@ package main import ( + "fmt" "sync" "sync/atomic" + + "github.com/prometheus/client_golang/prometheus" ) type statsTicker struct { @@ -18,6 +21,28 @@ type statsTicker struct { lock sync.Mutex } +func (s *statsTicker) setupPrometheus(drv string, reg *prometheus.Registry, lbl prometheus.Labels) { + metrics := map[string][]interface{}{ + "errors": []interface{}{string("errors"), s.Errors}, + "in_bytes": []interface{}{string("input bytes"), s.InBytes}, + "out_bytes": []interface{}{string("output bytes"), s.OutBytes}, + } + for mName, data := range metrics { + mHelp := data[0].(string) + mVal := data[1].(uint64) + reg.Register(prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Namespace: "arvados", + Subsystem: "keepstore", + Name: fmt.Sprintf("%s_%s", drv, mName), + Help: fmt.Sprintf("Number of %s backend %s", drv, mHelp), + ConstLabels: lbl, + }, + func() float64 { return float64(mVal) }, + )) + } +} + // Tick increments each of the given counters by 1 using // atomic.AddUint64. func (s *statsTicker) Tick(counters ...*uint64) { diff --git a/services/keepstore/volume.go b/services/keepstore/volume.go index 6bce05bec0..9638046391 100644 --- a/services/keepstore/volume.go +++ b/services/keepstore/volume.go @@ -14,6 +14,7 @@ import ( "time" "git.curoverse.com/arvados.git/sdk/go/arvados" + "github.com/prometheus/client_golang/prometheus" ) type BlockWriter interface { @@ -415,3 +416,9 @@ type ioStats struct { type InternalStatser interface { InternalStats() interface{} } + +// InternalMetricser provides an interface for volume drivers to register their +// own specific metrics. +type InternalMetricser interface { + SetupInternalMetrics(*prometheus.Registry, prometheus.Labels) +} diff --git a/services/keepstore/volume_unix.go b/services/keepstore/volume_unix.go index 23d6753592..a80bb7bf4a 100644 --- a/services/keepstore/volume_unix.go +++ b/services/keepstore/volume_unix.go @@ -21,6 +21,8 @@ import ( "sync/atomic" "syscall" "time" + + "github.com/prometheus/client_golang/prometheus" ) type unixVolumeAdder struct { @@ -789,6 +791,42 @@ func (v *UnixVolume) EmptyTrash() { log.Printf("EmptyTrash stats for %v: Deleted %v bytes in %v blocks. Remaining in trash: %v bytes in %v blocks.", v.String(), bytesDeleted, blocksDeleted, bytesInTrash-bytesDeleted, blocksInTrash-blocksDeleted) } +// SetupInternalMetrics registers driver stats to Prometheus. +// Implements InternalMetricser interface. +func (v *UnixVolume) SetupInternalMetrics(reg *prometheus.Registry, lbl prometheus.Labels) { + v.os.stats.setupPrometheus(reg, lbl) +} + +func (s *unixStats) setupPrometheus(reg *prometheus.Registry, lbl prometheus.Labels) { + // Common backend metrics + s.statsTicker.setupPrometheus("unix", reg, lbl) + // Driver-specific backend metrics + metrics := map[string][]interface{}{ + "open_ops": []interface{}{string("open operations"), s.OpenOps}, + "stat_ops": []interface{}{string("stat operations"), s.StatOps}, + "flock_ops": []interface{}{string("flock operations"), s.FlockOps}, + "utimes_ops": []interface{}{string("utimes operations"), s.UtimesOps}, + "create_ops": []interface{}{string("create operations"), s.CreateOps}, + "rename_ops": []interface{}{string("rename operations"), s.RenameOps}, + "unlink_ops": []interface{}{string("unlink operations"), s.UnlinkOps}, + "readdir_ops": []interface{}{string("readdir operations"), s.ReaddirOps}, + } + for mName, data := range metrics { + mHelp := data[0].(string) + mVal := data[1].(uint64) + reg.Register(prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Namespace: "arvados", + Subsystem: "keepstore", + Name: fmt.Sprintf("unix_%s", mName), + Help: fmt.Sprintf("Number of unix backend %s", mHelp), + ConstLabels: lbl, + }, + func() float64 { return float64(mVal) }, + )) + } +} + type unixStats struct { statsTicker OpenOps uint64 -- 2.30.2