X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/d6a21ab8a0c5a45bdcd7c0fbaa93f7096e56e831..0d753912c2fa8f53f1d5c2e6251b7a25caeb3499:/services/keep-balance/metrics.go diff --git a/services/keep-balance/metrics.go b/services/keep-balance/metrics.go index 0564d46cda..02cee3955f 100644 --- a/services/keep-balance/metrics.go +++ b/services/keep-balance/metrics.go @@ -2,11 +2,12 @@ // // SPDX-License-Identifier: AGPL-3.0 -package main +package keepbalance import ( "fmt" "net/http" + "strconv" "sync" "github.com/prometheus/client_golang/prometheus" @@ -17,18 +18,20 @@ type observer interface{ Observe(float64) } type setter interface{ Set(float64) } type metrics struct { - reg *prometheus.Registry - statsGauges map[string]setter - observers map[string]observer - setupOnce sync.Once - mtx sync.Mutex + reg *prometheus.Registry + statsGauges map[string]setter + statsGaugeVecs map[string]*prometheus.GaugeVec + observers map[string]observer + setupOnce sync.Once + mtx sync.Mutex } -func newMetrics() *metrics { +func newMetrics(registry *prometheus.Registry) *metrics { return &metrics{ - reg: prometheus.NewRegistry(), - statsGauges: map[string]setter{}, - observers: map[string]observer{}, + reg: registry, + statsGauges: map[string]setter{}, + statsGaugeVecs: map[string]*prometheus.GaugeVec{}, + observers: map[string]observer{}, } } @@ -58,12 +61,29 @@ func (m *metrics) UpdateStats(s balancerStats) { Help string } s2g := map[string]gauge{ - "total": {s.current, "current backend storage usage"}, - "garbage": {s.garbage, "garbage (unreferenced, old)"}, - "transient": {s.unref, "transient (unreferenced, new)"}, - "overreplicated": {s.overrep, "overreplicated"}, - "underreplicated": {s.underrep, "underreplicated"}, - "lost": {s.lost, "lost"}, + "total": {s.current, "current backend storage usage"}, + "garbage": {s.garbage, "garbage (unreferenced, old)"}, + "transient": {s.unref, "transient (unreferenced, new)"}, + "overreplicated": {s.overrep, "overreplicated"}, + "underreplicated": {s.underrep, "underreplicated"}, + "unachievable": {s.unachievable, "unachievable"}, + "balanced": {s.justright, "optimally balanced"}, + "desired": {s.desired, "desired"}, + "lost": {s.lost, "lost"}, + "dedup_byte_ratio": {s.dedupByteRatio(), "deduplication ratio, bytes referenced / bytes stored"}, + "dedup_block_ratio": {s.dedupBlockRatio(), "deduplication ratio, blocks referenced / blocks stored"}, + "collection_bytes": {s.collectionBytes, "total apparent size of all collections"}, + "referenced_bytes": {s.collectionBlockBytes, "total size of unique referenced blocks"}, + "reference_count": {s.collectionBlockRefs, "block references in all collections"}, + "referenced_blocks": {s.collectionBlocks, "blocks referenced by any collection"}, + + "pull_entries_sent_count": {s.pulls, "total entries sent in pull lists"}, + "pull_entries_deferred_count": {s.pullsDeferred, "total entries deferred (not sent) in pull lists"}, + "trash_entries_sent_count": {s.trashes, "total entries sent in trash lists"}, + "trash_entries_deferred_count": {s.trashesDeferred, "total entries deferred (not sent) in trash lists"}, + + "replicated_block_count": {s.replHistogram, "blocks with indicated number of replicas at last count"}, + "usage": {s.classStats, "stored in indicated storage class"}, } m.setupOnce.Do(func() { // Register gauge(s) for each balancerStats field. @@ -83,8 +103,31 @@ func (m *metrics) UpdateStats(s balancerStats) { for _, sub := range []string{"blocks", "bytes", "replicas"} { addGauge(name+"_"+sub, sub+" of "+gauge.Help) } - case int, int64: + case int, int64, float64: addGauge(name, gauge.Help) + case []int: + // replHistogram + gv := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "arvados", + Name: name, + Subsystem: "keep", + Help: gauge.Help, + }, []string{"replicas"}) + m.reg.MustRegister(gv) + m.statsGaugeVecs[name] = gv + case map[string]replicationStats: + // classStats + for _, sub := range []string{"blocks", "bytes", "replicas"} { + name := name + "_" + sub + gv := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "arvados", + Name: name, + Subsystem: "keep", + Help: gauge.Help, + }, []string{"storage_class", "status"}) + m.reg.MustRegister(gv) + m.statsGaugeVecs[name] = gv + } default: panic(fmt.Sprintf("bad gauge type %T", gauge.Value)) } @@ -101,6 +144,40 @@ func (m *metrics) UpdateStats(s balancerStats) { m.statsGauges[name].Set(float64(val)) case int64: m.statsGauges[name].Set(float64(val)) + case float64: + m.statsGauges[name].Set(float64(val)) + case []int: + // replHistogram + for r, n := range val { + m.statsGaugeVecs[name].WithLabelValues(strconv.Itoa(r)).Set(float64(n)) + } + // Record zero for higher-than-max-replication + // metrics, so we don't incorrectly continue + // to report stale metrics. + // + // For example, if we previously reported n=1 + // for repl=6, but have since restarted + // keep-balance and the most replicated block + // now has repl=5, then the repl=6 gauge will + // still say n=1 until we clear it explicitly + // here. + for r := len(val); r < len(val)+4 || r < len(val)*2; r++ { + m.statsGaugeVecs[name].WithLabelValues(strconv.Itoa(r)).Set(0) + } + case map[string]replicationStats: + // classStats + for class, cs := range val { + for label, val := range map[string]blocksNBytes{ + "needed": cs.needed, + "unneeded": cs.unneeded, + "pulling": cs.pulling, + "unachievable": cs.unachievable, + } { + m.statsGaugeVecs[name+"_blocks"].WithLabelValues(class, label).Set(float64(val.blocks)) + m.statsGaugeVecs[name+"_bytes"].WithLabelValues(class, label).Set(float64(val.bytes)) + m.statsGaugeVecs[name+"_replicas"].WithLabelValues(class, label).Set(float64(val.replicas)) + } + } default: panic(fmt.Sprintf("bad gauge type %T", gauge.Value)) }