21036: Report remaining balancerStats as prometheus metrics.
[arvados.git] / services / keep-balance / metrics.go
index 0564d46cdae76c471129c4bfbf087ed2d06748db..02cee3955f70e372924c15a3bb2ed8345db6bebf 100644 (file)
@@ -2,11 +2,12 @@
 //
 // SPDX-License-Identifier: AGPL-3.0
 
-package main
+package keepbalance
 
 import (
        "fmt"
        "net/http"
+       "strconv"
        "sync"
 
        "github.com/prometheus/client_golang/prometheus"
@@ -17,18 +18,20 @@ type observer interface{ Observe(float64) }
 type setter interface{ Set(float64) }
 
 type metrics struct {
-       reg         *prometheus.Registry
-       statsGauges map[string]setter
-       observers   map[string]observer
-       setupOnce   sync.Once
-       mtx         sync.Mutex
+       reg            *prometheus.Registry
+       statsGauges    map[string]setter
+       statsGaugeVecs map[string]*prometheus.GaugeVec
+       observers      map[string]observer
+       setupOnce      sync.Once
+       mtx            sync.Mutex
 }
 
-func newMetrics() *metrics {
+func newMetrics(registry *prometheus.Registry) *metrics {
        return &metrics{
-               reg:         prometheus.NewRegistry(),
-               statsGauges: map[string]setter{},
-               observers:   map[string]observer{},
+               reg:            registry,
+               statsGauges:    map[string]setter{},
+               statsGaugeVecs: map[string]*prometheus.GaugeVec{},
+               observers:      map[string]observer{},
        }
 }
 
@@ -58,12 +61,29 @@ func (m *metrics) UpdateStats(s balancerStats) {
                Help  string
        }
        s2g := map[string]gauge{
-               "total":           {s.current, "current backend storage usage"},
-               "garbage":         {s.garbage, "garbage (unreferenced, old)"},
-               "transient":       {s.unref, "transient (unreferenced, new)"},
-               "overreplicated":  {s.overrep, "overreplicated"},
-               "underreplicated": {s.underrep, "underreplicated"},
-               "lost":            {s.lost, "lost"},
+               "total":             {s.current, "current backend storage usage"},
+               "garbage":           {s.garbage, "garbage (unreferenced, old)"},
+               "transient":         {s.unref, "transient (unreferenced, new)"},
+               "overreplicated":    {s.overrep, "overreplicated"},
+               "underreplicated":   {s.underrep, "underreplicated"},
+               "unachievable":      {s.unachievable, "unachievable"},
+               "balanced":          {s.justright, "optimally balanced"},
+               "desired":           {s.desired, "desired"},
+               "lost":              {s.lost, "lost"},
+               "dedup_byte_ratio":  {s.dedupByteRatio(), "deduplication ratio, bytes referenced / bytes stored"},
+               "dedup_block_ratio": {s.dedupBlockRatio(), "deduplication ratio, blocks referenced / blocks stored"},
+               "collection_bytes":  {s.collectionBytes, "total apparent size of all collections"},
+               "referenced_bytes":  {s.collectionBlockBytes, "total size of unique referenced blocks"},
+               "reference_count":   {s.collectionBlockRefs, "block references in all collections"},
+               "referenced_blocks": {s.collectionBlocks, "blocks referenced by any collection"},
+
+               "pull_entries_sent_count":      {s.pulls, "total entries sent in pull lists"},
+               "pull_entries_deferred_count":  {s.pullsDeferred, "total entries deferred (not sent) in pull lists"},
+               "trash_entries_sent_count":     {s.trashes, "total entries sent in trash lists"},
+               "trash_entries_deferred_count": {s.trashesDeferred, "total entries deferred (not sent) in trash lists"},
+
+               "replicated_block_count": {s.replHistogram, "blocks with indicated number of replicas at last count"},
+               "usage":                  {s.classStats, "stored in indicated storage class"},
        }
        m.setupOnce.Do(func() {
                // Register gauge(s) for each balancerStats field.
@@ -83,8 +103,31 @@ func (m *metrics) UpdateStats(s balancerStats) {
                                for _, sub := range []string{"blocks", "bytes", "replicas"} {
                                        addGauge(name+"_"+sub, sub+" of "+gauge.Help)
                                }
-                       case int, int64:
+                       case int, int64, float64:
                                addGauge(name, gauge.Help)
+                       case []int:
+                               // replHistogram
+                               gv := prometheus.NewGaugeVec(prometheus.GaugeOpts{
+                                       Namespace: "arvados",
+                                       Name:      name,
+                                       Subsystem: "keep",
+                                       Help:      gauge.Help,
+                               }, []string{"replicas"})
+                               m.reg.MustRegister(gv)
+                               m.statsGaugeVecs[name] = gv
+                       case map[string]replicationStats:
+                               // classStats
+                               for _, sub := range []string{"blocks", "bytes", "replicas"} {
+                                       name := name + "_" + sub
+                                       gv := prometheus.NewGaugeVec(prometheus.GaugeOpts{
+                                               Namespace: "arvados",
+                                               Name:      name,
+                                               Subsystem: "keep",
+                                               Help:      gauge.Help,
+                                       }, []string{"storage_class", "status"})
+                                       m.reg.MustRegister(gv)
+                                       m.statsGaugeVecs[name] = gv
+                               }
                        default:
                                panic(fmt.Sprintf("bad gauge type %T", gauge.Value))
                        }
@@ -101,6 +144,40 @@ func (m *metrics) UpdateStats(s balancerStats) {
                        m.statsGauges[name].Set(float64(val))
                case int64:
                        m.statsGauges[name].Set(float64(val))
+               case float64:
+                       m.statsGauges[name].Set(float64(val))
+               case []int:
+                       // replHistogram
+                       for r, n := range val {
+                               m.statsGaugeVecs[name].WithLabelValues(strconv.Itoa(r)).Set(float64(n))
+                       }
+                       // Record zero for higher-than-max-replication
+                       // metrics, so we don't incorrectly continue
+                       // to report stale metrics.
+                       //
+                       // For example, if we previously reported n=1
+                       // for repl=6, but have since restarted
+                       // keep-balance and the most replicated block
+                       // now has repl=5, then the repl=6 gauge will
+                       // still say n=1 until we clear it explicitly
+                       // here.
+                       for r := len(val); r < len(val)+4 || r < len(val)*2; r++ {
+                               m.statsGaugeVecs[name].WithLabelValues(strconv.Itoa(r)).Set(0)
+                       }
+               case map[string]replicationStats:
+                       // classStats
+                       for class, cs := range val {
+                               for label, val := range map[string]blocksNBytes{
+                                       "needed":       cs.needed,
+                                       "unneeded":     cs.unneeded,
+                                       "pulling":      cs.pulling,
+                                       "unachievable": cs.unachievable,
+                               } {
+                                       m.statsGaugeVecs[name+"_blocks"].WithLabelValues(class, label).Set(float64(val.blocks))
+                                       m.statsGaugeVecs[name+"_bytes"].WithLabelValues(class, label).Set(float64(val.bytes))
+                                       m.statsGaugeVecs[name+"_replicas"].WithLabelValues(class, label).Set(float64(val.replicas))
+                               }
+                       }
                default:
                        panic(fmt.Sprintf("bad gauge type %T", gauge.Value))
                }