21036: Report remaining balancerStats as prometheus metrics.
authorTom Clegg <tom@curii.com>
Thu, 4 Jan 2024 21:12:18 +0000 (16:12 -0500)
committerTom Clegg <tom@curii.com>
Thu, 4 Jan 2024 21:12:18 +0000 (16:12 -0500)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

services/keep-balance/balance_run_test.go
services/keep-balance/metrics.go

index f66194e2a2cd39852d8de4cf45c8aa0a4cc89962..7f6deb1e54ae2668728a0e504b0fc0c0d0715fae 100644 (file)
@@ -556,6 +556,10 @@ func (s *runSuite) TestDryRun(c *check.C) {
        c.Check(bal.stats.trashesDeferred, check.Not(check.Equals), 0)
        c.Check(bal.stats.underrep.replicas, check.Not(check.Equals), 0)
        c.Check(bal.stats.overrep.replicas, check.Not(check.Equals), 0)
+
+       metrics := arvadostest.GatherMetricsAsString(srv.Metrics.reg)
+       c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_trash_entries_deferred_count [1-9].*`)
+       c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_pull_entries_deferred_count [1-9].*`)
 }
 
 func (s *runSuite) TestCommit(c *check.C) {
@@ -593,6 +597,19 @@ func (s *runSuite) TestCommit(c *check.C) {
        c.Check(metrics, check.Matches, `(?ms).*\narvados_keepbalance_changeset_compute_seconds_count 1\n.*`)
        c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_dedup_byte_ratio [1-9].*`)
        c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_dedup_block_ratio [1-9].*`)
+
+       c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_replicated_block_count{replicas="0"} [1-9].*`)
+       c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_replicated_block_count{replicas="1"} [1-9].*`)
+       c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_replicated_block_count{replicas="9"} 0\n.*`)
+
+       c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_replicas{status="needed",storage_class="default"} [1-9].*`)
+       c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_blocks{status="needed",storage_class="default"} [1-9].*`)
+       c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="needed",storage_class="default"} [1-9].*`)
+       c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="unneeded",storage_class="default"} [1-9].*`)
+       c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="unachievable",storage_class="default"} [1-9].*`)
+       c.Check(metrics, check.Matches, `(?ms).*\narvados_keep_usage_bytes{status="pulling",storage_class="default"} [1-9].*`)
+
+       c.Logf("%s", metrics)
 }
 
 func (s *runSuite) TestChunkPrefix(c *check.C) {
index 4683b67b9860052d97d8fa77e92141ae29bdcef1..02cee3955f70e372924c15a3bb2ed8345db6bebf 100644 (file)
@@ -7,6 +7,7 @@ package keepbalance
 import (
        "fmt"
        "net/http"
+       "strconv"
        "sync"
 
        "github.com/prometheus/client_golang/prometheus"
@@ -17,18 +18,20 @@ type observer interface{ Observe(float64) }
 type setter interface{ Set(float64) }
 
 type metrics struct {
-       reg         *prometheus.Registry
-       statsGauges map[string]setter
-       observers   map[string]observer
-       setupOnce   sync.Once
-       mtx         sync.Mutex
+       reg            *prometheus.Registry
+       statsGauges    map[string]setter
+       statsGaugeVecs map[string]*prometheus.GaugeVec
+       observers      map[string]observer
+       setupOnce      sync.Once
+       mtx            sync.Mutex
 }
 
 func newMetrics(registry *prometheus.Registry) *metrics {
        return &metrics{
-               reg:         registry,
-               statsGauges: map[string]setter{},
-               observers:   map[string]observer{},
+               reg:            registry,
+               statsGauges:    map[string]setter{},
+               statsGaugeVecs: map[string]*prometheus.GaugeVec{},
+               observers:      map[string]observer{},
        }
 }
 
@@ -63,9 +66,24 @@ func (m *metrics) UpdateStats(s balancerStats) {
                "transient":         {s.unref, "transient (unreferenced, new)"},
                "overreplicated":    {s.overrep, "overreplicated"},
                "underreplicated":   {s.underrep, "underreplicated"},
+               "unachievable":      {s.unachievable, "unachievable"},
+               "balanced":          {s.justright, "optimally balanced"},
+               "desired":           {s.desired, "desired"},
                "lost":              {s.lost, "lost"},
                "dedup_byte_ratio":  {s.dedupByteRatio(), "deduplication ratio, bytes referenced / bytes stored"},
                "dedup_block_ratio": {s.dedupBlockRatio(), "deduplication ratio, blocks referenced / blocks stored"},
+               "collection_bytes":  {s.collectionBytes, "total apparent size of all collections"},
+               "referenced_bytes":  {s.collectionBlockBytes, "total size of unique referenced blocks"},
+               "reference_count":   {s.collectionBlockRefs, "block references in all collections"},
+               "referenced_blocks": {s.collectionBlocks, "blocks referenced by any collection"},
+
+               "pull_entries_sent_count":      {s.pulls, "total entries sent in pull lists"},
+               "pull_entries_deferred_count":  {s.pullsDeferred, "total entries deferred (not sent) in pull lists"},
+               "trash_entries_sent_count":     {s.trashes, "total entries sent in trash lists"},
+               "trash_entries_deferred_count": {s.trashesDeferred, "total entries deferred (not sent) in trash lists"},
+
+               "replicated_block_count": {s.replHistogram, "blocks with indicated number of replicas at last count"},
+               "usage":                  {s.classStats, "stored in indicated storage class"},
        }
        m.setupOnce.Do(func() {
                // Register gauge(s) for each balancerStats field.
@@ -87,6 +105,29 @@ func (m *metrics) UpdateStats(s balancerStats) {
                                }
                        case int, int64, float64:
                                addGauge(name, gauge.Help)
+                       case []int:
+                               // replHistogram
+                               gv := prometheus.NewGaugeVec(prometheus.GaugeOpts{
+                                       Namespace: "arvados",
+                                       Name:      name,
+                                       Subsystem: "keep",
+                                       Help:      gauge.Help,
+                               }, []string{"replicas"})
+                               m.reg.MustRegister(gv)
+                               m.statsGaugeVecs[name] = gv
+                       case map[string]replicationStats:
+                               // classStats
+                               for _, sub := range []string{"blocks", "bytes", "replicas"} {
+                                       name := name + "_" + sub
+                                       gv := prometheus.NewGaugeVec(prometheus.GaugeOpts{
+                                               Namespace: "arvados",
+                                               Name:      name,
+                                               Subsystem: "keep",
+                                               Help:      gauge.Help,
+                                       }, []string{"storage_class", "status"})
+                                       m.reg.MustRegister(gv)
+                                       m.statsGaugeVecs[name] = gv
+                               }
                        default:
                                panic(fmt.Sprintf("bad gauge type %T", gauge.Value))
                        }
@@ -105,6 +146,38 @@ func (m *metrics) UpdateStats(s balancerStats) {
                        m.statsGauges[name].Set(float64(val))
                case float64:
                        m.statsGauges[name].Set(float64(val))
+               case []int:
+                       // replHistogram
+                       for r, n := range val {
+                               m.statsGaugeVecs[name].WithLabelValues(strconv.Itoa(r)).Set(float64(n))
+                       }
+                       // Record zero for higher-than-max-replication
+                       // metrics, so we don't incorrectly continue
+                       // to report stale metrics.
+                       //
+                       // For example, if we previously reported n=1
+                       // for repl=6, but have since restarted
+                       // keep-balance and the most replicated block
+                       // now has repl=5, then the repl=6 gauge will
+                       // still say n=1 until we clear it explicitly
+                       // here.
+                       for r := len(val); r < len(val)+4 || r < len(val)*2; r++ {
+                               m.statsGaugeVecs[name].WithLabelValues(strconv.Itoa(r)).Set(0)
+                       }
+               case map[string]replicationStats:
+                       // classStats
+                       for class, cs := range val {
+                               for label, val := range map[string]blocksNBytes{
+                                       "needed":       cs.needed,
+                                       "unneeded":     cs.unneeded,
+                                       "pulling":      cs.pulling,
+                                       "unachievable": cs.unachievable,
+                               } {
+                                       m.statsGaugeVecs[name+"_blocks"].WithLabelValues(class, label).Set(float64(val.blocks))
+                                       m.statsGaugeVecs[name+"_bytes"].WithLabelValues(class, label).Set(float64(val.bytes))
+                                       m.statsGaugeVecs[name+"_replicas"].WithLabelValues(class, label).Set(float64(val.replicas))
+                               }
+                       }
                default:
                        panic(fmt.Sprintf("bad gauge type %T", gauge.Value))
                }