21036: Report remaining balancerStats as prometheus metrics.
[arvados.git] / services / keep-balance / metrics.go
index 4683b67b9860052d97d8fa77e92141ae29bdcef1..02cee3955f70e372924c15a3bb2ed8345db6bebf 100644 (file)
@@ -7,6 +7,7 @@ package keepbalance
 import (
        "fmt"
        "net/http"
+       "strconv"
        "sync"
 
        "github.com/prometheus/client_golang/prometheus"
@@ -17,18 +18,20 @@ type observer interface{ Observe(float64) }
 type setter interface{ Set(float64) }
 
 type metrics struct {
-       reg         *prometheus.Registry
-       statsGauges map[string]setter
-       observers   map[string]observer
-       setupOnce   sync.Once
-       mtx         sync.Mutex
+       reg            *prometheus.Registry
+       statsGauges    map[string]setter
+       statsGaugeVecs map[string]*prometheus.GaugeVec
+       observers      map[string]observer
+       setupOnce      sync.Once
+       mtx            sync.Mutex
 }
 
 func newMetrics(registry *prometheus.Registry) *metrics {
        return &metrics{
-               reg:         registry,
-               statsGauges: map[string]setter{},
-               observers:   map[string]observer{},
+               reg:            registry,
+               statsGauges:    map[string]setter{},
+               statsGaugeVecs: map[string]*prometheus.GaugeVec{},
+               observers:      map[string]observer{},
        }
 }
 
@@ -63,9 +66,24 @@ func (m *metrics) UpdateStats(s balancerStats) {
                "transient":         {s.unref, "transient (unreferenced, new)"},
                "overreplicated":    {s.overrep, "overreplicated"},
                "underreplicated":   {s.underrep, "underreplicated"},
+               "unachievable":      {s.unachievable, "unachievable"},
+               "balanced":          {s.justright, "optimally balanced"},
+               "desired":           {s.desired, "desired"},
                "lost":              {s.lost, "lost"},
                "dedup_byte_ratio":  {s.dedupByteRatio(), "deduplication ratio, bytes referenced / bytes stored"},
                "dedup_block_ratio": {s.dedupBlockRatio(), "deduplication ratio, blocks referenced / blocks stored"},
+               "collection_bytes":  {s.collectionBytes, "total apparent size of all collections"},
+               "referenced_bytes":  {s.collectionBlockBytes, "total size of unique referenced blocks"},
+               "reference_count":   {s.collectionBlockRefs, "block references in all collections"},
+               "referenced_blocks": {s.collectionBlocks, "blocks referenced by any collection"},
+
+               "pull_entries_sent_count":      {s.pulls, "total entries sent in pull lists"},
+               "pull_entries_deferred_count":  {s.pullsDeferred, "total entries deferred (not sent) in pull lists"},
+               "trash_entries_sent_count":     {s.trashes, "total entries sent in trash lists"},
+               "trash_entries_deferred_count": {s.trashesDeferred, "total entries deferred (not sent) in trash lists"},
+
+               "replicated_block_count": {s.replHistogram, "blocks with indicated number of replicas at last count"},
+               "usage":                  {s.classStats, "stored in indicated storage class"},
        }
        m.setupOnce.Do(func() {
                // Register gauge(s) for each balancerStats field.
@@ -87,6 +105,29 @@ func (m *metrics) UpdateStats(s balancerStats) {
                                }
                        case int, int64, float64:
                                addGauge(name, gauge.Help)
+                       case []int:
+                               // replHistogram
+                               gv := prometheus.NewGaugeVec(prometheus.GaugeOpts{
+                                       Namespace: "arvados",
+                                       Name:      name,
+                                       Subsystem: "keep",
+                                       Help:      gauge.Help,
+                               }, []string{"replicas"})
+                               m.reg.MustRegister(gv)
+                               m.statsGaugeVecs[name] = gv
+                       case map[string]replicationStats:
+                               // classStats
+                               for _, sub := range []string{"blocks", "bytes", "replicas"} {
+                                       name := name + "_" + sub
+                                       gv := prometheus.NewGaugeVec(prometheus.GaugeOpts{
+                                               Namespace: "arvados",
+                                               Name:      name,
+                                               Subsystem: "keep",
+                                               Help:      gauge.Help,
+                                       }, []string{"storage_class", "status"})
+                                       m.reg.MustRegister(gv)
+                                       m.statsGaugeVecs[name] = gv
+                               }
                        default:
                                panic(fmt.Sprintf("bad gauge type %T", gauge.Value))
                        }
@@ -105,6 +146,38 @@ func (m *metrics) UpdateStats(s balancerStats) {
                        m.statsGauges[name].Set(float64(val))
                case float64:
                        m.statsGauges[name].Set(float64(val))
+               case []int:
+                       // replHistogram
+                       for r, n := range val {
+                               m.statsGaugeVecs[name].WithLabelValues(strconv.Itoa(r)).Set(float64(n))
+                       }
+                       // Record zero for higher-than-max-replication
+                       // metrics, so we don't incorrectly continue
+                       // to report stale metrics.
+                       //
+                       // For example, if we previously reported n=1
+                       // for repl=6, but have since restarted
+                       // keep-balance and the most replicated block
+                       // now has repl=5, then the repl=6 gauge will
+                       // still say n=1 until we clear it explicitly
+                       // here.
+                       for r := len(val); r < len(val)+4 || r < len(val)*2; r++ {
+                               m.statsGaugeVecs[name].WithLabelValues(strconv.Itoa(r)).Set(0)
+                       }
+               case map[string]replicationStats:
+                       // classStats
+                       for class, cs := range val {
+                               for label, val := range map[string]blocksNBytes{
+                                       "needed":       cs.needed,
+                                       "unneeded":     cs.unneeded,
+                                       "pulling":      cs.pulling,
+                                       "unachievable": cs.unachievable,
+                               } {
+                                       m.statsGaugeVecs[name+"_blocks"].WithLabelValues(class, label).Set(float64(val.blocks))
+                                       m.statsGaugeVecs[name+"_bytes"].WithLabelValues(class, label).Set(float64(val.bytes))
+                                       m.statsGaugeVecs[name+"_replicas"].WithLabelValues(class, label).Set(float64(val.replicas))
+                               }
+                       }
                default:
                        panic(fmt.Sprintf("bad gauge type %T", gauge.Value))
                }