21762: added window timeout to test spec
[arvados.git] / services / keep-balance / metrics.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package keepbalance
6
7 import (
8         "fmt"
9         "net/http"
10         "strconv"
11         "sync"
12
13         "github.com/prometheus/client_golang/prometheus"
14         "github.com/prometheus/client_golang/prometheus/promhttp"
15 )
16
17 type observer interface{ Observe(float64) }
18 type setter interface{ Set(float64) }
19
20 type metrics struct {
21         reg            *prometheus.Registry
22         statsGauges    map[string]setter
23         statsGaugeVecs map[string]*prometheus.GaugeVec
24         observers      map[string]observer
25         setupOnce      sync.Once
26         mtx            sync.Mutex
27 }
28
29 func newMetrics(registry *prometheus.Registry) *metrics {
30         return &metrics{
31                 reg:            registry,
32                 statsGauges:    map[string]setter{},
33                 statsGaugeVecs: map[string]*prometheus.GaugeVec{},
34                 observers:      map[string]observer{},
35         }
36 }
37
38 func (m *metrics) DurationObserver(name, help string) observer {
39         m.mtx.Lock()
40         defer m.mtx.Unlock()
41         if obs, ok := m.observers[name]; ok {
42                 return obs
43         }
44         summary := prometheus.NewSummary(prometheus.SummaryOpts{
45                 Namespace: "arvados",
46                 Name:      name,
47                 Subsystem: "keepbalance",
48                 Help:      help,
49         })
50         m.reg.MustRegister(summary)
51         m.observers[name] = summary
52         return summary
53 }
54
55 // UpdateStats updates prometheus metrics using the given
56 // balancerStats. It creates and registers the needed gauges on its
57 // first invocation.
58 func (m *metrics) UpdateStats(s balancerStats) {
59         type gauge struct {
60                 Value interface{}
61                 Help  string
62         }
63         s2g := map[string]gauge{
64                 "total":             {s.current, "current backend storage usage"},
65                 "garbage":           {s.garbage, "garbage (unreferenced, old)"},
66                 "transient":         {s.unref, "transient (unreferenced, new)"},
67                 "overreplicated":    {s.overrep, "overreplicated"},
68                 "underreplicated":   {s.underrep, "underreplicated"},
69                 "unachievable":      {s.unachievable, "unachievable"},
70                 "balanced":          {s.justright, "optimally balanced"},
71                 "desired":           {s.desired, "desired"},
72                 "lost":              {s.lost, "lost"},
73                 "dedup_byte_ratio":  {s.dedupByteRatio(), "deduplication ratio, bytes referenced / bytes stored"},
74                 "dedup_block_ratio": {s.dedupBlockRatio(), "deduplication ratio, blocks referenced / blocks stored"},
75                 "collection_bytes":  {s.collectionBytes, "total apparent size of all collections"},
76                 "referenced_bytes":  {s.collectionBlockBytes, "total size of unique referenced blocks"},
77                 "reference_count":   {s.collectionBlockRefs, "block references in all collections"},
78                 "referenced_blocks": {s.collectionBlocks, "blocks referenced by any collection"},
79
80                 "pull_entries_sent_count":      {s.pulls, "total entries sent in pull lists"},
81                 "pull_entries_deferred_count":  {s.pullsDeferred, "total entries deferred (not sent) in pull lists"},
82                 "trash_entries_sent_count":     {s.trashes, "total entries sent in trash lists"},
83                 "trash_entries_deferred_count": {s.trashesDeferred, "total entries deferred (not sent) in trash lists"},
84
85                 "replicated_block_count": {s.replHistogram, "blocks with indicated number of replicas at last count"},
86                 "usage":                  {s.classStats, "stored in indicated storage class"},
87         }
88         m.setupOnce.Do(func() {
89                 // Register gauge(s) for each balancerStats field.
90                 addGauge := func(name, help string) {
91                         g := prometheus.NewGauge(prometheus.GaugeOpts{
92                                 Namespace: "arvados",
93                                 Name:      name,
94                                 Subsystem: "keep",
95                                 Help:      help,
96                         })
97                         m.reg.MustRegister(g)
98                         m.statsGauges[name] = g
99                 }
100                 for name, gauge := range s2g {
101                         switch gauge.Value.(type) {
102                         case blocksNBytes:
103                                 for _, sub := range []string{"blocks", "bytes", "replicas"} {
104                                         addGauge(name+"_"+sub, sub+" of "+gauge.Help)
105                                 }
106                         case int, int64, float64:
107                                 addGauge(name, gauge.Help)
108                         case []int:
109                                 // replHistogram
110                                 gv := prometheus.NewGaugeVec(prometheus.GaugeOpts{
111                                         Namespace: "arvados",
112                                         Name:      name,
113                                         Subsystem: "keep",
114                                         Help:      gauge.Help,
115                                 }, []string{"replicas"})
116                                 m.reg.MustRegister(gv)
117                                 m.statsGaugeVecs[name] = gv
118                         case map[string]replicationStats:
119                                 // classStats
120                                 for _, sub := range []string{"blocks", "bytes", "replicas"} {
121                                         name := name + "_" + sub
122                                         gv := prometheus.NewGaugeVec(prometheus.GaugeOpts{
123                                                 Namespace: "arvados",
124                                                 Name:      name,
125                                                 Subsystem: "keep",
126                                                 Help:      gauge.Help,
127                                         }, []string{"storage_class", "status"})
128                                         m.reg.MustRegister(gv)
129                                         m.statsGaugeVecs[name] = gv
130                                 }
131                         default:
132                                 panic(fmt.Sprintf("bad gauge type %T", gauge.Value))
133                         }
134                 }
135         })
136         // Set gauges to values from s.
137         for name, gauge := range s2g {
138                 switch val := gauge.Value.(type) {
139                 case blocksNBytes:
140                         m.statsGauges[name+"_blocks"].Set(float64(val.blocks))
141                         m.statsGauges[name+"_bytes"].Set(float64(val.bytes))
142                         m.statsGauges[name+"_replicas"].Set(float64(val.replicas))
143                 case int:
144                         m.statsGauges[name].Set(float64(val))
145                 case int64:
146                         m.statsGauges[name].Set(float64(val))
147                 case float64:
148                         m.statsGauges[name].Set(float64(val))
149                 case []int:
150                         // replHistogram
151                         for r, n := range val {
152                                 m.statsGaugeVecs[name].WithLabelValues(strconv.Itoa(r)).Set(float64(n))
153                         }
154                         // Record zero for higher-than-max-replication
155                         // metrics, so we don't incorrectly continue
156                         // to report stale metrics.
157                         //
158                         // For example, if we previously reported n=1
159                         // for repl=6, but have since restarted
160                         // keep-balance and the most replicated block
161                         // now has repl=5, then the repl=6 gauge will
162                         // still say n=1 until we clear it explicitly
163                         // here.
164                         for r := len(val); r < len(val)+4 || r < len(val)*2; r++ {
165                                 m.statsGaugeVecs[name].WithLabelValues(strconv.Itoa(r)).Set(0)
166                         }
167                 case map[string]replicationStats:
168                         // classStats
169                         for class, cs := range val {
170                                 for label, val := range map[string]blocksNBytes{
171                                         "needed":       cs.needed,
172                                         "unneeded":     cs.unneeded,
173                                         "pulling":      cs.pulling,
174                                         "unachievable": cs.unachievable,
175                                 } {
176                                         m.statsGaugeVecs[name+"_blocks"].WithLabelValues(class, label).Set(float64(val.blocks))
177                                         m.statsGaugeVecs[name+"_bytes"].WithLabelValues(class, label).Set(float64(val.bytes))
178                                         m.statsGaugeVecs[name+"_replicas"].WithLabelValues(class, label).Set(float64(val.replicas))
179                                 }
180                         }
181                 default:
182                         panic(fmt.Sprintf("bad gauge type %T", gauge.Value))
183                 }
184         }
185 }
186
187 func (m *metrics) Handler(log promhttp.Logger) http.Handler {
188         return promhttp.HandlerFor(m.reg, promhttp.HandlerOpts{
189                 ErrorLog: log,
190         })
191 }