Merge branch '20688-wb1-to-wb2-redirects' refs #20688
[arvados.git] / lib / crunchstat / crunchstat_test.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package crunchstat
6
7 import (
8         "bytes"
9         "fmt"
10         "io/fs"
11         "os"
12         "regexp"
13         "runtime"
14         "strconv"
15         "testing"
16         "time"
17
18         "github.com/sirupsen/logrus"
19         . "gopkg.in/check.v1"
20 )
21
22 const logMsgPrefix = `(?m)(.*\n)*.* msg="`
23
24 func Test(t *testing.T) {
25         TestingT(t)
26 }
27
28 var _ = Suite(&suite{})
29
30 type testdatasource struct {
31         fspath string
32         pid    int
33 }
34
35 func (s testdatasource) Pid() int {
36         return s.pid
37 }
38 func (s testdatasource) FS() fs.FS {
39         return os.DirFS(s.fspath)
40 }
41
42 // To generate a test case for a new OS target, build
43 // cmd/arvados-server and run
44 //
45 //      arvados-server crunchstat -dump ./testdata/example1234 sleep 2
46 var testdata = map[string]testdatasource{
47         "debian11":   {fspath: "testdata/debian11", pid: 4153022},
48         "debian12":   {fspath: "testdata/debian12", pid: 1115883},
49         "ubuntu1804": {fspath: "testdata/ubuntu1804", pid: 2523},
50         "ubuntu2004": {fspath: "testdata/ubuntu2004", pid: 1360},
51         "ubuntu2204": {fspath: "testdata/ubuntu2204", pid: 1967},
52 }
53
54 type suite struct {
55         logbuf                bytes.Buffer
56         logger                *logrus.Logger
57         debian12MemoryCurrent int64
58 }
59
60 func (s *suite) SetUpSuite(c *C) {
61         s.logger = logrus.New()
62         s.logger.Out = &s.logbuf
63
64         buf, err := os.ReadFile("testdata/debian12/sys/fs/cgroup/user.slice/user-1000.slice/session-4.scope/memory.current")
65         c.Assert(err, IsNil)
66         _, err = fmt.Sscanf(string(buf), "%d", &s.debian12MemoryCurrent)
67         c.Assert(err, IsNil)
68 }
69
70 func (s *suite) SetUpTest(c *C) {
71         s.logbuf.Reset()
72 }
73
74 // Report stats for the current (go test) process's cgroup, using the
75 // test host's real procfs/sysfs.
76 func (s *suite) TestReportCurrent(c *C) {
77         r := Reporter{
78                 Pid:        os.Getpid,
79                 Logger:     s.logger,
80                 PollPeriod: time.Second,
81         }
82         r.Start()
83         defer r.Stop()
84         checkPatterns := []string{
85                 `(?ms).*rss.*`,
86                 `(?ms).*net:.*`,
87                 `(?ms).*blkio:.*`,
88                 `(?ms).* [\d.]+ user [\d.]+ sys ` + fmt.Sprintf("%d", runtime.NumCPU()) + ` cpus -- .*`,
89         }
90         for deadline := time.Now().Add(4 * time.Second); !c.Failed(); time.Sleep(time.Millisecond) {
91                 done := true
92                 for _, pattern := range checkPatterns {
93                         if m := regexp.MustCompile(pattern).FindSubmatch(s.logbuf.Bytes()); len(m) == 0 {
94                                 done = false
95                                 if time.Now().After(deadline) {
96                                         c.Errorf("timed out waiting for %s", pattern)
97                                 }
98                         }
99                 }
100                 if done {
101                         break
102                 }
103         }
104         c.Logf("%s", s.logbuf.String())
105 }
106
107 // Report stats for a the current (go test) process.
108 func (s *suite) TestReportPIDs(c *C) {
109         r := Reporter{
110                 Pid:        func() int { return 1 },
111                 Logger:     s.logger,
112                 PollPeriod: time.Second,
113         }
114         r.Start()
115         defer r.Stop()
116         r.ReportPID("init", 1)
117         r.ReportPID("test_process", os.Getpid())
118         r.ReportPID("nonexistent", 12345) // should be silently ignored/omitted
119         for deadline := time.Now().Add(10 * time.Second); ; time.Sleep(time.Millisecond) {
120                 if time.Now().After(deadline) {
121                         c.Error("timed out")
122                         break
123                 }
124                 if m := regexp.MustCompile(`(?ms).*procmem \d+ init (\d+) test_process.*`).FindSubmatch(s.logbuf.Bytes()); len(m) > 0 {
125                         size, err := strconv.ParseInt(string(m[1]), 10, 64)
126                         c.Check(err, IsNil)
127                         // Expect >1 MiB and <100 MiB -- otherwise we
128                         // are probably misinterpreting /proc/N/stat
129                         // or multiplying by the wrong page size.
130                         c.Check(size > 1000000, Equals, true)
131                         c.Check(size < 100000000, Equals, true)
132                         break
133                 }
134         }
135         c.Logf("%s", s.logbuf.String())
136 }
137
138 func (s *suite) TestAllTestdata(c *C) {
139         for platform, datasource := range testdata {
140                 s.logbuf.Reset()
141                 c.Logf("=== %s", platform)
142                 rep := Reporter{
143                         Pid:             datasource.Pid,
144                         FS:              datasource.FS(),
145                         Logger:          s.logger,
146                         PollPeriod:      time.Second,
147                         ThresholdLogger: s.logger,
148                         Debug:           true,
149                 }
150                 rep.Start()
151                 rep.Stop()
152                 logs := s.logbuf.String()
153                 c.Logf("%s", logs)
154                 c.Check(logs, Matches, `(?ms).* \d\d+ rss\\n.*`)
155                 c.Check(logs, Matches, `(?ms).*blkio:\d+:\d+ \d+ write \d+ read\\n.*`)
156                 c.Check(logs, Matches, `(?ms).*net:\S+ \d+ tx \d+ rx\\n.*`)
157                 c.Check(logs, Matches, `(?ms).* [\d.]+ user [\d.]+ sys [2-9]\d* cpus.*`)
158         }
159 }
160
161 func (s *suite) testRSSThresholds(c *C, rssPercentages []int64, alertCount int) {
162         c.Assert(alertCount <= len(rssPercentages), Equals, true)
163         rep := Reporter{
164                 Pid:    testdata["debian12"].Pid,
165                 FS:     testdata["debian12"].FS(),
166                 Logger: s.logger,
167                 MemThresholds: map[string][]Threshold{
168                         "rss": NewThresholdsFromPercentages(s.debian12MemoryCurrent*3/2, rssPercentages),
169                 },
170                 PollPeriod:      time.Second * 10,
171                 ThresholdLogger: s.logger,
172         }
173         rep.Start()
174         rep.Stop()
175         logs := s.logbuf.String()
176         c.Logf("%s", logs)
177
178         for index, expectPercentage := range rssPercentages[:alertCount] {
179                 var logCheck Checker
180                 if index < alertCount {
181                         logCheck = Matches
182                 } else {
183                         logCheck = Not(Matches)
184                 }
185                 pattern := fmt.Sprintf(`%sContainer using over %d%% of memory \(rss %d/%d bytes\)"`,
186                         logMsgPrefix, expectPercentage, s.debian12MemoryCurrent, s.debian12MemoryCurrent*3/2)
187                 c.Check(logs, logCheck, pattern)
188         }
189 }
190
191 func (s *suite) TestZeroRSSThresholds(c *C) {
192         s.testRSSThresholds(c, []int64{}, 0)
193 }
194
195 func (s *suite) TestOneRSSThresholdPassed(c *C) {
196         s.testRSSThresholds(c, []int64{55}, 1)
197 }
198
199 func (s *suite) TestOneRSSThresholdNotPassed(c *C) {
200         s.testRSSThresholds(c, []int64{85}, 0)
201 }
202
203 func (s *suite) TestMultipleRSSThresholdsNonePassed(c *C) {
204         s.testRSSThresholds(c, []int64{95, 97, 99}, 0)
205 }
206
207 func (s *suite) TestMultipleRSSThresholdsSomePassed(c *C) {
208         s.testRSSThresholds(c, []int64{45, 60, 75, 90}, 2)
209 }
210
211 func (s *suite) TestMultipleRSSThresholdsAllPassed(c *C) {
212         s.testRSSThresholds(c, []int64{1, 2, 3}, 3)
213 }
214
215 func (s *suite) TestLogMaxima(c *C) {
216         rep := Reporter{
217                 Pid:        testdata["debian12"].Pid,
218                 FS:         testdata["debian12"].FS(),
219                 Logger:     s.logger,
220                 PollPeriod: time.Second * 10,
221                 TempDir:    "/",
222         }
223         rep.Start()
224         rep.Stop()
225         rep.LogMaxima(s.logger, map[string]int64{"rss": s.debian12MemoryCurrent * 3 / 2})
226         logs := s.logbuf.String()
227         c.Logf("%s", logs)
228
229         expectRSS := fmt.Sprintf(`Maximum container memory rss usage was %d%%, %d/%d bytes`,
230                 66, s.debian12MemoryCurrent, s.debian12MemoryCurrent*3/2)
231         for _, expected := range []string{
232                 `Maximum disk usage was \d+%, \d+/\d+ bytes`,
233                 `Maximum container memory swap usage was \d\d+ bytes`,
234                 `Maximum container memory pgmajfault usage was \d\d+ faults`,
235                 expectRSS,
236         } {
237                 pattern := logMsgPrefix + expected + `"`
238                 c.Check(logs, Matches, pattern)
239         }
240 }
241
242 func (s *suite) TestLogProcessMemMax(c *C) {
243         rep := Reporter{
244                 Pid:        os.Getpid,
245                 Logger:     s.logger,
246                 PollPeriod: time.Second * 10,
247         }
248         rep.ReportPID("test-run", os.Getpid())
249         rep.Start()
250         rep.Stop()
251         rep.LogProcessMemMax(s.logger)
252         logs := s.logbuf.String()
253         c.Logf("%s", logs)
254
255         pattern := logMsgPrefix + `Maximum test-run memory rss usage was \d+ bytes"`
256         c.Check(logs, Matches, pattern)
257 }