1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
5 // Package crunchstat reports resource usage (CPU, memory, disk,
6 // network) for a cgroup.
28 // crunchstat collects all memory statistics, but only reports these.
29 var memoryStats = [...]string{"cache", "swap", "pgmajfault", "rss"}
31 type logPrinter interface {
32 Printf(fmt string, args ...interface{})
35 // A Reporter gathers statistics for a cgroup and writes them to a
37 type Reporter struct {
38 // Func that returns the pid of a process inside the desired
39 // cgroup. Reporter will call Pid periodically until it
40 // returns a positive number, then start reporting stats for
41 // the cgroup that process belongs to.
43 // Pid is used when cgroups v2 is available. For cgroups v1,
47 // Interval between samples. Must be positive.
48 PollPeriod time.Duration
50 // Temporary directory, will be monitored for available, used
54 // Where to write statistics. Must not be nil.
57 // When stats cross thresholds configured in the fields below,
58 // they are reported to this logger.
59 ThresholdLogger logPrinter
61 // MemThresholds maps memory stat names to slices of thresholds.
62 // When the corresponding stat exceeds a threshold, that will be logged.
63 MemThresholds map[string][]Threshold
65 // Filesystem to read /proc entries and cgroup stats from.
66 // Non-nil for testing, nil for real root filesystem.
69 // Enable debug messages.
72 // available cgroup hierarchies
75 cpusetCpus string // v1,v2 (via /proc/$PID/cpuset)
76 cpuacctStat string // v1 (via /proc/$PID/cgroup => cpuacct)
78 ioServiceBytes string // v1 (via /proc/$PID/cgroup => blkio)
80 memoryStat string // v1 and v2 (but v2 is missing some entries)
81 memoryCurrent string // v2
82 memorySwapCurrent string // v2
83 netDev string // /proc/$PID/net/dev
87 lastNetSample map[string]ioSample
88 lastDiskIOSample map[string]ioSample
89 lastCPUSample cpuSample
90 lastDiskSpaceSample diskSpaceSample
91 lastMemSample memSample
92 maxDiskSpaceSample diskSpaceSample
93 maxMemSample map[memoryKey]int64
95 // process returned by Pid(), whose cgroup stats we are
99 // individual processes whose memory size we are reporting
100 reportPIDs map[string]int
101 reportPIDsMu sync.Mutex
103 done chan struct{} // closed when we should stop reporting
104 ready chan struct{} // have pid and stat files
105 flushed chan struct{} // closed when we have made our last report
108 type Threshold struct {
114 func NewThresholdFromPercentage(total int64, percentage int64) Threshold {
116 percentage: percentage,
117 threshold: total * percentage / 100,
122 func NewThresholdsFromPercentages(total int64, percentages []int64) (thresholds []Threshold) {
123 for _, percentage := range percentages {
124 thresholds = append(thresholds, NewThresholdFromPercentage(total, percentage))
129 // memoryKey is a key into Reporter.maxMemSample.
130 // Initialize it with just statName to get the host/cgroup maximum.
131 // Initialize it with all fields to get that process' maximum.
132 type memoryKey struct {
138 // Start starts monitoring in a new goroutine, and returns
141 // The monitoring goroutine waits for a non-empty CIDFile to appear
142 // (unless CID is non-empty). Then it waits for the accounting files
143 // to appear for the monitored container. Then it collects and reports
144 // statistics until Stop is called.
146 // Callers should not call Start more than once.
148 // Callers should not modify public data fields after calling Start.
149 func (r *Reporter) Start() {
150 r.done = make(chan struct{})
151 r.ready = make(chan struct{})
152 r.flushed = make(chan struct{})
159 // ReportPID starts reporting stats for a specified process.
160 func (r *Reporter) ReportPID(name string, pid int) {
161 r.reportPIDsMu.Lock()
162 defer r.reportPIDsMu.Unlock()
163 if r.reportPIDs == nil {
164 r.reportPIDs = map[string]int{name: pid}
166 r.reportPIDs[name] = pid
170 // Stop reporting. Do not call more than once, or before calling
173 // Nothing will be logged after Stop returns unless you call a Log* method.
174 func (r *Reporter) Stop() {
179 var v1keys = map[string]bool{
186 // Find cgroup hierarchies in /proc/mounts, e.g.,
189 // "blkio": "/sys/fs/cgroup/blkio",
190 // "unified": "/sys/fs/cgroup/unified",
192 func (r *Reporter) cgroupMounts() map[string]string {
193 procmounts, err := fs.ReadFile(r.FS, "proc/mounts")
195 r.Logger.Printf("error reading /proc/mounts: %s", err)
198 mounts := map[string]string{}
199 for _, line := range bytes.Split(procmounts, []byte{'\n'}) {
200 fields := bytes.SplitN(line, []byte{' '}, 6)
201 if len(fields) != 6 {
204 switch string(fields[2]) {
206 // cgroup /sys/fs/cgroup/unified cgroup2 rw,nosuid,nodev,noexec,relatime 0 0
207 mounts["unified"] = string(fields[1])
209 // cgroup /sys/fs/cgroup/blkio cgroup rw,nosuid,nodev,noexec,relatime,blkio 0 0
210 options := bytes.Split(fields[3], []byte{','})
211 for _, option := range options {
212 option := string(option)
214 mounts[option] = string(fields[1])
223 // generate map of cgroup controller => path for r.pid.
225 // the "unified" controller represents cgroups v2.
226 func (r *Reporter) cgroupPaths(mounts map[string]string) map[string]string {
227 if len(mounts) == 0 {
230 procdir := fmt.Sprintf("proc/%d", r.pid)
231 buf, err := fs.ReadFile(r.FS, procdir+"/cgroup")
233 r.Logger.Printf("error reading cgroup file: %s", err)
236 paths := map[string]string{}
237 for _, line := range bytes.Split(buf, []byte{'\n'}) {
238 // The entry for cgroup v2 is always in the format
240 // https://docs.kernel.org/admin-guide/cgroup-v2.html
241 if bytes.HasPrefix(line, []byte("0::/")) && mounts["unified"] != "" {
242 paths["unified"] = mounts["unified"] + string(line[3:])
245 // cgroups v1 entries look like
246 // "6:cpu,cpuacct:/user.slice"
247 fields := bytes.SplitN(line, []byte{':'}, 3)
248 if len(fields) != 3 {
251 for _, key := range bytes.Split(fields[1], []byte{','}) {
253 if mounts[key] != "" {
254 paths[key] = mounts[key] + string(fields[2])
258 // In unified mode, /proc/$PID/cgroup doesn't have a cpuset
259 // entry, but we still need it -- there's no cpuset.cpus file
260 // in the cgroup2 subtree indicated by the 0::$PATH entry. We
261 // have to get the right path from /proc/$PID/cpuset.
262 if _, found := paths["cpuset"]; !found && mounts["unified"] != "" {
263 buf, _ := fs.ReadFile(r.FS, procdir+"/cpuset")
264 cpusetPath := string(bytes.TrimRight(buf, "\n"))
265 paths["cpuset"] = mounts["unified"] + cpusetPath
270 func (r *Reporter) findStatFiles() {
271 mounts := r.cgroupMounts()
272 paths := r.cgroupPaths(mounts)
273 done := map[*string]bool{}
274 for _, try := range []struct {
279 {&r.statFiles.cpuMax, "unified", "cpu.max"},
280 {&r.statFiles.cpusetCpus, "cpuset", "cpuset.cpus.effective"},
281 {&r.statFiles.cpusetCpus, "cpuset", "cpuset.cpus"},
282 {&r.statFiles.cpuacctStat, "cpuacct", "cpuacct.stat"},
283 {&r.statFiles.cpuStat, "unified", "cpu.stat"},
284 // blkio.throttle.io_service_bytes must precede
285 // blkio.io_service_bytes -- on ubuntu1804, the latter
286 // is present but reports 0
287 {&r.statFiles.ioServiceBytes, "blkio", "blkio.throttle.io_service_bytes"},
288 {&r.statFiles.ioServiceBytes, "blkio", "blkio.io_service_bytes"},
289 {&r.statFiles.ioStat, "unified", "io.stat"},
290 {&r.statFiles.memoryStat, "unified", "memory.stat"},
291 {&r.statFiles.memoryStat, "memory", "memory.stat"},
292 {&r.statFiles.memoryCurrent, "unified", "memory.current"},
293 {&r.statFiles.memorySwapCurrent, "unified", "memory.swap.current"},
295 startpath, ok := paths[try.pathkey]
296 if !ok || done[try.statFile] {
299 // /proc/$PID/cgroup says cgroup path is
300 // /exa/mple/exa/mple, however, sometimes the file we
301 // need is not under that path, it's only available in
302 // a parent cgroup's dir. So we start at
303 // /sys/fs/cgroup/unified/exa/mple/exa/mple/ and walk
304 // up to /sys/fs/cgroup/unified/ until we find the
307 // This might mean our reported stats include more
308 // cgroups in the cgroup tree, but it's the best we
310 for path := startpath; path != "" && path != "/" && (path == startpath || strings.HasPrefix(path, mounts[try.pathkey])); path, _ = filepath.Split(strings.TrimRight(path, "/")) {
311 target := strings.TrimLeft(filepath.Join(path, try.file), "/")
312 buf, err := fs.ReadFile(r.FS, target)
313 if err != nil || len(buf) == 0 || bytes.Equal(buf, []byte{'\n'}) {
315 if os.IsNotExist(err) {
319 r.Logger.Printf("skip /%s: %s", target, err)
323 *try.statFile = target
324 done[try.statFile] = true
325 r.Logger.Printf("notice: reading stats from /%s", target)
330 netdev := fmt.Sprintf("proc/%d/net/dev", r.pid)
331 if buf, err := fs.ReadFile(r.FS, netdev); err == nil && len(buf) > 0 {
332 r.statFiles.netDev = netdev
333 r.Logger.Printf("using /%s", netdev)
337 func (r *Reporter) reportMemoryMax(logger logPrinter, source, statName string, value, limit int64) {
346 percentage := 100 * value / limit
347 logger.Printf("Maximum %s memory %s usage was %d%%, %d/%d %s",
348 source, statName, percentage, value, limit, units)
350 logger.Printf("Maximum %s memory %s usage was %d %s",
351 source, statName, value, units)
355 func (r *Reporter) LogMaxima(logger logPrinter, memLimits map[string]int64) {
356 if r.lastCPUSample.hasData {
357 logger.Printf("Total CPU usage was %f user and %f sys on %.2f CPUs",
358 r.lastCPUSample.user, r.lastCPUSample.sys, r.lastCPUSample.cpus)
360 for disk, sample := range r.lastDiskIOSample {
361 logger.Printf("Total disk I/O on %s was %d bytes written and %d bytes read",
362 disk, sample.txBytes, sample.rxBytes)
364 if r.maxDiskSpaceSample.total > 0 {
365 percentage := 100 * r.maxDiskSpaceSample.used / r.maxDiskSpaceSample.total
366 logger.Printf("Maximum disk usage was %d%%, %d/%d bytes",
367 percentage, r.maxDiskSpaceSample.used, r.maxDiskSpaceSample.total)
369 for _, statName := range memoryStats {
370 value, ok := r.maxMemSample[memoryKey{statName: "total_" + statName}]
372 value, ok = r.maxMemSample[memoryKey{statName: statName}]
375 r.reportMemoryMax(logger, "container", statName, value, memLimits[statName])
378 for ifname, sample := range r.lastNetSample {
379 logger.Printf("Total network I/O on %s was %d bytes written and %d bytes read",
380 ifname, sample.txBytes, sample.rxBytes)
384 func (r *Reporter) LogProcessMemMax(logger logPrinter) {
385 for memKey, value := range r.maxMemSample {
386 if memKey.processName == "" {
389 r.reportMemoryMax(logger, memKey.processName, memKey.statName, value, 0)
393 func (r *Reporter) readAllOrWarn(in io.Reader) ([]byte, error) {
394 content, err := ioutil.ReadAll(in)
396 r.Logger.Printf("warning: %v", err)
401 type ioSample struct {
407 func (r *Reporter) doBlkIOStats() {
408 var sampleTime = time.Now()
409 newSamples := make(map[string]ioSample)
411 if r.statFiles.ioStat != "" {
412 statfile, err := fs.ReadFile(r.FS, r.statFiles.ioStat)
416 for _, line := range bytes.Split(statfile, []byte{'\n'}) {
417 // 254:16 rbytes=72163328 wbytes=117370880 rios=3811 wios=3906 dbytes=0 dios=0
418 words := bytes.Split(line, []byte{' '})
422 thisSample := ioSample{sampleTime, -1, -1}
423 for _, kv := range words[1:] {
424 if bytes.HasPrefix(kv, []byte("rbytes=")) {
425 fmt.Sscanf(string(kv[7:]), "%d", &thisSample.rxBytes)
426 } else if bytes.HasPrefix(kv, []byte("wbytes=")) {
427 fmt.Sscanf(string(kv[7:]), "%d", &thisSample.txBytes)
430 if thisSample.rxBytes >= 0 && thisSample.txBytes >= 0 {
431 newSamples[string(words[0])] = thisSample
434 } else if r.statFiles.ioServiceBytes != "" {
435 statfile, err := fs.ReadFile(r.FS, r.statFiles.ioServiceBytes)
439 for _, line := range bytes.Split(statfile, []byte{'\n'}) {
440 var device, op string
442 if _, err := fmt.Sscanf(string(line), "%s %s %d", &device, &op, &val); err != nil {
445 var thisSample ioSample
447 if thisSample, ok = newSamples[device]; !ok {
448 thisSample = ioSample{sampleTime, -1, -1}
452 thisSample.rxBytes = val
454 thisSample.txBytes = val
456 newSamples[device] = thisSample
460 for dev, sample := range newSamples {
461 if sample.txBytes < 0 || sample.rxBytes < 0 {
465 if prev, ok := r.lastDiskIOSample[dev]; ok {
466 delta = fmt.Sprintf(" -- interval %.4f seconds %d write %d read",
467 sample.sampleTime.Sub(prev.sampleTime).Seconds(),
468 sample.txBytes-prev.txBytes,
469 sample.rxBytes-prev.rxBytes)
471 r.Logger.Printf("blkio:%s %d write %d read%s\n", dev, sample.txBytes, sample.rxBytes, delta)
472 r.lastDiskIOSample[dev] = sample
476 type memSample struct {
478 memStat map[string]int64
481 func (r *Reporter) getMemSample() {
482 thisSample := memSample{time.Now(), make(map[string]int64)}
484 // memory.stat contains "pgmajfault" in cgroups v1 and v2. It
485 // also contains "rss", "swap", and "cache" in cgroups v1.
486 c, err := r.FS.Open(r.statFiles.memoryStat)
491 b := bufio.NewScanner(c)
495 if _, err := fmt.Sscanf(string(b.Text()), "%s %d", &stat, &val); err != nil {
498 thisSample.memStat[stat] = val
501 // In cgroups v2, we need to read "memory.current" and
502 // "memory.swap.current" as well.
503 for stat, fnm := range map[string]string{
504 // memory.current includes cache. We don't get
505 // separate rss/cache values, so we call
506 // memory usage "rss" for compatibility, and
508 "rss": r.statFiles.memoryCurrent,
509 "swap": r.statFiles.memorySwapCurrent,
514 buf, err := fs.ReadFile(r.FS, fnm)
519 _, err = fmt.Sscanf(string(buf), "%d", &val)
523 thisSample.memStat[stat] = val
525 for stat, val := range thisSample.memStat {
526 maxKey := memoryKey{statName: stat}
527 if val > r.maxMemSample[maxKey] {
528 r.maxMemSample[maxKey] = val
531 r.lastMemSample = thisSample
533 if r.ThresholdLogger != nil {
534 for statName, thresholds := range r.MemThresholds {
535 statValue, ok := thisSample.memStat["total_"+statName]
537 statValue, ok = thisSample.memStat[statName]
543 var statThreshold Threshold
544 for index, statThreshold = range thresholds {
545 if statValue < statThreshold.threshold {
547 } else if statThreshold.percentage > 0 {
548 r.ThresholdLogger.Printf("Container using over %d%% of memory (%s %d/%d bytes)",
549 statThreshold.percentage, statName, statValue, statThreshold.total)
551 r.ThresholdLogger.Printf("Container using over %d of memory (%s %s bytes)",
552 statThreshold.threshold, statName, statValue)
555 r.MemThresholds[statName] = thresholds[index:]
560 func (r *Reporter) reportMemSample() {
561 var outstat bytes.Buffer
562 for _, key := range memoryStats {
563 // Use "total_X" stats (entire hierarchy) if enabled,
564 // otherwise just the single cgroup -- see
565 // https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt
566 if val, ok := r.lastMemSample.memStat["total_"+key]; ok {
567 fmt.Fprintf(&outstat, " %d %s", val, key)
568 } else if val, ok := r.lastMemSample.memStat[key]; ok {
569 fmt.Fprintf(&outstat, " %d %s", val, key)
572 r.Logger.Printf("mem%s\n", outstat.String())
575 func (r *Reporter) doProcmemStats() {
576 if r.kernelPageSize == 0 {
577 // assign "don't try again" value in case we give up
578 // and return without assigning the real value
579 r.kernelPageSize = -1
580 buf, err := fs.ReadFile(r.FS, "proc/self/smaps")
582 r.Logger.Printf("error reading /proc/self/smaps: %s", err)
585 m := regexp.MustCompile(`\nKernelPageSize:\s*(\d+) kB\n`).FindSubmatch(buf)
587 r.Logger.Printf("error parsing /proc/self/smaps: KernelPageSize not found")
590 size, err := strconv.ParseInt(string(m[1]), 10, 64)
592 r.Logger.Printf("error parsing /proc/self/smaps: KernelPageSize %q: %s", m[1], err)
595 r.kernelPageSize = size * 1024
596 } else if r.kernelPageSize < 0 {
597 // already failed to determine page size, don't keep
602 r.reportPIDsMu.Lock()
603 defer r.reportPIDsMu.Unlock()
604 procnames := make([]string, 0, len(r.reportPIDs))
605 for name := range r.reportPIDs {
606 procnames = append(procnames, name)
608 sort.Strings(procnames)
610 for _, procname := range procnames {
611 pid := r.reportPIDs[procname]
612 buf, err := fs.ReadFile(r.FS, fmt.Sprintf("proc/%d/stat", pid))
616 // If the executable name contains a ')' char,
617 // /proc/$pid/stat will look like '1234 (exec name)) S
618 // 123 ...' -- the last ')' is the end of the 2nd
620 paren := bytes.LastIndexByte(buf, ')')
624 fields := bytes.SplitN(buf[paren:], []byte{' '}, 24)
625 if len(fields) < 24 {
628 // rss is the 24th field in .../stat, and fields[0]
629 // here is the last char ')' of the 2nd field, so
631 rss, err := strconv.ParseInt(string(fields[22]), 10, 64)
635 value := rss * r.kernelPageSize
636 procmem += fmt.Sprintf(" %d %s", value, procname)
637 maxKey := memoryKey{pid, procname, "rss"}
638 if value > r.maxMemSample[maxKey] {
639 r.maxMemSample[maxKey] = value
643 r.Logger.Printf("procmem%s\n", procmem)
647 func (r *Reporter) doNetworkStats() {
648 if r.statFiles.netDev == "" {
651 sampleTime := time.Now()
652 stats, err := r.FS.Open(r.statFiles.netDev)
657 scanner := bufio.NewScanner(stats)
661 words := strings.Fields(scanner.Text())
662 if len(words) != 17 {
663 // Skip lines with wrong format
666 ifName = strings.TrimRight(words[0], ":")
667 if ifName == "lo" || ifName == "" {
668 // Skip loopback interface and lines with wrong format
671 if tx, err = strconv.ParseInt(words[9], 10, 64); err != nil {
674 if rx, err = strconv.ParseInt(words[1], 10, 64); err != nil {
677 nextSample := ioSample{}
678 nextSample.sampleTime = sampleTime
679 nextSample.txBytes = tx
680 nextSample.rxBytes = rx
682 if prev, ok := r.lastNetSample[ifName]; ok {
683 interval := nextSample.sampleTime.Sub(prev.sampleTime).Seconds()
684 delta = fmt.Sprintf(" -- interval %.4f seconds %d tx %d rx",
689 r.Logger.Printf("net:%s %d tx %d rx%s\n", ifName, tx, rx, delta)
690 r.lastNetSample[ifName] = nextSample
694 type diskSpaceSample struct {
702 func (r *Reporter) doDiskSpaceStats() {
703 s := syscall.Statfs_t{}
704 err := syscall.Statfs(r.TempDir, &s)
708 bs := uint64(s.Bsize)
709 nextSample := diskSpaceSample{
711 sampleTime: time.Now(),
712 total: s.Blocks * bs,
713 used: (s.Blocks - s.Bfree) * bs,
714 available: s.Bavail * bs,
716 if nextSample.used > r.maxDiskSpaceSample.used {
717 r.maxDiskSpaceSample = nextSample
721 if r.lastDiskSpaceSample.hasData {
722 prev := r.lastDiskSpaceSample
723 interval := nextSample.sampleTime.Sub(prev.sampleTime).Seconds()
724 delta = fmt.Sprintf(" -- interval %.4f seconds %d used",
726 int64(nextSample.used-prev.used))
728 r.Logger.Printf("statfs %d available %d used %d total%s\n",
729 nextSample.available, nextSample.used, nextSample.total, delta)
730 r.lastDiskSpaceSample = nextSample
733 type cpuSample struct {
734 hasData bool // to distinguish the zero value from real data
741 // Return the number of virtual CPUs available in the container. This
742 // can be based on a scheduling ratio (which is not necessarily a
743 // whole number) or a restricted set of accessible CPUs.
745 // Return the number of host processors based on /proc/cpuinfo if
746 // cgroupfs doesn't reveal anything.
748 // Return 0 if even that doesn't work.
749 func (r *Reporter) getCPUCount() float64 {
750 if buf, err := fs.ReadFile(r.FS, r.statFiles.cpuMax); err == nil {
751 // cpu.max looks like "150000 100000" if CPU usage is
752 // restricted to 150% (docker run --cpus=1.5), or "max
754 var max, period int64
755 if _, err := fmt.Sscanf(string(buf), "%d %d", &max, &period); err == nil {
756 return float64(max) / float64(period)
759 if buf, err := fs.ReadFile(r.FS, r.statFiles.cpusetCpus); err == nil {
760 // cpuset.cpus looks like "0,4-7\n" if only CPUs
761 // 0,4,5,6,7 are available.
763 for _, v := range bytes.Split(buf, []byte{','}) {
765 n, _ := fmt.Sscanf(string(v), "%d-%d", &min, &max)
767 cpus += (max - min) + 1
774 if buf, err := fs.ReadFile(r.FS, "proc/cpuinfo"); err == nil {
775 // cpuinfo has a line like "processor\t: 0\n" for each
778 for _, line := range bytes.Split(buf, []byte{'\n'}) {
779 if bytes.HasPrefix(line, []byte("processor\t:")) {
788 func (r *Reporter) doCPUStats() {
789 var nextSample cpuSample
790 if r.statFiles.cpuStat != "" {
792 f, err := r.FS.Open(r.statFiles.cpuStat)
797 nextSample = cpuSample{
799 sampleTime: time.Now(),
800 cpus: r.getCPUCount(),
805 n, err := fmt.Fscanf(f, "%s %d\n", &stat, &val)
806 if err != nil || n != 2 {
809 if stat == "user_usec" {
810 nextSample.user = float64(val) / 1000000
811 } else if stat == "system_usec" {
812 nextSample.sys = float64(val) / 1000000
815 } else if r.statFiles.cpuacctStat != "" {
817 b, err := fs.ReadFile(r.FS, r.statFiles.cpuacctStat)
822 var userTicks, sysTicks int64
823 fmt.Sscanf(string(b), "user %d\nsystem %d", &userTicks, &sysTicks)
824 userHz := float64(100)
825 nextSample = cpuSample{
827 sampleTime: time.Now(),
828 user: float64(userTicks) / userHz,
829 sys: float64(sysTicks) / userHz,
830 cpus: r.getCPUCount(),
835 if r.lastCPUSample.hasData {
836 delta = fmt.Sprintf(" -- interval %.4f seconds %.4f user %.4f sys",
837 nextSample.sampleTime.Sub(r.lastCPUSample.sampleTime).Seconds(),
838 nextSample.user-r.lastCPUSample.user,
839 nextSample.sys-r.lastCPUSample.sys)
841 r.Logger.Printf("cpu %.4f user %.4f sys %.2f cpus%s\n",
842 nextSample.user, nextSample.sys, nextSample.cpus, delta)
843 r.lastCPUSample = nextSample
846 func (r *Reporter) doAllStats() {
855 // Report stats periodically until we learn (via r.done) that someone
857 func (r *Reporter) run() {
858 defer close(r.flushed)
860 r.maxMemSample = make(map[memoryKey]int64)
868 r.lastNetSample = make(map[string]ioSample)
869 r.lastDiskIOSample = make(map[string]ioSample)
871 if len(r.TempDir) == 0 {
872 // Temporary dir not provided, try to get it from the environment.
873 r.TempDir = os.Getenv("TMPDIR")
875 if len(r.TempDir) > 0 {
876 r.Logger.Printf("notice: monitoring temp dir %s\n", r.TempDir)
882 if r.PollPeriod < 1 {
883 r.PollPeriod = time.Second * 10
886 memTicker := time.NewTicker(time.Second)
887 mainTicker := time.NewTicker(r.PollPeriod)
900 // Wait for Pid() to return a real pid. Return true if this succeeds
901 // before Stop is called.
902 func (r *Reporter) waitForPid() bool {
903 ticker := time.NewTicker(100 * time.Millisecond)
905 warningTimer := time.After(r.PollPeriod)
914 r.Logger.Printf("warning: Pid() did not return a process ID after %v (config error?) -- still waiting...", r.PollPeriod)
916 r.Logger.Printf("warning: Pid() never returned a process ID")
923 func (r *Reporter) dumpSourceFiles(destdir string) error {
926 return errors.New("reporter was never ready")
930 fmt.Sprintf("proc/%d/cgroup", r.pid),
931 fmt.Sprintf("proc/%d/cpuset", r.pid),
936 r.statFiles.cpusetCpus,
937 r.statFiles.cpuacctStat,
939 r.statFiles.ioServiceBytes,
941 r.statFiles.memoryStat,
942 r.statFiles.memoryCurrent,
943 r.statFiles.memorySwapCurrent,
946 for _, path := range todo {
950 err := r.createParentsAndCopyFile(destdir, path)
955 r.reportPIDsMu.Lock()
956 r.reportPIDsMu.Unlock()
957 for _, pid := range r.reportPIDs {
958 path := fmt.Sprintf("proc/%d/stat", pid)
959 err := r.createParentsAndCopyFile(destdir, path)
964 if proc, err := os.FindProcess(r.pid); err != nil || proc.Signal(syscall.Signal(0)) != nil {
965 return fmt.Errorf("process %d no longer exists, snapshot is probably broken", r.pid)
970 func (r *Reporter) createParentsAndCopyFile(destdir, path string) error {
971 buf, err := fs.ReadFile(r.FS, path)
972 if os.IsNotExist(err) {
974 } else if err != nil {
977 if parent, _ := filepath.Split(path); parent != "" {
978 err = os.MkdirAll(destdir+"/"+parent, 0777)
980 return fmt.Errorf("mkdir %s: %s", destdir+"/"+parent, err)
983 destfile := destdir + "/" + path
984 r.Logger.Printf("copy %s to %s -- size %d", path, destfile, len(buf))
985 return os.WriteFile(destfile, buf, 0777)