1 // Package crunchstat reports resource usage (CPU, memory, disk,
2 // network) for a cgroup.
19 // This magically allows us to look up userHz via _SC_CLK_TCK:
23 #include <sys/types.h>
29 // A Reporter gathers statistics for a cgroup and writes them to a
31 type Reporter struct {
32 // CID of the container to monitor. If empty, read the CID
33 // from CIDFile (first waiting until a non-empty file appears
34 // at CIDFile). If CIDFile is also empty, report host
38 // Path to a file we can read CID from.
41 // Where cgroup accounting files live on this system, e.g.,
45 // Parent cgroup, e.g., "docker".
48 // Interval between samples. Must be positive.
49 PollPeriod time.Duration
51 // Where to write statistics. Must not be nil.
54 reportedStatFile map[string]string
55 lastNetSample map[string]ioSample
56 lastDiskSample map[string]ioSample
57 lastCPUSample cpuSample
62 // Start starts monitoring in a new goroutine, and returns
65 // The monitoring goroutine waits for a non-empty CIDFile to appear
66 // (unless CID is non-empty). Then it waits for the accounting files
67 // to appear for the monitored container. Then it collects and reports
68 // statistics until Stop is called.
70 // Callers should not call Start more than once.
72 // Callers should not modify public data fields after calling Start.
73 func (r *Reporter) Start() {
74 r.done = make(chan struct{})
78 // Stop reporting. Do not call more than once, or before calling
81 // Nothing will be logged after Stop returns.
82 func (r *Reporter) Stop() {
86 func (r *Reporter) readAllOrWarn(in io.Reader) ([]byte, error) {
87 content, err := ioutil.ReadAll(in)
94 // Open the cgroup stats file in /sys/fs corresponding to the target
95 // cgroup, and return an io.ReadCloser. If no stats file is available,
98 // Log the file that was opened, if it isn't the same file opened on
99 // the last openStatFile for this stat.
101 // Log "not available" if no file is found and either this stat has
102 // been available in the past, or verbose==true.
104 // TODO: Instead of trying all options, choose a process in the
105 // container, and read /proc/PID/cgroup to determine the appropriate
106 // cgroup root for the given statgroup. (This will avoid falling back
107 // to host-level stats during container setup and teardown.)
108 func (r *Reporter) openStatFile(statgroup, stat string, verbose bool) (io.ReadCloser, error) {
111 // Collect container's stats
113 fmt.Sprintf("%s/%s/%s/%s/%s", r.CgroupRoot, statgroup, r.CgroupParent, r.CID, stat),
114 fmt.Sprintf("%s/%s/%s/%s", r.CgroupRoot, r.CgroupParent, r.CID, stat),
117 // Collect this host's stats
119 fmt.Sprintf("%s/%s/%s", r.CgroupRoot, statgroup, stat),
120 fmt.Sprintf("%s/%s", r.CgroupRoot, stat),
126 for _, path = range paths {
127 file, err = os.Open(path)
134 if pathWas := r.reportedStatFile[stat]; pathWas != path {
135 // Log whenever we start using a new/different cgroup
136 // stat file for a given statistic. This typically
137 // happens 1 to 3 times per statistic, depending on
138 // whether we happen to collect stats [a] before any
139 // processes have been created in the container and
140 // [b] after all contained processes have exited.
141 if path == "" && verbose {
142 r.Logger.Printf("notice: stats not available: stat %s, statgroup %s, cid %s, parent %s, root %s\n", stat, statgroup, r.CID, r.CgroupParent, r.CgroupRoot)
143 } else if pathWas != "" {
144 r.Logger.Printf("notice: stats moved from %s to %s\n", r.reportedStatFile[stat], path)
146 r.Logger.Printf("notice: reading stats from %s\n", path)
148 r.reportedStatFile[stat] = path
153 func (r *Reporter) getContainerNetStats() (io.Reader, error) {
154 procsFile, err := r.openStatFile("cpuacct", "cgroup.procs", true)
158 defer procsFile.Close()
159 reader := bufio.NewScanner(procsFile)
161 taskPid := reader.Text()
162 statsFilename := fmt.Sprintf("/proc/%s/net/dev", taskPid)
163 stats, err := ioutil.ReadFile(statsFilename)
168 return strings.NewReader(string(stats)), nil
170 return nil, errors.New("Could not read stats for any proc in container")
173 type ioSample struct {
179 func (r *Reporter) doBlkIOStats() {
180 c, err := r.openStatFile("blkio", "blkio.io_service_bytes", true)
185 b := bufio.NewScanner(c)
186 var sampleTime = time.Now()
187 newSamples := make(map[string]ioSample)
189 var device, op string
191 if _, err := fmt.Sscanf(string(b.Text()), "%s %s %d", &device, &op, &val); err != nil {
194 var thisSample ioSample
196 if thisSample, ok = newSamples[device]; !ok {
197 thisSample = ioSample{sampleTime, -1, -1}
201 thisSample.rxBytes = val
203 thisSample.txBytes = val
205 newSamples[device] = thisSample
207 for dev, sample := range newSamples {
208 if sample.txBytes < 0 || sample.rxBytes < 0 {
212 if prev, ok := r.lastDiskSample[dev]; ok {
213 delta = fmt.Sprintf(" -- interval %.4f seconds %d write %d read",
214 sample.sampleTime.Sub(prev.sampleTime).Seconds(),
215 sample.txBytes-prev.txBytes,
216 sample.rxBytes-prev.rxBytes)
218 r.Logger.Printf("blkio:%s %d write %d read%s\n", dev, sample.txBytes, sample.rxBytes, delta)
219 r.lastDiskSample[dev] = sample
223 type memSample struct {
225 memStat map[string]int64
228 func (r *Reporter) doMemoryStats() {
229 c, err := r.openStatFile("memory", "memory.stat", true)
234 b := bufio.NewScanner(c)
235 thisSample := memSample{time.Now(), make(map[string]int64)}
236 wantStats := [...]string{"cache", "swap", "pgmajfault", "rss"}
240 if _, err := fmt.Sscanf(string(b.Text()), "%s %d", &stat, &val); err != nil {
243 thisSample.memStat[stat] = val
245 var outstat bytes.Buffer
246 for _, key := range wantStats {
247 if val, ok := thisSample.memStat[key]; ok {
248 outstat.WriteString(fmt.Sprintf(" %d %s", val, key))
251 r.Logger.Printf("mem%s\n", outstat.String())
254 func (r *Reporter) doNetworkStats() {
255 sampleTime := time.Now()
256 stats, err := r.getContainerNetStats()
261 scanner := bufio.NewScanner(stats)
265 words := strings.Fields(scanner.Text())
266 if len(words) != 17 {
267 // Skip lines with wrong format
270 ifName = strings.TrimRight(words[0], ":")
271 if ifName == "lo" || ifName == "" {
272 // Skip loopback interface and lines with wrong format
275 if tx, err = strconv.ParseInt(words[9], 10, 64); err != nil {
278 if rx, err = strconv.ParseInt(words[1], 10, 64); err != nil {
281 nextSample := ioSample{}
282 nextSample.sampleTime = sampleTime
283 nextSample.txBytes = tx
284 nextSample.rxBytes = rx
286 if prev, ok := r.lastNetSample[ifName]; ok {
287 interval := nextSample.sampleTime.Sub(prev.sampleTime).Seconds()
288 delta = fmt.Sprintf(" -- interval %.4f seconds %d tx %d rx",
293 r.Logger.Printf("net:%s %d tx %d rx%s\n", ifName, tx, rx, delta)
294 r.lastNetSample[ifName] = nextSample
298 type cpuSample struct {
299 hasData bool // to distinguish the zero value from real data
306 // Return the number of CPUs available in the container. Return 0 if
307 // we can't figure out the real number of CPUs.
308 func (r *Reporter) getCPUCount() int64 {
309 cpusetFile, err := r.openStatFile("cpuset", "cpuset.cpus", true)
313 defer cpusetFile.Close()
314 b, err := r.readAllOrWarn(cpusetFile)
315 sp := strings.Split(string(b), ",")
317 for _, v := range sp {
319 n, _ := fmt.Sscanf(v, "%d-%d", &min, &max)
321 cpus += (max - min) + 1
329 func (r *Reporter) doCPUStats() {
330 statFile, err := r.openStatFile("cpuacct", "cpuacct.stat", true)
334 defer statFile.Close()
335 b, err := r.readAllOrWarn(statFile)
340 nextSample := cpuSample{true, time.Now(), 0, 0, r.getCPUCount()}
341 var userTicks, sysTicks int64
342 fmt.Sscanf(string(b), "user %d\nsystem %d", &userTicks, &sysTicks)
343 userHz := float64(C.sysconf(C._SC_CLK_TCK))
344 nextSample.user = float64(userTicks) / userHz
345 nextSample.sys = float64(sysTicks) / userHz
348 if r.lastCPUSample.hasData {
349 delta = fmt.Sprintf(" -- interval %.4f seconds %.4f user %.4f sys",
350 nextSample.sampleTime.Sub(r.lastCPUSample.sampleTime).Seconds(),
351 nextSample.user-r.lastCPUSample.user,
352 nextSample.sys-r.lastCPUSample.sys)
354 r.Logger.Printf("cpu %.4f user %.4f sys %d cpus%s\n",
355 nextSample.user, nextSample.sys, nextSample.cpus, delta)
356 r.lastCPUSample = nextSample
359 // Report stats periodically until r.done indicates someone called
361 func (r *Reporter) run() {
362 r.reportedStatFile = make(map[string]string)
364 if !r.waitForCIDFile() || !r.waitForCgroup() {
368 r.lastNetSample = make(map[string]ioSample)
369 r.lastDiskSample = make(map[string]ioSample)
371 ticker := time.NewTicker(r.PollPeriod)
385 // If CID is empty, wait for it to appear in CIDFile. Return true if
386 // we get it before r.done indicates someone called Stop.
387 func (r *Reporter) waitForCIDFile() bool {
388 if r.CID != "" || r.CIDFile == "" {
392 ticker := time.NewTicker(100 * time.Millisecond)
395 cid, err := ioutil.ReadFile(r.CIDFile)
396 if err == nil && len(cid) > 0 {
403 r.Logger.Printf("CID never appeared in %+q: %v", r.CIDFile, err)
409 // Wait for the cgroup stats files to appear in cgroup_root. Return
410 // true if they appear before r.done indicates someone called Stop. If
411 // they don't appear within one poll interval, log a warning and keep
413 func (r *Reporter) waitForCgroup() bool {
414 ticker := time.NewTicker(100 * time.Millisecond)
416 warningTimer := time.After(r.PollPeriod)
418 c, err := r.openStatFile("cpuacct", "cgroup.procs", false)
426 r.Logger.Printf("cgroup stats files have not appeared after %v (config error?) -- still waiting...", r.PollPeriod)
428 r.Logger.Printf("cgroup stats files never appeared for %v", r.CID)