1 // Package crunchstat reports resource usage (CPU, memory, disk,
2 // network) for a cgroup.
19 // This magically allows us to look up userHz via _SC_CLK_TCK:
23 #include <sys/types.h>
29 // A Reporter gathers statistics for a cgroup and writes them to a
31 type Reporter struct {
32 // CID of the container to monitor. If empty, read the CID
33 // from CIDFile (first waiting until a non-empty file appears
34 // at CIDFile). If CIDFile is also empty, report host
38 // Path to a file we can read CID from.
41 // Where cgroup accounting files live on this system, e.g.,
45 // Parent cgroup, e.g., "docker".
48 // Interval between samples. Must be positive.
49 PollPeriod time.Duration
51 // Where to write statistics. Must not be nil.
54 reportedStatFile map[string]string
55 lastNetSample map[string]ioSample
56 lastDiskSample map[string]ioSample
57 lastCPUSample cpuSample
59 done chan struct{} // closed when we should stop reporting
60 flushed chan struct{} // closed when we have made our last report
63 // Start starts monitoring in a new goroutine, and returns
66 // The monitoring goroutine waits for a non-empty CIDFile to appear
67 // (unless CID is non-empty). Then it waits for the accounting files
68 // to appear for the monitored container. Then it collects and reports
69 // statistics until Stop is called.
71 // Callers should not call Start more than once.
73 // Callers should not modify public data fields after calling Start.
74 func (r *Reporter) Start() {
75 r.done = make(chan struct{})
76 r.flushed = make(chan struct{})
80 // Stop reporting. Do not call more than once, or before calling
83 // Nothing will be logged after Stop returns.
84 func (r *Reporter) Stop() {
89 func (r *Reporter) readAllOrWarn(in io.Reader) ([]byte, error) {
90 content, err := ioutil.ReadAll(in)
97 // Open the cgroup stats file in /sys/fs corresponding to the target
98 // cgroup, and return an io.ReadCloser. If no stats file is available,
101 // Log the file that was opened, if it isn't the same file opened on
102 // the last openStatFile for this stat.
104 // Log "not available" if no file is found and either this stat has
105 // been available in the past, or verbose==true.
107 // TODO: Instead of trying all options, choose a process in the
108 // container, and read /proc/PID/cgroup to determine the appropriate
109 // cgroup root for the given statgroup. (This will avoid falling back
110 // to host-level stats during container setup and teardown.)
111 func (r *Reporter) openStatFile(statgroup, stat string, verbose bool) (io.ReadCloser, error) {
114 // Collect container's stats
116 fmt.Sprintf("%s/%s/%s/%s/%s", r.CgroupRoot, statgroup, r.CgroupParent, r.CID, stat),
117 fmt.Sprintf("%s/%s/%s/%s", r.CgroupRoot, r.CgroupParent, r.CID, stat),
120 // Collect this host's stats
122 fmt.Sprintf("%s/%s/%s", r.CgroupRoot, statgroup, stat),
123 fmt.Sprintf("%s/%s", r.CgroupRoot, stat),
129 for _, path = range paths {
130 file, err = os.Open(path)
137 if pathWas := r.reportedStatFile[stat]; pathWas != path {
138 // Log whenever we start using a new/different cgroup
139 // stat file for a given statistic. This typically
140 // happens 1 to 3 times per statistic, depending on
141 // whether we happen to collect stats [a] before any
142 // processes have been created in the container and
143 // [b] after all contained processes have exited.
144 if path == "" && verbose {
145 r.Logger.Printf("notice: stats not available: stat %s, statgroup %s, cid %s, parent %s, root %s\n", stat, statgroup, r.CID, r.CgroupParent, r.CgroupRoot)
146 } else if pathWas != "" {
147 r.Logger.Printf("notice: stats moved from %s to %s\n", r.reportedStatFile[stat], path)
149 r.Logger.Printf("notice: reading stats from %s\n", path)
151 r.reportedStatFile[stat] = path
156 func (r *Reporter) getContainerNetStats() (io.Reader, error) {
157 procsFile, err := r.openStatFile("cpuacct", "cgroup.procs", true)
161 defer procsFile.Close()
162 reader := bufio.NewScanner(procsFile)
164 taskPid := reader.Text()
165 statsFilename := fmt.Sprintf("/proc/%s/net/dev", taskPid)
166 stats, err := ioutil.ReadFile(statsFilename)
171 return strings.NewReader(string(stats)), nil
173 return nil, errors.New("Could not read stats for any proc in container")
176 type ioSample struct {
182 func (r *Reporter) doBlkIOStats() {
183 c, err := r.openStatFile("blkio", "blkio.io_service_bytes", true)
188 b := bufio.NewScanner(c)
189 var sampleTime = time.Now()
190 newSamples := make(map[string]ioSample)
192 var device, op string
194 if _, err := fmt.Sscanf(string(b.Text()), "%s %s %d", &device, &op, &val); err != nil {
197 var thisSample ioSample
199 if thisSample, ok = newSamples[device]; !ok {
200 thisSample = ioSample{sampleTime, -1, -1}
204 thisSample.rxBytes = val
206 thisSample.txBytes = val
208 newSamples[device] = thisSample
210 for dev, sample := range newSamples {
211 if sample.txBytes < 0 || sample.rxBytes < 0 {
215 if prev, ok := r.lastDiskSample[dev]; ok {
216 delta = fmt.Sprintf(" -- interval %.4f seconds %d write %d read",
217 sample.sampleTime.Sub(prev.sampleTime).Seconds(),
218 sample.txBytes-prev.txBytes,
219 sample.rxBytes-prev.rxBytes)
221 r.Logger.Printf("blkio:%s %d write %d read%s\n", dev, sample.txBytes, sample.rxBytes, delta)
222 r.lastDiskSample[dev] = sample
226 type memSample struct {
228 memStat map[string]int64
231 func (r *Reporter) doMemoryStats() {
232 c, err := r.openStatFile("memory", "memory.stat", true)
237 b := bufio.NewScanner(c)
238 thisSample := memSample{time.Now(), make(map[string]int64)}
239 wantStats := [...]string{"cache", "swap", "pgmajfault", "rss"}
243 if _, err := fmt.Sscanf(string(b.Text()), "%s %d", &stat, &val); err != nil {
246 thisSample.memStat[stat] = val
248 var outstat bytes.Buffer
249 for _, key := range wantStats {
250 if val, ok := thisSample.memStat[key]; ok {
251 outstat.WriteString(fmt.Sprintf(" %d %s", val, key))
254 r.Logger.Printf("mem%s\n", outstat.String())
257 func (r *Reporter) doNetworkStats() {
258 sampleTime := time.Now()
259 stats, err := r.getContainerNetStats()
264 scanner := bufio.NewScanner(stats)
268 words := strings.Fields(scanner.Text())
269 if len(words) != 17 {
270 // Skip lines with wrong format
273 ifName = strings.TrimRight(words[0], ":")
274 if ifName == "lo" || ifName == "" {
275 // Skip loopback interface and lines with wrong format
278 if tx, err = strconv.ParseInt(words[9], 10, 64); err != nil {
281 if rx, err = strconv.ParseInt(words[1], 10, 64); err != nil {
284 nextSample := ioSample{}
285 nextSample.sampleTime = sampleTime
286 nextSample.txBytes = tx
287 nextSample.rxBytes = rx
289 if prev, ok := r.lastNetSample[ifName]; ok {
290 interval := nextSample.sampleTime.Sub(prev.sampleTime).Seconds()
291 delta = fmt.Sprintf(" -- interval %.4f seconds %d tx %d rx",
296 r.Logger.Printf("net:%s %d tx %d rx%s\n", ifName, tx, rx, delta)
297 r.lastNetSample[ifName] = nextSample
301 type cpuSample struct {
302 hasData bool // to distinguish the zero value from real data
309 // Return the number of CPUs available in the container. Return 0 if
310 // we can't figure out the real number of CPUs.
311 func (r *Reporter) getCPUCount() int64 {
312 cpusetFile, err := r.openStatFile("cpuset", "cpuset.cpus", true)
316 defer cpusetFile.Close()
317 b, err := r.readAllOrWarn(cpusetFile)
321 sp := strings.Split(string(b), ",")
323 for _, v := range sp {
325 n, _ := fmt.Sscanf(v, "%d-%d", &min, &max)
327 cpus += (max - min) + 1
335 func (r *Reporter) doCPUStats() {
336 statFile, err := r.openStatFile("cpuacct", "cpuacct.stat", true)
340 defer statFile.Close()
341 b, err := r.readAllOrWarn(statFile)
346 var userTicks, sysTicks int64
347 fmt.Sscanf(string(b), "user %d\nsystem %d", &userTicks, &sysTicks)
348 userHz := float64(C.sysconf(C._SC_CLK_TCK))
349 nextSample := cpuSample{
351 sampleTime: time.Now(),
352 user: float64(userTicks) / userHz,
353 sys: float64(sysTicks) / userHz,
354 cpus: r.getCPUCount(),
358 if r.lastCPUSample.hasData {
359 delta = fmt.Sprintf(" -- interval %.4f seconds %.4f user %.4f sys",
360 nextSample.sampleTime.Sub(r.lastCPUSample.sampleTime).Seconds(),
361 nextSample.user-r.lastCPUSample.user,
362 nextSample.sys-r.lastCPUSample.sys)
364 r.Logger.Printf("cpu %.4f user %.4f sys %d cpus%s\n",
365 nextSample.user, nextSample.sys, nextSample.cpus, delta)
366 r.lastCPUSample = nextSample
369 // Report stats periodically until we learn (via r.done) that someone
371 func (r *Reporter) run() {
372 defer close(r.flushed)
374 r.reportedStatFile = make(map[string]string)
376 if !r.waitForCIDFile() || !r.waitForCgroup() {
380 r.lastNetSample = make(map[string]ioSample)
381 r.lastDiskSample = make(map[string]ioSample)
383 ticker := time.NewTicker(r.PollPeriod)
397 // If CID is empty, wait for it to appear in CIDFile. Return true if
398 // we get it before we learn (via r.done) that someone called Stop.
399 func (r *Reporter) waitForCIDFile() bool {
400 if r.CID != "" || r.CIDFile == "" {
404 ticker := time.NewTicker(100 * time.Millisecond)
407 cid, err := ioutil.ReadFile(r.CIDFile)
408 if err == nil && len(cid) > 0 {
415 r.Logger.Printf("CID never appeared in %+q: %v", r.CIDFile, err)
421 // Wait for the cgroup stats files to appear in cgroup_root. Return
422 // true if they appear before r.done indicates someone called Stop. If
423 // they don't appear within one poll interval, log a warning and keep
425 func (r *Reporter) waitForCgroup() bool {
426 ticker := time.NewTicker(100 * time.Millisecond)
428 warningTimer := time.After(r.PollPeriod)
430 c, err := r.openStatFile("cpuacct", "cgroup.procs", false)
438 r.Logger.Printf("cgroup stats files have not appeared after %v (config error?) -- still waiting...", r.PollPeriod)
440 r.Logger.Printf("cgroup stats files never appeared for %v", r.CID)