Bugfix in logging which cgroup stats files it uses. refs #2882
[arvados.git] / services / crunch / crunchstat / src / arvados.org / crunchstat / crunchstat.go
1 package main
2
3 import (
4         "bufio"
5         "flag"
6         "fmt"
7         "io"
8         "io/ioutil"
9         "log"
10         "os"
11         "os/exec"
12         "os/signal"
13         "strings"
14         "syscall"
15         "time"
16 )
17
18 func ReadLineByLine(inp io.ReadCloser, out chan string, finish chan bool) {
19         s := bufio.NewScanner(inp)
20         for s.Scan() {
21                 out <- s.Text()
22         }
23         finish <- true
24 }
25
26 func OutputChannel(stdout chan string, stderr chan string) {
27         for {
28                 select {
29                 case s, ok := <-stdout:
30                         if ok {
31                                 fmt.Fprintln(os.Stdout, s)
32                         } else {
33                                 return
34                         }
35                 case s, ok := <-stderr:
36                         if ok {
37                                 fmt.Fprintln(os.Stderr, s)
38                         } else {
39                                 return
40                         }
41                 }
42         }
43 }
44
45 func FindStat(cgroup_path string, statgroup string, stat string) string {
46         path := fmt.Sprintf("%s/%s.%s", cgroup_path, statgroup, stat)
47         if _, err := os.Stat(path); err == nil {
48                 return path
49         }
50         path = fmt.Sprintf("%s/%s/%s.%s", cgroup_path, statgroup, statgroup, stat)
51         if _, err := os.Stat(path); err == nil {
52                 return path
53         }
54         return ""
55 }
56
57 func PollCgroupStats(cgroup_path string, stderr chan string, poll int64) {
58         //var last_usage int64 = 0
59         var last_user int64 = 0
60         var last_sys int64 = 0
61         var last_cpucount int64 = 0
62
63         type Disk struct {
64                 last_read  int64
65                 next_read  int64
66                 last_write int64
67                 next_write int64
68         }
69
70         disk := make(map[string]*Disk)
71
72         //cpuacct_usage := FindStat(cgroup_path, "cpuacct", "usage")
73         cpuacct_stat := FindStat(cgroup_path, "cpuacct", "stat")
74         blkio_io_service_bytes := FindStat(cgroup_path, "blkio", "io_service_bytes")
75         cpuset_cpus := FindStat(cgroup_path, "cpuset", "cpus")
76         memory_stat := FindStat(cgroup_path, "memory", "stat")
77
78         if cpuacct_stat != "" {
79                 stderr <- fmt.Sprintf("crunchstat: reading stats from %s", cpuacct_stat)
80         }
81         if blkio_io_service_bytes != "" {
82                 stderr <- fmt.Sprintf("crunchstat: reading stats from %s", blkio_io_service_bytes)
83         }
84         if cpuset_cpus != "" {
85                 stderr <- fmt.Sprintf("crunchstat: reading stats from %s", cpuset_cpus)
86         }
87         if memory_stat != "" {
88                 stderr <- fmt.Sprintf("crunchstat: reading stats from %s", memory_stat)
89         }
90
91         var elapsed int64 = poll
92
93         for {
94                 /*{
95                         c, _ := os.Open(cpuacct_usage)
96                         b, _ := ioutil.ReadAll(c)
97                         var next int64
98                         fmt.Sscanf(string(b), "%d", &next)
99                         if last_usage != 0 {
100                                 stderr <- fmt.Sprintf("crunchstat: cpuacct.usage %v", (next-last_usage)/10000000)
101                         }
102                         //fmt.Printf("usage %d %d %d %d%%\n", last_usage, next, next-last_usage, (next-last_usage)/10000000)
103                         last_usage = next
104                         c.Close()
105                 }*/
106                 var cpus int64 = 0
107                 if cpuset_cpus != "" {
108                         c, _ := os.Open(cpuset_cpus)
109                         b, _ := ioutil.ReadAll(c)
110                         sp := strings.Split(string(b), ",")
111                         for _, v := range sp {
112                                 var min, max int64
113                                 n, _ := fmt.Sscanf(v, "%d-%d", &min, &max)
114                                 if n == 2 {
115                                         cpus += (max - min) + 1
116                                 } else {
117                                         cpus += 1
118                                 }
119                         }
120
121                         if cpus != last_cpucount {
122                                 stderr <- fmt.Sprintf("crunchstat: cpuset.cpus %v", cpus)
123                         }
124                         last_cpucount = cpus
125
126                         c.Close()
127                 }
128                 if cpus == 0 {
129                         cpus = 1
130                 }
131                 if cpuacct_stat != "" {
132                         c, _ := os.Open(cpuacct_stat)
133                         b, _ := ioutil.ReadAll(c)
134                         var next_user int64
135                         var next_sys int64
136                         fmt.Sscanf(string(b), "user %d\nsystem %d", &next_user, &next_sys)
137                         c.Close()
138
139                         if last_user != 0 {
140                                 user_diff := next_user - last_user
141                                 sys_diff := next_sys - last_sys
142                                 // Assume we're reading stats based on 100
143                                 // jiffies per second.  Because the ellaspsed
144                                 // time is in milliseconds, we need to boost
145                                 // that to 1000 jiffies per second, then boost
146                                 // it by another 100x to get a percentage, then
147                                 // finally divide by the actual elapsed time
148                                 // and the number of cpus to get average load
149                                 // over the polling period.
150                                 user_pct := (user_diff * 10 * 100) / (elapsed * cpus)
151                                 sys_pct := (sys_diff * 10 * 100) / (elapsed * cpus)
152
153                                 stderr <- fmt.Sprintf("crunchstat: cpuacct.stat user %v", user_pct)
154                                 stderr <- fmt.Sprintf("crunchstat: cpuacct.stat sys %v", sys_pct)
155                         }
156
157                         /*fmt.Printf("user %d %d %d%%\n", last_user, next_user, next_user-last_user)
158                         fmt.Printf("sys %d %d %d%%\n", last_sys, next_sys, next_sys-last_sys)
159                         fmt.Printf("sum %d%%\n", (next_user-last_user)+(next_sys-last_sys))*/
160                         last_user = next_user
161                         last_sys = next_sys
162                 }
163                 if blkio_io_service_bytes != "" {
164                         c, _ := os.Open(blkio_io_service_bytes)
165                         b := bufio.NewScanner(c)
166                         var device, op string
167                         var next int64
168                         for b.Scan() {
169                                 if _, err := fmt.Sscanf(string(b.Text()), "%s %s %d", &device, &op, &next); err == nil {
170                                         if disk[device] == nil {
171                                                 disk[device] = new(Disk)
172                                         }
173                                         if op == "Read" {
174                                                 disk[device].last_read = disk[device].next_read
175                                                 disk[device].next_read = next
176                                                 if disk[device].last_read > 0 {
177                                                         stderr <- fmt.Sprintf("crunchstat: blkio.io_service_bytes %s read %v", device, disk[device].next_read-disk[device].last_read)
178                                                 }
179                                         }
180                                         if op == "Write" {
181                                                 disk[device].last_write = disk[device].next_write
182                                                 disk[device].next_write = next
183                                                 if disk[device].last_write > 0 {
184                                                         stderr <- fmt.Sprintf("crunchstat: blkio.io_service_bytes %s write %v", device, disk[device].next_write-disk[device].last_write)
185                                                 }
186                                         }
187                                 }
188                         }
189                         c.Close()
190                 }
191
192                 if memory_stat != "" {
193                         c, _ := os.Open(memory_stat)
194                         b := bufio.NewScanner(c)
195                         var stat string
196                         var val int64
197                         for b.Scan() {
198                                 if _, err := fmt.Sscanf(string(b.Text()), "%s %d", &stat, &val); err == nil {
199                                         if stat == "rss" {
200                                                 stderr <- fmt.Sprintf("crunchstat: memory.stat rss %v", val)
201                                         }
202                                 }
203                         }
204                         c.Close()
205                 }
206
207                 bedtime := time.Now()
208                 time.Sleep(time.Duration(poll) * time.Millisecond)
209                 morning := time.Now()
210                 elapsed = morning.Sub(bedtime).Nanoseconds() / int64(time.Millisecond)
211         }
212 }
213
214 func main() {
215
216         var (
217                 cgroup_path    string
218                 cgroup_parent  string
219                 cgroup_cidfile string
220                 wait           int64
221                 poll           int64
222         )
223
224         flag.StringVar(&cgroup_path, "cgroup-path", "", "Direct path to cgroup")
225         flag.StringVar(&cgroup_parent, "cgroup-parent", "", "Path to parent cgroup")
226         flag.StringVar(&cgroup_cidfile, "cgroup-cid", "", "Path to container id file")
227         flag.Int64Var(&wait, "wait", 5, "Maximum time (in seconds) to wait for cid file to show up")
228         flag.Int64Var(&poll, "poll", 1000, "Polling frequency, in milliseconds")
229
230         flag.Parse()
231
232         logger := log.New(os.Stderr, "crunchstat: ", 0)
233
234         if cgroup_path == "" && cgroup_cidfile == "" {
235                 logger.Fatal("Must provide either -cgroup-path or -cgroup-cid")
236         }
237
238         // Make output channel
239         stdout_chan := make(chan string)
240         stderr_chan := make(chan string)
241         finish_chan := make(chan bool)
242         defer close(stdout_chan)
243         defer close(stderr_chan)
244         defer close(finish_chan)
245
246         go OutputChannel(stdout_chan, stderr_chan)
247
248         var cmd *exec.Cmd
249
250         if len(flag.Args()) > 0 {
251                 // Set up subprocess
252                 cmd = exec.Command(flag.Args()[0], flag.Args()[1:]...)
253
254                 logger.Print("Running ", flag.Args())
255
256                 // Forward SIGINT and SIGTERM to inner process
257                 term := make(chan os.Signal, 1)
258                 go func(sig <-chan os.Signal) {
259                         catch := <-sig
260                         if cmd.Process != nil {
261                                 cmd.Process.Signal(catch)
262                         }
263                         logger.Print("caught signal:", catch)
264                 }(term)
265                 signal.Notify(term, syscall.SIGTERM)
266                 signal.Notify(term, syscall.SIGINT)
267
268                 // Funnel stdout and stderr from subprocess to output channels
269                 stdout_pipe, err := cmd.StdoutPipe()
270                 if err != nil {
271                         logger.Fatal(err)
272                 }
273                 go ReadLineByLine(stdout_pipe, stdout_chan, finish_chan)
274
275                 stderr_pipe, err := cmd.StderrPipe()
276                 if err != nil {
277                         logger.Fatal(err)
278                 }
279                 go ReadLineByLine(stderr_pipe, stderr_chan, finish_chan)
280
281                 // Run subprocess
282                 if err := cmd.Start(); err != nil {
283                         logger.Fatal(err)
284                 }
285         }
286
287         // Read the cid file
288         if cgroup_cidfile != "" {
289                 // wait up to 'wait' seconds for the cid file to appear
290                 var i time.Duration
291                 for i = 0; i < time.Duration(wait)*time.Second; i += (100 * time.Millisecond) {
292                         f, err := os.Open(cgroup_cidfile)
293                         if err == nil {
294                                 cid, err2 := ioutil.ReadAll(f)
295                                 if err2 == nil && len(cid) > 0 {
296                                         cgroup_path = string(cid)
297                                         f.Close()
298                                         break
299                                 }
300                         }
301                         time.Sleep(100 * time.Millisecond)
302                 }
303                 if cgroup_path == "" {
304                         logger.Printf("Could not read cid file %s", cgroup_cidfile)
305                 }
306         }
307
308         // add the parent prefix
309         if cgroup_parent != "" {
310                 cgroup_path = fmt.Sprintf("%s/%s", cgroup_parent, cgroup_path)
311         }
312
313         logger.Print("Using cgroup ", cgroup_path)
314
315         go PollCgroupStats(cgroup_path, stderr_chan, poll)
316
317         // Wait for each of stdout and stderr to drain
318         <-finish_chan
319         <-finish_chan
320
321         if err := cmd.Wait(); err != nil {
322                 if exiterr, ok := err.(*exec.ExitError); ok {
323                         // The program has exited with an exit code != 0
324
325                         // This works on both Unix and Windows. Although package
326                         // syscall is generally platform dependent, WaitStatus is
327                         // defined for both Unix and Windows and in both cases has
328                         // an ExitStatus() method with the same signature.
329                         if status, ok := exiterr.Sys().(syscall.WaitStatus); ok {
330                                 os.Exit(status.ExitStatus())
331                         }
332                 } else {
333                         logger.Fatalf("cmd.Wait: %v", err)
334                 }
335         }
336 }