4044: Clean up channel and pipe usage.
[arvados.git] / services / crunchstat / crunchstat.go
1 package main
2
3 import (
4         "bufio"
5         "flag"
6         "fmt"
7         "io"
8         "io/ioutil"
9         "log"
10         "os"
11         "os/exec"
12         "os/signal"
13         "strings"
14         "syscall"
15         "time"
16 )
17
18 func CopyPipeToChan(in io.Reader, out chan string, done chan<- bool) {
19         s := bufio.NewScanner(in)
20         for s.Scan() {
21                 out <- s.Text()
22         }
23         done <- true
24 }
25
26 func CopyChanToPipe(in <-chan string, out io.Writer) {
27         for s := range in {
28                 fmt.Fprintln(out, s)
29         }
30 }
31
32 func FindStat(cgroup_root string, cgroup_parent string, container_id string, statgroup string, stat string) string {
33         var path string
34         path = fmt.Sprintf("%s/%s/%s/%s/%s.%s", cgroup_root, statgroup, cgroup_parent, container_id, statgroup, stat)
35         if _, err := os.Stat(path); err == nil {
36                 return path
37         }
38         path = fmt.Sprintf("%s/%s/%s/%s.%s", cgroup_root, cgroup_parent, container_id, statgroup, stat)
39         if _, err := os.Stat(path); err == nil {
40                 return path
41         }
42         path = fmt.Sprintf("%s/%s/%s.%s", cgroup_root, statgroup, statgroup, stat)
43         if _, err := os.Stat(path); err == nil {
44                 return path
45         }
46         path = fmt.Sprintf("%s/%s.%s", cgroup_root, statgroup, stat)
47         if _, err := os.Stat(path); err == nil {
48                 return path
49         }
50         return ""
51 }
52
53 func PollCgroupStats(cgroup_root string, cgroup_parent string, container_id string, stderr chan string, poll int64, stop_poll_chan <-chan bool) {
54         //var last_usage int64 = 0
55         var last_user int64 = -1
56         var last_sys int64 = -1
57         var last_cpucount int64 = 0
58
59         type Disk struct {
60                 last_read  int64
61                 next_read  int64
62                 last_write int64
63                 next_write int64
64         }
65
66         disk := make(map[string]*Disk)
67
68         //cpuacct_usage := FindStat(cgroup_path, "cpuacct", "usage")
69         cpuacct_stat := FindStat(cgroup_root, cgroup_parent, container_id, "cpuacct", "stat")
70         blkio_io_service_bytes := FindStat(cgroup_root, cgroup_parent, container_id, "blkio", "io_service_bytes")
71         cpuset_cpus := FindStat(cgroup_root, cgroup_parent, container_id, "cpuset", "cpus")
72         memory_stat := FindStat(cgroup_root, cgroup_parent, container_id, "memory", "stat")
73
74         if cpuacct_stat != "" {
75                 stderr <- fmt.Sprintf("crunchstat: reading stats from %s", cpuacct_stat)
76         }
77         if blkio_io_service_bytes != "" {
78                 stderr <- fmt.Sprintf("crunchstat: reading stats from %s", blkio_io_service_bytes)
79         }
80         if cpuset_cpus != "" {
81                 stderr <- fmt.Sprintf("crunchstat: reading stats from %s", cpuset_cpus)
82         }
83         if memory_stat != "" {
84                 stderr <- fmt.Sprintf("crunchstat: reading stats from %s", memory_stat)
85         }
86
87         poll_chan := make(chan bool, 1)
88         go func() {
89                 // Send periodic poll events.
90                 poll_chan <- true
91                 for {
92                         time.Sleep(time.Duration(poll) * time.Millisecond)
93                         poll_chan <- true
94                 }
95         }()
96         for {
97                 bedtime := time.Now()
98                 select {
99                 case <-stop_poll_chan:
100                         return
101                 case <-poll_chan:
102                         // Emit stats, then select again.
103                 }
104                 morning := time.Now()
105                 elapsed := morning.Sub(bedtime).Nanoseconds() / int64(time.Millisecond)
106                 /*{
107                         c, _ := os.Open(cpuacct_usage)
108                         b, _ := ioutil.ReadAll(c)
109                         var next int64
110                         fmt.Sscanf(string(b), "%d", &next)
111                         if last_usage != 0 {
112                                 stderr <- fmt.Sprintf("crunchstat: cpuacct.usage %v", (next-last_usage)/10000000)
113                         }
114                         //fmt.Printf("usage %d %d %d %d%%\n", last_usage, next, next-last_usage, (next-last_usage)/10000000)
115                         last_usage = next
116                         c.Close()
117                 }*/
118                 var cpus int64 = 0
119                 if cpuset_cpus != "" {
120                         c, err := os.Open(cpuset_cpus)
121                         if err != nil {
122                                 stderr <- fmt.Sprintf("open %s: %s", cpuset_cpus, err)
123                                 continue
124                         }
125                         b, _ := ioutil.ReadAll(c)
126                         sp := strings.Split(string(b), ",")
127                         for _, v := range sp {
128                                 var min, max int64
129                                 n, _ := fmt.Sscanf(v, "%d-%d", &min, &max)
130                                 if n == 2 {
131                                         cpus += (max - min) + 1
132                                 } else {
133                                         cpus += 1
134                                 }
135                         }
136
137                         if cpus != last_cpucount {
138                                 stderr <- fmt.Sprintf("crunchstat: cpuset.cpus %v", cpus)
139                         }
140                         last_cpucount = cpus
141
142                         c.Close()
143                 }
144                 if cpus == 0 {
145                         cpus = 1
146                 }
147                 if cpuacct_stat != "" {
148                         c, err := os.Open(cpuacct_stat)
149                         if err != nil {
150                                 stderr <- fmt.Sprintf("open %s: %s", cpuacct_stat, err)
151                                 // Next time around, last_user would
152                                 // be >1 interval old, so stats will
153                                 // be incorrect. Start over instead.
154                                 last_user = -1
155                                 continue
156                         }
157                         b, _ := ioutil.ReadAll(c)
158                         var next_user int64
159                         var next_sys int64
160                         fmt.Sscanf(string(b), "user %d\nsystem %d", &next_user, &next_sys)
161                         c.Close()
162
163                         if elapsed > 0 && last_user != -1 {
164                                 user_diff := next_user - last_user
165                                 sys_diff := next_sys - last_sys
166                                 // Assume we're reading stats based on 100
167                                 // jiffies per second.  Because the elapsed
168                                 // time is in milliseconds, we need to boost
169                                 // that to 1000 jiffies per second, then boost
170                                 // it by another 100x to get a percentage, then
171                                 // finally divide by the actual elapsed time
172                                 // and the number of cpus to get average load
173                                 // over the polling period.
174                                 user_pct := (user_diff * 10 * 100) / (elapsed * cpus)
175                                 sys_pct := (sys_diff * 10 * 100) / (elapsed * cpus)
176
177                                 stderr <- fmt.Sprintf("crunchstat: cpuacct.stat user %v", user_pct)
178                                 stderr <- fmt.Sprintf("crunchstat: cpuacct.stat sys %v", sys_pct)
179                         }
180
181                         /*fmt.Printf("user %d %d %d%%\n", last_user, next_user, next_user-last_user)
182                         fmt.Printf("sys %d %d %d%%\n", last_sys, next_sys, next_sys-last_sys)
183                         fmt.Printf("sum %d%%\n", (next_user-last_user)+(next_sys-last_sys))*/
184                         last_user = next_user
185                         last_sys = next_sys
186                 }
187                 if blkio_io_service_bytes != "" {
188                         c, err := os.Open(blkio_io_service_bytes)
189                         if err != nil {
190                                 stderr <- fmt.Sprintf("open %s: %s", blkio_io_service_bytes, err)
191                                 continue
192                         }
193                         b := bufio.NewScanner(c)
194                         var device, op string
195                         var next int64
196                         for b.Scan() {
197                                 if _, err := fmt.Sscanf(string(b.Text()), "%s %s %d", &device, &op, &next); err == nil {
198                                         if disk[device] == nil {
199                                                 disk[device] = new(Disk)
200                                         }
201                                         if op == "Read" {
202                                                 disk[device].last_read = disk[device].next_read
203                                                 disk[device].next_read = next
204                                                 if disk[device].last_read > 0 && (disk[device].next_read != disk[device].last_read) {
205                                                         stderr <- fmt.Sprintf("crunchstat: blkio.io_service_bytes %s read %v", device, disk[device].next_read-disk[device].last_read)
206                                                 }
207                                         }
208                                         if op == "Write" {
209                                                 disk[device].last_write = disk[device].next_write
210                                                 disk[device].next_write = next
211                                                 if disk[device].last_write > 0 && (disk[device].next_write != disk[device].last_write) {
212                                                         stderr <- fmt.Sprintf("crunchstat: blkio.io_service_bytes %s write %v", device, disk[device].next_write-disk[device].last_write)
213                                                 }
214                                         }
215                                 }
216                         }
217                         c.Close()
218                 }
219
220                 if memory_stat != "" {
221                         c, err := os.Open(memory_stat)
222                         if err != nil {
223                                 stderr <- fmt.Sprintf("open %s: %s", memory_stat, err)
224                                 continue
225                         }
226                         b := bufio.NewScanner(c)
227                         var stat string
228                         var val int64
229                         for b.Scan() {
230                                 if _, err := fmt.Sscanf(string(b.Text()), "%s %d", &stat, &val); err == nil {
231                                         if stat == "rss" {
232                                                 stderr <- fmt.Sprintf("crunchstat: memory.stat rss %v", val)
233                                         }
234                                 }
235                         }
236                         c.Close()
237                 }
238         }
239 }
240
241 func run(logger *log.Logger) error {
242
243         var (
244                 cgroup_root    string
245                 cgroup_parent  string
246                 cgroup_cidfile string
247                 wait           int64
248                 poll           int64
249         )
250
251         flag.StringVar(&cgroup_root, "cgroup-root", "", "Root of cgroup tree")
252         flag.StringVar(&cgroup_parent, "cgroup-parent", "", "Name of container parent under cgroup")
253         flag.StringVar(&cgroup_cidfile, "cgroup-cid", "", "Path to container id file")
254         flag.Int64Var(&wait, "wait", 5, "Maximum time (in seconds) to wait for cid file to show up")
255         flag.Int64Var(&poll, "poll", 1000, "Polling frequency, in milliseconds")
256
257         flag.Parse()
258
259         if cgroup_root == "" {
260                 logger.Fatal("Must provide -cgroup-root")
261         }
262
263         stderr_chan := make(chan string, 1)
264         defer close(stderr_chan)
265         finish_chan := make(chan bool)
266         defer close(finish_chan)
267
268         go CopyChanToPipe(stderr_chan, os.Stderr)
269
270         var cmd *exec.Cmd
271
272         if len(flag.Args()) > 0 {
273                 // Set up subprocess
274                 cmd = exec.Command(flag.Args()[0], flag.Args()[1:]...)
275
276                 logger.Print("Running ", flag.Args())
277
278                 // Child process will use our stdin and stdout pipes
279                 // (we close our copies below)
280                 cmd.Stdin = os.Stdin
281                 cmd.Stdout = os.Stdout
282
283                 // Forward SIGINT and SIGTERM to inner process
284                 term := make(chan os.Signal, 1)
285                 go func(sig <-chan os.Signal) {
286                         catch := <-sig
287                         if cmd.Process != nil {
288                                 cmd.Process.Signal(catch)
289                         }
290                         logger.Print("caught signal: ", catch)
291                 }(term)
292                 signal.Notify(term, syscall.SIGTERM)
293                 signal.Notify(term, syscall.SIGINT)
294
295                 // Funnel stderr through our channel
296                 stderr_pipe, err := cmd.StderrPipe()
297                 if err != nil {
298                         logger.Fatal(err)
299                 }
300                 go CopyPipeToChan(stderr_pipe, stderr_chan, finish_chan)
301
302                 // Run subprocess
303                 if err := cmd.Start(); err != nil {
304                         logger.Fatal(err)
305                 }
306
307                 // Close stdin/stdout in this (parent) process
308                 os.Stdin.Close()
309                 os.Stdout.Close()
310         }
311
312         // Read the cid file
313         var container_id string
314         if cgroup_cidfile != "" {
315                 // wait up to 'wait' seconds for the cid file to appear
316                 ok := false
317                 var i time.Duration
318                 for i = 0; i < time.Duration(wait)*time.Second; i += (100 * time.Millisecond) {
319                         f, err := os.Open(cgroup_cidfile)
320                         if err == nil {
321                                 defer f.Close()
322                                 cid, err2 := ioutil.ReadAll(f)
323                                 if err2 == nil && len(cid) > 0 {
324                                         ok = true
325                                         container_id = string(cid)
326                                         break
327                                 }
328                         }
329                         time.Sleep(100 * time.Millisecond)
330                 }
331                 if !ok {
332                         logger.Printf("Could not read cid file %s", cgroup_cidfile)
333                 }
334         }
335
336         stop_poll_chan := make(chan bool, 1)
337         go PollCgroupStats(cgroup_root, cgroup_parent, container_id, stderr_chan, poll, stop_poll_chan)
338
339         // When the child exits, tell the polling goroutine to stop.
340         defer func() { stop_poll_chan <- true }()
341
342         // Wait for CopyPipeToChan to consume child's stderr pipe
343         <-finish_chan
344
345         return cmd.Wait()
346 }
347
348 func main() {
349         logger := log.New(os.Stderr, "crunchstat: ", 0)
350         if err := run(logger); err != nil {
351                 if exiterr, ok := err.(*exec.ExitError); ok {
352                         // The program has exited with an exit code != 0
353
354                         // This works on both Unix and
355                         // Windows. Although package syscall is
356                         // generally platform dependent, WaitStatus is
357                         // defined for both Unix and Windows and in
358                         // both cases has an ExitStatus() method with
359                         // the same signature.
360                         if status, ok := exiterr.Sys().(syscall.WaitStatus); ok {
361                                 os.Exit(status.ExitStatus())
362                         }
363                 } else {
364                         logger.Fatalf("cmd.Wait: %v", err)
365                 }
366         }
367 }