75284856fad62e102433a92282fb3b17f52908f7
[arvados.git] / services / crunch / crunchstat / src / arvados.org / crunchstat / crunchstat.go
1 package main
2
3 import (
4         "bufio"
5         "flag"
6         "fmt"
7         "io"
8         "io/ioutil"
9         "log"
10         "os"
11         "os/exec"
12         "os/signal"
13         "strings"
14         "syscall"
15         "time"
16 )
17
18 func ReadLineByLine(inp io.ReadCloser, out chan string, finish chan bool) {
19         s := bufio.NewScanner(inp)
20         for s.Scan() {
21                 out <- s.Text()
22         }
23         finish <- true
24 }
25
26 func OutputChannel(stdout chan string, stderr chan string) {
27         for {
28                 select {
29                 case s, ok := <-stdout:
30                         if ok {
31                                 fmt.Fprintln(os.Stdout, s)
32                         } else {
33                                 return
34                         }
35                 case s, ok := <-stderr:
36                         if ok {
37                                 fmt.Fprintln(os.Stderr, s)
38                         } else {
39                                 return
40                         }
41                 }
42         }
43 }
44
45 func PollCgroupStats(cgroup_path string, stderr chan string, poll int64) {
46         //var last_usage int64 = 0
47         var last_user int64 = 0
48         var last_sys int64 = 0
49         var last_cpucount int64 = 0
50
51         type Disk struct {
52                 last_read  int64
53                 next_read  int64
54                 last_write int64
55                 next_write int64
56         }
57
58         disk := make(map[string]*Disk)
59
60         //cpuacct_usage := fmt.Sprintf("%s/cpuacct.usage", cgroup_path)
61         cpuacct_stat := fmt.Sprintf("%s/cpuacct.stat", cgroup_path)
62         blkio_io_service_bytes := fmt.Sprintf("%s/blkio.io_service_bytes", cgroup_path)
63         cpuset_cpus := fmt.Sprintf("%s/cpuset.cpus", cgroup_path)
64         memory_stat := fmt.Sprintf("%s/memory.stat", cgroup_path)
65
66         var elapsed int64 = poll
67
68         for {
69                 /*{
70                         c, _ := os.Open(cpuacct_usage)
71                         b, _ := ioutil.ReadAll(c)
72                         var next int64
73                         fmt.Sscanf(string(b), "%d", &next)
74                         if last_usage != 0 {
75                                 stderr <- fmt.Sprintf("crunchstat: cpuacct.usage %v", (next-last_usage)/10000000)
76                         }
77                         //fmt.Printf("usage %d %d %d %d%%\n", last_usage, next, next-last_usage, (next-last_usage)/10000000)
78                         last_usage = next
79                         c.Close()
80                 }*/
81                 var cpus int64 = 0
82                 {
83                         c, _ := os.Open(cpuset_cpus)
84                         b, _ := ioutil.ReadAll(c)
85                         sp := strings.Split(string(b), ",")
86                         for _, v := range sp {
87                                 var min, max int64
88                                 n, _ := fmt.Sscanf(v, "%d-%d", &min, &max)
89                                 if n == 2 {
90                                         cpus += (max - min) + 1
91                                 } else {
92                                         cpus += 1
93                                 }
94                         }
95
96                         if cpus != last_cpucount {
97                                 stderr <- fmt.Sprintf("crunchstat: cpuset.cpus %v", cpus)
98                         }
99                         last_cpucount = cpus
100
101                         c.Close()
102                 }
103                 if cpus == 0 {
104                         cpus = 1
105                 }
106                 {
107                         c, _ := os.Open(cpuacct_stat)
108                         b, _ := ioutil.ReadAll(c)
109                         var next_user int64
110                         var next_sys int64
111                         fmt.Sscanf(string(b), "user %d\nsystem %d", &next_user, &next_sys)
112                         c.Close()
113
114                         if last_user != 0 {
115                                 user_diff := next_user - last_user
116                                 sys_diff := next_sys - last_sys
117                                 // Assume we're reading stats based on 100
118                                 // jiffies per second.  Because the ellaspsed
119                                 // time is in milliseconds, we need to boost
120                                 // that to 1000 jiffies per second, then boost
121                                 // it by another 100x to get a percentage, then
122                                 // finally divide by the actual elapsed time
123                                 // and the number of cpus to get average load
124                                 // over the polling period.
125                                 user_pct := (user_diff * 10 * 100) / (elapsed * cpus)
126                                 sys_pct := (sys_diff * 10 * 100) / (elapsed * cpus)
127
128                                 stderr <- fmt.Sprintf("crunchstat: cpuacct.stat user %v", user_pct)
129                                 stderr <- fmt.Sprintf("crunchstat: cpuacct.stat sys %v", sys_pct)
130                         }
131
132                         /*fmt.Printf("user %d %d %d%%\n", last_user, next_user, next_user-last_user)
133                         fmt.Printf("sys %d %d %d%%\n", last_sys, next_sys, next_sys-last_sys)
134                         fmt.Printf("sum %d%%\n", (next_user-last_user)+(next_sys-last_sys))*/
135                         last_user = next_user
136                         last_sys = next_sys
137                 }
138                 {
139                         c, _ := os.Open(blkio_io_service_bytes)
140                         b := bufio.NewScanner(c)
141                         var device, op string
142                         var next int64
143                         for b.Scan() {
144                                 if _, err := fmt.Sscanf(string(b.Text()), "%s %s %d", &device, &op, &next); err == nil {
145                                         if disk[device] == nil {
146                                                 disk[device] = new(Disk)
147                                         }
148                                         if op == "Read" {
149                                                 disk[device].last_read = disk[device].next_read
150                                                 disk[device].next_read = next
151                                                 if disk[device].last_read > 0 {
152                                                         stderr <- fmt.Sprintf("crunchstat: blkio.io_service_bytes %s read %v", device, disk[device].next_read-disk[device].last_read)
153                                                 }
154                                         }
155                                         if op == "Write" {
156                                                 disk[device].last_write = disk[device].next_write
157                                                 disk[device].next_write = next
158                                                 if disk[device].last_write > 0 {
159                                                         stderr <- fmt.Sprintf("crunchstat: blkio.io_service_bytes %s write %v", device, disk[device].next_write-disk[device].last_write)
160                                                 }
161                                         }
162                                 }
163                         }
164                         c.Close()
165                 }
166
167                 {
168                         c, _ := os.Open(memory_stat)
169                         b := bufio.NewScanner(c)
170                         var stat string
171                         var val int64
172                         for b.Scan() {
173                                 if _, err := fmt.Sscanf(string(b.Text()), "%s %d", &stat, &val); err == nil {
174                                         if stat == "rss" {
175                                                 stderr <- fmt.Sprintf("crunchstat: memory.stat rss %v", val)
176                                         }
177                                 }
178                         }
179                         c.Close()
180                 }
181
182                 bedtime := time.Now()
183                 time.Sleep(time.Duration(poll) * time.Millisecond)
184                 morning := time.Now()
185                 elapsed = morning.Sub(bedtime).Nanoseconds() / int64(time.Millisecond)
186         }
187 }
188
189 func main() {
190
191         var (
192                 cgroup_path    string
193                 cgroup_parent  string
194                 cgroup_cidfile string
195                 wait           int64
196                 poll           int64
197         )
198
199         flag.StringVar(&cgroup_path, "cgroup-path", "", "Direct path to cgroup")
200         flag.StringVar(&cgroup_parent, "cgroup-parent", "", "Path to parent cgroup")
201         flag.StringVar(&cgroup_cidfile, "cgroup-cid", "", "Path to container id file")
202         flag.Int64Var(&wait, "wait", 5, "Maximum time (in seconds) to wait for cid file to show up")
203         flag.Int64Var(&poll, "poll", 1000, "Polling frequency, in milliseconds")
204
205         flag.Parse()
206
207         logger := log.New(os.Stderr, "crunchstat: ", 0)
208
209         if cgroup_path == "" && cgroup_cidfile == "" {
210                 logger.Fatal("Must provide either -cgroup-path or -cgroup-cid")
211         }
212
213         // Make output channel
214         stdout_chan := make(chan string)
215         stderr_chan := make(chan string)
216         finish_chan := make(chan bool)
217         defer close(stdout_chan)
218         defer close(stderr_chan)
219         defer close(finish_chan)
220
221         go OutputChannel(stdout_chan, stderr_chan)
222
223         var cmd *exec.Cmd
224
225         if len(flag.Args()) > 0 {
226                 // Set up subprocess
227                 cmd = exec.Command(flag.Args()[0], flag.Args()[1:]...)
228
229                 logger.Print("Running ", flag.Args())
230
231                 // Forward SIGINT and SIGTERM to inner process
232                 term := make(chan os.Signal, 1)
233                 go func(sig <-chan os.Signal) {
234                         catch := <-sig
235                         if cmd.Process != nil {
236                                 cmd.Process.Signal(catch)
237                         }
238                         logger.Print("caught signal:", catch)
239                 }(term)
240                 signal.Notify(term, syscall.SIGTERM)
241                 signal.Notify(term, syscall.SIGINT)
242
243                 // Funnel stdout and stderr from subprocess to output channels
244                 stdout_pipe, err := cmd.StdoutPipe()
245                 if err != nil {
246                         logger.Fatal(err)
247                 }
248                 go ReadLineByLine(stdout_pipe, stdout_chan, finish_chan)
249
250                 stderr_pipe, err := cmd.StderrPipe()
251                 if err != nil {
252                         logger.Fatal(err)
253                 }
254                 go ReadLineByLine(stderr_pipe, stderr_chan, finish_chan)
255
256                 // Run subprocess
257                 if err := cmd.Start(); err != nil {
258                         logger.Fatal(err)
259                 }
260         }
261
262         // Read the cid file
263         if cgroup_cidfile != "" {
264                 // wait up to 'wait' seconds for the cid file to appear
265                 var i time.Duration
266                 for i = 0; i < time.Duration(wait)*time.Second; i += (100 * time.Millisecond) {
267                         f, err := os.Open(cgroup_cidfile)
268                         if err == nil {
269                                 cid, err2 := ioutil.ReadAll(f)
270                                 if err2 == nil && len(cid) > 0 {
271                                         cgroup_path = string(cid)
272                                         f.Close()
273                                         break
274                                 }
275                         }
276                         time.Sleep(100 * time.Millisecond)
277                 }
278                 if cgroup_path == "" {
279                         logger.Printf("Could not read cid file %s", cgroup_cidfile)
280                 }
281         }
282
283         // add the parent prefix
284         if cgroup_parent != "" {
285                 cgroup_path = fmt.Sprintf("%s/%s", cgroup_parent, cgroup_path)
286         }
287
288         logger.Print("Using cgroup ", cgroup_path)
289
290         go PollCgroupStats(cgroup_path, stderr_chan, poll)
291
292         // Wait for each of stdout and stderr to drain
293         <-finish_chan
294         <-finish_chan
295
296         if err := cmd.Wait(); err != nil {
297                 if exiterr, ok := err.(*exec.ExitError); ok {
298                         // The program has exited with an exit code != 0
299
300                         // This works on both Unix and Windows. Although package
301                         // syscall is generally platform dependent, WaitStatus is
302                         // defined for both Unix and Windows and in both cases has
303                         // an ExitStatus() method with the same signature.
304                         if status, ok := exiterr.Sys().(syscall.WaitStatus); ok {
305                                 os.Exit(status.ExitStatus())
306                         }
307                 } else {
308                         logger.Fatalf("cmd.Wait: %v", err)
309                 }
310         }
311 }