Merge branch '3187-pipeline-instance-page' into 3605-improved-dashboard
[arvados.git] / services / crunchstat / crunchstat.go
1 package main
2
3 import (
4         "bufio"
5         "flag"
6         "fmt"
7         "io"
8         "io/ioutil"
9         "log"
10         "os"
11         "os/exec"
12         "os/signal"
13         "strings"
14         "syscall"
15         "time"
16 )
17
18 func ReadLineByLine(inp io.ReadCloser, out chan string, finish chan bool) {
19         s := bufio.NewScanner(inp)
20         for s.Scan() {
21                 out <- s.Text()
22         }
23         finish <- true
24 }
25
26 func OutputChannel(stdout chan string, stderr chan string) {
27         for {
28                 select {
29                 case s, ok := <-stdout:
30                         if ok {
31                                 fmt.Fprintln(os.Stdout, s)
32                         } else {
33                                 return
34                         }
35                 case s, ok := <-stderr:
36                         if ok {
37                                 fmt.Fprintln(os.Stderr, s)
38                         } else {
39                                 return
40                         }
41                 }
42         }
43 }
44
45 func FindStat(cgroup_root string, cgroup_parent string, container_id string, statgroup string, stat string) string {
46         var path string
47         path = fmt.Sprintf("%s/%s/%s/%s/%s.%s", cgroup_root, statgroup, cgroup_parent, container_id, statgroup, stat)
48         if _, err := os.Stat(path); err == nil {
49                 return path
50         }
51         path = fmt.Sprintf("%s/%s/%s/%s.%s", cgroup_root, cgroup_parent, container_id, statgroup, stat)
52         if _, err := os.Stat(path); err == nil {
53                 return path
54         }
55         path = fmt.Sprintf("%s/%s/%s.%s", cgroup_root, statgroup, statgroup, stat)
56         if _, err := os.Stat(path); err == nil {
57                 return path
58         }
59         path = fmt.Sprintf("%s/%s.%s", cgroup_root, statgroup, stat)
60         if _, err := os.Stat(path); err == nil {
61                 return path
62         }
63         return ""
64 }
65
66 func PollCgroupStats(cgroup_root string, cgroup_parent string, container_id string, stderr chan string, poll int64) {
67         //var last_usage int64 = 0
68         var last_user int64 = 0
69         var last_sys int64 = 0
70         var last_cpucount int64 = 0
71
72         type Disk struct {
73                 last_read  int64
74                 next_read  int64
75                 last_write int64
76                 next_write int64
77         }
78
79         disk := make(map[string]*Disk)
80
81         //cpuacct_usage := FindStat(cgroup_path, "cpuacct", "usage")
82         cpuacct_stat := FindStat(cgroup_root, cgroup_parent, container_id, "cpuacct", "stat")
83         blkio_io_service_bytes := FindStat(cgroup_root, cgroup_parent, container_id, "blkio", "io_service_bytes")
84         cpuset_cpus := FindStat(cgroup_root, cgroup_parent, container_id, "cpuset", "cpus")
85         memory_stat := FindStat(cgroup_root, cgroup_parent, container_id, "memory", "stat")
86
87         if cpuacct_stat != "" {
88                 stderr <- fmt.Sprintf("crunchstat: reading stats from %s", cpuacct_stat)
89         }
90         if blkio_io_service_bytes != "" {
91                 stderr <- fmt.Sprintf("crunchstat: reading stats from %s", blkio_io_service_bytes)
92         }
93         if cpuset_cpus != "" {
94                 stderr <- fmt.Sprintf("crunchstat: reading stats from %s", cpuset_cpus)
95         }
96         if memory_stat != "" {
97                 stderr <- fmt.Sprintf("crunchstat: reading stats from %s", memory_stat)
98         }
99
100         var elapsed int64 = poll
101
102         for {
103                 /*{
104                         c, _ := os.Open(cpuacct_usage)
105                         b, _ := ioutil.ReadAll(c)
106                         var next int64
107                         fmt.Sscanf(string(b), "%d", &next)
108                         if last_usage != 0 {
109                                 stderr <- fmt.Sprintf("crunchstat: cpuacct.usage %v", (next-last_usage)/10000000)
110                         }
111                         //fmt.Printf("usage %d %d %d %d%%\n", last_usage, next, next-last_usage, (next-last_usage)/10000000)
112                         last_usage = next
113                         c.Close()
114                 }*/
115                 var cpus int64 = 0
116                 if cpuset_cpus != "" {
117                         c, _ := os.Open(cpuset_cpus)
118                         b, _ := ioutil.ReadAll(c)
119                         sp := strings.Split(string(b), ",")
120                         for _, v := range sp {
121                                 var min, max int64
122                                 n, _ := fmt.Sscanf(v, "%d-%d", &min, &max)
123                                 if n == 2 {
124                                         cpus += (max - min) + 1
125                                 } else {
126                                         cpus += 1
127                                 }
128                         }
129
130                         if cpus != last_cpucount {
131                                 stderr <- fmt.Sprintf("crunchstat: cpuset.cpus %v", cpus)
132                         }
133                         last_cpucount = cpus
134
135                         c.Close()
136                 }
137                 if cpus == 0 {
138                         cpus = 1
139                 }
140                 if cpuacct_stat != "" {
141                         c, _ := os.Open(cpuacct_stat)
142                         b, _ := ioutil.ReadAll(c)
143                         var next_user int64
144                         var next_sys int64
145                         fmt.Sscanf(string(b), "user %d\nsystem %d", &next_user, &next_sys)
146                         c.Close()
147
148                         if last_user != 0 {
149                                 user_diff := next_user - last_user
150                                 sys_diff := next_sys - last_sys
151                                 // Assume we're reading stats based on 100
152                                 // jiffies per second.  Because the elapsed
153                                 // time is in milliseconds, we need to boost
154                                 // that to 1000 jiffies per second, then boost
155                                 // it by another 100x to get a percentage, then
156                                 // finally divide by the actual elapsed time
157                                 // and the number of cpus to get average load
158                                 // over the polling period.
159                                 user_pct := (user_diff * 10 * 100) / (elapsed * cpus)
160                                 sys_pct := (sys_diff * 10 * 100) / (elapsed * cpus)
161
162                                 stderr <- fmt.Sprintf("crunchstat: cpuacct.stat user %v", user_pct)
163                                 stderr <- fmt.Sprintf("crunchstat: cpuacct.stat sys %v", sys_pct)
164                         }
165
166                         /*fmt.Printf("user %d %d %d%%\n", last_user, next_user, next_user-last_user)
167                         fmt.Printf("sys %d %d %d%%\n", last_sys, next_sys, next_sys-last_sys)
168                         fmt.Printf("sum %d%%\n", (next_user-last_user)+(next_sys-last_sys))*/
169                         last_user = next_user
170                         last_sys = next_sys
171                 }
172                 if blkio_io_service_bytes != "" {
173                         c, _ := os.Open(blkio_io_service_bytes)
174                         b := bufio.NewScanner(c)
175                         var device, op string
176                         var next int64
177                         for b.Scan() {
178                                 if _, err := fmt.Sscanf(string(b.Text()), "%s %s %d", &device, &op, &next); err == nil {
179                                         if disk[device] == nil {
180                                                 disk[device] = new(Disk)
181                                         }
182                                         if op == "Read" {
183                                                 disk[device].last_read = disk[device].next_read
184                                                 disk[device].next_read = next
185                                                 if disk[device].last_read > 0 && (disk[device].next_read != disk[device].last_read) {
186                                                         stderr <- fmt.Sprintf("crunchstat: blkio.io_service_bytes %s read %v", device, disk[device].next_read-disk[device].last_read)
187                                                 }
188                                         }
189                                         if op == "Write" {
190                                                 disk[device].last_write = disk[device].next_write
191                                                 disk[device].next_write = next
192                                                 if disk[device].last_write > 0 && (disk[device].next_write != disk[device].last_write) {
193                                                         stderr <- fmt.Sprintf("crunchstat: blkio.io_service_bytes %s write %v", device, disk[device].next_write-disk[device].last_write)
194                                                 }
195                                         }
196                                 }
197                         }
198                         c.Close()
199                 }
200
201                 if memory_stat != "" {
202                         c, _ := os.Open(memory_stat)
203                         b := bufio.NewScanner(c)
204                         var stat string
205                         var val int64
206                         for b.Scan() {
207                                 if _, err := fmt.Sscanf(string(b.Text()), "%s %d", &stat, &val); err == nil {
208                                         if stat == "rss" {
209                                                 stderr <- fmt.Sprintf("crunchstat: memory.stat rss %v", val)
210                                         }
211                                 }
212                         }
213                         c.Close()
214                 }
215
216                 bedtime := time.Now()
217                 time.Sleep(time.Duration(poll) * time.Millisecond)
218                 morning := time.Now()
219                 elapsed = morning.Sub(bedtime).Nanoseconds() / int64(time.Millisecond)
220         }
221 }
222
223 func main() {
224
225         var (
226                 cgroup_root    string
227                 cgroup_parent  string
228                 cgroup_cidfile string
229                 wait           int64
230                 poll           int64
231         )
232
233         flag.StringVar(&cgroup_root, "cgroup-root", "", "Root of cgroup tree")
234         flag.StringVar(&cgroup_parent, "cgroup-parent", "", "Name of container parent under cgroup")
235         flag.StringVar(&cgroup_cidfile, "cgroup-cid", "", "Path to container id file")
236         flag.Int64Var(&wait, "wait", 5, "Maximum time (in seconds) to wait for cid file to show up")
237         flag.Int64Var(&poll, "poll", 1000, "Polling frequency, in milliseconds")
238
239         flag.Parse()
240
241         logger := log.New(os.Stderr, "crunchstat: ", 0)
242
243         if cgroup_root == "" {
244                 logger.Fatal("Must provide -cgroup-root")
245         }
246
247         // Make output channel
248         stdout_chan := make(chan string)
249         stderr_chan := make(chan string)
250         finish_chan := make(chan bool)
251         defer close(stdout_chan)
252         defer close(stderr_chan)
253         defer close(finish_chan)
254
255         go OutputChannel(stdout_chan, stderr_chan)
256
257         var cmd *exec.Cmd
258
259         if len(flag.Args()) > 0 {
260                 // Set up subprocess
261                 cmd = exec.Command(flag.Args()[0], flag.Args()[1:]...)
262
263                 logger.Print("Running ", flag.Args())
264
265                 // Child process will read from our stdin pipe (we
266                 // close our copy below)
267                 cmd.Stdin = os.Stdin
268
269                 // Forward SIGINT and SIGTERM to inner process
270                 term := make(chan os.Signal, 1)
271                 go func(sig <-chan os.Signal) {
272                         catch := <-sig
273                         if cmd.Process != nil {
274                                 cmd.Process.Signal(catch)
275                         }
276                         logger.Print("caught signal:", catch)
277                 }(term)
278                 signal.Notify(term, syscall.SIGTERM)
279                 signal.Notify(term, syscall.SIGINT)
280
281                 // Funnel stdout and stderr from subprocess to output channels
282                 stdout_pipe, err := cmd.StdoutPipe()
283                 if err != nil {
284                         logger.Fatal(err)
285                 }
286                 go ReadLineByLine(stdout_pipe, stdout_chan, finish_chan)
287
288                 stderr_pipe, err := cmd.StderrPipe()
289                 if err != nil {
290                         logger.Fatal(err)
291                 }
292                 go ReadLineByLine(stderr_pipe, stderr_chan, finish_chan)
293
294                 // Run subprocess
295                 if err := cmd.Start(); err != nil {
296                         logger.Fatal(err)
297                 }
298         }
299
300         // Close standard input in this (parent) process
301         os.Stdin.Close()
302
303         // Read the cid file
304         var container_id string
305         if cgroup_cidfile != "" {
306                 // wait up to 'wait' seconds for the cid file to appear
307                 var i time.Duration
308                 for i = 0; i < time.Duration(wait)*time.Second; i += (100 * time.Millisecond) {
309                         f, err := os.Open(cgroup_cidfile)
310                         if err == nil {
311                                 cid, err2 := ioutil.ReadAll(f)
312                                 if err2 == nil && len(cid) > 0 {
313                                         container_id = string(cid)
314                                         f.Close()
315                                         break
316                                 }
317                         }
318                         time.Sleep(100 * time.Millisecond)
319                 }
320                 if cgroup_root == "" {
321                         logger.Printf("Could not read cid file %s", cgroup_cidfile)
322                 }
323         }
324
325         go PollCgroupStats(cgroup_root, cgroup_parent, container_id, stderr_chan, poll)
326
327         // Wait for each of stdout and stderr to drain
328         <-finish_chan
329         <-finish_chan
330
331         if err := cmd.Wait(); err != nil {
332                 if exiterr, ok := err.(*exec.ExitError); ok {
333                         // The program has exited with an exit code != 0
334
335                         // This works on both Unix and Windows. Although package
336                         // syscall is generally platform dependent, WaitStatus is
337                         // defined for both Unix and Windows and in both cases has
338                         // an ExitStatus() method with the same signature.
339                         if status, ok := exiterr.Sys().(syscall.WaitStatus); ok {
340                                 os.Exit(status.ExitStatus())
341                         }
342                 } else {
343                         logger.Fatalf("cmd.Wait: %v", err)
344                 }
345         }
346 }