Add TagsPlacedNTimes stat.
[lightning.git] / stats.go
1 package main
2
3 import (
4         "bufio"
5         "encoding/gob"
6         "encoding/json"
7         "errors"
8         "flag"
9         "fmt"
10         "io"
11         "io/ioutil"
12         "net/http"
13         _ "net/http/pprof"
14         "os"
15
16         "git.arvados.org/arvados.git/sdk/go/arvados"
17         log "github.com/sirupsen/logrus"
18 )
19
20 type stats struct{}
21
22 func (cmd *stats) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
23         var err error
24         defer func() {
25                 if err != nil {
26                         fmt.Fprintf(stderr, "%s\n", err)
27                 }
28         }()
29         flags := flag.NewFlagSet("", flag.ContinueOnError)
30         flags.SetOutput(stderr)
31         pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
32         runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
33         projectUUID := flags.String("project", "", "project `UUID` for output data")
34         priority := flags.Int("priority", 500, "container request priority")
35         inputFilename := flags.String("i", "-", "input `file`")
36         outputFilename := flags.String("o", "-", "output `file`")
37         err = flags.Parse(args)
38         if err == flag.ErrHelp {
39                 err = nil
40                 return 0
41         } else if err != nil {
42                 return 2
43         }
44
45         if *pprof != "" {
46                 go func() {
47                         log.Println(http.ListenAndServe(*pprof, nil))
48                 }()
49         }
50
51         if !*runlocal {
52                 if *outputFilename != "-" {
53                         err = errors.New("cannot specify output file in container mode: not implemented")
54                         return 1
55                 }
56                 runner := arvadosContainerRunner{
57                         Name:        "lightning stats",
58                         Client:      arvados.NewClientFromEnv(),
59                         ProjectUUID: *projectUUID,
60                         RAM:         16000000000,
61                         VCPUs:       1,
62                         Priority:    *priority,
63                 }
64                 err = runner.TranslatePaths(inputFilename)
65                 if err != nil {
66                         return 1
67                 }
68                 runner.Args = []string{"stats", "-local=true", "-i", *inputFilename, "-o", "/mnt/output/stats.json"}
69                 var output string
70                 output, err = runner.Run()
71                 if err != nil {
72                         return 1
73                 }
74                 fmt.Fprintln(stdout, output+"/stats.json")
75                 return 0
76         }
77
78         var input io.ReadCloser
79         if *inputFilename == "-" {
80                 input = ioutil.NopCloser(stdin)
81         } else {
82                 input, err = os.Open(*inputFilename)
83                 if err != nil {
84                         return 1
85                 }
86                 defer input.Close()
87         }
88
89         var output io.WriteCloser
90         if *outputFilename == "-" {
91                 output = nopCloser{stdout}
92         } else {
93                 output, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0777)
94                 if err != nil {
95                         return 1
96                 }
97                 defer output.Close()
98         }
99
100         bufw := bufio.NewWriter(output)
101         cmd.doStats(input, bufw)
102         err = bufw.Flush()
103         if err != nil {
104                 return 1
105         }
106         err = output.Close()
107         if err != nil {
108                 return 1
109         }
110         return 0
111 }
112
113 func (cmd *stats) doStats(input io.Reader, output io.Writer) error {
114         var ret struct {
115                 Genomes          int
116                 Tags             int
117                 TagsPlacedNTimes []int // a[x]==y means there were y tags that placed x times
118                 TileVariants     int
119                 VariantsBySize   []int
120                 NCVariantsBySize []int
121         }
122
123         var tagPlacements []int
124         dec := gob.NewDecoder(bufio.NewReaderSize(input, 1<<26))
125         for {
126                 var ent LibraryEntry
127                 err := dec.Decode(&ent)
128                 if err == io.EOF {
129                         break
130                 } else if err != nil {
131                         return err
132                 }
133                 ret.Genomes += len(ent.CompactGenomes)
134                 ret.TileVariants += len(ent.TileVariants)
135                 if len(ent.TagSet) > 0 {
136                         if ret.Tags > 0 {
137                                 return errors.New("invalid input: contains multiple tagsets")
138                         }
139                         ret.Tags = len(ent.TagSet)
140                 }
141                 for _, g := range ent.CompactGenomes {
142                         if need := (len(g.Variants)+1)/2 - len(tagPlacements); need > 0 {
143                                 tagPlacements = append(tagPlacements, make([]int, need)...)
144                         }
145                         for idx, v := range g.Variants {
146                                 if v > 0 {
147                                         tagPlacements[idx/2]++
148                                 }
149                         }
150                 }
151                 for _, tv := range ent.TileVariants {
152                         if need := 1 + len(tv.Sequence) - len(ret.VariantsBySize); need > 0 {
153                                 ret.VariantsBySize = append(ret.VariantsBySize, make([]int, need)...)
154                                 ret.NCVariantsBySize = append(ret.NCVariantsBySize, make([]int, need)...)
155                         }
156
157                         hasNoCalls := false
158                         for _, b := range tv.Sequence {
159                                 if b != 'a' && b != 'c' && b != 'g' && b != 't' {
160                                         hasNoCalls = true
161                                 }
162                         }
163
164                         if hasNoCalls {
165                                 ret.NCVariantsBySize[len(tv.Sequence)]++
166                         } else {
167                                 ret.VariantsBySize[len(tv.Sequence)]++
168                         }
169                 }
170         }
171         for _, p := range tagPlacements {
172                 for len(ret.TagsPlacedNTimes) <= p {
173                         ret.TagsPlacedNTimes = append(ret.TagsPlacedNTimes, 0)
174                 }
175                 ret.TagsPlacedNTimes[p]++
176         }
177
178         return json.NewEncoder(output).Encode(ret)
179 }