Option to output full list of unplaced tags.
[lightning.git] / stats.go
1 package main
2
3 import (
4         "bufio"
5         "encoding/gob"
6         "encoding/json"
7         "errors"
8         "flag"
9         "fmt"
10         "io"
11         "io/ioutil"
12         "net/http"
13         _ "net/http/pprof"
14         "os"
15
16         "git.arvados.org/arvados.git/sdk/go/arvados"
17         log "github.com/sirupsen/logrus"
18 )
19
20 type stats struct {
21         debugUnplaced bool
22 }
23
24 func (cmd *stats) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
25         var err error
26         defer func() {
27                 if err != nil {
28                         fmt.Fprintf(stderr, "%s\n", err)
29                 }
30         }()
31         flags := flag.NewFlagSet("", flag.ContinueOnError)
32         flags.SetOutput(stderr)
33         pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
34         runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
35         projectUUID := flags.String("project", "", "project `UUID` for output data")
36         priority := flags.Int("priority", 500, "container request priority")
37         inputFilename := flags.String("i", "-", "input `file`")
38         outputFilename := flags.String("o", "-", "output `file`")
39         flags.BoolVar(&cmd.debugUnplaced, "debug-unplaced", false, "output full list of unplaced tags")
40         err = flags.Parse(args)
41         if err == flag.ErrHelp {
42                 err = nil
43                 return 0
44         } else if err != nil {
45                 return 2
46         }
47
48         if *pprof != "" {
49                 go func() {
50                         log.Println(http.ListenAndServe(*pprof, nil))
51                 }()
52         }
53
54         if !*runlocal {
55                 if *outputFilename != "-" {
56                         err = errors.New("cannot specify output file in container mode: not implemented")
57                         return 1
58                 }
59                 runner := arvadosContainerRunner{
60                         Name:        "lightning stats",
61                         Client:      arvados.NewClientFromEnv(),
62                         ProjectUUID: *projectUUID,
63                         RAM:         16000000000,
64                         VCPUs:       1,
65                         Priority:    *priority,
66                 }
67                 err = runner.TranslatePaths(inputFilename)
68                 if err != nil {
69                         return 1
70                 }
71                 runner.Args = []string{"stats", "-local=true", fmt.Sprintf("-debug-unplaced=%v", cmd.debugUnplaced), "-i", *inputFilename, "-o", "/mnt/output/stats.json"}
72                 var output string
73                 output, err = runner.Run()
74                 if err != nil {
75                         return 1
76                 }
77                 fmt.Fprintln(stdout, output+"/stats.json")
78                 return 0
79         }
80
81         var input io.ReadCloser
82         if *inputFilename == "-" {
83                 input = ioutil.NopCloser(stdin)
84         } else {
85                 input, err = os.Open(*inputFilename)
86                 if err != nil {
87                         return 1
88                 }
89                 defer input.Close()
90         }
91
92         var output io.WriteCloser
93         if *outputFilename == "-" {
94                 output = nopCloser{stdout}
95         } else {
96                 output, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0777)
97                 if err != nil {
98                         return 1
99                 }
100                 defer output.Close()
101         }
102
103         bufw := bufio.NewWriter(output)
104         cmd.doStats(input, bufw)
105         err = bufw.Flush()
106         if err != nil {
107                 return 1
108         }
109         err = output.Close()
110         if err != nil {
111                 return 1
112         }
113         return 0
114 }
115
116 func (cmd *stats) doStats(input io.Reader, output io.Writer) error {
117         var ret struct {
118                 Genomes          int
119                 Tags             int
120                 TagsPlacedNTimes []int // a[x]==y means there were y tags that placed x times
121                 TileVariants     int
122                 VariantsBySize   []int
123                 NCVariantsBySize []int
124                 UnplacedTags     []string `json:",omitempty"`
125         }
126
127         var tagSet [][]byte
128         var tagPlacements []int
129         dec := gob.NewDecoder(bufio.NewReaderSize(input, 1<<26))
130         for {
131                 var ent LibraryEntry
132                 err := dec.Decode(&ent)
133                 if err == io.EOF {
134                         break
135                 } else if err != nil {
136                         return err
137                 }
138                 ret.Genomes += len(ent.CompactGenomes)
139                 ret.TileVariants += len(ent.TileVariants)
140                 if len(ent.TagSet) > 0 {
141                         if ret.Tags > 0 {
142                                 return errors.New("invalid input: contains multiple tagsets")
143                         }
144                         ret.Tags = len(ent.TagSet)
145                         tagSet = ent.TagSet
146                 }
147                 for _, g := range ent.CompactGenomes {
148                         if need := (len(g.Variants)+1)/2 - len(tagPlacements); need > 0 {
149                                 tagPlacements = append(tagPlacements, make([]int, need)...)
150                         }
151                         for idx, v := range g.Variants {
152                                 if v > 0 {
153                                         tagPlacements[idx/2]++
154                                 }
155                         }
156                 }
157                 for _, tv := range ent.TileVariants {
158                         if need := 1 + len(tv.Sequence) - len(ret.VariantsBySize); need > 0 {
159                                 ret.VariantsBySize = append(ret.VariantsBySize, make([]int, need)...)
160                                 ret.NCVariantsBySize = append(ret.NCVariantsBySize, make([]int, need)...)
161                         }
162
163                         hasNoCalls := false
164                         for _, b := range tv.Sequence {
165                                 if b != 'a' && b != 'c' && b != 'g' && b != 't' {
166                                         hasNoCalls = true
167                                 }
168                         }
169
170                         if hasNoCalls {
171                                 ret.NCVariantsBySize[len(tv.Sequence)]++
172                         } else {
173                                 ret.VariantsBySize[len(tv.Sequence)]++
174                         }
175                 }
176         }
177         for id, p := range tagPlacements {
178                 for len(ret.TagsPlacedNTimes) <= p {
179                         ret.TagsPlacedNTimes = append(ret.TagsPlacedNTimes, 0)
180                 }
181                 ret.TagsPlacedNTimes[p]++
182                 if cmd.debugUnplaced && p == 0 {
183                         ret.UnplacedTags = append(ret.UnplacedTags, fmt.Sprintf("%d %s", id, tagSet[id]))
184                 }
185         }
186
187         return json.NewEncoder(output).Encode(ret)
188 }