Write import stats to stats.json.
[lightning.git] / stats.go
1 package main
2
3 import (
4         "bufio"
5         "encoding/gob"
6         "encoding/json"
7         "errors"
8         "flag"
9         "fmt"
10         "io"
11         "io/ioutil"
12         "net/http"
13         _ "net/http/pprof"
14         "os"
15
16         "git.arvados.org/arvados.git/sdk/go/arvados"
17         log "github.com/sirupsen/logrus"
18 )
19
20 type statscmd struct {
21         debugUnplaced bool
22 }
23
24 func (cmd *statscmd) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
25         var err error
26         defer func() {
27                 if err != nil {
28                         fmt.Fprintf(stderr, "%s\n", err)
29                 }
30         }()
31         flags := flag.NewFlagSet("", flag.ContinueOnError)
32         flags.SetOutput(stderr)
33         pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
34         runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
35         projectUUID := flags.String("project", "", "project `UUID` for output data")
36         priority := flags.Int("priority", 500, "container request priority")
37         inputFilename := flags.String("i", "-", "input `file`")
38         outputFilename := flags.String("o", "-", "output `file`")
39         flags.BoolVar(&cmd.debugUnplaced, "debug-unplaced", false, "output full list of unplaced tags")
40         err = flags.Parse(args)
41         if err == flag.ErrHelp {
42                 err = nil
43                 return 0
44         } else if err != nil {
45                 return 2
46         }
47
48         if *pprof != "" {
49                 go func() {
50                         log.Println(http.ListenAndServe(*pprof, nil))
51                 }()
52         }
53
54         if !*runlocal {
55                 if *outputFilename != "-" {
56                         err = errors.New("cannot specify output file in container mode: not implemented")
57                         return 1
58                 }
59                 runner := arvadosContainerRunner{
60                         Name:        "lightning stats",
61                         Client:      arvados.NewClientFromEnv(),
62                         ProjectUUID: *projectUUID,
63                         RAM:         16000000000,
64                         VCPUs:       1,
65                         Priority:    *priority,
66                 }
67                 err = runner.TranslatePaths(inputFilename)
68                 if err != nil {
69                         return 1
70                 }
71                 runner.Args = []string{"stats", "-local=true", fmt.Sprintf("-debug-unplaced=%v", cmd.debugUnplaced), "-i", *inputFilename, "-o", "/mnt/output/stats.json"}
72                 var output string
73                 output, err = runner.Run()
74                 if err != nil {
75                         return 1
76                 }
77                 fmt.Fprintln(stdout, output+"/stats.json")
78                 return 0
79         }
80
81         var input io.ReadCloser
82         if *inputFilename == "-" {
83                 input = ioutil.NopCloser(stdin)
84         } else {
85                 input, err = os.Open(*inputFilename)
86                 if err != nil {
87                         return 1
88                 }
89                 defer input.Close()
90         }
91
92         var output io.WriteCloser
93         if *outputFilename == "-" {
94                 output = nopCloser{stdout}
95         } else {
96                 output, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0777)
97                 if err != nil {
98                         return 1
99                 }
100                 defer output.Close()
101         }
102
103         bufw := bufio.NewWriter(output)
104         err = cmd.doStats(input, bufw)
105         if err != nil {
106                 return 1
107         }
108         err = bufw.Flush()
109         if err != nil {
110                 return 1
111         }
112         err = output.Close()
113         if err != nil {
114                 return 1
115         }
116         return 0
117 }
118
119 func (cmd *statscmd) doStats(input io.Reader, output io.Writer) error {
120         var ret struct {
121                 Genomes          int
122                 CalledBases      []int64
123                 Tags             int
124                 TagsPlacedNTimes []int // a[x]==y means there were y tags that placed x times
125                 TileVariants     int
126                 VariantsBySize   []int
127                 NCVariantsBySize []int
128                 UnplacedTags     []string `json:",omitempty"`
129         }
130
131         var tagSet [][]byte
132         var tagPlacements []int
133         tileVariantCalls := map[tileLibRef]int{}
134         dec := gob.NewDecoder(bufio.NewReaderSize(input, 1<<26))
135         for {
136                 var ent LibraryEntry
137                 err := dec.Decode(&ent)
138                 if err == io.EOF {
139                         break
140                 } else if err != nil {
141                         return fmt.Errorf("gob decode: %w", err)
142                 }
143                 ret.Genomes += len(ent.CompactGenomes)
144                 ret.TileVariants += len(ent.TileVariants)
145                 if len(ent.TagSet) > 0 {
146                         if ret.Tags > 0 {
147                                 return errors.New("invalid input: contains multiple tagsets")
148                         }
149                         ret.Tags = len(ent.TagSet)
150                         tagSet = ent.TagSet
151                 }
152                 for _, tv := range ent.TileVariants {
153                         if need := 1 + len(tv.Sequence) - len(ret.VariantsBySize); need > 0 {
154                                 ret.VariantsBySize = append(ret.VariantsBySize, make([]int, need)...)
155                                 ret.NCVariantsBySize = append(ret.NCVariantsBySize, make([]int, need)...)
156                         }
157
158                         calls := 0
159                         hasNoCalls := false
160                         for _, b := range tv.Sequence {
161                                 if b == 'a' || b == 'c' || b == 'g' || b == 't' {
162                                         calls++
163                                 } else {
164                                         hasNoCalls = true
165                                 }
166                         }
167
168                         if hasNoCalls {
169                                 ret.NCVariantsBySize[len(tv.Sequence)]++
170                         } else {
171                                 ret.VariantsBySize[len(tv.Sequence)]++
172                         }
173
174                         tileVariantCalls[tileLibRef{Tag: tv.Tag, Variant: tv.Variant}] = calls
175                 }
176                 for _, g := range ent.CompactGenomes {
177                         if need := (len(g.Variants)+1)/2 - len(tagPlacements); need > 0 {
178                                 tagPlacements = append(tagPlacements, make([]int, need)...)
179                         }
180                         calledBases := int64(0)
181                         for idx, v := range g.Variants {
182                                 if v > 0 {
183                                         tagPlacements[idx/2]++
184                                         calledBases += int64(tileVariantCalls[tileLibRef{Tag: tagID(idx / 2), Variant: v}])
185                                 }
186                         }
187                         ret.CalledBases = append(ret.CalledBases, calledBases)
188                 }
189         }
190         for id, p := range tagPlacements {
191                 for len(ret.TagsPlacedNTimes) <= p {
192                         ret.TagsPlacedNTimes = append(ret.TagsPlacedNTimes, 0)
193                 }
194                 ret.TagsPlacedNTimes[p]++
195                 if cmd.debugUnplaced && p == 0 {
196                         ret.UnplacedTags = append(ret.UnplacedTags, fmt.Sprintf("%d %s", id, tagSet[id]))
197                 }
198         }
199
200         return json.NewEncoder(output).Encode(ret)
201 }