Merge branch '19527-training-set'
[lightning.git] / stats.go
1 // Copyright (C) The Lightning Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package lightning
6
7 import (
8         "bufio"
9         "encoding/json"
10         "errors"
11         "flag"
12         "fmt"
13         "io"
14         "io/ioutil"
15         "net/http"
16         _ "net/http/pprof"
17         "os"
18         "strings"
19
20         "git.arvados.org/arvados.git/sdk/go/arvados"
21         log "github.com/sirupsen/logrus"
22 )
23
24 type statscmd struct {
25         debugUnplaced bool
26 }
27
28 func (cmd *statscmd) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
29         var err error
30         defer func() {
31                 if err != nil {
32                         fmt.Fprintf(stderr, "%s\n", err)
33                 }
34         }()
35         flags := flag.NewFlagSet("", flag.ContinueOnError)
36         flags.SetOutput(stderr)
37         pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
38         runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
39         projectUUID := flags.String("project", "", "project `UUID` for output data")
40         priority := flags.Int("priority", 500, "container request priority")
41         inputFilename := flags.String("i", "-", "input `file`")
42         outputFilename := flags.String("o", "-", "output `file`")
43         flags.BoolVar(&cmd.debugUnplaced, "debug-unplaced", false, "output full list of unplaced tags")
44         err = flags.Parse(args)
45         if err == flag.ErrHelp {
46                 err = nil
47                 return 0
48         } else if err != nil {
49                 return 2
50         }
51
52         if *pprof != "" {
53                 go func() {
54                         log.Println(http.ListenAndServe(*pprof, nil))
55                 }()
56         }
57
58         if !*runlocal {
59                 if *outputFilename != "-" {
60                         err = errors.New("cannot specify output file in container mode: not implemented")
61                         return 1
62                 }
63                 runner := arvadosContainerRunner{
64                         Name:        "lightning stats",
65                         Client:      arvados.NewClientFromEnv(),
66                         ProjectUUID: *projectUUID,
67                         RAM:         16000000000,
68                         VCPUs:       1,
69                         Priority:    *priority,
70                 }
71                 err = runner.TranslatePaths(inputFilename)
72                 if err != nil {
73                         return 1
74                 }
75                 runner.Args = []string{"stats", "-local=true", fmt.Sprintf("-debug-unplaced=%v", cmd.debugUnplaced), "-i", *inputFilename, "-o", "/mnt/output/stats.json"}
76                 var output string
77                 output, err = runner.Run()
78                 if err != nil {
79                         return 1
80                 }
81                 fmt.Fprintln(stdout, output+"/stats.json")
82                 return 0
83         }
84
85         var input io.ReadCloser
86         if *inputFilename == "-" {
87                 input = ioutil.NopCloser(stdin)
88         } else {
89                 input, err = os.Open(*inputFilename)
90                 if err != nil {
91                         return 1
92                 }
93                 defer input.Close()
94         }
95
96         var output io.WriteCloser
97         if *outputFilename == "-" {
98                 output = nopCloser{stdout}
99         } else {
100                 output, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0777)
101                 if err != nil {
102                         return 1
103                 }
104                 defer output.Close()
105         }
106
107         bufw := bufio.NewWriter(output)
108         err = cmd.doStats(input, strings.HasSuffix(*inputFilename, ".gz"), bufw)
109         if err != nil {
110                 return 1
111         }
112         err = bufw.Flush()
113         if err != nil {
114                 return 1
115         }
116         err = output.Close()
117         if err != nil {
118                 return 1
119         }
120         return 0
121 }
122
123 func (cmd *statscmd) doStats(input io.Reader, gz bool, output io.Writer) error {
124         var ret struct {
125                 Genomes          int
126                 CalledBases      []int64
127                 Tags             int
128                 TagsPlacedNTimes []int // a[x]==y means there were y tags that placed x times
129                 TileVariants     int
130                 VariantsBySize   []int
131                 NCVariantsBySize []int
132                 UnplacedTags     []string `json:",omitempty"`
133         }
134
135         var tagSet [][]byte
136         var tagPlacements []int
137         tileVariantCalls := map[tileLibRef]int{}
138         err := DecodeLibrary(input, gz, func(ent *LibraryEntry) error {
139                 ret.Genomes += len(ent.CompactGenomes)
140                 ret.TileVariants += len(ent.TileVariants)
141                 if len(ent.TagSet) > 0 {
142                         if ret.Tags > 0 {
143                                 return errors.New("invalid input: contains multiple tagsets")
144                         }
145                         ret.Tags = len(ent.TagSet)
146                         tagSet = ent.TagSet
147                 }
148                 for _, tv := range ent.TileVariants {
149                         if need := 1 + len(tv.Sequence) - len(ret.VariantsBySize); need > 0 {
150                                 ret.VariantsBySize = append(ret.VariantsBySize, make([]int, need)...)
151                                 ret.NCVariantsBySize = append(ret.NCVariantsBySize, make([]int, need)...)
152                         }
153
154                         calls := 0
155                         hasNoCalls := false
156                         for _, b := range tv.Sequence {
157                                 if b == 'a' || b == 'c' || b == 'g' || b == 't' {
158                                         calls++
159                                 } else {
160                                         hasNoCalls = true
161                                 }
162                         }
163
164                         if hasNoCalls {
165                                 ret.NCVariantsBySize[len(tv.Sequence)]++
166                         } else {
167                                 ret.VariantsBySize[len(tv.Sequence)]++
168                         }
169
170                         tileVariantCalls[tileLibRef{Tag: tv.Tag, Variant: tv.Variant}] = calls
171                 }
172                 for _, g := range ent.CompactGenomes {
173                         if need := (len(g.Variants)+1)/2 - len(tagPlacements); need > 0 {
174                                 tagPlacements = append(tagPlacements, make([]int, need)...)
175                         }
176                         calledBases := int64(0)
177                         for idx, v := range g.Variants {
178                                 if v > 0 {
179                                         tagPlacements[idx/2]++
180                                         calledBases += int64(tileVariantCalls[tileLibRef{Tag: tagID(idx / 2), Variant: v}])
181                                 }
182                         }
183                         ret.CalledBases = append(ret.CalledBases, calledBases)
184                 }
185                 return nil
186         })
187         if err != nil {
188                 return err
189         }
190         for id, p := range tagPlacements {
191                 for len(ret.TagsPlacedNTimes) <= p {
192                         ret.TagsPlacedNTimes = append(ret.TagsPlacedNTimes, 0)
193                 }
194                 ret.TagsPlacedNTimes[p]++
195                 if cmd.debugUnplaced && p == 0 {
196                         ret.UnplacedTags = append(ret.UnplacedTags, fmt.Sprintf("%d %s", id, tagSet[id]))
197                 }
198         }
199
200         return json.NewEncoder(output).Encode(ret)
201 }