Add copyright headers.
[lightning.git] / filter.go
1 // Copyright (C) The Lightning Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package lightning
6
7 import (
8         "bufio"
9         "encoding/gob"
10         "errors"
11         "flag"
12         "fmt"
13         "io"
14         "io/ioutil"
15         "net/http"
16         _ "net/http/pprof"
17         "os"
18         "strings"
19
20         "git.arvados.org/arvados.git/sdk/go/arvados"
21         log "github.com/sirupsen/logrus"
22 )
23
24 type filter struct {
25         MaxVariants int
26         MinCoverage float64
27         MaxTag      int
28 }
29
30 func (f *filter) Flags(flags *flag.FlagSet) {
31         flags.IntVar(&f.MaxVariants, "max-variants", -1, "drop tiles with more than `N` variants")
32         flags.Float64Var(&f.MinCoverage, "min-coverage", 0, "drop tiles with coverage less than `P` across all haplotypes (0 < P ≤ 1)")
33         flags.IntVar(&f.MaxTag, "max-tag", -1, "drop tiles with tag ID > `N`")
34 }
35
36 func (f *filter) Args() []string {
37         return []string{
38                 fmt.Sprintf("-max-variants=%d", f.MaxVariants),
39                 fmt.Sprintf("-min-coverage=%f", f.MinCoverage),
40                 fmt.Sprintf("-max-tag=%d", f.MaxTag),
41         }
42 }
43
44 func (f *filter) Apply(tilelib *tileLibrary) {
45         // Zero out variants at tile positions that have more than
46         // f.MaxVariants tile variants.
47         if f.MaxVariants >= 0 {
48                 for tag, variants := range tilelib.variant {
49                         if f.MaxTag >= 0 && tag >= f.MaxTag {
50                                 break
51                         }
52                         if len(variants) <= f.MaxVariants {
53                                 continue
54                         }
55                         for _, cg := range tilelib.compactGenomes {
56                                 if len(cg) > tag*2 {
57                                         cg[tag*2] = 0
58                                         cg[tag*2+1] = 0
59                                 }
60                         }
61                 }
62         }
63
64         // Zero out variants at tile positions that have less than
65         // f.MinCoverage.
66         mincov := int(2*f.MinCoverage*float64(len(tilelib.compactGenomes)) + 1)
67 TAG:
68         for tag := 0; tag < len(tilelib.variant) && tag < f.MaxTag; tag++ {
69                 tagcov := 0
70                 for _, cg := range tilelib.compactGenomes {
71                         if cg[tag*2] > 0 {
72                                 tagcov++
73                         }
74                         if cg[tag*2+1] > 0 {
75                                 tagcov++
76                         }
77                         if tagcov >= mincov {
78                                 continue TAG
79                         }
80                 }
81                 for _, cg := range tilelib.compactGenomes {
82                         cg[tag*2] = 0
83                         cg[tag*2+1] = 0
84                 }
85         }
86
87         // Truncate genomes and tile data to f.MaxTag (TODO: truncate
88         // refseqs too)
89         if f.MaxTag >= 0 {
90                 if len(tilelib.variant) > f.MaxTag {
91                         tilelib.variant = tilelib.variant[:f.MaxTag]
92                 }
93                 for name, cg := range tilelib.compactGenomes {
94                         if len(cg) > 2*f.MaxTag {
95                                 tilelib.compactGenomes[name] = cg[:2*f.MaxTag]
96                         }
97                 }
98         }
99 }
100
101 type filtercmd struct {
102         output io.Writer
103         filter
104 }
105
106 func (cmd *filtercmd) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
107         var err error
108         defer func() {
109                 if err != nil {
110                         fmt.Fprintf(stderr, "%s\n", err)
111                 }
112         }()
113         flags := flag.NewFlagSet("", flag.ContinueOnError)
114         flags.SetOutput(stderr)
115         pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
116         runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
117         projectUUID := flags.String("project", "", "project `UUID` for output data")
118         priority := flags.Int("priority", 500, "container request priority")
119         inputFilename := flags.String("i", "-", "input `file`")
120         outputFilename := flags.String("o", "-", "output `file`")
121         cmd.filter.Flags(flags)
122         err = flags.Parse(args)
123         if err == flag.ErrHelp {
124                 err = nil
125                 return 0
126         } else if err != nil {
127                 return 2
128         }
129         cmd.output = stdout
130
131         if *pprof != "" {
132                 go func() {
133                         log.Println(http.ListenAndServe(*pprof, nil))
134                 }()
135         }
136
137         if !*runlocal {
138                 if *outputFilename != "-" {
139                         err = errors.New("cannot specify output file in container mode: not implemented")
140                         return 1
141                 }
142                 runner := arvadosContainerRunner{
143                         Name:        "lightning filter",
144                         Client:      arvados.NewClientFromEnv(),
145                         ProjectUUID: *projectUUID,
146                         RAM:         64000000000,
147                         VCPUs:       2,
148                         Priority:    *priority,
149                 }
150                 err = runner.TranslatePaths(inputFilename)
151                 if err != nil {
152                         return 1
153                 }
154                 runner.Args = []string{"filter", "-local=true",
155                         "-i", *inputFilename,
156                         "-o", "/mnt/output/library.gob",
157                         "-max-variants", fmt.Sprintf("%d", cmd.MaxVariants),
158                         "-min-coverage", fmt.Sprintf("%f", cmd.MinCoverage),
159                         "-max-tag", fmt.Sprintf("%d", cmd.MaxTag),
160                 }
161                 var output string
162                 output, err = runner.Run()
163                 if err != nil {
164                         return 1
165                 }
166                 fmt.Fprintln(stdout, output+"/library.gob")
167                 return 0
168         }
169
170         var infile io.ReadCloser
171         if *inputFilename == "-" {
172                 infile = ioutil.NopCloser(stdin)
173         } else {
174                 infile, err = os.Open(*inputFilename)
175                 if err != nil {
176                         return 1
177                 }
178                 defer infile.Close()
179         }
180         log.Print("reading")
181         cgs, err := ReadCompactGenomes(infile, strings.HasSuffix(*inputFilename, ".gz"))
182         if err != nil {
183                 return 1
184         }
185         err = infile.Close()
186         if err != nil {
187                 return 1
188         }
189         log.Printf("reading done, %d genomes", len(cgs))
190
191         log.Print("filtering")
192         ntags := 0
193         for _, cg := range cgs {
194                 if ntags < len(cg.Variants)/2 {
195                         ntags = len(cg.Variants) / 2
196                 }
197                 if cmd.MaxVariants < 0 {
198                         continue
199                 }
200                 for idx, variant := range cg.Variants {
201                         if variant > tileVariantID(cmd.MaxVariants) {
202                                 for _, cg := range cgs {
203                                         if len(cg.Variants) > idx {
204                                                 cg.Variants[idx & ^1] = 0
205                                                 cg.Variants[idx|1] = 0
206                                         }
207                                 }
208                         }
209                 }
210         }
211
212         if cmd.MaxTag >= 0 && ntags > cmd.MaxTag {
213                 ntags = cmd.MaxTag
214                 for i, cg := range cgs {
215                         if len(cg.Variants) > cmd.MaxTag*2 {
216                                 cgs[i].Variants = cg.Variants[:cmd.MaxTag*2]
217                         }
218                 }
219         }
220
221         if cmd.MinCoverage > 0 {
222                 mincov := int(cmd.MinCoverage * float64(len(cgs)*2))
223                 cov := make([]int, ntags)
224                 for _, cg := range cgs {
225                         for idx, variant := range cg.Variants {
226                                 if variant > 0 {
227                                         cov[idx>>1]++
228                                 }
229                         }
230                 }
231                 for tag, c := range cov {
232                         if c < mincov {
233                                 for _, cg := range cgs {
234                                         if len(cg.Variants) > tag*2 {
235                                                 cg.Variants[tag*2] = 0
236                                                 cg.Variants[tag*2+1] = 0
237                                         }
238                                 }
239                         }
240                 }
241         }
242
243         log.Print("filtering done")
244
245         var outfile io.WriteCloser
246         if *outputFilename == "-" {
247                 outfile = nopCloser{cmd.output}
248         } else {
249                 outfile, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0777)
250                 if err != nil {
251                         return 1
252                 }
253                 defer outfile.Close()
254         }
255         w := bufio.NewWriter(outfile)
256         enc := gob.NewEncoder(w)
257         log.Print("writing")
258         err = enc.Encode(LibraryEntry{
259                 CompactGenomes: cgs,
260         })
261         if err != nil {
262                 return 1
263         }
264         log.Print("writing done")
265         err = w.Flush()
266         if err != nil {
267                 return 1
268         }
269         err = outfile.Close()
270         if err != nil {
271                 return 1
272         }
273         return 0
274 }