a4ffce8e7aa4d0c6ec355d7625a77ef6c52ff87d
[lightning.git] / filter.go
1 package main
2
3 import (
4         "bufio"
5         "encoding/gob"
6         "errors"
7         "flag"
8         "fmt"
9         "io"
10         "io/ioutil"
11         "net/http"
12         _ "net/http/pprof"
13         "os"
14
15         "git.arvados.org/arvados.git/sdk/go/arvados"
16         log "github.com/sirupsen/logrus"
17 )
18
19 type filter struct {
20         MaxVariants int
21         MinCoverage float64
22         MaxTag      int
23 }
24
25 func (f *filter) Flags(flags *flag.FlagSet) {
26         flags.IntVar(&f.MaxVariants, "max-variants", -1, "drop tiles with more than `N` variants")
27         flags.Float64Var(&f.MinCoverage, "min-coverage", 0, "drop tiles with coverage less than `P` across all haplotypes (0 < P ≤ 1)")
28         flags.IntVar(&f.MaxTag, "max-tag", -1, "drop tiles with tag ID > `N`")
29 }
30
31 func (f *filter) Apply(tilelib *tileLibrary) {
32         // Zero out variants at tile positions that have more than
33         // f.MaxVariants tile variants.
34         if f.MaxVariants >= 0 {
35                 for tag, variants := range tilelib.variant {
36                         if f.MaxTag >= 0 && tag >= f.MaxTag {
37                                 break
38                         }
39                         if len(variants) <= f.MaxVariants {
40                                 continue
41                         }
42                         for _, cg := range tilelib.compactGenomes {
43                                 if len(cg) > tag*2 {
44                                         cg[tag*2] = 0
45                                         cg[tag*2+1] = 0
46                                 }
47                         }
48                 }
49         }
50
51         // Zero out variants at tile positions that have less than
52         // f.MinCoverage.
53         mincov := int(2*f.MinCoverage*float64(len(tilelib.compactGenomes)) + 1)
54 TAG:
55         for tag := 0; tag < len(tilelib.variant) && tag < f.MaxTag; tag++ {
56                 tagcov := 0
57                 for _, cg := range tilelib.compactGenomes {
58                         if cg[tag*2] > 0 {
59                                 tagcov++
60                         }
61                         if cg[tag*2+1] > 0 {
62                                 tagcov++
63                         }
64                         if tagcov >= mincov {
65                                 continue TAG
66                         }
67                 }
68                 for _, cg := range tilelib.compactGenomes {
69                         cg[tag*2] = 0
70                         cg[tag*2+1] = 0
71                 }
72         }
73
74         // Truncate genomes and tile data to f.MaxTag (TODO: truncate
75         // refseqs too)
76         if f.MaxTag >= 0 {
77                 if len(tilelib.variant) > f.MaxTag {
78                         tilelib.variant = tilelib.variant[:f.MaxTag]
79                 }
80                 for name, cg := range tilelib.compactGenomes {
81                         if len(cg) > 2*f.MaxTag {
82                                 tilelib.compactGenomes[name] = cg[:2*f.MaxTag]
83                         }
84                 }
85         }
86 }
87
88 type filtercmd struct {
89         output io.Writer
90         filter
91 }
92
93 func (cmd *filtercmd) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
94         var err error
95         defer func() {
96                 if err != nil {
97                         fmt.Fprintf(stderr, "%s\n", err)
98                 }
99         }()
100         flags := flag.NewFlagSet("", flag.ContinueOnError)
101         flags.SetOutput(stderr)
102         pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
103         runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
104         projectUUID := flags.String("project", "", "project `UUID` for output data")
105         priority := flags.Int("priority", 500, "container request priority")
106         inputFilename := flags.String("i", "-", "input `file`")
107         outputFilename := flags.String("o", "-", "output `file`")
108         cmd.filter.Flags(flags)
109         err = flags.Parse(args)
110         if err == flag.ErrHelp {
111                 err = nil
112                 return 0
113         } else if err != nil {
114                 return 2
115         }
116         cmd.output = stdout
117
118         if *pprof != "" {
119                 go func() {
120                         log.Println(http.ListenAndServe(*pprof, nil))
121                 }()
122         }
123
124         if !*runlocal {
125                 if *outputFilename != "-" {
126                         err = errors.New("cannot specify output file in container mode: not implemented")
127                         return 1
128                 }
129                 runner := arvadosContainerRunner{
130                         Name:        "lightning filter",
131                         Client:      arvados.NewClientFromEnv(),
132                         ProjectUUID: *projectUUID,
133                         RAM:         64000000000,
134                         VCPUs:       2,
135                         Priority:    *priority,
136                 }
137                 err = runner.TranslatePaths(inputFilename)
138                 if err != nil {
139                         return 1
140                 }
141                 runner.Args = []string{"filter", "-local=true",
142                         "-i", *inputFilename,
143                         "-o", "/mnt/output/library.gob",
144                         "-max-variants", fmt.Sprintf("%d", cmd.MaxVariants),
145                         "-min-coverage", fmt.Sprintf("%f", cmd.MinCoverage),
146                         "-max-tag", fmt.Sprintf("%d", cmd.MaxTag),
147                 }
148                 var output string
149                 output, err = runner.Run()
150                 if err != nil {
151                         return 1
152                 }
153                 fmt.Fprintln(stdout, output+"/library.gob")
154                 return 0
155         }
156
157         var infile io.ReadCloser
158         if *inputFilename == "-" {
159                 infile = ioutil.NopCloser(stdin)
160         } else {
161                 infile, err = os.Open(*inputFilename)
162                 if err != nil {
163                         return 1
164                 }
165                 defer infile.Close()
166         }
167         log.Print("reading")
168         cgs, err := ReadCompactGenomes(infile)
169         if err != nil {
170                 return 1
171         }
172         err = infile.Close()
173         if err != nil {
174                 return 1
175         }
176         log.Printf("reading done, %d genomes", len(cgs))
177
178         log.Print("filtering")
179         ntags := 0
180         for _, cg := range cgs {
181                 if ntags < len(cg.Variants)/2 {
182                         ntags = len(cg.Variants) / 2
183                 }
184                 if cmd.MaxVariants < 0 {
185                         continue
186                 }
187                 for idx, variant := range cg.Variants {
188                         if variant > tileVariantID(cmd.MaxVariants) {
189                                 for _, cg := range cgs {
190                                         if len(cg.Variants) > idx {
191                                                 cg.Variants[idx & ^1] = 0
192                                                 cg.Variants[idx|1] = 0
193                                         }
194                                 }
195                         }
196                 }
197         }
198
199         if cmd.MaxTag >= 0 && ntags > cmd.MaxTag {
200                 ntags = cmd.MaxTag
201                 for i, cg := range cgs {
202                         if len(cg.Variants) > cmd.MaxTag*2 {
203                                 cgs[i].Variants = cg.Variants[:cmd.MaxTag*2]
204                         }
205                 }
206         }
207
208         if cmd.MinCoverage > 0 {
209                 mincov := int(cmd.MinCoverage * float64(len(cgs)*2))
210                 cov := make([]int, ntags)
211                 for _, cg := range cgs {
212                         for idx, variant := range cg.Variants {
213                                 if variant > 0 {
214                                         cov[idx>>1]++
215                                 }
216                         }
217                 }
218                 for tag, c := range cov {
219                         if c < mincov {
220                                 for _, cg := range cgs {
221                                         if len(cg.Variants) > tag*2 {
222                                                 cg.Variants[tag*2] = 0
223                                                 cg.Variants[tag*2+1] = 0
224                                         }
225                                 }
226                         }
227                 }
228         }
229
230         log.Print("filtering done")
231
232         var outfile io.WriteCloser
233         if *outputFilename == "-" {
234                 outfile = nopCloser{cmd.output}
235         } else {
236                 outfile, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0777)
237                 if err != nil {
238                         return 1
239                 }
240                 defer outfile.Close()
241         }
242         w := bufio.NewWriter(outfile)
243         enc := gob.NewEncoder(w)
244         log.Print("writing")
245         err = enc.Encode(LibraryEntry{
246                 CompactGenomes: cgs,
247         })
248         if err != nil {
249                 return 1
250         }
251         log.Print("writing done")
252         err = w.Flush()
253         if err != nil {
254                 return 1
255         }
256         err = outfile.Close()
257         if err != nil {
258                 return 1
259         }
260         return 0
261 }