Fix zeroing everything if max-variants not specified.
[lightning.git] / filter.go
1 package main
2
3 import (
4         "bufio"
5         "encoding/gob"
6         "flag"
7         "fmt"
8         "io"
9         "log"
10         "net/http"
11         _ "net/http/pprof"
12 )
13
14 type filterer struct {
15         output io.Writer
16 }
17
18 func (cmd *filterer) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
19         var err error
20         defer func() {
21                 if err != nil {
22                         fmt.Fprintf(stderr, "%s\n", err)
23                 }
24         }()
25         flags := flag.NewFlagSet("", flag.ContinueOnError)
26         flags.SetOutput(stderr)
27         pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
28         maxvariants := flags.Int("max-variants", -1, "drop tiles with more than `N` variants")
29         mincoverage := flags.Float64("min-coverage", 1, "drop tiles with coverage less than `P` across all haplotypes (0 < P ≤ 1)")
30         maxtag := flags.Int("max-tag", -1, "drop tiles with tag ID > `N`")
31         err = flags.Parse(args)
32         if err == flag.ErrHelp {
33                 err = nil
34                 return 0
35         } else if err != nil {
36                 return 2
37         }
38         cmd.output = stdout
39
40         if *pprof != "" {
41                 go func() {
42                         log.Println(http.ListenAndServe(*pprof, nil))
43                 }()
44         }
45
46         log.Print("reading")
47         cgs, err := ReadCompactGenomes(stdin)
48         if err != nil {
49                 return 1
50         }
51         log.Printf("reading done, %d genomes", len(cgs))
52
53         log.Print("filtering")
54         ntags := 0
55         for _, cg := range cgs {
56                 if ntags < len(cg.Variants)/2 {
57                         ntags = len(cg.Variants) / 2
58                 }
59                 if *maxvariants < 0 {
60                         continue
61                 }
62                 maxVariantID := tileVariantID(*maxVariants)
63                 for idx, variant := range cg.Variants {
64                         if variant > maxVariantID {
65                                 for _, cg := range cgs {
66                                         if len(cg.Variants) > idx {
67                                                 cg.Variants[idx & ^1] = 0
68                                                 cg.Variants[idx|1] = 0
69                                         }
70                                 }
71                         }
72                 }
73         }
74
75         if *maxtag >= 0 && ntags > *maxtag {
76                 ntags = *maxtag
77                 for i, cg := range cgs {
78                         if len(cg.Variants) > *maxtag*2 {
79                                 cgs[i].Variants = cg.Variants[:*maxtag*2]
80                         }
81                 }
82         }
83
84         if *mincoverage < 1 {
85                 mincov := int(*mincoverage * float64(len(cgs)*2))
86                 cov := make([]int, ntags)
87                 for _, cg := range cgs {
88                         for idx, variant := range cg.Variants {
89                                 if variant > 0 {
90                                         cov[idx>>1]++
91                                 }
92                         }
93                 }
94                 for tag, c := range cov {
95                         if c < mincov {
96                                 for _, cg := range cgs {
97                                         if len(cg.Variants) > tag*2 {
98                                                 cg.Variants[tag*2] = 0
99                                                 cg.Variants[tag*2+1] = 0
100                                         }
101                                 }
102                         }
103                 }
104         }
105
106         log.Print("filtering done")
107
108         w := bufio.NewWriter(cmd.output)
109         enc := gob.NewEncoder(w)
110         log.Print("writing")
111         err = enc.Encode(LibraryEntry{
112                 CompactGenomes: cgs,
113         })
114         if err != nil {
115                 return 1
116         }
117         log.Print("writing done")
118         err = w.Flush()
119         if err != nil {
120                 return 1
121         }
122         return 0
123 }