26e55bafa4efea47cb8ffda6c41ca45587272869
[lightning.git] / filter.go
1 package main
2
3 import (
4         "bufio"
5         "encoding/gob"
6         "errors"
7         "flag"
8         "fmt"
9         "io"
10         "io/ioutil"
11         "log"
12         "net/http"
13         _ "net/http/pprof"
14         "os"
15
16         "git.arvados.org/arvados.git/sdk/go/arvados"
17 )
18
19 type filterer struct {
20         output io.Writer
21 }
22
23 func (cmd *filterer) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
24         var err error
25         defer func() {
26                 if err != nil {
27                         fmt.Fprintf(stderr, "%s\n", err)
28                 }
29         }()
30         flags := flag.NewFlagSet("", flag.ContinueOnError)
31         flags.SetOutput(stderr)
32         pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
33         runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
34         projectUUID := flags.String("project", "", "project `UUID` for output data")
35         inputFilename := flags.String("i", "-", "input `file`")
36         outputFilename := flags.String("o", "-", "output `file`")
37         maxvariants := flags.Int("max-variants", -1, "drop tiles with more than `N` variants")
38         mincoverage := flags.Float64("min-coverage", 1, "drop tiles with coverage less than `P` across all haplotypes (0 < P ≤ 1)")
39         maxtag := flags.Int("max-tag", -1, "drop tiles with tag ID > `N`")
40         err = flags.Parse(args)
41         if err == flag.ErrHelp {
42                 err = nil
43                 return 0
44         } else if err != nil {
45                 return 2
46         }
47         cmd.output = stdout
48
49         if *pprof != "" {
50                 go func() {
51                         log.Println(http.ListenAndServe(*pprof, nil))
52                 }()
53         }
54
55         if !*runlocal {
56                 if *outputFilename != "-" {
57                         err = errors.New("cannot specify output file in container mode: not implemented")
58                         return 1
59                 }
60                 runner := arvadosContainerRunner{
61                         Name:        "lightning filter",
62                         Client:      arvados.NewClientFromEnv(),
63                         ProjectUUID: *projectUUID,
64                         RAM:         64000000000,
65                         VCPUs:       2,
66                 }
67                 err = runner.TranslatePaths(inputFilename)
68                 if err != nil {
69                         return 1
70                 }
71                 runner.Args = []string{"filter", "-local=true",
72                         "-i", *inputFilename,
73                         "-o", "/mnt/output/library.gob",
74                         "-max-variants", fmt.Sprintf("%d", *maxvariants),
75                         "-min-coverage", fmt.Sprintf("%f", *mincoverage),
76                         "-max-tag", fmt.Sprintf("%d", *maxtag),
77                 }
78                 err = runner.Run()
79                 if err != nil {
80                         return 1
81                 }
82                 return 0
83         }
84
85         var infile io.ReadCloser
86         if *inputFilename == "-" {
87                 infile = ioutil.NopCloser(stdin)
88         } else {
89                 infile, err = os.Open(*inputFilename)
90                 if err != nil {
91                         return 1
92                 }
93                 defer infile.Close()
94         }
95         log.Print("reading")
96         cgs, err := ReadCompactGenomes(infile)
97         if err != nil {
98                 return 1
99         }
100         err = infile.Close()
101         if err != nil {
102                 return 1
103         }
104         log.Printf("reading done, %d genomes", len(cgs))
105
106         log.Print("filtering")
107         ntags := 0
108         for _, cg := range cgs {
109                 if ntags < len(cg.Variants)/2 {
110                         ntags = len(cg.Variants) / 2
111                 }
112                 if *maxvariants < 0 {
113                         continue
114                 }
115                 maxVariantID := tileVariantID(*maxvariants)
116                 for idx, variant := range cg.Variants {
117                         if variant > maxVariantID {
118                                 for _, cg := range cgs {
119                                         if len(cg.Variants) > idx {
120                                                 cg.Variants[idx & ^1] = 0
121                                                 cg.Variants[idx|1] = 0
122                                         }
123                                 }
124                         }
125                 }
126         }
127
128         if *maxtag >= 0 && ntags > *maxtag {
129                 ntags = *maxtag
130                 for i, cg := range cgs {
131                         if len(cg.Variants) > *maxtag*2 {
132                                 cgs[i].Variants = cg.Variants[:*maxtag*2]
133                         }
134                 }
135         }
136
137         if *mincoverage < 1 {
138                 mincov := int(*mincoverage * float64(len(cgs)*2))
139                 cov := make([]int, ntags)
140                 for _, cg := range cgs {
141                         for idx, variant := range cg.Variants {
142                                 if variant > 0 {
143                                         cov[idx>>1]++
144                                 }
145                         }
146                 }
147                 for tag, c := range cov {
148                         if c < mincov {
149                                 for _, cg := range cgs {
150                                         if len(cg.Variants) > tag*2 {
151                                                 cg.Variants[tag*2] = 0
152                                                 cg.Variants[tag*2+1] = 0
153                                         }
154                                 }
155                         }
156                 }
157         }
158
159         log.Print("filtering done")
160
161         var outfile io.WriteCloser
162         if *outputFilename == "-" {
163                 outfile = nopCloser{cmd.output}
164         } else {
165                 outfile, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0777)
166                 if err != nil {
167                         return 1
168                 }
169                 defer outfile.Close()
170         }
171         w := bufio.NewWriter(outfile)
172         enc := gob.NewEncoder(w)
173         log.Print("writing")
174         err = enc.Encode(LibraryEntry{
175                 CompactGenomes: cgs,
176         })
177         if err != nil {
178                 return 1
179         }
180         log.Print("writing done")
181         err = w.Flush()
182         if err != nil {
183                 return 1
184         }
185         err = outfile.Close()
186         if err != nil {
187                 return 1
188         }
189         return 0
190 }