1 // Copyright (C) The Lightning Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
21 "git.arvados.org/arvados.git/sdk/go/arvados"
22 log "github.com/sirupsen/logrus"
32 func (f *filter) Flags(flags *flag.FlagSet) {
33 flags.IntVar(&f.MaxVariants, "max-variants", -1, "drop tiles with more than `N` variants")
34 flags.Float64Var(&f.MinCoverage, "min-coverage", 0, "drop tiles with coverage less than `P` across all haplotypes (0 < P ≤ 1)")
35 flags.IntVar(&f.MaxTag, "max-tag", -1, "drop tiles with tag ID > `N`")
36 flags.StringVar(&f.MatchGenome, "match-genome", "", "keep genomes whose names contain `regexp`, drop the rest")
39 func (f *filter) Args() []string {
41 fmt.Sprintf("-max-variants=%d", f.MaxVariants),
42 fmt.Sprintf("-min-coverage=%f", f.MinCoverage),
43 fmt.Sprintf("-max-tag=%d", f.MaxTag),
44 fmt.Sprintf("-match-genome=%s", f.MatchGenome),
48 func (f *filter) Apply(tilelib *tileLibrary) {
49 // Zero out variants at tile positions that have more than
50 // f.MaxVariants tile variants.
51 if f.MaxVariants >= 0 {
52 for tag, variants := range tilelib.variant {
53 if f.MaxTag >= 0 && tag >= f.MaxTag {
56 if len(variants) <= f.MaxVariants {
59 for _, cg := range tilelib.compactGenomes {
68 // Zero out variants at tile positions that have less than
70 mincov := int(2*f.MinCoverage*float64(len(tilelib.compactGenomes)) + 1)
72 for tag := 0; tag < len(tilelib.variant) && (tag < f.MaxTag || f.MaxTag < 0); tag++ {
74 for _, cg := range tilelib.compactGenomes {
75 if len(cg) < tag*2+2 {
88 for _, cg := range tilelib.compactGenomes {
96 // Truncate genomes and tile data to f.MaxTag (TODO: truncate
99 if len(tilelib.variant) > f.MaxTag {
100 tilelib.variant = tilelib.variant[:f.MaxTag]
102 for name, cg := range tilelib.compactGenomes {
103 if len(cg) > 2*f.MaxTag {
104 tilelib.compactGenomes[name] = cg[:2*f.MaxTag]
109 re, err := regexp.Compile(f.MatchGenome)
111 log.Errorf("invalid regexp %q does not match anything, dropping all genomes", f.MatchGenome)
113 for name := range tilelib.compactGenomes {
114 if !re.MatchString(name) {
115 delete(tilelib.compactGenomes, name)
120 type filtercmd struct {
125 func (cmd *filtercmd) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
129 fmt.Fprintf(stderr, "%s\n", err)
132 flags := flag.NewFlagSet("", flag.ContinueOnError)
133 flags.SetOutput(stderr)
134 pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
135 runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
136 projectUUID := flags.String("project", "", "project `UUID` for output data")
137 priority := flags.Int("priority", 500, "container request priority")
138 inputFilename := flags.String("i", "-", "input `file`")
139 outputFilename := flags.String("o", "-", "output `file`")
140 cmd.filter.Flags(flags)
141 err = flags.Parse(args)
142 if err == flag.ErrHelp {
145 } else if err != nil {
147 } else if flags.NArg() > 0 {
148 err = fmt.Errorf("errant command line arguments after parsed flags: %v", flags.Args())
155 log.Println(http.ListenAndServe(*pprof, nil))
160 if *outputFilename != "-" {
161 err = errors.New("cannot specify output file in container mode: not implemented")
164 runner := arvadosContainerRunner{
165 Name: "lightning filter",
166 Client: arvados.NewClientFromEnv(),
167 ProjectUUID: *projectUUID,
172 err = runner.TranslatePaths(inputFilename)
176 runner.Args = []string{"filter", "-local=true",
177 "-i", *inputFilename,
178 "-o", "/mnt/output/library.gob",
179 "-max-variants", fmt.Sprintf("%d", cmd.MaxVariants),
180 "-min-coverage", fmt.Sprintf("%f", cmd.MinCoverage),
181 "-max-tag", fmt.Sprintf("%d", cmd.MaxTag),
184 output, err = runner.Run()
188 fmt.Fprintln(stdout, output+"/library.gob")
192 var infile io.ReadCloser
193 if *inputFilename == "-" {
194 infile = ioutil.NopCloser(stdin)
196 infile, err = os.Open(*inputFilename)
203 cgs, err := ReadCompactGenomes(infile, strings.HasSuffix(*inputFilename, ".gz"))
211 log.Printf("reading done, %d genomes", len(cgs))
213 log.Print("filtering")
215 for _, cg := range cgs {
216 if ntags < len(cg.Variants)/2 {
217 ntags = len(cg.Variants) / 2
219 if cmd.MaxVariants < 0 {
222 for idx, variant := range cg.Variants {
223 if variant > tileVariantID(cmd.MaxVariants) {
224 for _, cg := range cgs {
225 if len(cg.Variants) > idx {
226 cg.Variants[idx & ^1] = 0
227 cg.Variants[idx|1] = 0
234 if cmd.MaxTag >= 0 && ntags > cmd.MaxTag {
236 for i, cg := range cgs {
237 if len(cg.Variants) > cmd.MaxTag*2 {
238 cgs[i].Variants = cg.Variants[:cmd.MaxTag*2]
243 if cmd.MinCoverage > 0 {
244 mincov := int(cmd.MinCoverage * float64(len(cgs)*2))
245 cov := make([]int, ntags)
246 for _, cg := range cgs {
247 for idx, variant := range cg.Variants {
253 for tag, c := range cov {
255 for _, cg := range cgs {
256 if len(cg.Variants) > tag*2 {
257 cg.Variants[tag*2] = 0
258 cg.Variants[tag*2+1] = 0
265 log.Print("filtering done")
267 var outfile io.WriteCloser
268 if *outputFilename == "-" {
269 outfile = nopCloser{cmd.output}
271 outfile, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0777)
275 defer outfile.Close()
277 w := bufio.NewWriter(outfile)
278 enc := gob.NewEncoder(w)
280 err = enc.Encode(LibraryEntry{
286 log.Print("writing done")
291 err = outfile.Close()