X-Git-Url: https://git.arvados.org/lightning.git/blobdiff_plain/16af77bb805d4641c33cc8920ff1c48d1471aed7..ec86ad521dff7cb8529e7931f7756e41f03029ff:/filter.go diff --git a/filter.go b/filter.go index b90f6eabc3..9fe150e63d 100644 --- a/filter.go +++ b/filter.go @@ -1,4 +1,8 @@ -package main +// Copyright (C) The Lightning Authors. All rights reserved. +// +// SPDX-License-Identifier: AGPL-3.0 + +package lightning import ( "bufio" @@ -11,6 +15,8 @@ import ( "net/http" _ "net/http/pprof" "os" + "regexp" + "strings" "git.arvados.org/arvados.git/sdk/go/arvados" log "github.com/sirupsen/logrus" @@ -20,12 +26,23 @@ type filter struct { MaxVariants int MinCoverage float64 MaxTag int + MatchGenome string } func (f *filter) Flags(flags *flag.FlagSet) { flags.IntVar(&f.MaxVariants, "max-variants", -1, "drop tiles with more than `N` variants") flags.Float64Var(&f.MinCoverage, "min-coverage", 0, "drop tiles with coverage less than `P` across all haplotypes (0 < P ≤ 1)") flags.IntVar(&f.MaxTag, "max-tag", -1, "drop tiles with tag ID > `N`") + flags.StringVar(&f.MatchGenome, "match-genome", "", "keep genomes whose names contain `regexp`, drop the rest") +} + +func (f *filter) Args() []string { + return []string{ + fmt.Sprintf("-max-variants=%d", f.MaxVariants), + fmt.Sprintf("-min-coverage=%f", f.MinCoverage), + fmt.Sprintf("-max-tag=%d", f.MaxTag), + fmt.Sprintf("-match-genome=%s", f.MatchGenome), + } } func (f *filter) Apply(tilelib *tileLibrary) { @@ -39,7 +56,6 @@ func (f *filter) Apply(tilelib *tileLibrary) { if len(variants) <= f.MaxVariants { continue } - tilelib.variant[tag] = nil for _, cg := range tilelib.compactGenomes { if len(cg) > tag*2 { cg[tag*2] = 0 @@ -53,9 +69,12 @@ func (f *filter) Apply(tilelib *tileLibrary) { // f.MinCoverage. mincov := int(2*f.MinCoverage*float64(len(tilelib.compactGenomes)) + 1) TAG: - for tag := 0; tag < len(tilelib.variant) && tag < f.MaxTag; tag++ { + for tag := 0; tag < len(tilelib.variant) && (tag < f.MaxTag || f.MaxTag < 0); tag++ { tagcov := 0 for _, cg := range tilelib.compactGenomes { + if len(cg) < tag*2+2 { + continue + } if cg[tag*2] > 0 { tagcov++ } @@ -67,8 +86,10 @@ TAG: } } for _, cg := range tilelib.compactGenomes { - cg[tag*2] = 0 - cg[tag*2+1] = 0 + if len(cg) > tag*2 { + cg[tag*2] = 0 + cg[tag*2+1] = 0 + } } } @@ -84,6 +105,16 @@ TAG: } } } + + re, err := regexp.Compile(f.MatchGenome) + if err != nil { + log.Errorf("invalid regexp %q does not match anything, dropping all genomes", f.MatchGenome) + } + for name := range tilelib.compactGenomes { + if !re.MatchString(name) { + delete(tilelib.compactGenomes, name) + } + } } type filtercmd struct { @@ -166,7 +197,7 @@ func (cmd *filtercmd) RunCommand(prog string, args []string, stdin io.Reader, st defer infile.Close() } log.Print("reading") - cgs, err := ReadCompactGenomes(infile) + cgs, err := ReadCompactGenomes(infile, strings.HasSuffix(*inputFilename, ".gz")) if err != nil { return 1 }