Add -match-genome=regexp filter.
authorTom Clegg <tom@tomclegg.ca>
Wed, 11 Aug 2021 22:10:26 +0000 (18:10 -0400)
committerTom Clegg <tom@tomclegg.ca>
Wed, 11 Aug 2021 22:10:26 +0000 (18:10 -0400)
refs #17939
refs #17922

Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

export_test.go
filter.go

index d87b3ae51c3d9dbac364a28499921ac07b72f66a..a2452c6ca32a5eb9c2e99ce5012796e4d6314aa9 100644 (file)
@@ -164,6 +164,7 @@ chr2        471     .       GG      AA      .       .       AC=1
                "-output-dir=" + outdir,
                "-output-format=hgvs-numpy",
                "-ref=testdata/ref.fasta",
+               "-match-genome=input[12]",
        }, nil, os.Stderr, os.Stderr)
        c.Check(exited, check.Equals, 0)
 
index 9d43abea608d98d9be8aec53934f2383c86fb0b1..9fe150e63df697b0b878cc495cc1623afb63059f 100644 (file)
--- a/filter.go
+++ b/filter.go
@@ -15,6 +15,7 @@ import (
        "net/http"
        _ "net/http/pprof"
        "os"
+       "regexp"
        "strings"
 
        "git.arvados.org/arvados.git/sdk/go/arvados"
@@ -25,12 +26,14 @@ type filter struct {
        MaxVariants int
        MinCoverage float64
        MaxTag      int
+       MatchGenome string
 }
 
 func (f *filter) Flags(flags *flag.FlagSet) {
        flags.IntVar(&f.MaxVariants, "max-variants", -1, "drop tiles with more than `N` variants")
        flags.Float64Var(&f.MinCoverage, "min-coverage", 0, "drop tiles with coverage less than `P` across all haplotypes (0 < P ≤ 1)")
        flags.IntVar(&f.MaxTag, "max-tag", -1, "drop tiles with tag ID > `N`")
+       flags.StringVar(&f.MatchGenome, "match-genome", "", "keep genomes whose names contain `regexp`, drop the rest")
 }
 
 func (f *filter) Args() []string {
@@ -38,6 +41,7 @@ func (f *filter) Args() []string {
                fmt.Sprintf("-max-variants=%d", f.MaxVariants),
                fmt.Sprintf("-min-coverage=%f", f.MinCoverage),
                fmt.Sprintf("-max-tag=%d", f.MaxTag),
+               fmt.Sprintf("-match-genome=%s", f.MatchGenome),
        }
 }
 
@@ -101,6 +105,16 @@ TAG:
                        }
                }
        }
+
+       re, err := regexp.Compile(f.MatchGenome)
+       if err != nil {
+               log.Errorf("invalid regexp %q does not match anything, dropping all genomes", f.MatchGenome)
+       }
+       for name := range tilelib.compactGenomes {
+               if !re.MatchString(name) {
+                       delete(tilelib.compactGenomes, name)
+               }
+       }
 }
 
 type filtercmd struct {