Fix some tests.
[lightning.git] / filter.go
index b90f6eabc3f64959a31bcfd5a2c40f68e250ae60..4c86c1b85b486f6b9bf2e6961ed6ce9606ae7c08 100644 (file)
--- a/filter.go
+++ b/filter.go
@@ -1,4 +1,8 @@
-package main
+// Copyright (C) The Lightning Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package lightning
 
 import (
        "bufio"
@@ -11,6 +15,8 @@ import (
        "net/http"
        _ "net/http/pprof"
        "os"
+       "regexp"
+       "strings"
 
        "git.arvados.org/arvados.git/sdk/go/arvados"
        log "github.com/sirupsen/logrus"
@@ -20,12 +26,23 @@ type filter struct {
        MaxVariants int
        MinCoverage float64
        MaxTag      int
+       MatchGenome string
 }
 
 func (f *filter) Flags(flags *flag.FlagSet) {
        flags.IntVar(&f.MaxVariants, "max-variants", -1, "drop tiles with more than `N` variants")
        flags.Float64Var(&f.MinCoverage, "min-coverage", 0, "drop tiles with coverage less than `P` across all haplotypes (0 < P ≤ 1)")
        flags.IntVar(&f.MaxTag, "max-tag", -1, "drop tiles with tag ID > `N`")
+       flags.StringVar(&f.MatchGenome, "match-genome", "", "keep genomes whose names contain `regexp`, drop the rest")
+}
+
+func (f *filter) Args() []string {
+       return []string{
+               fmt.Sprintf("-max-variants=%d", f.MaxVariants),
+               fmt.Sprintf("-min-coverage=%f", f.MinCoverage),
+               fmt.Sprintf("-max-tag=%d", f.MaxTag),
+               fmt.Sprintf("-match-genome=%s", f.MatchGenome),
+       }
 }
 
 func (f *filter) Apply(tilelib *tileLibrary) {
@@ -39,7 +56,6 @@ func (f *filter) Apply(tilelib *tileLibrary) {
                        if len(variants) <= f.MaxVariants {
                                continue
                        }
-                       tilelib.variant[tag] = nil
                        for _, cg := range tilelib.compactGenomes {
                                if len(cg) > tag*2 {
                                        cg[tag*2] = 0
@@ -53,9 +69,12 @@ func (f *filter) Apply(tilelib *tileLibrary) {
        // f.MinCoverage.
        mincov := int(2*f.MinCoverage*float64(len(tilelib.compactGenomes)) + 1)
 TAG:
-       for tag := 0; tag < len(tilelib.variant) && tag < f.MaxTag; tag++ {
+       for tag := 0; tag < len(tilelib.variant) && (tag < f.MaxTag || f.MaxTag < 0); tag++ {
                tagcov := 0
                for _, cg := range tilelib.compactGenomes {
+                       if len(cg) < tag*2+2 {
+                               continue
+                       }
                        if cg[tag*2] > 0 {
                                tagcov++
                        }
@@ -67,8 +86,10 @@ TAG:
                        }
                }
                for _, cg := range tilelib.compactGenomes {
-                       cg[tag*2] = 0
-                       cg[tag*2+1] = 0
+                       if len(cg) > tag*2 {
+                               cg[tag*2] = 0
+                               cg[tag*2+1] = 0
+                       }
                }
        }
 
@@ -84,6 +105,16 @@ TAG:
                        }
                }
        }
+
+       re, err := regexp.Compile(f.MatchGenome)
+       if err != nil {
+               log.Errorf("invalid regexp %q does not match anything, dropping all genomes", f.MatchGenome)
+       }
+       for name := range tilelib.compactGenomes {
+               if !re.MatchString(name) {
+                       delete(tilelib.compactGenomes, name)
+               }
+       }
 }
 
 type filtercmd struct {
@@ -113,6 +144,9 @@ func (cmd *filtercmd) RunCommand(prog string, args []string, stdin io.Reader, st
                return 0
        } else if err != nil {
                return 2
+       } else if flags.NArg() > 0 {
+               err = fmt.Errorf("errant command line arguments after parsed flags: %v", flags.Args())
+               return 2
        }
        cmd.output = stdout
 
@@ -166,7 +200,7 @@ func (cmd *filtercmd) RunCommand(prog string, args []string, stdin io.Reader, st
                defer infile.Close()
        }
        log.Print("reading")
-       cgs, err := ReadCompactGenomes(infile)
+       cgs, err := ReadCompactGenomes(infile, strings.HasSuffix(*inputFilename, ".gz"))
        if err != nil {
                return 1
        }