Fix some tests.
[lightning.git] / filter.go
index 239040dbaf453dd3f64290e91745eec8f6611101..4c86c1b85b486f6b9bf2e6961ed6ce9606ae7c08 100644 (file)
--- a/filter.go
+++ b/filter.go
-package main
+// Copyright (C) The Lightning Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package lightning
 
 import (
        "bufio"
        "encoding/gob"
+       "errors"
        "flag"
        "fmt"
        "io"
-       "log"
+       "io/ioutil"
        "net/http"
        _ "net/http/pprof"
+       "os"
+       "regexp"
+       "strings"
+
+       "git.arvados.org/arvados.git/sdk/go/arvados"
+       log "github.com/sirupsen/logrus"
 )
 
-type filterer struct {
+type filter struct {
+       MaxVariants int
+       MinCoverage float64
+       MaxTag      int
+       MatchGenome string
+}
+
+func (f *filter) Flags(flags *flag.FlagSet) {
+       flags.IntVar(&f.MaxVariants, "max-variants", -1, "drop tiles with more than `N` variants")
+       flags.Float64Var(&f.MinCoverage, "min-coverage", 0, "drop tiles with coverage less than `P` across all haplotypes (0 < P ≤ 1)")
+       flags.IntVar(&f.MaxTag, "max-tag", -1, "drop tiles with tag ID > `N`")
+       flags.StringVar(&f.MatchGenome, "match-genome", "", "keep genomes whose names contain `regexp`, drop the rest")
+}
+
+func (f *filter) Args() []string {
+       return []string{
+               fmt.Sprintf("-max-variants=%d", f.MaxVariants),
+               fmt.Sprintf("-min-coverage=%f", f.MinCoverage),
+               fmt.Sprintf("-max-tag=%d", f.MaxTag),
+               fmt.Sprintf("-match-genome=%s", f.MatchGenome),
+       }
+}
+
+func (f *filter) Apply(tilelib *tileLibrary) {
+       // Zero out variants at tile positions that have more than
+       // f.MaxVariants tile variants.
+       if f.MaxVariants >= 0 {
+               for tag, variants := range tilelib.variant {
+                       if f.MaxTag >= 0 && tag >= f.MaxTag {
+                               break
+                       }
+                       if len(variants) <= f.MaxVariants {
+                               continue
+                       }
+                       for _, cg := range tilelib.compactGenomes {
+                               if len(cg) > tag*2 {
+                                       cg[tag*2] = 0
+                                       cg[tag*2+1] = 0
+                               }
+                       }
+               }
+       }
+
+       // Zero out variants at tile positions that have less than
+       // f.MinCoverage.
+       mincov := int(2*f.MinCoverage*float64(len(tilelib.compactGenomes)) + 1)
+TAG:
+       for tag := 0; tag < len(tilelib.variant) && (tag < f.MaxTag || f.MaxTag < 0); tag++ {
+               tagcov := 0
+               for _, cg := range tilelib.compactGenomes {
+                       if len(cg) < tag*2+2 {
+                               continue
+                       }
+                       if cg[tag*2] > 0 {
+                               tagcov++
+                       }
+                       if cg[tag*2+1] > 0 {
+                               tagcov++
+                       }
+                       if tagcov >= mincov {
+                               continue TAG
+                       }
+               }
+               for _, cg := range tilelib.compactGenomes {
+                       if len(cg) > tag*2 {
+                               cg[tag*2] = 0
+                               cg[tag*2+1] = 0
+                       }
+               }
+       }
+
+       // Truncate genomes and tile data to f.MaxTag (TODO: truncate
+       // refseqs too)
+       if f.MaxTag >= 0 {
+               if len(tilelib.variant) > f.MaxTag {
+                       tilelib.variant = tilelib.variant[:f.MaxTag]
+               }
+               for name, cg := range tilelib.compactGenomes {
+                       if len(cg) > 2*f.MaxTag {
+                               tilelib.compactGenomes[name] = cg[:2*f.MaxTag]
+                       }
+               }
+       }
+
+       re, err := regexp.Compile(f.MatchGenome)
+       if err != nil {
+               log.Errorf("invalid regexp %q does not match anything, dropping all genomes", f.MatchGenome)
+       }
+       for name := range tilelib.compactGenomes {
+               if !re.MatchString(name) {
+                       delete(tilelib.compactGenomes, name)
+               }
+       }
+}
+
+type filtercmd struct {
        output io.Writer
+       filter
 }
 
-func (cmd *filterer) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
+func (cmd *filtercmd) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
        var err error
        defer func() {
                if err != nil {
@@ -25,15 +132,21 @@ func (cmd *filterer) RunCommand(prog string, args []string, stdin io.Reader, std
        flags := flag.NewFlagSet("", flag.ContinueOnError)
        flags.SetOutput(stderr)
        pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
-       maxvariants := flags.Int("max-variants", -1, "drop tiles with more than `N` variants")
-       mincoverage := flags.Float64("min-coverage", 1, "drop tiles with coverage less than `P` across all haplotypes (0 < P ≤ 1)")
-       maxtag := flags.Int("max-tag", -1, "drop tiles with tag ID > `N`")
+       runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
+       projectUUID := flags.String("project", "", "project `UUID` for output data")
+       priority := flags.Int("priority", 500, "container request priority")
+       inputFilename := flags.String("i", "-", "input `file`")
+       outputFilename := flags.String("o", "-", "output `file`")
+       cmd.filter.Flags(flags)
        err = flags.Parse(args)
        if err == flag.ErrHelp {
                err = nil
                return 0
        } else if err != nil {
                return 2
+       } else if flags.NArg() > 0 {
+               err = fmt.Errorf("errant command line arguments after parsed flags: %v", flags.Args())
+               return 2
        }
        cmd.output = stdout
 
@@ -43,8 +156,55 @@ func (cmd *filterer) RunCommand(prog string, args []string, stdin io.Reader, std
                }()
        }
 
+       if !*runlocal {
+               if *outputFilename != "-" {
+                       err = errors.New("cannot specify output file in container mode: not implemented")
+                       return 1
+               }
+               runner := arvadosContainerRunner{
+                       Name:        "lightning filter",
+                       Client:      arvados.NewClientFromEnv(),
+                       ProjectUUID: *projectUUID,
+                       RAM:         64000000000,
+                       VCPUs:       2,
+                       Priority:    *priority,
+               }
+               err = runner.TranslatePaths(inputFilename)
+               if err != nil {
+                       return 1
+               }
+               runner.Args = []string{"filter", "-local=true",
+                       "-i", *inputFilename,
+                       "-o", "/mnt/output/library.gob",
+                       "-max-variants", fmt.Sprintf("%d", cmd.MaxVariants),
+                       "-min-coverage", fmt.Sprintf("%f", cmd.MinCoverage),
+                       "-max-tag", fmt.Sprintf("%d", cmd.MaxTag),
+               }
+               var output string
+               output, err = runner.Run()
+               if err != nil {
+                       return 1
+               }
+               fmt.Fprintln(stdout, output+"/library.gob")
+               return 0
+       }
+
+       var infile io.ReadCloser
+       if *inputFilename == "-" {
+               infile = ioutil.NopCloser(stdin)
+       } else {
+               infile, err = os.Open(*inputFilename)
+               if err != nil {
+                       return 1
+               }
+               defer infile.Close()
+       }
        log.Print("reading")
-       cgs, err := ReadCompactGenomes(stdin)
+       cgs, err := ReadCompactGenomes(infile, strings.HasSuffix(*inputFilename, ".gz"))
+       if err != nil {
+               return 1
+       }
+       err = infile.Close()
        if err != nil {
                return 1
        }
@@ -56,12 +216,11 @@ func (cmd *filterer) RunCommand(prog string, args []string, stdin io.Reader, std
                if ntags < len(cg.Variants)/2 {
                        ntags = len(cg.Variants) / 2
                }
-               if *maxvariants < 0 {
+               if cmd.MaxVariants < 0 {
                        continue
                }
-               maxVariantID := tileVariantID(*maxVariants)
                for idx, variant := range cg.Variants {
-                       if variant > maxVariantID {
+                       if variant > tileVariantID(cmd.MaxVariants) {
                                for _, cg := range cgs {
                                        if len(cg.Variants) > idx {
                                                cg.Variants[idx & ^1] = 0
@@ -72,17 +231,17 @@ func (cmd *filterer) RunCommand(prog string, args []string, stdin io.Reader, std
                }
        }
 
-       if *maxtag >= 0 && ntags > *maxtag {
-               ntags = *maxtag
+       if cmd.MaxTag >= 0 && ntags > cmd.MaxTag {
+               ntags = cmd.MaxTag
                for i, cg := range cgs {
-                       if len(cg.Variants) > *maxtag*2 {
-                               cgs[i].Variants = cg.Variants[:*maxtag*2]
+                       if len(cg.Variants) > cmd.MaxTag*2 {
+                               cgs[i].Variants = cg.Variants[:cmd.MaxTag*2]
                        }
                }
        }
 
-       if *mincoverage < 1 {
-               mincov := int(*mincoverage * float64(len(cgs)*2))
+       if cmd.MinCoverage > 0 {
+               mincov := int(cmd.MinCoverage * float64(len(cgs)*2))
                cov := make([]int, ntags)
                for _, cg := range cgs {
                        for idx, variant := range cg.Variants {
@@ -105,7 +264,17 @@ func (cmd *filterer) RunCommand(prog string, args []string, stdin io.Reader, std
 
        log.Print("filtering done")
 
-       w := bufio.NewWriter(cmd.output)
+       var outfile io.WriteCloser
+       if *outputFilename == "-" {
+               outfile = nopCloser{cmd.output}
+       } else {
+               outfile, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0777)
+               if err != nil {
+                       return 1
+               }
+               defer outfile.Close()
+       }
+       w := bufio.NewWriter(outfile)
        enc := gob.NewEncoder(w)
        log.Print("writing")
        err = enc.Encode(LibraryEntry{
@@ -119,5 +288,9 @@ func (cmd *filterer) RunCommand(prog string, args []string, stdin io.Reader, std
        if err != nil {
                return 1
        }
+       err = outfile.Close()
+       if err != nil {
+               return 1
+       }
        return 0
 }