X-Git-Url: https://git.arvados.org/lightning.git/blobdiff_plain/8d4af49bcb5f054d23ea0739bd018e407e4bd441..ec86ad521dff7cb8529e7931f7756e41f03029ff:/filter.go diff --git a/filter.go b/filter.go index 26e55bafa4..9fe150e63d 100644 --- a/filter.go +++ b/filter.go @@ -1,4 +1,8 @@ -package main +// Copyright (C) The Lightning Authors. All rights reserved. +// +// SPDX-License-Identifier: AGPL-3.0 + +package lightning import ( "bufio" @@ -8,19 +12,117 @@ import ( "fmt" "io" "io/ioutil" - "log" "net/http" _ "net/http/pprof" "os" + "regexp" + "strings" "git.arvados.org/arvados.git/sdk/go/arvados" + log "github.com/sirupsen/logrus" ) -type filterer struct { +type filter struct { + MaxVariants int + MinCoverage float64 + MaxTag int + MatchGenome string +} + +func (f *filter) Flags(flags *flag.FlagSet) { + flags.IntVar(&f.MaxVariants, "max-variants", -1, "drop tiles with more than `N` variants") + flags.Float64Var(&f.MinCoverage, "min-coverage", 0, "drop tiles with coverage less than `P` across all haplotypes (0 < P ≤ 1)") + flags.IntVar(&f.MaxTag, "max-tag", -1, "drop tiles with tag ID > `N`") + flags.StringVar(&f.MatchGenome, "match-genome", "", "keep genomes whose names contain `regexp`, drop the rest") +} + +func (f *filter) Args() []string { + return []string{ + fmt.Sprintf("-max-variants=%d", f.MaxVariants), + fmt.Sprintf("-min-coverage=%f", f.MinCoverage), + fmt.Sprintf("-max-tag=%d", f.MaxTag), + fmt.Sprintf("-match-genome=%s", f.MatchGenome), + } +} + +func (f *filter) Apply(tilelib *tileLibrary) { + // Zero out variants at tile positions that have more than + // f.MaxVariants tile variants. + if f.MaxVariants >= 0 { + for tag, variants := range tilelib.variant { + if f.MaxTag >= 0 && tag >= f.MaxTag { + break + } + if len(variants) <= f.MaxVariants { + continue + } + for _, cg := range tilelib.compactGenomes { + if len(cg) > tag*2 { + cg[tag*2] = 0 + cg[tag*2+1] = 0 + } + } + } + } + + // Zero out variants at tile positions that have less than + // f.MinCoverage. + mincov := int(2*f.MinCoverage*float64(len(tilelib.compactGenomes)) + 1) +TAG: + for tag := 0; tag < len(tilelib.variant) && (tag < f.MaxTag || f.MaxTag < 0); tag++ { + tagcov := 0 + for _, cg := range tilelib.compactGenomes { + if len(cg) < tag*2+2 { + continue + } + if cg[tag*2] > 0 { + tagcov++ + } + if cg[tag*2+1] > 0 { + tagcov++ + } + if tagcov >= mincov { + continue TAG + } + } + for _, cg := range tilelib.compactGenomes { + if len(cg) > tag*2 { + cg[tag*2] = 0 + cg[tag*2+1] = 0 + } + } + } + + // Truncate genomes and tile data to f.MaxTag (TODO: truncate + // refseqs too) + if f.MaxTag >= 0 { + if len(tilelib.variant) > f.MaxTag { + tilelib.variant = tilelib.variant[:f.MaxTag] + } + for name, cg := range tilelib.compactGenomes { + if len(cg) > 2*f.MaxTag { + tilelib.compactGenomes[name] = cg[:2*f.MaxTag] + } + } + } + + re, err := regexp.Compile(f.MatchGenome) + if err != nil { + log.Errorf("invalid regexp %q does not match anything, dropping all genomes", f.MatchGenome) + } + for name := range tilelib.compactGenomes { + if !re.MatchString(name) { + delete(tilelib.compactGenomes, name) + } + } +} + +type filtercmd struct { output io.Writer + filter } -func (cmd *filterer) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int { +func (cmd *filtercmd) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int { var err error defer func() { if err != nil { @@ -32,11 +134,10 @@ func (cmd *filterer) RunCommand(prog string, args []string, stdin io.Reader, std pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`") runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)") projectUUID := flags.String("project", "", "project `UUID` for output data") + priority := flags.Int("priority", 500, "container request priority") inputFilename := flags.String("i", "-", "input `file`") outputFilename := flags.String("o", "-", "output `file`") - maxvariants := flags.Int("max-variants", -1, "drop tiles with more than `N` variants") - mincoverage := flags.Float64("min-coverage", 1, "drop tiles with coverage less than `P` across all haplotypes (0 < P ≤ 1)") - maxtag := flags.Int("max-tag", -1, "drop tiles with tag ID > `N`") + cmd.filter.Flags(flags) err = flags.Parse(args) if err == flag.ErrHelp { err = nil @@ -63,6 +164,7 @@ func (cmd *filterer) RunCommand(prog string, args []string, stdin io.Reader, std ProjectUUID: *projectUUID, RAM: 64000000000, VCPUs: 2, + Priority: *priority, } err = runner.TranslatePaths(inputFilename) if err != nil { @@ -71,14 +173,16 @@ func (cmd *filterer) RunCommand(prog string, args []string, stdin io.Reader, std runner.Args = []string{"filter", "-local=true", "-i", *inputFilename, "-o", "/mnt/output/library.gob", - "-max-variants", fmt.Sprintf("%d", *maxvariants), - "-min-coverage", fmt.Sprintf("%f", *mincoverage), - "-max-tag", fmt.Sprintf("%d", *maxtag), + "-max-variants", fmt.Sprintf("%d", cmd.MaxVariants), + "-min-coverage", fmt.Sprintf("%f", cmd.MinCoverage), + "-max-tag", fmt.Sprintf("%d", cmd.MaxTag), } - err = runner.Run() + var output string + output, err = runner.Run() if err != nil { return 1 } + fmt.Fprintln(stdout, output+"/library.gob") return 0 } @@ -93,7 +197,7 @@ func (cmd *filterer) RunCommand(prog string, args []string, stdin io.Reader, std defer infile.Close() } log.Print("reading") - cgs, err := ReadCompactGenomes(infile) + cgs, err := ReadCompactGenomes(infile, strings.HasSuffix(*inputFilename, ".gz")) if err != nil { return 1 } @@ -109,12 +213,11 @@ func (cmd *filterer) RunCommand(prog string, args []string, stdin io.Reader, std if ntags < len(cg.Variants)/2 { ntags = len(cg.Variants) / 2 } - if *maxvariants < 0 { + if cmd.MaxVariants < 0 { continue } - maxVariantID := tileVariantID(*maxvariants) for idx, variant := range cg.Variants { - if variant > maxVariantID { + if variant > tileVariantID(cmd.MaxVariants) { for _, cg := range cgs { if len(cg.Variants) > idx { cg.Variants[idx & ^1] = 0 @@ -125,17 +228,17 @@ func (cmd *filterer) RunCommand(prog string, args []string, stdin io.Reader, std } } - if *maxtag >= 0 && ntags > *maxtag { - ntags = *maxtag + if cmd.MaxTag >= 0 && ntags > cmd.MaxTag { + ntags = cmd.MaxTag for i, cg := range cgs { - if len(cg.Variants) > *maxtag*2 { - cgs[i].Variants = cg.Variants[:*maxtag*2] + if len(cg.Variants) > cmd.MaxTag*2 { + cgs[i].Variants = cg.Variants[:cmd.MaxTag*2] } } } - if *mincoverage < 1 { - mincov := int(*mincoverage * float64(len(cgs)*2)) + if cmd.MinCoverage > 0 { + mincov := int(cmd.MinCoverage * float64(len(cgs)*2)) cov := make([]int, ntags) for _, cg := range cgs { for idx, variant := range cg.Variants {