From abd4e656fc7ed53bc4f31d7d8b2a86e92cb20a3f Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Wed, 5 Feb 2020 16:05:55 -0500 Subject: [PATCH] Add filter subcommand. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- cmd.go | 1 + filter.go | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+) create mode 100644 filter.go diff --git a/cmd.go b/cmd.go index 1e86dc7010..a321848f58 100644 --- a/cmd.go +++ b/cmd.go @@ -18,6 +18,7 @@ var ( "import": &importer{}, "export-numpy": &exportNumpy{}, + "filter": &filterer{}, "build-docker-image": &buildDockerImage{}, }) ) diff --git a/filter.go b/filter.go new file mode 100644 index 0000000000..679c319fbf --- /dev/null +++ b/filter.go @@ -0,0 +1,119 @@ +package main + +import ( + "bufio" + "encoding/gob" + "flag" + "fmt" + "io" + "log" + "net/http" + _ "net/http/pprof" +) + +type filterer struct { + output io.Writer +} + +func (cmd *filterer) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int { + var err error + defer func() { + if err != nil { + fmt.Fprintf(stderr, "%s\n", err) + } + }() + flags := flag.NewFlagSet("", flag.ContinueOnError) + flags.SetOutput(stderr) + pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`") + maxvariants := flags.Int("max-variants", -1, "drop tiles with more than `N` variants") + mincoverage := flags.Float64("min-coverage", 1, "drop tiles with coverage less than `P` across all haplotypes (0 < P ≤ 1)") + maxtag := flags.Int("max-tag", -1, "drop tiles with tag ID > `N`") + err = flags.Parse(args) + if err == flag.ErrHelp { + err = nil + return 0 + } else if err != nil { + return 2 + } + cmd.output = stdout + + if *pprof != "" { + go func() { + log.Println(http.ListenAndServe(*pprof, nil)) + }() + } + + log.Print("reading") + cgs, err := ReadCompactGenomes(stdin) + if err != nil { + return 1 + } + log.Printf("reading done, %d genomes", len(cgs)) + + log.Print("filtering") + ntags := 0 + for _, cg := range cgs { + if ntags < len(cg.Variants)/2 { + ntags = len(cg.Variants) / 2 + } + for idx, variant := range cg.Variants { + if int(variant) > *maxvariants { + for _, cg := range cgs { + if len(cg.Variants) > idx { + cg.Variants[idx & ^1] = 0 + cg.Variants[idx|1] = 0 + } + } + } + } + } + + if *maxtag >= 0 && ntags > *maxtag { + ntags = *maxtag + for i, cg := range cgs { + if len(cg.Variants) > *maxtag*2 { + cgs[i].Variants = cg.Variants[:*maxtag*2] + } + } + } + + if *mincoverage < 1 { + mincov := int(*mincoverage * float64(len(cgs)*2)) + cov := make([]int, ntags) + for _, cg := range cgs { + for idx, variant := range cg.Variants { + if variant > 0 { + cov[idx>>1]++ + } + } + } + for tag, c := range cov { + if c < mincov { + for _, cg := range cgs { + if len(cg.Variants) > tag*2 { + cg.Variants[tag*2] = 0 + cg.Variants[tag*2+1] = 0 + } + } + } + } + } + + log.Print("filtering done") + + w := bufio.NewWriter(cmd.output) + enc := gob.NewEncoder(w) + log.Print("writing") + err = enc.Encode(LibraryEntry{ + CompactGenomes: cgs, + }) + if err != nil { + return 1 + } + log.Print("writing done") + err = w.Flush() + if err != nil { + return 1 + } + return 0 +} -- 2.30.2