+++ /dev/null
-// Copyright (C) The Lightning Authors. All rights reserved.
-//
-// SPDX-License-Identifier: AGPL-3.0
-
-package lightning
-
-import (
- "bufio"
- "encoding/gob"
- "errors"
- "flag"
- "fmt"
- "io"
- "io/ioutil"
- "net/http"
- _ "net/http/pprof"
- "os"
- "strings"
-
- "git.arvados.org/arvados.git/sdk/go/arvados"
- log "github.com/sirupsen/logrus"
-)
-
-type filter struct {
- MaxVariants int
- MinCoverage float64
- MaxTag int
-}
-
-func (f *filter) Flags(flags *flag.FlagSet) {
- flags.IntVar(&f.MaxVariants, "max-variants", -1, "drop tiles with more than `N` variants")
- flags.Float64Var(&f.MinCoverage, "min-coverage", 0, "drop tiles with coverage less than `P` across all haplotypes (0 < P ≤ 1)")
- flags.IntVar(&f.MaxTag, "max-tag", -1, "drop tiles with tag ID > `N`")
-}
-
-func (f *filter) Args() []string {
- return []string{
- fmt.Sprintf("-max-variants=%d", f.MaxVariants),
- fmt.Sprintf("-min-coverage=%f", f.MinCoverage),
- fmt.Sprintf("-max-tag=%d", f.MaxTag),
- }
-}
-
-func (f *filter) Apply(tilelib *tileLibrary) {
- // Zero out variants at tile positions that have more than
- // f.MaxVariants tile variants.
- if f.MaxVariants >= 0 {
- for tag, variants := range tilelib.variant {
- if f.MaxTag >= 0 && tag >= f.MaxTag {
- break
- }
- if len(variants) <= f.MaxVariants {
- continue
- }
- for _, cg := range tilelib.compactGenomes {
- if len(cg) > tag*2 {
- cg[tag*2] = 0
- cg[tag*2+1] = 0
- }
- }
- }
- }
-
- // Zero out variants at tile positions that have less than
- // f.MinCoverage.
- mincov := int(2*f.MinCoverage*float64(len(tilelib.compactGenomes)) + 1)
-TAG:
- for tag := 0; tag < len(tilelib.variant) && tag < f.MaxTag; tag++ {
- tagcov := 0
- for _, cg := range tilelib.compactGenomes {
- if cg[tag*2] > 0 {
- tagcov++
- }
- if cg[tag*2+1] > 0 {
- tagcov++
- }
- if tagcov >= mincov {
- continue TAG
- }
- }
- for _, cg := range tilelib.compactGenomes {
- cg[tag*2] = 0
- cg[tag*2+1] = 0
- }
- }
-
- // Truncate genomes and tile data to f.MaxTag (TODO: truncate
- // refseqs too)
- if f.MaxTag >= 0 {
- if len(tilelib.variant) > f.MaxTag {
- tilelib.variant = tilelib.variant[:f.MaxTag]
- }
- for name, cg := range tilelib.compactGenomes {
- if len(cg) > 2*f.MaxTag {
- tilelib.compactGenomes[name] = cg[:2*f.MaxTag]
- }
- }
- }
-}
-
-type filtercmd struct {
- output io.Writer
- filter
-}
-
-func (cmd *filtercmd) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
- var err error
- defer func() {
- if err != nil {
- fmt.Fprintf(stderr, "%s\n", err)
- }
- }()
- flags := flag.NewFlagSet("", flag.ContinueOnError)
- flags.SetOutput(stderr)
- pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
- runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
- projectUUID := flags.String("project", "", "project `UUID` for output data")
- priority := flags.Int("priority", 500, "container request priority")
- inputFilename := flags.String("i", "-", "input `file`")
- outputFilename := flags.String("o", "-", "output `file`")
- cmd.filter.Flags(flags)
- err = flags.Parse(args)
- if err == flag.ErrHelp {
- err = nil
- return 0
- } else if err != nil {
- return 2
- }
- cmd.output = stdout
-
- if *pprof != "" {
- go func() {
- log.Println(http.ListenAndServe(*pprof, nil))
- }()
- }
-
- if !*runlocal {
- if *outputFilename != "-" {
- err = errors.New("cannot specify output file in container mode: not implemented")
- return 1
- }
- runner := arvadosContainerRunner{
- Name: "lightning filter",
- Client: arvados.NewClientFromEnv(),
- ProjectUUID: *projectUUID,
- RAM: 64000000000,
- VCPUs: 2,
- Priority: *priority,
- }
- err = runner.TranslatePaths(inputFilename)
- if err != nil {
- return 1
- }
- runner.Args = []string{"filter", "-local=true",
- "-i", *inputFilename,
- "-o", "/mnt/output/library.gob",
- "-max-variants", fmt.Sprintf("%d", cmd.MaxVariants),
- "-min-coverage", fmt.Sprintf("%f", cmd.MinCoverage),
- "-max-tag", fmt.Sprintf("%d", cmd.MaxTag),
- }
- var output string
- output, err = runner.Run()
- if err != nil {
- return 1
- }
- fmt.Fprintln(stdout, output+"/library.gob")
- return 0
- }
-
- var infile io.ReadCloser
- if *inputFilename == "-" {
- infile = ioutil.NopCloser(stdin)
- } else {
- infile, err = os.Open(*inputFilename)
- if err != nil {
- return 1
- }
- defer infile.Close()
- }
- log.Print("reading")
- cgs, err := ReadCompactGenomes(infile, strings.HasSuffix(*inputFilename, ".gz"))
- if err != nil {
- return 1
- }
- err = infile.Close()
- if err != nil {
- return 1
- }
- log.Printf("reading done, %d genomes", len(cgs))
-
- log.Print("filtering")
- ntags := 0
- for _, cg := range cgs {
- if ntags < len(cg.Variants)/2 {
- ntags = len(cg.Variants) / 2
- }
- if cmd.MaxVariants < 0 {
- continue
- }
- for idx, variant := range cg.Variants {
- if variant > tileVariantID(cmd.MaxVariants) {
- for _, cg := range cgs {
- if len(cg.Variants) > idx {
- cg.Variants[idx & ^1] = 0
- cg.Variants[idx|1] = 0
- }
- }
- }
- }
- }
-
- if cmd.MaxTag >= 0 && ntags > cmd.MaxTag {
- ntags = cmd.MaxTag
- for i, cg := range cgs {
- if len(cg.Variants) > cmd.MaxTag*2 {
- cgs[i].Variants = cg.Variants[:cmd.MaxTag*2]
- }
- }
- }
-
- if cmd.MinCoverage > 0 {
- mincov := int(cmd.MinCoverage * float64(len(cgs)*2))
- cov := make([]int, ntags)
- for _, cg := range cgs {
- for idx, variant := range cg.Variants {
- if variant > 0 {
- cov[idx>>1]++
- }
- }
- }
- for tag, c := range cov {
- if c < mincov {
- for _, cg := range cgs {
- if len(cg.Variants) > tag*2 {
- cg.Variants[tag*2] = 0
- cg.Variants[tag*2+1] = 0
- }
- }
- }
- }
- }
-
- log.Print("filtering done")
-
- var outfile io.WriteCloser
- if *outputFilename == "-" {
- outfile = nopCloser{cmd.output}
- } else {
- outfile, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0777)
- if err != nil {
- return 1
- }
- defer outfile.Close()
- }
- w := bufio.NewWriter(outfile)
- enc := gob.NewEncoder(w)
- log.Print("writing")
- err = enc.Encode(LibraryEntry{
- CompactGenomes: cgs,
- })
- if err != nil {
- return 1
- }
- log.Print("writing done")
- err = w.Flush()
- if err != nil {
- return 1
- }
- err = outfile.Close()
- if err != nil {
- return 1
- }
- return 0
-}