Generate genome file from fasta.
authorTom Clegg <tom@tomclegg.ca>
Tue, 10 Mar 2020 21:01:00 +0000 (17:01 -0400)
committerTom Clegg <tom@tomclegg.ca>
Tue, 10 Mar 2020 21:01:00 +0000 (17:01 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@tomclegg.ca>

cmd.go
ref2genome.go [new file with mode: 0644]
vcf2fasta.go

diff --git a/cmd.go b/cmd.go
index 65ce7e2a0ca6cee8da0c6d9670d97c1b35d207f3..17578b43e7ab112d3f40a20b7545e75f421cd31c 100644 (file)
--- a/cmd.go
+++ b/cmd.go
@@ -16,6 +16,7 @@ var (
                "-version":  cmd.Version,
                "--version": cmd.Version,
 
+               "ref2genome":         &ref2genome{},
                "vcf2fasta":          &vcf2fasta{},
                "import":             &importer{},
                "export-numpy":       &exportNumpy{},
diff --git a/ref2genome.go b/ref2genome.go
new file mode 100644 (file)
index 0000000..79ea29d
--- /dev/null
@@ -0,0 +1,132 @@
+package main
+
+import (
+       "bufio"
+       "bytes"
+       "compress/gzip"
+       "errors"
+       "flag"
+       "fmt"
+       "io"
+       "net/http"
+       _ "net/http/pprof"
+       "os"
+       "strings"
+
+       "git.arvados.org/arvados.git/sdk/go/arvados"
+       log "github.com/sirupsen/logrus"
+)
+
+type ref2genome struct {
+       refFile        string
+       projectUUID    string
+       outputFilename string
+       runLocal       bool
+}
+
+func (cmd *ref2genome) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
+       var err error
+       defer func() {
+               if err != nil {
+                       fmt.Fprintf(stderr, "%s\n", err)
+               }
+       }()
+       flags := flag.NewFlagSet("", flag.ContinueOnError)
+       flags.SetOutput(stderr)
+       flags.StringVar(&cmd.refFile, "ref", "", "reference fasta `file`")
+       flags.StringVar(&cmd.projectUUID, "project", "", "project `UUID` for containers and output data")
+       flags.StringVar(&cmd.outputFilename, "o", "", "output filename")
+       flags.BoolVar(&cmd.runLocal, "local", false, "run on local host (default: run in an arvados container)")
+       priority := flags.Int("priority", 500, "container request priority")
+       pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
+       err = flags.Parse(args)
+       if err == flag.ErrHelp {
+               err = nil
+               return 0
+       } else if err != nil {
+               return 2
+       } else if cmd.refFile == "" {
+               err = errors.New("reference data (-ref) not specified")
+               return 2
+       }
+
+       if *pprof != "" {
+               go func() {
+                       log.Println(http.ListenAndServe(*pprof, nil))
+               }()
+       }
+
+       if !cmd.runLocal {
+               if cmd.outputFilename != "" {
+                       err = errors.New("cannot specify output filename in non-local mode")
+                       return 2
+               }
+               runner := arvadosContainerRunner{
+                       Name:        "lightning ref2genome",
+                       Client:      arvados.NewClientFromEnv(),
+                       ProjectUUID: cmd.projectUUID,
+                       RAM:         1 << 30,
+                       Priority:    *priority,
+               }
+               err = runner.TranslatePaths(&cmd.refFile)
+               if err != nil {
+                       return 1
+               }
+               runner.Args = []string{"ref2genome", "-local=true", "-ref", cmd.refFile, "-o", "/mnt/output/ref.genome"}
+               var output string
+               output, err = runner.Run()
+               if err != nil {
+                       return 1
+               }
+               fmt.Fprintln(stdout, output)
+               return 0
+       }
+
+       var out io.WriteCloser
+       if cmd.outputFilename == "" {
+               out = nopCloser{stdout}
+       } else {
+               out, err = os.OpenFile(cmd.outputFilename, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0666)
+               if err != nil {
+                       return 1
+               }
+       }
+       f, err := os.Open(cmd.refFile)
+       if err != nil {
+               return 1
+       }
+       defer f.Close()
+       var in io.Reader
+       if strings.HasSuffix(cmd.refFile, ".gz") {
+               in, err = gzip.NewReader(f)
+               if err != nil {
+                       return 1
+               }
+       } else {
+               in = f
+       }
+       label, seqlen := "", 0
+       scanner := bufio.NewScanner(in)
+       for scanner.Scan() {
+               buf := scanner.Bytes()
+               if len(buf) > 0 && buf[0] == '>' {
+                       if label != "" {
+                               fmt.Fprintf(out, "%s\t%d\n", label, seqlen)
+                       }
+                       label = strings.TrimSpace(string(buf[1:]))
+                       seqlen = 0
+               } else {
+                       seqlen += len(bytes.TrimSpace(buf))
+               }
+       }
+       if label != "" {
+               fmt.Fprintf(out, "%s\t%d\n", label, seqlen)
+       }
+       if err = scanner.Err(); err != nil {
+               return 1
+       }
+       if err = out.Close(); err != nil {
+               return 1
+       }
+       return 0
+}
index 0e93e55407ec8e293e90f07943e26513ecb361dc..6c8ec5fbccdba042b945d2d800d91384429f1e3e 100644 (file)
@@ -23,6 +23,7 @@ import (
 
 type vcf2fasta struct {
        refFile           string
+       genomeFile        string
        mask              bool
        gvcfRegionsPy     string
        gvcfRegionsPyData []byte
@@ -44,6 +45,7 @@ func (cmd *vcf2fasta) RunCommand(prog string, args []string, stdin io.Reader, st
        flags := flag.NewFlagSet("", flag.ContinueOnError)
        flags.SetOutput(stderr)
        flags.StringVar(&cmd.refFile, "ref", "", "reference fasta `file`")
+       flags.StringVar(&cmd.genomeFile, "genome", "", "reference genome `file`")
        flags.BoolVar(&cmd.mask, "mask", false, "mask uncalled regions (default: output hom ref)")
        flags.StringVar(&cmd.gvcfRegionsPy, "gvcf-regions.py", "https://raw.githubusercontent.com/lijiayong/gvcf_regions/master/gvcf_regions.py", "source of gvcf_regions.py")
        flags.StringVar(&cmd.projectUUID, "project", "", "project `UUID` for containers and output data")
@@ -212,6 +214,9 @@ func (cmd *vcf2fasta) vcf2fasta(infile string, phase int) error {
        var wg sync.WaitGroup
        errs := make(chan error, 3)
        if cmd.mask {
+               if cmd.genomeFile == "" {
+                       return errors.New("cannot apply mask without -genome argument")
+               }
                bedr, bedw, err := os.Pipe()
                if err != nil {
                        return err
@@ -232,7 +237,7 @@ func (cmd *vcf2fasta) vcf2fasta(infile string, phase int) error {
                if err != nil {
                        return err
                }
-               bedcompargs := []string{"bedtools", "complement", "-i", "/dev/stdin", "-g", cmd.refFile}
+               bedcompargs := []string{"bedtools", "complement", "-i", "/dev/stdin", "-g", cmd.genomeFile}
                bedcompargs = maybeInDocker(bedcompargs, []string{cmd.refFile, infile})
                bedcomp := exec.Command(bedcompargs[0], bedcompargs[1:]...)
                bedcomp.Stdin = bedr
@@ -289,6 +294,7 @@ func (cmd *vcf2fasta) vcf2fasta(infile string, phase int) error {
 
        for err := range errs {
                if err != nil {
+                       wg.Wait()
                        return err
                }
        }