Separate pvcf/vcf output.
authorTom Clegg <tom@tomclegg.ca>
Mon, 12 Jul 2021 14:20:11 +0000 (10:20 -0400)
committerTom Clegg <tom@tomclegg.ca>
Mon, 12 Jul 2021 14:20:11 +0000 (10:20 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

export.go
export_test.go
pipeline_test.go

index ac7f5292beb9812fe81187242973819365fd69c7..496f5c9cd593f4e2bdbac42bafbc7321d6abdac3 100644 (file)
--- a/export.go
+++ b/export.go
@@ -14,6 +14,7 @@ import (
        "path/filepath"
        "runtime"
        "sort"
+       "strconv"
        "strings"
        "sync"
        "time"
@@ -33,10 +34,12 @@ var (
        outputFormats = map[string]outputFormat{
                "hgvs-onehot": outputFormatHGVSOneHot,
                "hgvs":        outputFormatHGVS,
+               "pvcf":        outputFormatPVCF,
                "vcf":         outputFormatVCF,
        }
        outputFormatHGVS       = outputFormat{Filename: "out.csv", Print: printHGVS}
        outputFormatHGVSOneHot = outputFormat{Filename: "out.csv", Print: printHGVSOneHot}
+       outputFormatPVCF       = outputFormat{Filename: "out.vcf", Print: printPVCF, PadLeft: true}
        outputFormatVCF        = outputFormat{Filename: "out.vcf", Print: printVCF, PadLeft: true}
 )
 
@@ -63,7 +66,7 @@ func (cmd *exporter) RunCommand(prog string, args []string, stdin io.Reader, std
        refname := flags.String("ref", "", "reference genome `name`")
        inputDir := flags.String("input-dir", ".", "input `directory`")
        outputDir := flags.String("output-dir", ".", "output `directory`")
-       outputFormatStr := flags.String("output-format", "hgvs", "output `format`: hgvs or vcf")
+       outputFormatStr := flags.String("output-format", "hgvs", "output `format`: hgvs, pvcf, or vcf")
        outputBed := flags.String("output-bed", "", "also output bed `file`")
        flags.BoolVar(&cmd.outputPerChrom, "output-per-chromosome", true, "output one file per chromosome")
        labelsFilename := flags.String("output-labels", "", "also output genome labels csv `file`")
@@ -330,7 +333,7 @@ func (cmd *exporter) exportSeq(outw, bedw io.Writer, taglen int, seqname string,
                case <-progressbar.C:
                        var eta interface{}
                        if refstep > 0 {
-                               fin := t0.Add(time.Now().Sub(t0) * time.Duration(len(reftiles)) / time.Duration(refstep))
+                               fin := t0.Add(time.Duration(float64(time.Now().Sub(t0)) * float64(len(reftiles)) / float64(refstep)))
                                eta = fmt.Sprintf("%v (%v)", fin.Format(time.RFC3339), fin.Sub(time.Now()))
                        } else {
                                eta = "N/A"
@@ -449,21 +452,44 @@ func (cmd *exporter) exportSeq(outw, bedw io.Writer, taglen int, seqname string,
        }
 }
 
-func printVCF(out io.Writer, seqname string, varslice []hgvs.Variant) {
-       refs := map[string]map[string]int{}
+func bucketVarsliceByRef(varslice []hgvs.Variant) map[string]map[string]int {
+       byref := map[string]map[string]int{}
        for _, v := range varslice {
                if v.Ref == "" && v.New == "" {
                        continue
                }
-               alts := refs[v.Ref]
+               alts := byref[v.Ref]
                if alts == nil {
                        alts = map[string]int{}
-                       refs[v.Ref] = alts
+                       byref[v.Ref] = alts
                }
-               alts[v.New] = 0
+               alts[v.New]++
        }
-       for ref, alts := range refs {
-               var altslice []string
+       return byref
+}
+
+func printVCF(out io.Writer, seqname string, varslice []hgvs.Variant) {
+       for ref, alts := range bucketVarsliceByRef(varslice) {
+               altslice := make([]string, 0, len(alts))
+               for alt := range alts {
+                       altslice = append(altslice, alt)
+               }
+               sort.Strings(altslice)
+
+               info := "AC="
+               for i, a := range altslice {
+                       if i > 0 {
+                               info += ","
+                       }
+                       info += strconv.Itoa(alts[a])
+               }
+               fmt.Fprintf(out, "%s\t%d\t%s\t%s\t.\t.\t%s\tGT\t0/1\n", seqname, varslice[0].Position, ref, strings.Join(altslice, ","), info)
+       }
+}
+
+func printPVCF(out io.Writer, seqname string, varslice []hgvs.Variant) {
+       for ref, alts := range bucketVarsliceByRef(varslice) {
+               altslice := make([]string, 0, len(alts))
                for alt := range alts {
                        altslice = append(altslice, alt)
                }
@@ -471,11 +497,14 @@ func printVCF(out io.Writer, seqname string, varslice []hgvs.Variant) {
                for i, a := range altslice {
                        alts[a] = i + 1
                }
-               fmt.Fprintf(out, "%s\t%d\t%s\t%s", seqname, varslice[0].Position, ref, strings.Join(altslice, ","))
+               fmt.Fprintf(out, "%s\t%d\t%s\t%s\t.\t.\tGT", seqname, varslice[0].Position, ref, strings.Join(altslice, ","))
                for i := 0; i < len(varslice); i += 2 {
                        v1, v2 := varslice[i], varslice[i+1]
                        a1, a2 := alts[v1.New], alts[v2.New]
                        if v1.Ref != ref {
+                               // variant on allele 0 belongs on a
+                               // different output line -- same
+                               // chr,pos but different "ref" length
                                a1 = 0
                        }
                        if v2.Ref != ref {
index 8ee3ab48ef66e7e1c589f0d88ee3095356fdab82..c3a0b845bdc063201dab37b622a65d21ab64475d 100644 (file)
@@ -57,6 +57,36 @@ chr2.471_472delinsAA 1       0
 1,"input2","out.csv"
 `)
 
+       exited = (&exporter{}).RunCommand("export", []string{
+               "-local=true",
+               "-input-dir=" + tmpdir,
+               "-output-dir=" + tmpdir,
+               "-output-format=pvcf",
+               "-ref=testdata/ref.fasta",
+       }, &buffer, os.Stderr, os.Stderr)
+       c.Check(exited, check.Equals, 0)
+       output, err = ioutil.ReadFile(tmpdir + "/out.chr1.vcf")
+       c.Check(err, check.IsNil)
+       c.Log(string(output))
+       c.Check(sortLines(string(output)), check.Equals, sortLines(`chr1        1       NNN     GGC     .       .       GT      1/1     0/0
+chr1   41      TT      AA      .       .       GT      1/0     0/0
+chr1   161     A       T       .       .       GT      0/1     0/0
+chr1   178     A       T       .       .       GT      0/1     0/0
+chr1   221     TCCA    T       .       .       GT      1/1     0/0
+chr1   302     TTTT    AAAA    .       .       GT      0/1     0/0
+`))
+       output, err = ioutil.ReadFile(tmpdir + "/out.chr2.vcf")
+       c.Check(err, check.IsNil)
+       c.Log(string(output))
+       c.Check(sortLines(string(output)), check.Equals, sortLines(`chr2        1       TTT     AAA     .       .       GT      0/0     0/1
+chr2   125     CTT     AAA     .       .       GT      0/0     1/1
+chr2   240     ATTTTTCTTGCTCTC A       .       .       GT      1/0     0/0
+chr2   258     CCTTGTATTTTT    AA      .       .       GT      1/0     0/0
+chr2   315     C       A       .       .       GT      1/0     0/0
+chr2   469     GTGG    G       .       .       GT      1/0     0/0
+chr2   471     GG      AA      .       .       GT      0/1     0/0
+`))
+
        exited = (&exporter{}).RunCommand("export", []string{
                "-local=true",
                "-input-dir=" + tmpdir,
@@ -68,22 +98,22 @@ chr2.471_472delinsAA        1       0
        output, err = ioutil.ReadFile(tmpdir + "/out.chr1.vcf")
        c.Check(err, check.IsNil)
        c.Log(string(output))
-       c.Check(sortLines(string(output)), check.Equals, sortLines(`chr1        1       NNN     GGC     1/1     0/0
-chr1   41      TT      AA      1/0     0/0
-chr1   161     A       T       0/1     0/0
-chr1   178     A       T       0/1     0/0
-chr1   221     TCCA    T       1/1     0/0
-chr1   302     TTTT    AAAA    0/1     0/0
+       c.Check(sortLines(string(output)), check.Equals, sortLines(`chr1        1       NNN     GGC     .       .       AC=2    GT      0/1
+chr1   41      TT      AA      .       .       AC=1    GT      0/1
+chr1   161     A       T       .       .       AC=1    GT      0/1
+chr1   178     A       T       .       .       AC=1    GT      0/1
+chr1   221     TCCA    T       .       .       AC=2    GT      0/1
+chr1   302     TTTT    AAAA    .       .       AC=1    GT      0/1
 `))
        output, err = ioutil.ReadFile(tmpdir + "/out.chr2.vcf")
        c.Check(err, check.IsNil)
        c.Log(string(output))
-       c.Check(sortLines(string(output)), check.Equals, sortLines(`chr2        1       TTT     AAA     0/0     0/1
-chr2   125     CTT     AAA     0/0     1/1
-chr2   240     ATTTTTCTTGCTCTC A       1/0     0/0
-chr2   258     CCTTGTATTTTT    AA      1/0     0/0
-chr2   315     C       A       1/0     0/0
-chr2   469     GTGG    G       1/0     0/0
-chr2   471     GG      AA      0/1     0/0
+       c.Check(sortLines(string(output)), check.Equals, sortLines(`chr2        1       TTT     AAA     .       .       AC=1    GT      0/1
+chr2   125     CTT     AAA     .       .       AC=2    GT      0/1
+chr2   240     ATTTTTCTTGCTCTC A       .       .       AC=1    GT      0/1
+chr2   258     CCTTGTATTTTT    AA      .       .       AC=1    GT      0/1
+chr2   315     C       A       .       .       AC=1    GT      0/1
+chr2   469     GTGG    G       .       .       AC=1    GT      0/1
+chr2   471     GG      AA      .       .       AC=1    GT      0/1
 `))
 }
index 38718306f5e92efdc36bb267d05d6769e0f52791..2cd0db872b5af88099847a086597e431298ab0b6 100644 (file)
@@ -106,23 +106,23 @@ chr2:g.[470_472del];[470=]        .
 chr2:g.[471=];[471_472delinsAA]        .
 `))
 
-       code = (&exporter{}).RunCommand("lightning export", []string{"-local", "-ref", "testdata/ref.fasta", "-output-dir", tmpdir, "-output-format", "vcf", "-input-dir", tmpdir + "/merged", "-output-bed", tmpdir + "/export.bed", "-output-per-chromosome=false"}, bytes.NewReader(nil), os.Stderr, os.Stderr)
+       code = (&exporter{}).RunCommand("lightning export", []string{"-local", "-ref", "testdata/ref.fasta", "-output-dir", tmpdir, "-output-format", "pvcf", "-input-dir", tmpdir + "/merged", "-output-bed", tmpdir + "/export.bed", "-output-per-chromosome=false"}, bytes.NewReader(nil), os.Stderr, os.Stderr)
        c.Check(code, check.Equals, 0)
        vcfout, err := ioutil.ReadFile(tmpdir + "/out.vcf")
        c.Check(err, check.IsNil)
-       c.Check(sortLines(string(vcfout)), check.Equals, sortLines(`chr1        1       NNN     GGC     1/1     0/0
-chr1   41      TT      AA      1/0     0/0
-chr1   161     A       T       0/1     0/0
-chr1   178     A       T       0/1     0/0
-chr1   221     TCCA    T       1/1     0/0
-chr1   302     TTTT    AAAA    0/1     0/0
-chr2   1       TTT     AAA     0/0     0/1
-chr2   125     CTT     AAA     0/0     1/1
-chr2   240     ATTTTTCTTGCTCTC A       1/0     0/0
-chr2   258     CCTTGTATTTTT    AA      1/0     0/0
-chr2   315     C       A       1/0     0/0
-chr2   469     GTGG    G       1/0     0/0
-chr2   471     GG      AA      0/1     0/0
+       c.Check(sortLines(string(vcfout)), check.Equals, sortLines(`chr1        1       NNN     GGC     .       .       GT      1/1     0/0
+chr1   41      TT      AA      .       .       GT      1/0     0/0
+chr1   161     A       T       .       .       GT      0/1     0/0
+chr1   178     A       T       .       .       GT      0/1     0/0
+chr1   221     TCCA    T       .       .       GT      1/1     0/0
+chr1   302     TTTT    AAAA    .       .       GT      0/1     0/0
+chr2   1       TTT     AAA     .       .       GT      0/0     0/1
+chr2   125     CTT     AAA     .       .       GT      0/0     1/1
+chr2   240     ATTTTTCTTGCTCTC A       .       .       GT      1/0     0/0
+chr2   258     CCTTGTATTTTT    AA      .       .       GT      1/0     0/0
+chr2   315     C       A       .       .       GT      1/0     0/0
+chr2   469     GTGG    G       .       .       GT      1/0     0/0
+chr2   471     GG      AA      .       .       GT      0/1     0/0
 `))
        bedout, err := ioutil.ReadFile(tmpdir + "/export.bed")
        c.Check(err, check.IsNil)