"path/filepath"
"runtime"
"sort"
+ "strconv"
"strings"
"sync"
"time"
outputFormats = map[string]outputFormat{
"hgvs-onehot": outputFormatHGVSOneHot,
"hgvs": outputFormatHGVS,
+ "pvcf": outputFormatPVCF,
"vcf": outputFormatVCF,
}
outputFormatHGVS = outputFormat{Filename: "out.csv", Print: printHGVS}
outputFormatHGVSOneHot = outputFormat{Filename: "out.csv", Print: printHGVSOneHot}
+ outputFormatPVCF = outputFormat{Filename: "out.vcf", Print: printPVCF, PadLeft: true}
outputFormatVCF = outputFormat{Filename: "out.vcf", Print: printVCF, PadLeft: true}
)
refname := flags.String("ref", "", "reference genome `name`")
inputDir := flags.String("input-dir", ".", "input `directory`")
outputDir := flags.String("output-dir", ".", "output `directory`")
- outputFormatStr := flags.String("output-format", "hgvs", "output `format`: hgvs or vcf")
+ outputFormatStr := flags.String("output-format", "hgvs", "output `format`: hgvs, pvcf, or vcf")
outputBed := flags.String("output-bed", "", "also output bed `file`")
flags.BoolVar(&cmd.outputPerChrom, "output-per-chromosome", true, "output one file per chromosome")
labelsFilename := flags.String("output-labels", "", "also output genome labels csv `file`")
case <-progressbar.C:
var eta interface{}
if refstep > 0 {
- fin := t0.Add(time.Now().Sub(t0) * time.Duration(len(reftiles)) / time.Duration(refstep))
+ fin := t0.Add(time.Duration(float64(time.Now().Sub(t0)) * float64(len(reftiles)) / float64(refstep)))
eta = fmt.Sprintf("%v (%v)", fin.Format(time.RFC3339), fin.Sub(time.Now()))
} else {
eta = "N/A"
}
}
-func printVCF(out io.Writer, seqname string, varslice []hgvs.Variant) {
- refs := map[string]map[string]int{}
+func bucketVarsliceByRef(varslice []hgvs.Variant) map[string]map[string]int {
+ byref := map[string]map[string]int{}
for _, v := range varslice {
if v.Ref == "" && v.New == "" {
continue
}
- alts := refs[v.Ref]
+ alts := byref[v.Ref]
if alts == nil {
alts = map[string]int{}
- refs[v.Ref] = alts
+ byref[v.Ref] = alts
}
- alts[v.New] = 0
+ alts[v.New]++
}
- for ref, alts := range refs {
- var altslice []string
+ return byref
+}
+
+func printVCF(out io.Writer, seqname string, varslice []hgvs.Variant) {
+ for ref, alts := range bucketVarsliceByRef(varslice) {
+ altslice := make([]string, 0, len(alts))
+ for alt := range alts {
+ altslice = append(altslice, alt)
+ }
+ sort.Strings(altslice)
+
+ info := "AC="
+ for i, a := range altslice {
+ if i > 0 {
+ info += ","
+ }
+ info += strconv.Itoa(alts[a])
+ }
+ fmt.Fprintf(out, "%s\t%d\t%s\t%s\t.\t.\t%s\tGT\t0/1\n", seqname, varslice[0].Position, ref, strings.Join(altslice, ","), info)
+ }
+}
+
+func printPVCF(out io.Writer, seqname string, varslice []hgvs.Variant) {
+ for ref, alts := range bucketVarsliceByRef(varslice) {
+ altslice := make([]string, 0, len(alts))
for alt := range alts {
altslice = append(altslice, alt)
}
for i, a := range altslice {
alts[a] = i + 1
}
- fmt.Fprintf(out, "%s\t%d\t%s\t%s", seqname, varslice[0].Position, ref, strings.Join(altslice, ","))
+ fmt.Fprintf(out, "%s\t%d\t%s\t%s\t.\t.\tGT", seqname, varslice[0].Position, ref, strings.Join(altslice, ","))
for i := 0; i < len(varslice); i += 2 {
v1, v2 := varslice[i], varslice[i+1]
a1, a2 := alts[v1.New], alts[v2.New]
if v1.Ref != ref {
+ // variant on allele 0 belongs on a
+ // different output line -- same
+ // chr,pos but different "ref" length
a1 = 0
}
if v2.Ref != ref {
1,"input2","out.csv"
`)
+ exited = (&exporter{}).RunCommand("export", []string{
+ "-local=true",
+ "-input-dir=" + tmpdir,
+ "-output-dir=" + tmpdir,
+ "-output-format=pvcf",
+ "-ref=testdata/ref.fasta",
+ }, &buffer, os.Stderr, os.Stderr)
+ c.Check(exited, check.Equals, 0)
+ output, err = ioutil.ReadFile(tmpdir + "/out.chr1.vcf")
+ c.Check(err, check.IsNil)
+ c.Log(string(output))
+ c.Check(sortLines(string(output)), check.Equals, sortLines(`chr1 1 NNN GGC . . GT 1/1 0/0
+chr1 41 TT AA . . GT 1/0 0/0
+chr1 161 A T . . GT 0/1 0/0
+chr1 178 A T . . GT 0/1 0/0
+chr1 221 TCCA T . . GT 1/1 0/0
+chr1 302 TTTT AAAA . . GT 0/1 0/0
+`))
+ output, err = ioutil.ReadFile(tmpdir + "/out.chr2.vcf")
+ c.Check(err, check.IsNil)
+ c.Log(string(output))
+ c.Check(sortLines(string(output)), check.Equals, sortLines(`chr2 1 TTT AAA . . GT 0/0 0/1
+chr2 125 CTT AAA . . GT 0/0 1/1
+chr2 240 ATTTTTCTTGCTCTC A . . GT 1/0 0/0
+chr2 258 CCTTGTATTTTT AA . . GT 1/0 0/0
+chr2 315 C A . . GT 1/0 0/0
+chr2 469 GTGG G . . GT 1/0 0/0
+chr2 471 GG AA . . GT 0/1 0/0
+`))
+
exited = (&exporter{}).RunCommand("export", []string{
"-local=true",
"-input-dir=" + tmpdir,
output, err = ioutil.ReadFile(tmpdir + "/out.chr1.vcf")
c.Check(err, check.IsNil)
c.Log(string(output))
- c.Check(sortLines(string(output)), check.Equals, sortLines(`chr1 1 NNN GGC 1/1 0/0
-chr1 41 TT AA 1/0 0/0
-chr1 161 A T 0/1 0/0
-chr1 178 A T 0/1 0/0
-chr1 221 TCCA T 1/1 0/0
-chr1 302 TTTT AAAA 0/1 0/0
+ c.Check(sortLines(string(output)), check.Equals, sortLines(`chr1 1 NNN GGC . . AC=2 GT 0/1
+chr1 41 TT AA . . AC=1 GT 0/1
+chr1 161 A T . . AC=1 GT 0/1
+chr1 178 A T . . AC=1 GT 0/1
+chr1 221 TCCA T . . AC=2 GT 0/1
+chr1 302 TTTT AAAA . . AC=1 GT 0/1
`))
output, err = ioutil.ReadFile(tmpdir + "/out.chr2.vcf")
c.Check(err, check.IsNil)
c.Log(string(output))
- c.Check(sortLines(string(output)), check.Equals, sortLines(`chr2 1 TTT AAA 0/0 0/1
-chr2 125 CTT AAA 0/0 1/1
-chr2 240 ATTTTTCTTGCTCTC A 1/0 0/0
-chr2 258 CCTTGTATTTTT AA 1/0 0/0
-chr2 315 C A 1/0 0/0
-chr2 469 GTGG G 1/0 0/0
-chr2 471 GG AA 0/1 0/0
+ c.Check(sortLines(string(output)), check.Equals, sortLines(`chr2 1 TTT AAA . . AC=1 GT 0/1
+chr2 125 CTT AAA . . AC=2 GT 0/1
+chr2 240 ATTTTTCTTGCTCTC A . . AC=1 GT 0/1
+chr2 258 CCTTGTATTTTT AA . . AC=1 GT 0/1
+chr2 315 C A . . AC=1 GT 0/1
+chr2 469 GTGG G . . AC=1 GT 0/1
+chr2 471 GG AA . . AC=1 GT 0/1
`))
}
chr2:g.[471=];[471_472delinsAA] .
`))
- code = (&exporter{}).RunCommand("lightning export", []string{"-local", "-ref", "testdata/ref.fasta", "-output-dir", tmpdir, "-output-format", "vcf", "-input-dir", tmpdir + "/merged", "-output-bed", tmpdir + "/export.bed", "-output-per-chromosome=false"}, bytes.NewReader(nil), os.Stderr, os.Stderr)
+ code = (&exporter{}).RunCommand("lightning export", []string{"-local", "-ref", "testdata/ref.fasta", "-output-dir", tmpdir, "-output-format", "pvcf", "-input-dir", tmpdir + "/merged", "-output-bed", tmpdir + "/export.bed", "-output-per-chromosome=false"}, bytes.NewReader(nil), os.Stderr, os.Stderr)
c.Check(code, check.Equals, 0)
vcfout, err := ioutil.ReadFile(tmpdir + "/out.vcf")
c.Check(err, check.IsNil)
- c.Check(sortLines(string(vcfout)), check.Equals, sortLines(`chr1 1 NNN GGC 1/1 0/0
-chr1 41 TT AA 1/0 0/0
-chr1 161 A T 0/1 0/0
-chr1 178 A T 0/1 0/0
-chr1 221 TCCA T 1/1 0/0
-chr1 302 TTTT AAAA 0/1 0/0
-chr2 1 TTT AAA 0/0 0/1
-chr2 125 CTT AAA 0/0 1/1
-chr2 240 ATTTTTCTTGCTCTC A 1/0 0/0
-chr2 258 CCTTGTATTTTT AA 1/0 0/0
-chr2 315 C A 1/0 0/0
-chr2 469 GTGG G 1/0 0/0
-chr2 471 GG AA 0/1 0/0
+ c.Check(sortLines(string(vcfout)), check.Equals, sortLines(`chr1 1 NNN GGC . . GT 1/1 0/0
+chr1 41 TT AA . . GT 1/0 0/0
+chr1 161 A T . . GT 0/1 0/0
+chr1 178 A T . . GT 0/1 0/0
+chr1 221 TCCA T . . GT 1/1 0/0
+chr1 302 TTTT AAAA . . GT 0/1 0/0
+chr2 1 TTT AAA . . GT 0/0 0/1
+chr2 125 CTT AAA . . GT 0/0 1/1
+chr2 240 ATTTTTCTTGCTCTC A . . GT 1/0 0/0
+chr2 258 CCTTGTATTTTT AA . . GT 1/0 0/0
+chr2 315 C A . . GT 1/0 0/0
+chr2 469 GTGG G . . GT 1/0 0/0
+chr2 471 GG AA . . GT 0/1 0/0
`))
bedout, err := ioutil.ReadFile(tmpdir + "/export.bed")
c.Check(err, check.IsNil)