From 0df66c8fae2cdaf70c811379a4c1522211838a9b Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Mon, 12 Jul 2021 10:20:11 -0400 Subject: [PATCH] Separate pvcf/vcf output. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- export.go | 49 +++++++++++++++++++++++++++++++++--------- export_test.go | 56 +++++++++++++++++++++++++++++++++++++----------- pipeline_test.go | 28 ++++++++++++------------ 3 files changed, 96 insertions(+), 37 deletions(-) diff --git a/export.go b/export.go index ac7f5292be..496f5c9cd5 100644 --- a/export.go +++ b/export.go @@ -14,6 +14,7 @@ import ( "path/filepath" "runtime" "sort" + "strconv" "strings" "sync" "time" @@ -33,10 +34,12 @@ var ( outputFormats = map[string]outputFormat{ "hgvs-onehot": outputFormatHGVSOneHot, "hgvs": outputFormatHGVS, + "pvcf": outputFormatPVCF, "vcf": outputFormatVCF, } outputFormatHGVS = outputFormat{Filename: "out.csv", Print: printHGVS} outputFormatHGVSOneHot = outputFormat{Filename: "out.csv", Print: printHGVSOneHot} + outputFormatPVCF = outputFormat{Filename: "out.vcf", Print: printPVCF, PadLeft: true} outputFormatVCF = outputFormat{Filename: "out.vcf", Print: printVCF, PadLeft: true} ) @@ -63,7 +66,7 @@ func (cmd *exporter) RunCommand(prog string, args []string, stdin io.Reader, std refname := flags.String("ref", "", "reference genome `name`") inputDir := flags.String("input-dir", ".", "input `directory`") outputDir := flags.String("output-dir", ".", "output `directory`") - outputFormatStr := flags.String("output-format", "hgvs", "output `format`: hgvs or vcf") + outputFormatStr := flags.String("output-format", "hgvs", "output `format`: hgvs, pvcf, or vcf") outputBed := flags.String("output-bed", "", "also output bed `file`") flags.BoolVar(&cmd.outputPerChrom, "output-per-chromosome", true, "output one file per chromosome") labelsFilename := flags.String("output-labels", "", "also output genome labels csv `file`") @@ -330,7 +333,7 @@ func (cmd *exporter) exportSeq(outw, bedw io.Writer, taglen int, seqname string, case <-progressbar.C: var eta interface{} if refstep > 0 { - fin := t0.Add(time.Now().Sub(t0) * time.Duration(len(reftiles)) / time.Duration(refstep)) + fin := t0.Add(time.Duration(float64(time.Now().Sub(t0)) * float64(len(reftiles)) / float64(refstep))) eta = fmt.Sprintf("%v (%v)", fin.Format(time.RFC3339), fin.Sub(time.Now())) } else { eta = "N/A" @@ -449,21 +452,44 @@ func (cmd *exporter) exportSeq(outw, bedw io.Writer, taglen int, seqname string, } } -func printVCF(out io.Writer, seqname string, varslice []hgvs.Variant) { - refs := map[string]map[string]int{} +func bucketVarsliceByRef(varslice []hgvs.Variant) map[string]map[string]int { + byref := map[string]map[string]int{} for _, v := range varslice { if v.Ref == "" && v.New == "" { continue } - alts := refs[v.Ref] + alts := byref[v.Ref] if alts == nil { alts = map[string]int{} - refs[v.Ref] = alts + byref[v.Ref] = alts } - alts[v.New] = 0 + alts[v.New]++ } - for ref, alts := range refs { - var altslice []string + return byref +} + +func printVCF(out io.Writer, seqname string, varslice []hgvs.Variant) { + for ref, alts := range bucketVarsliceByRef(varslice) { + altslice := make([]string, 0, len(alts)) + for alt := range alts { + altslice = append(altslice, alt) + } + sort.Strings(altslice) + + info := "AC=" + for i, a := range altslice { + if i > 0 { + info += "," + } + info += strconv.Itoa(alts[a]) + } + fmt.Fprintf(out, "%s\t%d\t%s\t%s\t.\t.\t%s\tGT\t0/1\n", seqname, varslice[0].Position, ref, strings.Join(altslice, ","), info) + } +} + +func printPVCF(out io.Writer, seqname string, varslice []hgvs.Variant) { + for ref, alts := range bucketVarsliceByRef(varslice) { + altslice := make([]string, 0, len(alts)) for alt := range alts { altslice = append(altslice, alt) } @@ -471,11 +497,14 @@ func printVCF(out io.Writer, seqname string, varslice []hgvs.Variant) { for i, a := range altslice { alts[a] = i + 1 } - fmt.Fprintf(out, "%s\t%d\t%s\t%s", seqname, varslice[0].Position, ref, strings.Join(altslice, ",")) + fmt.Fprintf(out, "%s\t%d\t%s\t%s\t.\t.\tGT", seqname, varslice[0].Position, ref, strings.Join(altslice, ",")) for i := 0; i < len(varslice); i += 2 { v1, v2 := varslice[i], varslice[i+1] a1, a2 := alts[v1.New], alts[v2.New] if v1.Ref != ref { + // variant on allele 0 belongs on a + // different output line -- same + // chr,pos but different "ref" length a1 = 0 } if v2.Ref != ref { diff --git a/export_test.go b/export_test.go index 8ee3ab48ef..c3a0b845bd 100644 --- a/export_test.go +++ b/export_test.go @@ -57,6 +57,36 @@ chr2.471_472delinsAA 1 0 1,"input2","out.csv" `) + exited = (&exporter{}).RunCommand("export", []string{ + "-local=true", + "-input-dir=" + tmpdir, + "-output-dir=" + tmpdir, + "-output-format=pvcf", + "-ref=testdata/ref.fasta", + }, &buffer, os.Stderr, os.Stderr) + c.Check(exited, check.Equals, 0) + output, err = ioutil.ReadFile(tmpdir + "/out.chr1.vcf") + c.Check(err, check.IsNil) + c.Log(string(output)) + c.Check(sortLines(string(output)), check.Equals, sortLines(`chr1 1 NNN GGC . . GT 1/1 0/0 +chr1 41 TT AA . . GT 1/0 0/0 +chr1 161 A T . . GT 0/1 0/0 +chr1 178 A T . . GT 0/1 0/0 +chr1 221 TCCA T . . GT 1/1 0/0 +chr1 302 TTTT AAAA . . GT 0/1 0/0 +`)) + output, err = ioutil.ReadFile(tmpdir + "/out.chr2.vcf") + c.Check(err, check.IsNil) + c.Log(string(output)) + c.Check(sortLines(string(output)), check.Equals, sortLines(`chr2 1 TTT AAA . . GT 0/0 0/1 +chr2 125 CTT AAA . . GT 0/0 1/1 +chr2 240 ATTTTTCTTGCTCTC A . . GT 1/0 0/0 +chr2 258 CCTTGTATTTTT AA . . GT 1/0 0/0 +chr2 315 C A . . GT 1/0 0/0 +chr2 469 GTGG G . . GT 1/0 0/0 +chr2 471 GG AA . . GT 0/1 0/0 +`)) + exited = (&exporter{}).RunCommand("export", []string{ "-local=true", "-input-dir=" + tmpdir, @@ -68,22 +98,22 @@ chr2.471_472delinsAA 1 0 output, err = ioutil.ReadFile(tmpdir + "/out.chr1.vcf") c.Check(err, check.IsNil) c.Log(string(output)) - c.Check(sortLines(string(output)), check.Equals, sortLines(`chr1 1 NNN GGC 1/1 0/0 -chr1 41 TT AA 1/0 0/0 -chr1 161 A T 0/1 0/0 -chr1 178 A T 0/1 0/0 -chr1 221 TCCA T 1/1 0/0 -chr1 302 TTTT AAAA 0/1 0/0 + c.Check(sortLines(string(output)), check.Equals, sortLines(`chr1 1 NNN GGC . . AC=2 GT 0/1 +chr1 41 TT AA . . AC=1 GT 0/1 +chr1 161 A T . . AC=1 GT 0/1 +chr1 178 A T . . AC=1 GT 0/1 +chr1 221 TCCA T . . AC=2 GT 0/1 +chr1 302 TTTT AAAA . . AC=1 GT 0/1 `)) output, err = ioutil.ReadFile(tmpdir + "/out.chr2.vcf") c.Check(err, check.IsNil) c.Log(string(output)) - c.Check(sortLines(string(output)), check.Equals, sortLines(`chr2 1 TTT AAA 0/0 0/1 -chr2 125 CTT AAA 0/0 1/1 -chr2 240 ATTTTTCTTGCTCTC A 1/0 0/0 -chr2 258 CCTTGTATTTTT AA 1/0 0/0 -chr2 315 C A 1/0 0/0 -chr2 469 GTGG G 1/0 0/0 -chr2 471 GG AA 0/1 0/0 + c.Check(sortLines(string(output)), check.Equals, sortLines(`chr2 1 TTT AAA . . AC=1 GT 0/1 +chr2 125 CTT AAA . . AC=2 GT 0/1 +chr2 240 ATTTTTCTTGCTCTC A . . AC=1 GT 0/1 +chr2 258 CCTTGTATTTTT AA . . AC=1 GT 0/1 +chr2 315 C A . . AC=1 GT 0/1 +chr2 469 GTGG G . . AC=1 GT 0/1 +chr2 471 GG AA . . AC=1 GT 0/1 `)) } diff --git a/pipeline_test.go b/pipeline_test.go index 38718306f5..2cd0db872b 100644 --- a/pipeline_test.go +++ b/pipeline_test.go @@ -106,23 +106,23 @@ chr2:g.[470_472del];[470=] . chr2:g.[471=];[471_472delinsAA] . `)) - code = (&exporter{}).RunCommand("lightning export", []string{"-local", "-ref", "testdata/ref.fasta", "-output-dir", tmpdir, "-output-format", "vcf", "-input-dir", tmpdir + "/merged", "-output-bed", tmpdir + "/export.bed", "-output-per-chromosome=false"}, bytes.NewReader(nil), os.Stderr, os.Stderr) + code = (&exporter{}).RunCommand("lightning export", []string{"-local", "-ref", "testdata/ref.fasta", "-output-dir", tmpdir, "-output-format", "pvcf", "-input-dir", tmpdir + "/merged", "-output-bed", tmpdir + "/export.bed", "-output-per-chromosome=false"}, bytes.NewReader(nil), os.Stderr, os.Stderr) c.Check(code, check.Equals, 0) vcfout, err := ioutil.ReadFile(tmpdir + "/out.vcf") c.Check(err, check.IsNil) - c.Check(sortLines(string(vcfout)), check.Equals, sortLines(`chr1 1 NNN GGC 1/1 0/0 -chr1 41 TT AA 1/0 0/0 -chr1 161 A T 0/1 0/0 -chr1 178 A T 0/1 0/0 -chr1 221 TCCA T 1/1 0/0 -chr1 302 TTTT AAAA 0/1 0/0 -chr2 1 TTT AAA 0/0 0/1 -chr2 125 CTT AAA 0/0 1/1 -chr2 240 ATTTTTCTTGCTCTC A 1/0 0/0 -chr2 258 CCTTGTATTTTT AA 1/0 0/0 -chr2 315 C A 1/0 0/0 -chr2 469 GTGG G 1/0 0/0 -chr2 471 GG AA 0/1 0/0 + c.Check(sortLines(string(vcfout)), check.Equals, sortLines(`chr1 1 NNN GGC . . GT 1/1 0/0 +chr1 41 TT AA . . GT 1/0 0/0 +chr1 161 A T . . GT 0/1 0/0 +chr1 178 A T . . GT 0/1 0/0 +chr1 221 TCCA T . . GT 1/1 0/0 +chr1 302 TTTT AAAA . . GT 0/1 0/0 +chr2 1 TTT AAA . . GT 0/0 0/1 +chr2 125 CTT AAA . . GT 0/0 1/1 +chr2 240 ATTTTTCTTGCTCTC A . . GT 1/0 0/0 +chr2 258 CCTTGTATTTTT AA . . GT 1/0 0/0 +chr2 315 C A . . GT 1/0 0/0 +chr2 469 GTGG G . . GT 1/0 0/0 +chr2 471 GG AA . . GT 0/1 0/0 `)) bedout, err := ioutil.ReadFile(tmpdir + "/export.bed") c.Check(err, check.IsNil) -- 2.30.2