From: Tom Clegg Date: Tue, 13 Jul 2021 14:09:16 +0000 (-0400) Subject: Fix up VCF format. X-Git-Url: https://git.arvados.org/lightning.git/commitdiff_plain/1c140a306fb88fc24d0e6b5742e7afd127b2a000 Fix up VCF format. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- diff --git a/export.go b/export.go index fb07fd329e..67d7d56e6c 100644 --- a/export.go +++ b/export.go @@ -26,6 +26,7 @@ import ( type outputFormat struct { Filename string + Head func(out io.Writer, cgs []CompactGenome) Print func(out io.Writer, seqname string, varslice []hgvs.Variant) PadLeft bool } @@ -37,10 +38,11 @@ var ( "pvcf": outputFormatPVCF, "vcf": outputFormatVCF, } - outputFormatHGVS = outputFormat{Filename: "out.csv", Print: printHGVS} - outputFormatHGVSOneHot = outputFormat{Filename: "out.csv", Print: printHGVSOneHot} - outputFormatPVCF = outputFormat{Filename: "out.vcf", Print: printPVCF, PadLeft: true} - outputFormatVCF = outputFormat{Filename: "out.vcf", Print: printVCF, PadLeft: true} + outputFormatHGVS = outputFormat{Filename: "out.csv", Head: headNone, Print: printHGVS} + outputFormatHGVSOneHot = outputFormat{Filename: "out.csv", Head: headNone, Print: printHGVSOneHot} + outputFormatPVCF = outputFormat{Filename: "out.vcf", Head: headPVCF, Print: printPVCF, PadLeft: true} + outputFormatVCF = outputFormat{Filename: "out.vcf", Head: headVCF, Print: printVCF, PadLeft: true} + headNone = func(io.Writer, []CompactGenome) {} ) type exporter struct { @@ -272,22 +274,24 @@ func (cmd *exporter) export(outdir string, bedout io.Writer, tilelib *tileLibrar } if cmd.outputPerChrom { for i, seqname := range seqnames { - f, err := os.OpenFile(filepath.Join(outdir, strings.Replace(cmd.outputFormat.Filename, ".", "."+seqname+".", 1)), os.O_CREATE|os.O_WRONLY, 0666) + f, err := os.OpenFile(filepath.Join(outdir, strings.Replace(cmd.outputFormat.Filename, ".", "."+seqname+".", 1)), os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0666) if err != nil { return err } defer f.Close() log.Infof("writing %q", f.Name()) + cmd.outputFormat.Head(f, cgs) outw[i] = f } } else { fnm := filepath.Join(outdir, cmd.outputFormat.Filename) log.Infof("writing %q", fnm) - out, err := os.OpenFile(fnm, os.O_CREATE|os.O_WRONLY, 0666) + out, err := os.OpenFile(fnm, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0666) if err != nil { return err } defer out.Close() + cmd.outputFormat.Head(out, cgs) merge(out, outw, "output") } if bedout != nil { @@ -477,6 +481,10 @@ func bucketVarsliceByRef(varslice []hgvs.Variant) map[string]map[string]int { return byref } +func headVCF(out io.Writer, cgs []CompactGenome) { + fmt.Fprint(out, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n") +} + func printVCF(out io.Writer, seqname string, varslice []hgvs.Variant) { for ref, alts := range bucketVarsliceByRef(varslice) { altslice := make([]string, 0, len(alts)) @@ -492,8 +500,17 @@ func printVCF(out io.Writer, seqname string, varslice []hgvs.Variant) { } info += strconv.Itoa(alts[a]) } - fmt.Fprintf(out, "%s\t%d\t%s\t%s\t.\t.\t%s\tGT\t0/1\n", seqname, varslice[0].Position, ref, strings.Join(altslice, ","), info) + fmt.Fprintf(out, "%s\t%d\t.\t%s\t%s\t.\t.\t%s\n", seqname, varslice[0].Position, ref, strings.Join(altslice, ","), info) + } +} + +func headPVCF(out io.Writer, cgs []CompactGenome) { + fmt.Fprintln(out, `##FORMAT=`) + fmt.Fprintf(out, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT") + for _, cg := range cgs { + fmt.Fprintf(out, "\t%s", cg.Name) } + fmt.Fprintf(out, "\n") } func printPVCF(out io.Writer, seqname string, varslice []hgvs.Variant) { @@ -506,7 +523,7 @@ func printPVCF(out io.Writer, seqname string, varslice []hgvs.Variant) { for i, a := range altslice { alts[a] = i + 1 } - fmt.Fprintf(out, "%s\t%d\t%s\t%s\t.\t.\tGT", seqname, varslice[0].Position, ref, strings.Join(altslice, ",")) + fmt.Fprintf(out, "%s\t%d\t.\t%s\t%s\t.\t.\t.\tGT", seqname, varslice[0].Position, ref, strings.Join(altslice, ",")) for i := 0; i < len(varslice); i += 2 { v1, v2 := varslice[i], varslice[i+1] a1, a2 := alts[v1.New], alts[v2.New] diff --git a/export_test.go b/export_test.go index c3a0b845bd..0971a155bc 100644 --- a/export_test.go +++ b/export_test.go @@ -68,23 +68,27 @@ chr2.471_472delinsAA 1 0 output, err = ioutil.ReadFile(tmpdir + "/out.chr1.vcf") c.Check(err, check.IsNil) c.Log(string(output)) - c.Check(sortLines(string(output)), check.Equals, sortLines(`chr1 1 NNN GGC . . GT 1/1 0/0 -chr1 41 TT AA . . GT 1/0 0/0 -chr1 161 A T . . GT 0/1 0/0 -chr1 178 A T . . GT 0/1 0/0 -chr1 221 TCCA T . . GT 1/1 0/0 -chr1 302 TTTT AAAA . . GT 0/1 0/0 + c.Check(sortLines(string(output)), check.Equals, sortLines(`##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT testdata/pipeline1/input1.1.fasta testdata/pipeline1/input2.1.fasta +chr1 1 . NNN GGC . . . GT 1/1 0/0 +chr1 41 . TT AA . . . GT 1/0 0/0 +chr1 161 . A T . . . GT 0/1 0/0 +chr1 178 . A T . . . GT 0/1 0/0 +chr1 221 . TCCA T . . . GT 1/1 0/0 +chr1 302 . TTTT AAAA . . . GT 0/1 0/0 `)) output, err = ioutil.ReadFile(tmpdir + "/out.chr2.vcf") c.Check(err, check.IsNil) c.Log(string(output)) - c.Check(sortLines(string(output)), check.Equals, sortLines(`chr2 1 TTT AAA . . GT 0/0 0/1 -chr2 125 CTT AAA . . GT 0/0 1/1 -chr2 240 ATTTTTCTTGCTCTC A . . GT 1/0 0/0 -chr2 258 CCTTGTATTTTT AA . . GT 1/0 0/0 -chr2 315 C A . . GT 1/0 0/0 -chr2 469 GTGG G . . GT 1/0 0/0 -chr2 471 GG AA . . GT 0/1 0/0 + c.Check(sortLines(string(output)), check.Equals, sortLines(`##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT testdata/pipeline1/input1.1.fasta testdata/pipeline1/input2.1.fasta +chr2 1 . TTT AAA . . . GT 0/0 0/1 +chr2 125 . CTT AAA . . . GT 0/0 1/1 +chr2 240 . ATTTTTCTTGCTCTC A . . . GT 1/0 0/0 +chr2 258 . CCTTGTATTTTT AA . . . GT 1/0 0/0 +chr2 315 . C A . . . GT 1/0 0/0 +chr2 469 . GTGG G . . . GT 1/0 0/0 +chr2 471 . GG AA . . . GT 0/1 0/0 `)) exited = (&exporter{}).RunCommand("export", []string{ @@ -98,22 +102,24 @@ chr2 471 GG AA . . GT 0/1 0/0 output, err = ioutil.ReadFile(tmpdir + "/out.chr1.vcf") c.Check(err, check.IsNil) c.Log(string(output)) - c.Check(sortLines(string(output)), check.Equals, sortLines(`chr1 1 NNN GGC . . AC=2 GT 0/1 -chr1 41 TT AA . . AC=1 GT 0/1 -chr1 161 A T . . AC=1 GT 0/1 -chr1 178 A T . . AC=1 GT 0/1 -chr1 221 TCCA T . . AC=2 GT 0/1 -chr1 302 TTTT AAAA . . AC=1 GT 0/1 + c.Check(sortLines(string(output)), check.Equals, sortLines(`#CHROM POS ID REF ALT QUAL FILTER INFO +chr1 1 . NNN GGC . . AC=2 +chr1 41 . TT AA . . AC=1 +chr1 161 . A T . . AC=1 +chr1 178 . A T . . AC=1 +chr1 221 . TCCA T . . AC=2 +chr1 302 . TTTT AAAA . . AC=1 `)) output, err = ioutil.ReadFile(tmpdir + "/out.chr2.vcf") c.Check(err, check.IsNil) c.Log(string(output)) - c.Check(sortLines(string(output)), check.Equals, sortLines(`chr2 1 TTT AAA . . AC=1 GT 0/1 -chr2 125 CTT AAA . . AC=2 GT 0/1 -chr2 240 ATTTTTCTTGCTCTC A . . AC=1 GT 0/1 -chr2 258 CCTTGTATTTTT AA . . AC=1 GT 0/1 -chr2 315 C A . . AC=1 GT 0/1 -chr2 469 GTGG G . . AC=1 GT 0/1 -chr2 471 GG AA . . AC=1 GT 0/1 + c.Check(sortLines(string(output)), check.Equals, sortLines(`#CHROM POS ID REF ALT QUAL FILTER INFO +chr2 1 . TTT AAA . . AC=1 +chr2 125 . CTT AAA . . AC=2 +chr2 240 . ATTTTTCTTGCTCTC A . . AC=1 +chr2 258 . CCTTGTATTTTT AA . . AC=1 +chr2 315 . C A . . AC=1 +chr2 469 . GTGG G . . AC=1 +chr2 471 . GG AA . . AC=1 `)) } diff --git a/pipeline_test.go b/pipeline_test.go index 2cd0db872b..e1b6427c2f 100644 --- a/pipeline_test.go +++ b/pipeline_test.go @@ -110,19 +110,21 @@ chr2:g.[471=];[471_472delinsAA] . c.Check(code, check.Equals, 0) vcfout, err := ioutil.ReadFile(tmpdir + "/out.vcf") c.Check(err, check.IsNil) - c.Check(sortLines(string(vcfout)), check.Equals, sortLines(`chr1 1 NNN GGC . . GT 1/1 0/0 -chr1 41 TT AA . . GT 1/0 0/0 -chr1 161 A T . . GT 0/1 0/0 -chr1 178 A T . . GT 0/1 0/0 -chr1 221 TCCA T . . GT 1/1 0/0 -chr1 302 TTTT AAAA . . GT 0/1 0/0 -chr2 1 TTT AAA . . GT 0/0 0/1 -chr2 125 CTT AAA . . GT 0/0 1/1 -chr2 240 ATTTTTCTTGCTCTC A . . GT 1/0 0/0 -chr2 258 CCTTGTATTTTT AA . . GT 1/0 0/0 -chr2 315 C A . . GT 1/0 0/0 -chr2 469 GTGG G . . GT 1/0 0/0 -chr2 471 GG AA . . GT 0/1 0/0 + c.Check(sortLines(string(vcfout)), check.Equals, sortLines(`##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT testdata/pipeline1/input1.1.fasta testdata/pipeline1/input2.1.fasta +chr1 1 . NNN GGC . . . GT 1/1 0/0 +chr1 41 . TT AA . . . GT 1/0 0/0 +chr1 161 . A T . . . GT 0/1 0/0 +chr1 178 . A T . . . GT 0/1 0/0 +chr1 221 . TCCA T . . . GT 1/1 0/0 +chr1 302 . TTTT AAAA . . . GT 0/1 0/0 +chr2 1 . TTT AAA . . . GT 0/0 0/1 +chr2 125 . CTT AAA . . . GT 0/0 1/1 +chr2 240 . ATTTTTCTTGCTCTC A . . . GT 1/0 0/0 +chr2 258 . CCTTGTATTTTT AA . . . GT 1/0 0/0 +chr2 315 . C A . . . GT 1/0 0/0 +chr2 469 . GTGG G . . . GT 1/0 0/0 +chr2 471 . GG AA . . . GT 0/1 0/0 `)) bedout, err := ioutil.ReadFile(tmpdir + "/export.bed") c.Check(err, check.IsNil)