From 7e4204348ddd8b46ec417d3a04b8bf55168c934e Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Sat, 4 Dec 2021 01:40:19 -0500 Subject: [PATCH] Call 2-base deletion-insertion as two adjacent SNPs. refs #18496 refs #18438 Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- diff_test.go | 3 ++- export_test.go | 44 ++++++++++++++++++++++++++------------------ hgvs/diff.go | 22 ++++++++++++++++++++++ hgvs/diff_test.go | 25 ++++++++++++++++++++----- pipeline_test.go | 18 ++++++++++++------ 5 files changed, 82 insertions(+), 30 deletions(-) diff --git a/diff_test.go b/diff_test.go index 14ec86058d..e1848b786c 100644 --- a/diff_test.go +++ b/diff_test.go @@ -31,7 +31,8 @@ func (s *diffSuite) TestDiff(c *check.C) { c.Check(exited, check.Equals, 0) c.Check("\n"+output.String(), check.Equals, ` chr2:g.1008C>G chr2 1008 C G false -chr2:g.1028_1029delinsTT chr2 1028 AA TT false +chr2:g.1028A>T chr2 1028 A T false +chr2:g.1029A>T chr2 1029 A T false chr2:g.1032_1033insA chr2 1033 A false `) } diff --git a/export_test.go b/export_test.go index ac7063d2fc..f904e76f4a 100644 --- a/export_test.go +++ b/export_test.go @@ -68,7 +68,8 @@ func (s *exportSuite) TestFastaToHGVS(c *check.C) { c.Logf("%s", out) } c.Check(sortLines(string(output)), check.Equals, sortLines(`chr1.1_3delinsGGC 1 0 -chr1.41_42delinsAA 1 0 +chr1.41T>A 1 0 +chr1.42T>A 1 0 chr1.161A>T 1 0 chr1.178A>T 1 0 chr1.222_224del 1 0 @@ -82,7 +83,8 @@ chr2.241_254del 1 0 chr2.258_269delinsAA 1 0 chr2.315C>A 1 0 chr2.470_472del 1 0 -chr2.471_472delinsAA 1 0 +chr2.471G>A 1 0 +chr2.472G>A 1 0 `)) labels, err := ioutil.ReadFile(tmpdir + "/labels.csv") c.Check(err, check.IsNil) @@ -104,7 +106,8 @@ chr2.471_472delinsAA 1 0 c.Check(sortLines(string(output)), check.Equals, sortLines(`##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT testdata/pipeline1/input1.1.fasta testdata/pipeline1/input2.1.fasta chr1 1 . NNN GGC . . . GT 1/1 0/0 -chr1 41 . TT AA . . . GT 1/0 0/0 +chr1 41 . T A . . . GT 1/0 0/0 +chr1 42 . T A . . . GT 1/0 0/0 chr1 161 . A T . . . GT 0/1 0/0 chr1 178 . A T . . . GT 0/1 0/0 chr1 221 . TCCA T . . . GT 1/1 0/0 @@ -121,7 +124,8 @@ chr2 240 . ATTTTTCTTGCTCTC A . . . GT 1/0 0/0 chr2 258 . CCTTGTATTTTT AA . . . GT 1/0 0/0 chr2 315 . C A . . . GT 1/0 0/0 chr2 469 . GTGG G . . . GT 1/0 0/0 -chr2 471 . GG AA . . . GT 0/1 0/0 +chr2 471 . G A . . . GT 0/1 0/0 +chr2 472 . G A . . . GT 0/1 0/0 `)) exited = (&exporter{}).RunCommand("export", []string{ @@ -137,7 +141,8 @@ chr2 471 . GG AA . . . GT 0/1 0/0 c.Log(string(output)) c.Check(sortLines(string(output)), check.Equals, sortLines(`#CHROM POS ID REF ALT QUAL FILTER INFO chr1 1 . NNN GGC . . AC=2 -chr1 41 . TT AA . . AC=1 +chr1 41 . T A . . AC=1 +chr1 42 . T A . . AC=1 chr1 161 . A T . . AC=1 chr1 178 . A T . . AC=1 chr1 221 . TCCA T . . AC=2 @@ -153,7 +158,8 @@ chr2 240 . ATTTTTCTTGCTCTC A . . AC=1 chr2 258 . CCTTGTATTTTT AA . . AC=1 chr2 315 . C A . . AC=1 chr2 469 . GTGG G . . AC=1 -chr2 471 . GG AA . . AC=1 +chr2 471 . G A . . AC=1 +chr2 472 . G A . . AC=1 `)) c.Logf("export hgvs-numpy") @@ -175,10 +181,10 @@ chr2 471 . GG AA . . AC=1 c.Assert(err, check.IsNil) variants, err := npy.GetInt8() c.Assert(err, check.IsNil) - c.Check(variants, check.HasLen, 6*2*2) // 6 variants * 2 alleles * 2 genomes + c.Check(variants, check.HasLen, 7*2*2) // 7 variants * 2 alleles * 2 genomes c.Check(variants, check.DeepEquals, []int8{ - 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, // input1.1.fasta - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, // input2.1.fasta + 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, // input1.1.fasta + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, // input2.1.fasta }) f, err = os.Open(outdir + "/matrix.chr2.npy") @@ -188,21 +194,22 @@ chr2 471 . GG AA . . AC=1 c.Assert(err, check.IsNil) variants, err = npy.GetInt8() c.Assert(err, check.IsNil) - c.Check(variants, check.HasLen, 7*2*2) // 6 variants * 2 alleles * 2 genomes + c.Check(variants, check.HasLen, 8*2*2) // 8 variants * 2 alleles * 2 genomes c.Check(variants, check.DeepEquals, []int8{ - 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, // input1.1.fasta - 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // input2.1.fasta + 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, // input1.1.fasta + 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // input2.1.fasta }) annotations, err := ioutil.ReadFile(outdir + "/annotations.chr1.csv") c.Check(err, check.IsNil) c.Logf("%s", string(annotations)) c.Check(string(annotations), check.Equals, `0,"chr1.1_3delinsGGC" -1,"chr1.41_42delinsAA" -2,"chr1.161A>T" -3,"chr1.178A>T" -4,"chr1.222_224del" -5,"chr1.302_305delinsAAAA" +1,"chr1.41T>A" +2,"chr1.42T>A" +3,"chr1.161A>T" +4,"chr1.178A>T" +5,"chr1.222_224del" +6,"chr1.302_305delinsAAAA" `) annotations, err = ioutil.ReadFile(outdir + "/annotations.chr2.csv") c.Check(err, check.IsNil) @@ -212,7 +219,8 @@ chr2 471 . GG AA . . AC=1 3,"chr2.258_269delinsAA" 4,"chr2.315C>A" 5,"chr2.470_472del" -6,"chr2.471_472delinsAA" +6,"chr2.471G>A" +7,"chr2.472G>A" `) c.Logf("export hgvs-numpy with p-value threshold") diff --git a/hgvs/diff.go b/hgvs/diff.go index 12fa7c77c4..206cf7e56c 100644 --- a/hgvs/diff.go +++ b/hgvs/diff.go @@ -93,6 +93,17 @@ func Diff(a, b string, timeout time.Duration) ([]Variant, bool) { v.New += diffs[i].Text } } + if len(v.Ref) == 2 && len(v.New) == 2 { + v1 := v + v1.Ref = v1.Ref[:1] + v1.New = v1.New[:1] + v.Ref = v.Ref[1:] + v.New = v.New[1:] + v.Position++ + v.Left = v1.Ref + pos++ + variants = append(variants, v1) + } pos += len(v.Ref) variants = append(variants, v) left = "" @@ -193,6 +204,17 @@ func cleanup(in []diffmatchpatch.Diff) (out []diffmatchpatch.Diff) { } out = append(out, d) } + // for i := 0; i < len(out)-1; i++ { + // if out[i].Type == diffmatchpatch.DiffDelete && len(out[i].Text) == 2 && + // out[i+1].Type == diffmatchpatch.DiffInsert && len(out[i+1].Text) == 2 { + // out = append(out, diffmatchpatch.Diff{}, diffmatchpatch.Diff{}) + // copy(out[i+4:], out[i+2:]) + // out[i+2] = diffmatchpatch.Diff{diffmatchpatch.DiffDelete, out[i].Text[1:]} + // out[i+3] = diffmatchpatch.Diff{diffmatchpatch.DiffInsert, out[i+1].Text[1:]} + // out[i].Text = out[i].Text[:1] + // out[i+1].Text = out[i+1].Text[:1] + // } + // } return } diff --git a/hgvs/diff_test.go b/hgvs/diff_test.go index 8a0d0c8c94..a5e861e35a 100644 --- a/hgvs/diff_test.go +++ b/hgvs/diff_test.go @@ -61,7 +61,7 @@ func (s *diffSuite) TestDiff(c *check.C) { { a: "aaGGttAAtttt", b: "aaCCttttttC", - expect: []string{"3_4delinsCC", "7_8del", "12_13insC"}, + expect: []string{"3G>C", "4G>C", "7_8del", "12_13insC"}, }, { // without cleanup, diffmatchpatch solves this as {"3del", "=A", "4_5insA"} @@ -73,13 +73,13 @@ func (s *diffSuite) TestDiff(c *check.C) { // without cleanup, diffmatchpatch solves this as {"3_4del", "=A", "5_6insAA"} a: "agggaggggg", b: "agAAaggggg", - expect: []string{"3_4delinsAA"}, + expect: []string{"3G>A", "4G>A"}, }, { // without cleanup, diffmatchpatch solves this as {"3_4del", "=A", "5_6insCA"} a: "agggaggggg", b: "agACaggggg", - expect: []string{"3_4delinsAC"}, + expect: []string{"3G>A", "4G>C"}, }, { // without cleanup, diffmatchpatch solves this as {"3_7del", "=A", "8_9insAAACA"} @@ -96,12 +96,17 @@ func (s *diffSuite) TestDiff(c *check.C) { { a: "agggaggggg", b: "agCAaggggg", - expect: []string{"3_4delinsCA"}, + expect: []string{"3G>C", "4G>A"}, }, { a: "agggg", b: "agAAg", - expect: []string{"3_4delinsAA"}, + expect: []string{"3G>A", "4G>A"}, + }, + { + a: "aggggg", + b: "agAAAg", + expect: []string{"3_5delinsAAA"}, }, { a: "acgtgaa", @@ -133,6 +138,16 @@ func (s *diffSuite) TestDiff(c *check.C) { b: "tcacaagac", expect: []string{"4T>C", "6del"}, }, + { + a: "tcatcgagac", + b: "tcGCcgagac", + expect: []string{"3A>G", "4T>C"}, + }, + { + a: "tcatcgagac", + b: "tcGCcggac", + expect: []string{"3A>G", "4T>C", "7del"}, + }, } { c.Log(trial) var vars []string diff --git a/pipeline_test.go b/pipeline_test.go index c208c645cc..391312e507 100644 --- a/pipeline_test.go +++ b/pipeline_test.go @@ -96,7 +96,8 @@ func (s *pipelineSuite) TestImportMerge(c *check.C) { hgvsout, err := ioutil.ReadFile(tmpdir + "/out.tsv") c.Check(err, check.IsNil) c.Check(sortLines(string(hgvsout)), check.Equals, sortLines(`chr1:g.1_3delinsGGC N -chr1:g.[41_42delinsAA];[41=] N +chr1:g.[41T>A];[41=] N +chr1:g.[42T>A];[42=] N chr1:g.[161=];[161A>T] N chr1:g.[178=];[178A>T] N chr1:g.222_224del N @@ -107,7 +108,8 @@ chr2:g.[241_254del];[241=] . chr2:g.[258_269delinsAA];[258=] . chr2:g.[315C>A];[315=] . chr2:g.[470_472del];[470=] . -chr2:g.[471=];[471_472delinsAA] . +chr2:g.[471=];[471G>A] . +chr2:g.[472=];[472G>A] . `)) code = (&exporter{}).RunCommand("lightning export", []string{"-local", "-ref", "testdata/ref.fasta", "-output-dir", tmpdir, "-output-format", "pvcf", "-input-dir", tmpdir + "/merged", "-output-bed", tmpdir + "/export.bed", "-output-per-chromosome=false"}, bytes.NewReader(nil), os.Stderr, os.Stderr) @@ -117,7 +119,8 @@ chr2:g.[471=];[471_472delinsAA] . c.Check(sortLines(string(vcfout)), check.Equals, sortLines(`##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT testdata/pipeline1/input1.1.fasta testdata/pipeline1/input2.1.fasta chr1 1 . NNN GGC . . . GT 1/1 0/0 -chr1 41 . TT AA . . . GT 1/0 0/0 +chr1 41 . T A . . . GT 1/0 0/0 +chr1 42 . T A . . . GT 1/0 0/0 chr1 161 . A T . . . GT 0/1 0/0 chr1 178 . A T . . . GT 0/1 0/0 chr1 221 . TCCA T . . . GT 1/1 0/0 @@ -128,7 +131,8 @@ chr2 240 . ATTTTTCTTGCTCTC A . . . GT 1/0 0/0 chr2 258 . CCTTGTATTTTT AA . . . GT 1/0 0/0 chr2 315 . C A . . . GT 1/0 0/0 chr2 469 . GTGG G . . . GT 1/0 0/0 -chr2 471 . GG AA . . . GT 0/1 0/0 +chr2 471 . G A . . . GT 0/1 0/0 +chr2 472 . G A . . . GT 0/1 0/0 `)) bedout, err := ioutil.ReadFile(tmpdir + "/export.bed") c.Check(err, check.IsNil) @@ -155,14 +159,16 @@ chr2 472 572 7 1000 . 496 572 0,0,8d4fe9a63921b,chr1:g.222_224del 0,0,ba4263ca4199c,chr1:g.1_3delinsGGC 0,0,ba4263ca4199c,chr1:g.222_224del -0,0,ba4263ca4199c,chr1:g.41_42delinsAA +0,0,ba4263ca4199c,chr1:g.41T>A +0,0,ba4263ca4199c,chr1:g.42T>A 1,1,139890345dbb8,chr1:g.302_305delinsAAAA 4,4,cbfca15d241d3,chr2:g.125_127delinsAAA 4,4,cbfca15d241d3,chr2:g.1_3delinsAAA 4,4,f5fafe9450b02,chr2:g.241_245delinsAAAAA 4,4,f5fafe9450b02,chr2:g.291C>A 4,4,fe9a71a42adb4,chr2:g.125_127delinsAAA -6,6,e36dce85efbef,chr2:g.471_472delinsAA +6,6,e36dce85efbef,chr2:g.471G>A +6,6,e36dce85efbef,chr2:g.472G>A 6,6,f81388b184f4a,chr2:g.470_472del `)) } -- 2.30.2