From 3194c2deab4750f319ac3e66943863694902be57 Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Thu, 8 Apr 2021 16:04:05 -0400 Subject: [PATCH] Fix HGVS diff: GGAA>AAAA is GG>AA, not delGG,=AA,insAA. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- hgvs/diff.go | 27 +++++++++++++++++++++++++++ hgvs/diff_test.go | 45 +++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 70 insertions(+), 2 deletions(-) diff --git a/hgvs/diff.go b/hgvs/diff.go index 0f6365db26..5a6b625e6b 100644 --- a/hgvs/diff.go +++ b/hgvs/diff.go @@ -2,6 +2,7 @@ package hgvs import ( "fmt" + "strings" "time" "github.com/sergi/go-diff/diffmatchpatch" @@ -92,13 +93,39 @@ func Diff(a, b string, timeout time.Duration) ([]Variant, bool) { } func cleanup(in []diffmatchpatch.Diff) (out []diffmatchpatch.Diff) { + out = make([]diffmatchpatch.Diff, 0, len(in)) for i := 0; i < len(in); i++ { d := in[i] + // Merge consecutive entries of same type (e.g., + // "insert A; insert B") for i < len(in)-1 && in[i].Type == in[i+1].Type { d.Text += in[i+1].Text i++ } out = append(out, d) } + in, out = out, make([]diffmatchpatch.Diff, 0, len(in)) + for i := 0; i < len(in); i++ { + d := in[i] + // diffmatchpatch solves diff("AAX","XTX") with + // [delAA,=X,insTX] but we prefer to spell it + // [delAA,insXT,=X]. + // + // So, when we see a [del,=,ins] sequence where the + // "=" part is a suffix of the "ins" part -- e.g., + // [delAAA,=CGG,insTTTCGG] -- we rearrange it to the + // equivalent spelling [delAAA,insCGGTTT,=CGG]. + if i < len(in)-2 && + d.Type == diffmatchpatch.DiffDelete && + in[i+1].Type == diffmatchpatch.DiffEqual && + in[i+2].Type == diffmatchpatch.DiffInsert && + strings.HasSuffix(in[i+2].Text, in[i+1].Text) { + eq, ins := in[i+1], in[i+2] + ins.Text = eq.Text + ins.Text[:len(ins.Text)-len(eq.Text)] + in[i+1] = ins + in[i+2] = eq + } + out = append(out, d) + } return } diff --git a/hgvs/diff_test.go b/hgvs/diff_test.go index 48360efd54..8beb7979d5 100644 --- a/hgvs/diff_test.go +++ b/hgvs/diff_test.go @@ -1,6 +1,7 @@ package hgvs import ( + "strings" "testing" "gopkg.in/check.v1" @@ -21,7 +22,7 @@ func (s *diffSuite) TestDiff(c *check.C) { { a: "aaaaaaaaaa", b: "aaaaCaaaaa", - expect: []string{"5a>C"}, + expect: []string{"5A>C"}, }, { a: "aaaacGcaaa", @@ -58,10 +59,50 @@ func (s *diffSuite) TestDiff(c *check.C) { b: "aaCCttttttC", expect: []string{"3_4delinsCC", "7_8del", "12_13insC"}, }, + { + // without cleanup, diffmatchpatch solves this as {"3del", "=A", "4_5insA"} + a: "aggaggggg", + b: "agAaggggg", + expect: []string{"3G>A"}, + }, + { + // without cleanup, diffmatchpatch solves this as {"3_4del", "=A", "5_6insAA"} + a: "agggaggggg", + b: "agAAaggggg", + expect: []string{"3_4delinsAA"}, + }, + { + // without cleanup, diffmatchpatch solves this as {"3_4del", "=A", "5_6insCA"} + a: "agggaggggg", + b: "agACaggggg", + expect: []string{"3_4delinsAC"}, + }, + { + // without cleanup, diffmatchpatch solves this as {"3_7del", "=A", "8_9insAAACA"} + a: "aggggggaggggg", + b: "agAAAACaggggg", + expect: []string{"3_7delinsAAAAC"}, + }, + { + // without cleanup, diffmatchpatch solves this as {"3_7del", "=AAAA", "11_12insCAAAA"} + a: "aggggggaaaaggggg", + b: "agAAAACaaaaggggg", + expect: []string{"3_7delinsAAAAC"}, + }, + { + a: "agggaggggg", + b: "agCAaggggg", + expect: []string{"3_4delinsCA"}, + }, + { + a: "agggg", + b: "agAAg", + expect: []string{"3_4delinsAA"}, + }, } { c.Log(trial) var vars []string - diffs, _ := Diff(trial.a, trial.b, 0) + diffs, _ := Diff(strings.ToUpper(trial.a), strings.ToUpper(trial.b), 0) for _, v := range diffs { vars = append(vars, v.String()) } -- 2.30.2