Fix HGVS diff: GGAA>AAAA is GG>AA, not delGG,=AA,insAA.
authorTom Clegg <tom@tomclegg.ca>
Thu, 8 Apr 2021 20:04:05 +0000 (16:04 -0400)
committerTom Clegg <tom@tomclegg.ca>
Thu, 8 Apr 2021 20:04:05 +0000 (16:04 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

hgvs/diff.go
hgvs/diff_test.go

index 0f6365db267e3d4051e9066aeb7a0fcda0f4c84b..5a6b625e6b7b6d9e640feb3ef00e2c0ca83d888d 100644 (file)
@@ -2,6 +2,7 @@ package hgvs
 
 import (
        "fmt"
+       "strings"
        "time"
 
        "github.com/sergi/go-diff/diffmatchpatch"
@@ -92,13 +93,39 @@ func Diff(a, b string, timeout time.Duration) ([]Variant, bool) {
 }
 
 func cleanup(in []diffmatchpatch.Diff) (out []diffmatchpatch.Diff) {
+       out = make([]diffmatchpatch.Diff, 0, len(in))
        for i := 0; i < len(in); i++ {
                d := in[i]
+               // Merge consecutive entries of same type (e.g.,
+               // "insert A; insert B")
                for i < len(in)-1 && in[i].Type == in[i+1].Type {
                        d.Text += in[i+1].Text
                        i++
                }
                out = append(out, d)
        }
+       in, out = out, make([]diffmatchpatch.Diff, 0, len(in))
+       for i := 0; i < len(in); i++ {
+               d := in[i]
+               // diffmatchpatch solves diff("AAX","XTX") with
+               // [delAA,=X,insTX] but we prefer to spell it
+               // [delAA,insXT,=X].
+               //
+               // So, when we see a [del,=,ins] sequence where the
+               // "=" part is a suffix of the "ins" part -- e.g.,
+               // [delAAA,=CGG,insTTTCGG] -- we rearrange it to the
+               // equivalent spelling [delAAA,insCGGTTT,=CGG].
+               if i < len(in)-2 &&
+                       d.Type == diffmatchpatch.DiffDelete &&
+                       in[i+1].Type == diffmatchpatch.DiffEqual &&
+                       in[i+2].Type == diffmatchpatch.DiffInsert &&
+                       strings.HasSuffix(in[i+2].Text, in[i+1].Text) {
+                       eq, ins := in[i+1], in[i+2]
+                       ins.Text = eq.Text + ins.Text[:len(ins.Text)-len(eq.Text)]
+                       in[i+1] = ins
+                       in[i+2] = eq
+               }
+               out = append(out, d)
+       }
        return
 }
index 48360efd54ec5ddb28155400b0722c9689ab36b6..8beb7979d54df2e7d321ec519eed0f86a50e7167 100644 (file)
@@ -1,6 +1,7 @@
 package hgvs
 
 import (
+       "strings"
        "testing"
 
        "gopkg.in/check.v1"
@@ -21,7 +22,7 @@ func (s *diffSuite) TestDiff(c *check.C) {
                {
                        a:      "aaaaaaaaaa",
                        b:      "aaaaCaaaaa",
-                       expect: []string{"5a>C"},
+                       expect: []string{"5A>C"},
                },
                {
                        a:      "aaaacGcaaa",
@@ -58,10 +59,50 @@ func (s *diffSuite) TestDiff(c *check.C) {
                        b:      "aaCCttttttC",
                        expect: []string{"3_4delinsCC", "7_8del", "12_13insC"},
                },
+               {
+                       // without cleanup, diffmatchpatch solves this as {"3del", "=A", "4_5insA"}
+                       a:      "aggaggggg",
+                       b:      "agAaggggg",
+                       expect: []string{"3G>A"},
+               },
+               {
+                       // without cleanup, diffmatchpatch solves this as {"3_4del", "=A", "5_6insAA"}
+                       a:      "agggaggggg",
+                       b:      "agAAaggggg",
+                       expect: []string{"3_4delinsAA"},
+               },
+               {
+                       // without cleanup, diffmatchpatch solves this as {"3_4del", "=A", "5_6insCA"}
+                       a:      "agggaggggg",
+                       b:      "agACaggggg",
+                       expect: []string{"3_4delinsAC"},
+               },
+               {
+                       // without cleanup, diffmatchpatch solves this as {"3_7del", "=A", "8_9insAAACA"}
+                       a:      "aggggggaggggg",
+                       b:      "agAAAACaggggg",
+                       expect: []string{"3_7delinsAAAAC"},
+               },
+               {
+                       // without cleanup, diffmatchpatch solves this as {"3_7del", "=AAAA", "11_12insCAAAA"}
+                       a:      "aggggggaaaaggggg",
+                       b:      "agAAAACaaaaggggg",
+                       expect: []string{"3_7delinsAAAAC"},
+               },
+               {
+                       a:      "agggaggggg",
+                       b:      "agCAaggggg",
+                       expect: []string{"3_4delinsCA"},
+               },
+               {
+                       a:      "agggg",
+                       b:      "agAAg",
+                       expect: []string{"3_4delinsAA"},
+               },
        } {
                c.Log(trial)
                var vars []string
-               diffs, _ := Diff(trial.a, trial.b, 0)
+               diffs, _ := Diff(strings.ToUpper(trial.a), strings.ToUpper(trial.b), 0)
                for _, v := range diffs {
                        vars = append(vars, v.String())
                }