Fix left-most diff cases.
[lightning.git] / hgvs / diff.go
index d920663f4403b6b06f75ef977951d4368168ba06..780cfacf10c1d3ba3b802f8c249fce4d1eef6031 100644 (file)
@@ -1,7 +1,12 @@
+// Copyright (C) The Lightning Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
 package hgvs
 
 import (
        "fmt"
+       "strings"
        "time"
 
        "github.com/sergi/go-diff/diffmatchpatch"
@@ -18,6 +23,10 @@ func (v *Variant) String() string {
        switch {
        case len(v.New) == 0 && len(v.Ref) == 0:
                return fmt.Sprintf("%d=", v.Position)
+       case len(v.New) == 1 && v.New == v.Ref:
+               return fmt.Sprintf("%d=", v.Position)
+       case v.New == v.Ref:
+               return fmt.Sprintf("%d_%d=", v.Position, v.Position+len(v.Ref)-1)
        case len(v.New) == 0 && len(v.Ref) == 1:
                return fmt.Sprintf("%ddel", v.Position)
        case len(v.New) == 0:
@@ -63,52 +72,179 @@ func Diff(a, b string, timeout time.Duration) ([]Variant, bool) {
                timedOut = true
        }
        diffs = cleanup(dmp.DiffCleanupEfficiency(diffs))
-       left := "" // last char before an insertion or deletion
        pos := 1
        var variants []Variant
-       for i := 0; i < len(diffs); i++ {
-               switch diffs[i].Type {
-               case diffmatchpatch.DiffEqual:
+       for i := 0; i < len(diffs); {
+               left := "" // last char before an insertion or deletion
+               for ; i < len(diffs) && diffs[i].Type == diffmatchpatch.DiffEqual; i++ {
                        pos += len(diffs[i].Text)
                        if tlen := len(diffs[i].Text); tlen > 0 {
                                left = diffs[i].Text[tlen-1:]
-                       } else {
-                               left = ""
                        }
-               case diffmatchpatch.DiffDelete:
-                       if i+1 < len(diffs) && diffs[i+1].Type == diffmatchpatch.DiffInsert {
-                               // deletion followed by insertion
-                               variants = append(variants, Variant{Position: pos, Ref: diffs[i].Text, New: diffs[i+1].Text})
-                               pos += len(diffs[i].Text)
-                               i++
-                       } else {
-                               variants = append(variants, Variant{Position: pos, Ref: diffs[i].Text, Left: left})
-                               pos += len(diffs[i].Text)
-                       }
-                       left = ""
-               case diffmatchpatch.DiffInsert:
-                       if i+1 < len(diffs) && diffs[i+1].Type == diffmatchpatch.DiffDelete {
-                               // insertion followed by deletion
-                               variants = append(variants, Variant{Position: pos, Ref: diffs[i+1].Text, New: diffs[i].Text})
-                               pos += len(diffs[i+1].Text)
-                               i++
+               }
+               if i >= len(diffs) {
+                       break
+               }
+               v := Variant{Position: pos, Left: left}
+               for ; i < len(diffs) && diffs[i].Type != diffmatchpatch.DiffEqual; i++ {
+                       if diffs[i].Type == diffmatchpatch.DiffDelete {
+                               v.Ref += diffs[i].Text
                        } else {
-                               variants = append(variants, Variant{Position: pos, New: diffs[i].Text, Left: left})
+                               v.New += diffs[i].Text
                        }
-                       left = ""
                }
+               if len(v.Ref) == 2 && len(v.New) == 2 {
+                       v1 := v
+                       v1.Ref = v1.Ref[:1]
+                       v1.New = v1.New[:1]
+                       v.Ref = v.Ref[1:]
+                       v.New = v.New[1:]
+                       v.Position++
+                       v.Left = v1.Ref
+                       pos++
+                       variants = append(variants, v1)
+               }
+               pos += len(v.Ref)
+               variants = append(variants, v)
+               left = ""
        }
        return variants, timedOut
 }
 
 func cleanup(in []diffmatchpatch.Diff) (out []diffmatchpatch.Diff) {
+       out = make([]diffmatchpatch.Diff, 0, len(in))
        for i := 0; i < len(in); i++ {
                d := in[i]
+               // Merge consecutive entries of same type (e.g.,
+               // "insert A; insert B")
                for i < len(in)-1 && in[i].Type == in[i+1].Type {
                        d.Text += in[i+1].Text
                        i++
                }
                out = append(out, d)
        }
+       in, out = out, make([]diffmatchpatch.Diff, 0, len(in))
+       for i := 0; i < len(in); i++ {
+               d := in[i]
+               // diffmatchpatch solves diff("AAX","XTX") with
+               // [delAA,=X,insTX] but we prefer to spell it
+               // [delAA,insXT,=X].
+               //
+               // So, when we see a [del,=,ins] sequence where the
+               // "=" part is a suffix of the "ins" part -- e.g.,
+               // [delAAA,=CGG,insTTTCGG] -- we rearrange it to the
+               // equivalent spelling [delAAA,insCGGTTT,=CGG].
+               if i < len(in)-2 &&
+                       d.Type == diffmatchpatch.DiffDelete &&
+                       in[i+1].Type == diffmatchpatch.DiffEqual &&
+                       in[i+2].Type == diffmatchpatch.DiffInsert &&
+                       strings.HasSuffix(in[i+2].Text, in[i+1].Text) {
+                       eq, ins := in[i+1], in[i+2]
+                       ins.Text = eq.Text + ins.Text[:len(ins.Text)-len(eq.Text)]
+                       in[i+1] = ins
+                       in[i+2] = eq
+               }
+               // when diffmatchpatch says [=yyyyXXXX, delX, =zzz],
+               // we really want [=yyyy, delX, =XXXXzzz] (ditto for
+               // ins instead of del)
+               if i < len(in)-2 &&
+                       d.Type == diffmatchpatch.DiffEqual &&
+                       in[i+1].Type != diffmatchpatch.DiffEqual &&
+                       in[i+2].Type == diffmatchpatch.DiffEqual &&
+                       len(in[i+1].Text) <= len(d.Text) {
+                       for cut := 0; cut < len(d.Text); cut++ {
+                               skip := strings.Index(d.Text[cut:], in[i+1].Text)
+                               if skip < 0 {
+                                       break
+                               }
+                               cut += skip
+                               if d.Text[cut:]+in[i+1].Text == in[i+1].Text+d.Text[cut:] {
+                                       in[i+2].Text = d.Text[cut:] + in[i+2].Text
+                                       d.Text = d.Text[:cut]
+                                       break
+                               }
+                       }
+               }
+               // diffmatchpatch solves diff("AXX","XXX") with
+               // [delA,=XX,insX] but we prefer to spell it
+               // [delA,insX,=XX].
+               //
+               // So, when we see a [del,=,ins] sequence that has the
+               // same effect after swapping the "=" and "ins" parts,
+               // we swap them.
+               if i < len(in)-2 &&
+                       d.Type == diffmatchpatch.DiffDelete &&
+                       in[i+1].Type == diffmatchpatch.DiffEqual &&
+                       in[i+2].Type == diffmatchpatch.DiffInsert &&
+                       in[i+1].Text+in[i+2].Text == in[i+2].Text+in[i+1].Text {
+                       in[i+2], in[i+1] = in[i+1], in[i+2]
+               }
+               // when diffmatchpatch says [delAAA, insXAY] and
+               // len(X)==1, we prefer to treat the A>X as a snp.
+               if i < len(in)-1 &&
+                       d.Type == diffmatchpatch.DiffDelete &&
+                       in[i+1].Type == diffmatchpatch.DiffInsert &&
+                       len(d.Text) >= 2 &&
+                       len(in[i+1].Text) >= 2 &&
+                       d.Text[1] == in[i+1].Text[1] {
+                       eqend := 2
+                       for ; eqend < len(d.Text) && eqend < len(in[i+1].Text) && d.Text[eqend] == in[i+1].Text[eqend]; eqend++ {
+                       }
+                       out = append(out,
+                               diffmatchpatch.Diff{diffmatchpatch.DiffDelete, d.Text[:1]},
+                               diffmatchpatch.Diff{diffmatchpatch.DiffInsert, in[i+1].Text[:1]},
+                               diffmatchpatch.Diff{diffmatchpatch.DiffEqual, d.Text[1:eqend]})
+                       in[i].Text, in[i+1].Text = in[i].Text[eqend:], in[i+1].Text[eqend:]
+                       i--
+                       continue
+               }
+               // when diffmatchpatch says [delAAA, insXaY] and
+               // len(Y)==1, we prefer to treat the A>Y as a snp.
+               if i < len(in)-1 &&
+                       d.Type == diffmatchpatch.DiffDelete &&
+                       in[i+1].Type == diffmatchpatch.DiffInsert &&
+                       len(d.Text) >= 2 &&
+                       len(in[i+1].Text) >= 2 &&
+                       d.Text[len(d.Text)-2] == in[i+1].Text[len(in[i+1].Text)-2] {
+                       // eqstart will be the number of equal chars
+                       // before the terminal snp, plus 1 for the snp
+                       // itself. Example, for [delAAAA, insTTAAG],
+                       // eqstart will be 3.
+                       eqstart := 2
+                       for ; eqstart < len(d.Text) && eqstart < len(in[i+1].Text) && d.Text[len(d.Text)-eqstart] == in[i+1].Text[len(in[i+1].Text)-eqstart]; eqstart++ {
+                       }
+                       eqstart--
+                       out = append(out,
+                               diffmatchpatch.Diff{diffmatchpatch.DiffDelete, d.Text[:len(d.Text)-eqstart]},
+                               diffmatchpatch.Diff{diffmatchpatch.DiffInsert, in[i+1].Text[:len(in[i+1].Text)-eqstart]},
+                               diffmatchpatch.Diff{diffmatchpatch.DiffEqual, d.Text[len(d.Text)-eqstart : len(d.Text)-1]},
+                               diffmatchpatch.Diff{diffmatchpatch.DiffDelete, d.Text[len(d.Text)-1:]},
+                               diffmatchpatch.Diff{diffmatchpatch.DiffInsert, in[i+1].Text[len(in[i+1].Text)-1:]})
+                       i++
+                       continue
+               }
+               out = append(out, d)
+       }
+       // for i := 0; i < len(out)-1; i++ {
+       //      if out[i].Type == diffmatchpatch.DiffDelete && len(out[i].Text) == 2 &&
+       //              out[i+1].Type == diffmatchpatch.DiffInsert && len(out[i+1].Text) == 2 {
+       //              out = append(out, diffmatchpatch.Diff{}, diffmatchpatch.Diff{})
+       //              copy(out[i+4:], out[i+2:])
+       //              out[i+2] = diffmatchpatch.Diff{diffmatchpatch.DiffDelete, out[i].Text[1:]}
+       //              out[i+3] = diffmatchpatch.Diff{diffmatchpatch.DiffInsert, out[i+1].Text[1:]}
+       //              out[i].Text = out[i].Text[:1]
+       //              out[i+1].Text = out[i+1].Text[:1]
+       //      }
+       // }
        return
 }
+
+func Less(a, b Variant) bool {
+       if a.Position != b.Position {
+               return a.Position < b.Position
+       } else if a.New != b.New {
+               return a.New < b.New
+       } else {
+               return a.Ref < b.Ref
+       }
+}