Fix diff case
[lightning.git] / hgvs / diff.go
1 // Copyright (C) The Lightning Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package hgvs
6
7 import (
8         "fmt"
9         "strings"
10         "time"
11
12         "github.com/sergi/go-diff/diffmatchpatch"
13 )
14
15 type Variant struct {
16         Position int
17         Ref      string
18         New      string
19         Left     string // base preceding an indel, if Ref or New is empty
20 }
21
22 func (v *Variant) String() string {
23         switch {
24         case len(v.New) == 0 && len(v.Ref) == 0:
25                 return fmt.Sprintf("%d=", v.Position)
26         case len(v.New) == 1 && v.New == v.Ref:
27                 return fmt.Sprintf("%d=", v.Position)
28         case v.New == v.Ref:
29                 return fmt.Sprintf("%d_%d=", v.Position, v.Position+len(v.Ref)-1)
30         case len(v.New) == 0 && len(v.Ref) == 1:
31                 return fmt.Sprintf("%ddel", v.Position)
32         case len(v.New) == 0:
33                 return fmt.Sprintf("%d_%ddel", v.Position, v.Position+len(v.Ref)-1)
34         case len(v.Ref) == 1 && len(v.New) == 1:
35                 return fmt.Sprintf("%d%s>%s", v.Position, v.Ref, v.New)
36         case len(v.Ref) == 0:
37                 return fmt.Sprintf("%d_%dins%s", v.Position-1, v.Position, v.New)
38         case len(v.Ref) == 1 && len(v.New) > 0:
39                 return fmt.Sprintf("%ddelins%s", v.Position, v.New)
40         default:
41                 return fmt.Sprintf("%d_%ddelins%s", v.Position, v.Position+len(v.Ref)-1, v.New)
42         }
43 }
44
45 // PadLeft returns a Variant that is equivalent to v but (if possible)
46 // uses the stashed preceding base (the Left field) to avoid having a
47 // non-empty Ref or New part, even for an insertion or deletion.
48 //
49 // For example, if v is {Position: 45, Ref: "", New: "A"}, PadLeft
50 // might return {Position: 44, Ref: "T", New: "TA"}.
51 func (v *Variant) PadLeft() Variant {
52         if len(v.Ref) == 0 || len(v.New) == 0 {
53                 return Variant{
54                         Position: v.Position - len(v.Left),
55                         Ref:      v.Left + v.Ref,
56                         New:      v.Left + v.New,
57                 }
58         } else {
59                 return *v
60         }
61 }
62
63 func Diff(a, b string, timeout time.Duration) ([]Variant, bool) {
64         dmp := diffmatchpatch.New()
65         var deadline time.Time
66         if timeout > 0 {
67                 deadline = time.Now().Add(timeout)
68         }
69         diffs := dmp.DiffBisect(a, b, deadline)
70         timedOut := false
71         if timeout > 0 && time.Now().After(deadline) {
72                 timedOut = true
73         }
74         diffs = cleanup(dmp.DiffCleanupEfficiency(diffs))
75         pos := 1
76         var variants []Variant
77         for i := 0; i < len(diffs); {
78                 left := "" // last char before an insertion or deletion
79                 for ; i < len(diffs) && diffs[i].Type == diffmatchpatch.DiffEqual; i++ {
80                         pos += len(diffs[i].Text)
81                         if tlen := len(diffs[i].Text); tlen > 0 {
82                                 left = diffs[i].Text[tlen-1:]
83                         }
84                 }
85                 if i >= len(diffs) {
86                         break
87                 }
88                 v := Variant{Position: pos, Left: left}
89                 for ; i < len(diffs) && diffs[i].Type != diffmatchpatch.DiffEqual; i++ {
90                         if diffs[i].Type == diffmatchpatch.DiffDelete {
91                                 v.Ref += diffs[i].Text
92                         } else {
93                                 v.New += diffs[i].Text
94                         }
95                 }
96                 if len(v.Ref) == 2 && len(v.New) == 2 {
97                         v1 := v
98                         v1.Ref = v1.Ref[:1]
99                         v1.New = v1.New[:1]
100                         v.Ref = v.Ref[1:]
101                         v.New = v.New[1:]
102                         v.Position++
103                         v.Left = v1.Ref
104                         pos++
105                         variants = append(variants, v1)
106                 }
107                 pos += len(v.Ref)
108                 variants = append(variants, v)
109                 left = ""
110         }
111         return variants, timedOut
112 }
113
114 func cleanup(in []diffmatchpatch.Diff) (out []diffmatchpatch.Diff) {
115         out = make([]diffmatchpatch.Diff, 0, len(in))
116         for i := 0; i < len(in); i++ {
117                 d := in[i]
118                 // Merge consecutive entries of same type (e.g.,
119                 // "insert A; insert B")
120                 for i < len(in)-1 && in[i].Type == in[i+1].Type {
121                         d.Text += in[i+1].Text
122                         i++
123                 }
124                 out = append(out, d)
125         }
126         in, out = out, make([]diffmatchpatch.Diff, 0, len(in))
127         for i := 0; i < len(in); i++ {
128                 d := in[i]
129                 // when diffmatchpatch says [=yyyyXXXX, delX, =zzz],
130                 // we really want [=yyyy, delX, =XXXXzzz] (ditto for
131                 // ins instead of del)
132                 if i < len(in)-2 &&
133                         d.Type == diffmatchpatch.DiffEqual &&
134                         in[i+1].Type != diffmatchpatch.DiffEqual &&
135                         in[i+2].Type == diffmatchpatch.DiffEqual &&
136                         len(in[i+1].Text) <= len(d.Text) {
137                         for cut := 0; cut < len(d.Text)-len(in[i+1].Text); cut++ {
138                                 if d.Text[cut:] == d.Text[cut+len(in[i+1].Text):]+in[i+1].Text {
139                                         in[i+2].Text = d.Text[cut+len(in[i+1].Text):] + in[i+1].Text + in[i+2].Text
140                                         in[i+1].Text = d.Text[cut : cut+len(in[i+1].Text)]
141                                         d.Text = d.Text[:cut]
142                                         break
143                                 }
144                         }
145                 }
146                 // diffmatchpatch solves diff("AAX","XTX") with
147                 // [delAA,=X,insTX] but we prefer to spell it
148                 // [delAA,insXT,=X].
149                 //
150                 // So, when we see a [del,=,ins] sequence where the
151                 // "=" part is a suffix of the "ins" part -- e.g.,
152                 // [delAAA,=CGG,insTTTCGG] -- we rearrange it to the
153                 // equivalent spelling [delAAA,insCGGTTT,=CGG].
154                 if i < len(in)-2 &&
155                         d.Type == diffmatchpatch.DiffDelete &&
156                         in[i+1].Type == diffmatchpatch.DiffEqual &&
157                         in[i+2].Type == diffmatchpatch.DiffInsert &&
158                         strings.HasSuffix(in[i+2].Text, in[i+1].Text) {
159                         eq, ins := in[i+1], in[i+2]
160                         ins.Text = eq.Text + ins.Text[:len(ins.Text)-len(eq.Text)]
161                         in[i+1] = ins
162                         in[i+2] = eq
163                 }
164                 // diffmatchpatch solves diff("AXX","XXX") with
165                 // [delA,=XX,insX] but we prefer to spell it
166                 // [delA,insX,=XX].
167                 //
168                 // So, when we see a [del,=,ins] sequence that has the
169                 // same effect after swapping the "=" and "ins" parts,
170                 // we swap them.
171                 if i < len(in)-2 &&
172                         d.Type == diffmatchpatch.DiffDelete &&
173                         in[i+1].Type == diffmatchpatch.DiffEqual &&
174                         in[i+2].Type == diffmatchpatch.DiffInsert &&
175                         in[i+1].Text+in[i+2].Text == in[i+2].Text+in[i+1].Text {
176                         in[i+2], in[i+1] = in[i+1], in[i+2]
177                 }
178                 // Likewise, diffmatchpatch solves
179                 // diff("XXXA","XXAA") with [delX,=XXA,insA], we
180                 // prefer [=XX,delX,insA,=A]
181                 if i < len(in)-2 &&
182                         d.Type == diffmatchpatch.DiffDelete &&
183                         in[i+1].Type == diffmatchpatch.DiffEqual &&
184                         in[i+2].Type == diffmatchpatch.DiffInsert {
185                         redo := false
186                         for x := len(d.Text); x <= len(in[i+1].Text)-len(in[i+2].Text); x++ {
187                                 // d  in[i+1]  in[i+2]
188                                 // x  xxx aaa  a
189                                 //       ^
190                                 // x  xx
191                                 //    xxx
192                                 //        aaa
193                                 //         aa  a
194                                 if d.Text+in[i+1].Text[:x-len(d.Text)] == in[i+1].Text[:x] &&
195                                         in[i+1].Text[x:] == in[i+1].Text[x+len(in[i+2].Text):]+in[i+2].Text {
196                                         out = append(out, diffmatchpatch.Diff{diffmatchpatch.DiffEqual, d.Text + in[i+1].Text[:x-len(d.Text)]})
197                                         in[i], in[i+1], in[i+2] = diffmatchpatch.Diff{diffmatchpatch.DiffDelete, in[i+1].Text[x-len(d.Text) : x]},
198                                                 diffmatchpatch.Diff{diffmatchpatch.DiffInsert, in[i+1].Text[x : x+len(in[i+2].Text)]},
199                                                 diffmatchpatch.Diff{diffmatchpatch.DiffEqual, in[i+1].Text[x+len(in[i+2].Text):] + in[i+2].Text}
200                                         redo = true
201                                         break
202                                 }
203                         }
204                         if redo {
205                                 i--
206                                 continue
207                         }
208                 }
209                 // when diffmatchpatch says [delAAA, insXAY] and
210                 // len(X)==1, we prefer to treat the A>X as a snp.
211                 if i < len(in)-1 &&
212                         d.Type == diffmatchpatch.DiffDelete &&
213                         in[i+1].Type == diffmatchpatch.DiffInsert &&
214                         len(d.Text) >= 2 &&
215                         len(in[i+1].Text) >= 2 &&
216                         d.Text[1] == in[i+1].Text[1] {
217                         eqend := 2
218                         for ; eqend < len(d.Text) && eqend < len(in[i+1].Text) && d.Text[eqend] == in[i+1].Text[eqend]; eqend++ {
219                         }
220                         out = append(out,
221                                 diffmatchpatch.Diff{diffmatchpatch.DiffDelete, d.Text[:1]},
222                                 diffmatchpatch.Diff{diffmatchpatch.DiffInsert, in[i+1].Text[:1]},
223                                 diffmatchpatch.Diff{diffmatchpatch.DiffEqual, d.Text[1:eqend]})
224                         in[i].Text, in[i+1].Text = in[i].Text[eqend:], in[i+1].Text[eqend:]
225                         i--
226                         continue
227                 }
228                 // when diffmatchpatch says [delAAA, insXaY] and
229                 // len(Y)==1, we prefer to treat the A>Y as a snp.
230                 if i < len(in)-1 &&
231                         d.Type == diffmatchpatch.DiffDelete &&
232                         in[i+1].Type == diffmatchpatch.DiffInsert &&
233                         len(d.Text) >= 2 &&
234                         len(in[i+1].Text) >= 2 &&
235                         d.Text[len(d.Text)-2] == in[i+1].Text[len(in[i+1].Text)-2] {
236                         // eqstart will be the number of equal chars
237                         // before the terminal snp, plus 1 for the snp
238                         // itself. Example, for [delAAAA, insTTAAG],
239                         // eqstart will be 3.
240                         eqstart := 2
241                         for ; eqstart < len(d.Text) && eqstart < len(in[i+1].Text) && d.Text[len(d.Text)-eqstart] == in[i+1].Text[len(in[i+1].Text)-eqstart]; eqstart++ {
242                         }
243                         eqstart--
244                         out = append(out,
245                                 diffmatchpatch.Diff{diffmatchpatch.DiffDelete, d.Text[:len(d.Text)-eqstart]},
246                                 diffmatchpatch.Diff{diffmatchpatch.DiffInsert, in[i+1].Text[:len(in[i+1].Text)-eqstart]},
247                                 diffmatchpatch.Diff{diffmatchpatch.DiffEqual, d.Text[len(d.Text)-eqstart : len(d.Text)-1]},
248                                 diffmatchpatch.Diff{diffmatchpatch.DiffDelete, d.Text[len(d.Text)-1:]},
249                                 diffmatchpatch.Diff{diffmatchpatch.DiffInsert, in[i+1].Text[len(in[i+1].Text)-1:]})
250                         i++
251                         continue
252                 }
253                 // [=AB,insCB,=D] => [=A,insBC,=BD]
254                 // and
255                 // [=AB,delCB,=D] => [=A,delBC,=BD]
256                 if i < len(in)-2 &&
257                         d.Type == diffmatchpatch.DiffEqual &&
258                         in[i+1].Type != diffmatchpatch.DiffEqual &&
259                         in[i+2].Type == diffmatchpatch.DiffEqual &&
260                         len(d.Text) > 0 && len(in[i+1].Text) > 0 &&
261                         !(i+3 < len(in) &&
262                                 // Except: leave deletion alone if an
263                                 // upcoming insertion will be moved up
264                                 // against it: e.g., for
265                                 // [=AB,delCB,=D,insED] we want
266                                 // [=AB,delCB,=D,insED] for now, so it
267                                 // can become [=AB,delCB,insDE,=D] on
268                                 // the next iteration.
269                                 in[i+1].Type == diffmatchpatch.DiffDelete &&
270                                 in[i+3].Type == diffmatchpatch.DiffInsert &&
271                                 strings.HasSuffix(in[i+3].Text, in[i+2].Text)) {
272                         // Find x, length of common suffix B
273                         x := 1
274                         for ; x <= len(d.Text) && x <= len(in[i+1].Text); x++ {
275                                 if d.Text[len(d.Text)-x] != in[i+1].Text[len(in[i+1].Text)-x] {
276                                         break
277                                 }
278                         }
279                         x--
280                         d.Text, in[i+1].Text, in[i+2].Text =
281                                 d.Text[:len(d.Text)-x],
282                                 d.Text[len(d.Text)-x:]+
283                                         in[i+1].Text[:len(in[i+1].Text)-x],
284                                 in[i+1].Text[len(in[i+1].Text)-x:]+in[i+2].Text
285                 }
286                 // [=X,delAX] => [delXA,=X]
287                 if i < len(in)-1 &&
288                         d.Type == diffmatchpatch.DiffEqual &&
289                         in[i+1].Type == diffmatchpatch.DiffDelete && false {
290                 }
291                 out = append(out, d)
292         }
293         in, out = out, make([]diffmatchpatch.Diff, 0, len(in))
294         for _, d := range in {
295                 if len(d.Text) > 0 {
296                         out = append(out, d)
297                 }
298         }
299         // for i := 0; i < len(out)-1; i++ {
300         //      if out[i].Type == diffmatchpatch.DiffDelete && len(out[i].Text) == 2 &&
301         //              out[i+1].Type == diffmatchpatch.DiffInsert && len(out[i+1].Text) == 2 {
302         //              out = append(out, diffmatchpatch.Diff{}, diffmatchpatch.Diff{})
303         //              copy(out[i+4:], out[i+2:])
304         //              out[i+2] = diffmatchpatch.Diff{diffmatchpatch.DiffDelete, out[i].Text[1:]}
305         //              out[i+3] = diffmatchpatch.Diff{diffmatchpatch.DiffInsert, out[i+1].Text[1:]}
306         //              out[i].Text = out[i].Text[:1]
307         //              out[i+1].Text = out[i+1].Text[:1]
308         //      }
309         // }
310         return
311 }
312
313 func Less(a, b Variant) bool {
314         if a.Position != b.Position {
315                 return a.Position < b.Position
316         } else if a.New != b.New {
317                 return a.New < b.New
318         } else {
319                 return a.Ref < b.Ref
320         }
321 }