Avoid empty "ref" field in anno2vcf output.
[lightning.git] / hgvs / diff.go
1 // Copyright (C) The Lightning Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package hgvs
6
7 import (
8         "fmt"
9         "strings"
10         "time"
11
12         "github.com/sergi/go-diff/diffmatchpatch"
13 )
14
15 type Variant struct {
16         Position int
17         Ref      string
18         New      string
19         Left     string // base preceding an indel, if Ref or New is empty
20 }
21
22 func (v *Variant) String() string {
23         switch {
24         case len(v.New) == 0 && len(v.Ref) == 0:
25                 return fmt.Sprintf("%d=", v.Position)
26         case len(v.New) == 0 && len(v.Ref) == 1:
27                 return fmt.Sprintf("%ddel", v.Position)
28         case len(v.New) == 0:
29                 return fmt.Sprintf("%d_%ddel", v.Position, v.Position+len(v.Ref)-1)
30         case len(v.Ref) == 1 && len(v.New) == 1:
31                 return fmt.Sprintf("%d%s>%s", v.Position, v.Ref, v.New)
32         case len(v.Ref) == 0:
33                 return fmt.Sprintf("%d_%dins%s", v.Position-1, v.Position, v.New)
34         case len(v.Ref) == 1 && len(v.New) > 0:
35                 return fmt.Sprintf("%ddelins%s", v.Position, v.New)
36         default:
37                 return fmt.Sprintf("%d_%ddelins%s", v.Position, v.Position+len(v.Ref)-1, v.New)
38         }
39 }
40
41 // PadLeft returns a Variant that is equivalent to v but (if possible)
42 // uses the stashed preceding base (the Left field) to avoid having a
43 // non-empty Ref or New part, even for an insertion or deletion.
44 //
45 // For example, if v is {Position: 45, Ref: "", New: "A"}, PadLeft
46 // might return {Position: 44, Ref: "T", New: "TA"}.
47 func (v *Variant) PadLeft() Variant {
48         if len(v.Ref) == 0 || len(v.New) == 0 {
49                 return Variant{
50                         Position: v.Position - len(v.Left),
51                         Ref:      v.Left + v.Ref,
52                         New:      v.Left + v.New,
53                 }
54         } else {
55                 return *v
56         }
57 }
58
59 func Diff(a, b string, timeout time.Duration) ([]Variant, bool) {
60         dmp := diffmatchpatch.New()
61         var deadline time.Time
62         if timeout > 0 {
63                 deadline = time.Now().Add(timeout)
64         }
65         diffs := dmp.DiffBisect(a, b, deadline)
66         timedOut := false
67         if timeout > 0 && time.Now().After(deadline) {
68                 timedOut = true
69         }
70         diffs = cleanup(dmp.DiffCleanupEfficiency(diffs))
71         pos := 1
72         var variants []Variant
73         for i := 0; i < len(diffs); {
74                 left := "" // last char before an insertion or deletion
75                 for ; i < len(diffs) && diffs[i].Type == diffmatchpatch.DiffEqual; i++ {
76                         pos += len(diffs[i].Text)
77                         if tlen := len(diffs[i].Text); tlen > 0 {
78                                 left = diffs[i].Text[tlen-1:]
79                         }
80                 }
81                 if i >= len(diffs) {
82                         break
83                 }
84                 v := Variant{Position: pos, Left: left}
85                 for ; i < len(diffs) && diffs[i].Type != diffmatchpatch.DiffEqual; i++ {
86                         if diffs[i].Type == diffmatchpatch.DiffDelete {
87                                 v.Ref += diffs[i].Text
88                         } else {
89                                 v.New += diffs[i].Text
90                         }
91                 }
92                 pos += len(v.Ref)
93                 variants = append(variants, v)
94                 left = ""
95         }
96         return variants, timedOut
97 }
98
99 func cleanup(in []diffmatchpatch.Diff) (out []diffmatchpatch.Diff) {
100         out = make([]diffmatchpatch.Diff, 0, len(in))
101         for i := 0; i < len(in); i++ {
102                 d := in[i]
103                 // Merge consecutive entries of same type (e.g.,
104                 // "insert A; insert B")
105                 for i < len(in)-1 && in[i].Type == in[i+1].Type {
106                         d.Text += in[i+1].Text
107                         i++
108                 }
109                 out = append(out, d)
110         }
111         in, out = out, make([]diffmatchpatch.Diff, 0, len(in))
112         for i := 0; i < len(in); i++ {
113                 d := in[i]
114                 // diffmatchpatch solves diff("AAX","XTX") with
115                 // [delAA,=X,insTX] but we prefer to spell it
116                 // [delAA,insXT,=X].
117                 //
118                 // So, when we see a [del,=,ins] sequence where the
119                 // "=" part is a suffix of the "ins" part -- e.g.,
120                 // [delAAA,=CGG,insTTTCGG] -- we rearrange it to the
121                 // equivalent spelling [delAAA,insCGGTTT,=CGG].
122                 if i < len(in)-2 &&
123                         d.Type == diffmatchpatch.DiffDelete &&
124                         in[i+1].Type == diffmatchpatch.DiffEqual &&
125                         in[i+2].Type == diffmatchpatch.DiffInsert &&
126                         strings.HasSuffix(in[i+2].Text, in[i+1].Text) {
127                         eq, ins := in[i+1], in[i+2]
128                         ins.Text = eq.Text + ins.Text[:len(ins.Text)-len(eq.Text)]
129                         in[i+1] = ins
130                         in[i+2] = eq
131                 }
132                 // diffmatchpatch solves diff("AXX","XXX") with
133                 // [delA,=XX,insX] but we prefer to spell it
134                 // [delA,insX,=XX].
135                 //
136                 // So, when we see a [del,=,ins] sequence that has the
137                 // same effect after swapping the "=" and "ins" parts,
138                 // we swap them.
139                 if i < len(in)-2 &&
140                         d.Type == diffmatchpatch.DiffDelete &&
141                         in[i+1].Type == diffmatchpatch.DiffEqual &&
142                         in[i+2].Type == diffmatchpatch.DiffInsert &&
143                         in[i+1].Text+in[i+2].Text == in[i+2].Text+in[i+1].Text {
144                         in[i+2], in[i+1] = in[i+1], in[i+2]
145                 }
146                 out = append(out, d)
147         }
148         return
149 }
150
151 func Less(a, b Variant) bool {
152         if a.Position != b.Position {
153                 return a.Position < b.Position
154         } else if a.New != b.New {
155                 return a.New < b.New
156         } else {
157                 return a.Ref < b.Ref
158         }
159 }