Call SNPs separately when called within 1bp of start/end of indels.
authorTom Clegg <tom@curii.com>
Tue, 30 Nov 2021 19:59:45 +0000 (14:59 -0500)
committerTom Clegg <tom@curii.com>
Tue, 30 Nov 2021 19:59:45 +0000 (14:59 -0500)
fixes #18496

Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

hgvs/diff.go
hgvs/diff_test.go

index 835fcc2b97c704b0ff7d4d1ad205148c77cd28bd..57025e358862a79407f58dff30b84ed046bd0ea0 100644 (file)
@@ -143,6 +143,50 @@ func cleanup(in []diffmatchpatch.Diff) (out []diffmatchpatch.Diff) {
                        in[i+1].Text+in[i+2].Text == in[i+2].Text+in[i+1].Text {
                        in[i+2], in[i+1] = in[i+1], in[i+2]
                }
+               // when diffmatchpatch says [delAAA, insXAY] and
+               // len(X)==1, we prefer to treat the A>X as a snp.
+               if i < len(in)-1 &&
+                       d.Type == diffmatchpatch.DiffDelete &&
+                       in[i+1].Type == diffmatchpatch.DiffInsert &&
+                       len(d.Text) > 2 &&
+                       len(in[i+1].Text) > 2 &&
+                       d.Text[1] == in[i+1].Text[1] {
+                       eqend := 2
+                       for ; eqend < len(d.Text) && eqend < len(in[i+1].Text) && d.Text[eqend] == in[i+1].Text[eqend]; eqend++ {
+                       }
+                       out = append(out,
+                               diffmatchpatch.Diff{diffmatchpatch.DiffDelete, d.Text[:1]},
+                               diffmatchpatch.Diff{diffmatchpatch.DiffInsert, in[i+1].Text[:1]},
+                               diffmatchpatch.Diff{diffmatchpatch.DiffEqual, d.Text[1:eqend]})
+                       in[i].Text, in[i+1].Text = in[i].Text[eqend:], in[i+1].Text[eqend:]
+                       i--
+                       continue
+               }
+               // when diffmatchpatch says [delAAA, insXaY] and
+               // len(Y)==1, we prefer to treat the A>Y as a snp.
+               if i < len(in)-1 &&
+                       d.Type == diffmatchpatch.DiffDelete &&
+                       in[i+1].Type == diffmatchpatch.DiffInsert &&
+                       len(d.Text) > 2 &&
+                       len(in[i+1].Text) > 2 &&
+                       d.Text[len(d.Text)-2] == in[i+1].Text[len(in[i+1].Text)-2] {
+                       // eqstart will be the number of equal chars
+                       // before the terminal snp, plus 1 for the snp
+                       // itself. Example, for [delAAAA, insTTAAG],
+                       // eqstart will be 3.
+                       eqstart := 2
+                       for ; eqstart < len(d.Text) && eqstart < len(in[i+1].Text) && d.Text[len(d.Text)-eqstart] == in[i+1].Text[len(in[i+1].Text)-eqstart]; eqstart++ {
+                       }
+                       eqstart--
+                       out = append(out,
+                               diffmatchpatch.Diff{diffmatchpatch.DiffDelete, d.Text[:len(d.Text)-eqstart]},
+                               diffmatchpatch.Diff{diffmatchpatch.DiffInsert, in[i+1].Text[:len(in[i+1].Text)-eqstart]},
+                               diffmatchpatch.Diff{diffmatchpatch.DiffEqual, d.Text[len(d.Text)-eqstart : len(d.Text)-1]},
+                               diffmatchpatch.Diff{diffmatchpatch.DiffDelete, d.Text[len(d.Text)-1:]},
+                               diffmatchpatch.Diff{diffmatchpatch.DiffInsert, in[i+1].Text[len(in[i+1].Text)-1:]})
+                       i++
+                       continue
+               }
                out = append(out, d)
        }
        return
index cc211b5f2deecdacd25b562f998263ed4fcf574a..1032f4e3d2dbbc5cf24d6aa35c7ce97f653c85f8 100644 (file)
@@ -113,6 +113,21 @@ func (s *diffSuite) TestDiff(c *check.C) {
                        b:      "tcaAaagac",
                        expect: []string{"4G>A"},
                },
+               {
+                       a:      "tcagatggac",
+                       b:      "tcaAaCggac",
+                       expect: []string{"4G>A", "6T>C"},
+               },
+               {
+                       a:      "tcagatggac",
+                       b:      "tcaAaCggTc",
+                       expect: []string{"4G>A", "6T>C", "9A>T"},
+               },
+               {
+                       a:      "tcagatggac",
+                       b:      "tcaAaCCggTc",
+                       expect: []string{"4G>A", "6delinsCC", "9A>T"},
+               },
        } {
                c.Log(trial)
                var vars []string