Add diff-fasta command.
authorTom Clegg <tom@tomclegg.ca>
Tue, 12 May 2020 23:56:43 +0000 (19:56 -0400)
committerTom Clegg <tom@tomclegg.ca>
Tue, 12 May 2020 23:56:43 +0000 (19:56 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@tomclegg.ca>

cmd.go
diff.go [new file with mode: 0644]
diff_test.go [new file with mode: 0644]
go.mod
go.sum
hgvs/diff.go [new file with mode: 0644]
hgvs/diff_test.go [new file with mode: 0644]

diff --git a/cmd.go b/cmd.go
index 266efcecfb4ac410195fe73f8890f6565138f3a0..c40c06369e8fcd94a20aacc0bc2dce6abd55c965 100644 (file)
--- a/cmd.go
+++ b/cmd.go
@@ -24,6 +24,7 @@ var (
                "build-docker-image": &buildDockerImage{},
                "pca":                &pythonPCA{},
                "plot":               &pythonPlot{},
+               "diff-fasta":         &diffFasta{},
        })
 )
 
diff --git a/diff.go b/diff.go
new file mode 100644 (file)
index 0000000..43ed90c
--- /dev/null
+++ b/diff.go
@@ -0,0 +1,76 @@
+package main
+
+import (
+       "bufio"
+       "bytes"
+       "flag"
+       "fmt"
+       "io"
+       "os"
+
+       "github.com/curii/lightning/hgvs"
+)
+
+type diffFasta struct{}
+
+func (cmd *diffFasta) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
+       var err error
+       defer func() {
+               if err != nil {
+                       fmt.Fprintf(stderr, "%s\n", err)
+               }
+       }()
+       flags := flag.NewFlagSet("", flag.ContinueOnError)
+       flags.SetOutput(stderr)
+       offset := flags.Int("offset", 0, "coordinate offset")
+       sequence := flags.String("sequence", "chr1", "sequence label")
+       err = flags.Parse(args)
+       if err == flag.ErrHelp {
+               err = nil
+               return 0
+       } else if err != nil {
+               return 2
+       }
+       if len(flags.Args()) != 2 {
+               err = fmt.Errorf("usage: %s [options] a.fasta b.fasta", prog)
+               return 2
+       }
+
+       var fasta [2][]byte
+       errs := make(chan error, 2)
+       for idx, fnm := range flags.Args() {
+               idx, fnm := idx, fnm
+               go func() {
+                       f, err := os.Open(fnm)
+                       if err != nil {
+                               errs <- err
+                               return
+                       }
+                       defer f.Close()
+                       scanner := bufio.NewScanner(f)
+                       for scanner.Scan() {
+                               buf := scanner.Bytes()
+                               if len(buf) > 0 && buf[0] != '>' {
+                                       fasta[idx] = append(fasta[idx], bytes.ToUpper(buf)...)
+                               }
+                       }
+                       errs <- scanner.Err()
+               }()
+       }
+       for range flags.Args() {
+               if err = <-errs; err != nil {
+                       return 1
+               }
+       }
+
+       variants := hgvs.Diff(string(fasta[0]), string(fasta[1]))
+       if *offset != 0 {
+               for i := range variants {
+                       variants[i].Position += *offset
+               }
+       }
+       for _, v := range variants {
+               fmt.Fprintf(stdout, "%s:g.%s\t%s\t%d\t%s\t%s\n", *sequence, v.String(), *sequence, v.Position, v.Ref, v.New)
+       }
+       return 0
+}
diff --git a/diff_test.go b/diff_test.go
new file mode 100644 (file)
index 0000000..622d6c4
--- /dev/null
@@ -0,0 +1,33 @@
+package main
+
+import (
+       "bytes"
+       "io/ioutil"
+       "os"
+
+       "gopkg.in/check.v1"
+)
+
+type diffSuite struct{}
+
+var _ = check.Suite(&diffSuite{})
+
+func (s *diffSuite) TestDiff(c *check.C) {
+       tempdir, err := ioutil.TempDir("", "")
+       c.Assert(err, check.IsNil)
+       defer os.RemoveAll(tempdir)
+
+       err = ioutil.WriteFile(tempdir+"/f1.fa", []byte(">f1\nactgactCacgtacgt\nactgactgacgAAcgt\n"), 0700)
+       c.Assert(err, check.IsNil)
+       err = ioutil.WriteFile(tempdir+"/f2.fa", []byte(">f2\nactgactGacgtacgt\nactgactgacgTTcgtA\n"), 0700)
+       c.Assert(err, check.IsNil)
+
+       var output bytes.Buffer
+       exited := (&diffFasta{}).RunCommand("diff-fasta", []string{"-sequence", "chr2", "-offset", "1000", tempdir + "/f1.fa", tempdir + "/f2.fa"}, nil, &output, os.Stderr)
+       c.Check(exited, check.Equals, 0)
+       c.Check("\n"+output.String(), check.Equals, `
+chr2:g.1008C>G chr2    1008    C       G
+chr2:g.1028_1029delinsTT       chr2    1028    AA      TT
+chr2:g.1032_1033insA   chr2    1033            A
+`)
+}
diff --git a/go.mod b/go.mod
index f5dadd9c43a13bbe3ffdb221e321f65c9d143f61..904ec294d43a7e6e0c5ef2a199df0690b7525786 100644 (file)
--- a/go.mod
+++ b/go.mod
@@ -10,6 +10,7 @@ require (
        github.com/kshedden/gonpy v0.0.0-20190510000443-66c21fac4672
        github.com/prometheus/client_golang v1.5.0 // indirect
        github.com/prometheus/procfs v0.0.10 // indirect
+       github.com/sergi/go-diff v1.0.0
        github.com/sirupsen/logrus v1.4.2
        golang.org/x/crypto v0.0.0-20200302210943-78000ba7a073
        golang.org/x/net v0.0.0-20200301022130-244492dfa37a
diff --git a/go.sum b/go.sum
index 66c33a063c5e47a8e53ad3b175532a6569b99aaa..a0bc59103c398609d9be48c2c740b8f2a836263d 100644 (file)
--- a/go.sum
+++ b/go.sum
@@ -143,6 +143,7 @@ github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+Gx
 github.com/prometheus/procfs v0.0.10 h1:QJQN3jYQhkamO4mhfUWqdDH2asK7ONOI9MTWjyAxNKM=
 github.com/prometheus/procfs v0.0.10/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+GxbHq6oeK9A=
 github.com/satori/go.uuid v1.2.1-0.20180103174451-36e9d2ebbde5/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0=
+github.com/sergi/go-diff v1.0.0 h1:Kpca3qRNrduNnOQeazBd0ysaKrUJiIuISHxogkT9RPQ=
 github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo=
 github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
 github.com/sirupsen/logrus v1.4.2 h1:SPIRibHv4MatM3XXNO2BJeFLZwZ2LvZgfQ5+UNI2im4=
diff --git a/hgvs/diff.go b/hgvs/diff.go
new file mode 100644 (file)
index 0000000..e6e3b33
--- /dev/null
@@ -0,0 +1,76 @@
+package hgvs
+
+import (
+       "fmt"
+       "time"
+
+       "github.com/sergi/go-diff/diffmatchpatch"
+)
+
+type Variant struct {
+       Position int
+       Ref      string
+       New      string
+}
+
+func (v *Variant) String() string {
+       switch {
+       case len(v.New) == 0 && len(v.Ref) == 1:
+               return fmt.Sprintf("%ddel", v.Position)
+       case len(v.New) == 0:
+               return fmt.Sprintf("%d_%ddel", v.Position, v.Position+len(v.Ref)-1)
+       case len(v.Ref) == 1 && len(v.New) == 1:
+               return fmt.Sprintf("%d%s>%s", v.Position, v.Ref, v.New)
+       case len(v.Ref) == 0:
+               return fmt.Sprintf("%d_%dins%s", v.Position-1, v.Position, v.New)
+       case len(v.Ref) == 1 && len(v.New) > 0:
+               return fmt.Sprintf("%ddelins%s", v.Position, v.New)
+       default:
+               return fmt.Sprintf("%d_%ddelins%s", v.Position, v.Position+len(v.Ref)-1, v.New)
+       }
+}
+
+func Diff(a, b string) []Variant {
+       dmp := diffmatchpatch.New()
+       diffs := cleanup(dmp.DiffCleanupEfficiency(dmp.DiffBisect(a, b, time.Time{})))
+       pos := 1
+       var variants []Variant
+       for i := 0; i < len(diffs); i++ {
+               switch diffs[i].Type {
+               case diffmatchpatch.DiffEqual:
+                       pos += len(diffs[i].Text)
+               case diffmatchpatch.DiffDelete:
+                       if i+1 < len(diffs) && diffs[i+1].Type == diffmatchpatch.DiffInsert {
+                               // deletion followed by insertion
+                               variants = append(variants, Variant{Position: pos, Ref: diffs[i].Text, New: diffs[i+1].Text})
+                               pos += len(diffs[i].Text)
+                               i++
+                       } else {
+                               variants = append(variants, Variant{Position: pos, Ref: diffs[i].Text})
+                               pos += len(diffs[i].Text)
+                       }
+               case diffmatchpatch.DiffInsert:
+                       if i+1 < len(diffs) && diffs[i+1].Type == diffmatchpatch.DiffDelete {
+                               // insertion followed by deletion
+                               variants = append(variants, Variant{Position: pos, Ref: diffs[i+1].Text, New: diffs[i].Text})
+                               pos += len(diffs[i+1].Text)
+                               i++
+                       } else {
+                               variants = append(variants, Variant{Position: pos, New: diffs[i].Text})
+                       }
+               }
+       }
+       return variants
+}
+
+func cleanup(in []diffmatchpatch.Diff) (out []diffmatchpatch.Diff) {
+       for i := 0; i < len(in); i++ {
+               d := in[i]
+               for i < len(in)-1 && in[i].Type == in[i+1].Type {
+                       d.Text += in[i+1].Text
+                       i++
+               }
+               out = append(out, d)
+       }
+       return
+}
diff --git a/hgvs/diff_test.go b/hgvs/diff_test.go
new file mode 100644 (file)
index 0000000..6bad311
--- /dev/null
@@ -0,0 +1,69 @@
+package hgvs
+
+import (
+       "testing"
+
+       "gopkg.in/check.v1"
+)
+
+func Test(t *testing.T) { check.TestingT(t) }
+
+type diffSuite struct{}
+
+var _ = check.Suite(&diffSuite{})
+
+func (s *diffSuite) TestDiff(c *check.C) {
+       for _, trial := range []struct {
+               a      string
+               b      string
+               expect []string
+       }{
+               {
+                       a:      "aaaaaaaaaa",
+                       b:      "aaaaCaaaaa",
+                       expect: []string{"5a>C"},
+               },
+               {
+                       a:      "aaaacGcaaa",
+                       b:      "aaaaccaaa",
+                       expect: []string{"6del"},
+               },
+               {
+                       a:      "aaaacGGcaaa",
+                       b:      "aaaaccaaa",
+                       expect: []string{"6_7del"},
+               },
+               {
+                       a:      "aaaac",
+                       b:      "aaaa",
+                       expect: []string{"5del"},
+               },
+               {
+                       a:      "aaaa",
+                       b:      "aaCaa",
+                       expect: []string{"2_3insC"},
+               },
+               {
+                       a:      "aaGGGtt",
+                       b:      "aaCCCtt",
+                       expect: []string{"3_5delinsCCC"},
+               },
+               {
+                       a:      "aa",
+                       b:      "aaCCC",
+                       expect: []string{"2_3insCCC"},
+               },
+               {
+                       a:      "aaGGttAAtttt",
+                       b:      "aaCCttttttC",
+                       expect: []string{"3_4delinsCC", "7_8del", "12_13insC"},
+               },
+       } {
+               c.Log(trial)
+               var vars []string
+               for _, v := range Diff(trial.a, trial.b) {
+                       vars = append(vars, v.String())
+               }
+               c.Check(vars, check.DeepEquals, trial.expect)
+       }
+}