Use real longest increasing subsequence algorithm for -skip-ooo.
authorTom Clegg <tom@tomclegg.ca>
Wed, 2 Sep 2020 02:22:30 +0000 (22:22 -0400)
committerTom Clegg <tom@tomclegg.ca>
Wed, 2 Sep 2020 02:22:30 +0000 (22:22 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@tomclegg.ca>

lis.go [new file with mode: 0644]
lis_test.go [new file with mode: 0644]
tilelib.go

diff --git a/lis.go b/lis.go
new file mode 100644 (file)
index 0000000..2103af0
--- /dev/null
+++ b/lis.go
@@ -0,0 +1,34 @@
+package main
+
+func longestIncreasingSubsequence(srclen int, X func(int) int) []int {
+       if srclen == 0 {
+               return nil
+       }
+       M := make([]int, srclen+1) // M[j] == index into X such that X[M[j]] is the smallest X[k] where k<=i and an increasing subsequence with length j ends at X[k]
+       P := make([]int, srclen)   // P[k] == index in X of predecessor of X[k] in longest increasing subsequence ending at X[k]
+       L := 0                     // length of longest increasing subsequence found so far
+       for i := range P {
+               lo, hi := 1, L
+               for lo <= hi {
+                       mid := (lo + hi + 1) / 2
+                       if X(M[mid]) < X(i) {
+                               lo = mid + 1
+                       } else {
+                               hi = mid - 1
+                       }
+               }
+               newL := lo
+               if i > 0 {
+                       P[i] = M[newL-1]
+               }
+               M[newL] = i
+               if newL > L {
+                       L = newL
+               }
+       }
+       ret := make([]int, L)
+       for k, i := M[L], len(ret)-1; i >= 0; k, i = P[k], i-1 {
+               ret[i] = k
+       }
+       return ret
+}
diff --git a/lis_test.go b/lis_test.go
new file mode 100644 (file)
index 0000000..9a33e5e
--- /dev/null
@@ -0,0 +1,52 @@
+package main
+
+import (
+       "gopkg.in/check.v1"
+)
+
+type lisSuite struct{}
+
+var _ = check.Suite(&lisSuite{})
+
+func (s *lisSuite) Test(c *check.C) {
+       for _, trial := range []struct {
+               in  []int
+               out []int
+       }{
+               {},
+               {
+                       in: []int{},
+               },
+               {
+                       in:  []int{0},
+                       out: []int{0},
+               },
+               {
+                       in:  []int{1, 2, 3, 4},
+                       out: []int{0, 1, 2, 3},
+               },
+               {
+                       in:  []int{1, 2, 2, 4},
+                       out: []int{0, 2, 3},
+               },
+               {
+                       in:  []int{4, 3, 2, 1},
+                       out: []int{3},
+               },
+               {
+                       in:  []int{1, 3, 2, 4},
+                       out: []int{0, 2, 3},
+               },
+               {
+                       in:  []int{1, 0, 0, 0, 4},
+                       out: []int{3, 4},
+               },
+               {
+                       in:  []int{0, 1, 2, 1, 4, 5},
+                       out: []int{0, 1, 2, 4, 5},
+               },
+       } {
+               c.Logf("=== %v", trial)
+               c.Check(longestIncreasingSubsequence(len(trial.in), func(i int) int { return trial.in[i] }), check.DeepEquals, trial.out)
+       }
+}
index 5b4b30754c689f2b09a2c7012e31cfb0462a9e68..e87a1f46fc1a01237d8da3e888c91d02244e0ca7 100644 (file)
@@ -104,23 +104,19 @@ func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq,
                })
                totalFoundTags += len(found)
 
+               skipped := 0
                path = path[:0]
                last := foundtag{tagid: -1}
+               if tilelib.skipOOO {
+                       keep := longestIncreasingSubsequence(len(found), func(i int) int { return int(found[i].tagid) })
+                       for i, x := range keep {
+                               found[i] = found[x]
+                       }
+                       skipped = len(found) - len(keep)
+                       found = found[:len(keep)]
+               }
                for i, f := range found {
                        log.Tracef("%s %s found[%d] == %#v", filelabel, job.label, i, f)
-                       if tilelib.skipOOO {
-                               if f.tagid < last.tagid+1 {
-                                       log.Debugf("%s %s skipped out-of-order tag %d (found at %d) because it appears after tag %d (found at %d)", filelabel, job.label, f.tagid, f.pos, last.tagid, last.pos)
-                                       continue
-                               }
-                               if f.tagid > last.tagid+1 && // accepting this tag would mean skipping some tags
-                                       i+1 < len(found) && // there is a "next" found tag after this one
-                                       found[i+1].tagid > last.tagid && // next found tag is usable (we haven't already passed it in accepted sequence)
-                                       found[i+1].tagid <= f.tagid { // next found tag is expected before this one (so we can't use both)
-                                       log.Debugf("%s %s skipped out-of-order tag %d (found at %d) because it appears between tag %d (found at %d) and %d (found at %d)", filelabel, job.label, f.tagid, f.pos, last.tagid, last.pos, found[i+1].tagid, found[i+1].pos)
-                                       continue
-                               }
-                       }
                        if last.taglen > 0 {
                                path = append(path, tilelib.getRef(last.tagid, job.fasta[last.pos:f.pos+f.taglen]))
                        }
@@ -133,7 +129,7 @@ func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq,
                pathcopy := make([]tileLibRef, len(path))
                copy(pathcopy, path)
                ret[job.label] = pathcopy
-               log.Debugf("%s %s tiled with path len %d, skipped %d", filelabel, job.label, len(path), len(found)-len(path))
+               log.Debugf("%s %s tiled with path len %d, skipped %d", filelabel, job.label, len(path), skipped)
                totalPathLen += len(path)
        }
        log.Printf("%s tiled with total path len %d in %d sequences (skipped %d sequences with '_' in name, skipped %d out-of-order tags)", filelabel, totalPathLen, len(ret), skippedSequences, totalFoundTags-totalPathLen)