From 04f618563a9603f6e56e5d6480f263c09e0cf89f Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Tue, 1 Sep 2020 22:22:30 -0400 Subject: [PATCH] Use real longest increasing subsequence algorithm for -skip-ooo. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- lis.go | 34 ++++++++++++++++++++++++++++++++++ lis_test.go | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ tilelib.go | 24 ++++++++++-------------- 3 files changed, 96 insertions(+), 14 deletions(-) create mode 100644 lis.go create mode 100644 lis_test.go diff --git a/lis.go b/lis.go new file mode 100644 index 0000000000..2103af07bd --- /dev/null +++ b/lis.go @@ -0,0 +1,34 @@ +package main + +func longestIncreasingSubsequence(srclen int, X func(int) int) []int { + if srclen == 0 { + return nil + } + M := make([]int, srclen+1) // M[j] == index into X such that X[M[j]] is the smallest X[k] where k<=i and an increasing subsequence with length j ends at X[k] + P := make([]int, srclen) // P[k] == index in X of predecessor of X[k] in longest increasing subsequence ending at X[k] + L := 0 // length of longest increasing subsequence found so far + for i := range P { + lo, hi := 1, L + for lo <= hi { + mid := (lo + hi + 1) / 2 + if X(M[mid]) < X(i) { + lo = mid + 1 + } else { + hi = mid - 1 + } + } + newL := lo + if i > 0 { + P[i] = M[newL-1] + } + M[newL] = i + if newL > L { + L = newL + } + } + ret := make([]int, L) + for k, i := M[L], len(ret)-1; i >= 0; k, i = P[k], i-1 { + ret[i] = k + } + return ret +} diff --git a/lis_test.go b/lis_test.go new file mode 100644 index 0000000000..9a33e5ec5c --- /dev/null +++ b/lis_test.go @@ -0,0 +1,52 @@ +package main + +import ( + "gopkg.in/check.v1" +) + +type lisSuite struct{} + +var _ = check.Suite(&lisSuite{}) + +func (s *lisSuite) Test(c *check.C) { + for _, trial := range []struct { + in []int + out []int + }{ + {}, + { + in: []int{}, + }, + { + in: []int{0}, + out: []int{0}, + }, + { + in: []int{1, 2, 3, 4}, + out: []int{0, 1, 2, 3}, + }, + { + in: []int{1, 2, 2, 4}, + out: []int{0, 2, 3}, + }, + { + in: []int{4, 3, 2, 1}, + out: []int{3}, + }, + { + in: []int{1, 3, 2, 4}, + out: []int{0, 2, 3}, + }, + { + in: []int{1, 0, 0, 0, 4}, + out: []int{3, 4}, + }, + { + in: []int{0, 1, 2, 1, 4, 5}, + out: []int{0, 1, 2, 4, 5}, + }, + } { + c.Logf("=== %v", trial) + c.Check(longestIncreasingSubsequence(len(trial.in), func(i int) int { return trial.in[i] }), check.DeepEquals, trial.out) + } +} diff --git a/tilelib.go b/tilelib.go index 5b4b30754c..e87a1f46fc 100644 --- a/tilelib.go +++ b/tilelib.go @@ -104,23 +104,19 @@ func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq, }) totalFoundTags += len(found) + skipped := 0 path = path[:0] last := foundtag{tagid: -1} + if tilelib.skipOOO { + keep := longestIncreasingSubsequence(len(found), func(i int) int { return int(found[i].tagid) }) + for i, x := range keep { + found[i] = found[x] + } + skipped = len(found) - len(keep) + found = found[:len(keep)] + } for i, f := range found { log.Tracef("%s %s found[%d] == %#v", filelabel, job.label, i, f) - if tilelib.skipOOO { - if f.tagid < last.tagid+1 { - log.Debugf("%s %s skipped out-of-order tag %d (found at %d) because it appears after tag %d (found at %d)", filelabel, job.label, f.tagid, f.pos, last.tagid, last.pos) - continue - } - if f.tagid > last.tagid+1 && // accepting this tag would mean skipping some tags - i+1 < len(found) && // there is a "next" found tag after this one - found[i+1].tagid > last.tagid && // next found tag is usable (we haven't already passed it in accepted sequence) - found[i+1].tagid <= f.tagid { // next found tag is expected before this one (so we can't use both) - log.Debugf("%s %s skipped out-of-order tag %d (found at %d) because it appears between tag %d (found at %d) and %d (found at %d)", filelabel, job.label, f.tagid, f.pos, last.tagid, last.pos, found[i+1].tagid, found[i+1].pos) - continue - } - } if last.taglen > 0 { path = append(path, tilelib.getRef(last.tagid, job.fasta[last.pos:f.pos+f.taglen])) } @@ -133,7 +129,7 @@ func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq, pathcopy := make([]tileLibRef, len(path)) copy(pathcopy, path) ret[job.label] = pathcopy - log.Debugf("%s %s tiled with path len %d, skipped %d", filelabel, job.label, len(path), len(found)-len(path)) + log.Debugf("%s %s tiled with path len %d, skipped %d", filelabel, job.label, len(path), skipped) totalPathLen += len(path) } log.Printf("%s tiled with total path len %d in %d sequences (skipped %d sequences with '_' in name, skipped %d out-of-order tags)", filelabel, totalPathLen, len(ret), skippedSequences, totalFoundTags-totalPathLen) -- 2.39.5