Don't use tags that appear more than once per sequence.
authorTom Clegg <tom@curii.com>
Fri, 4 Feb 2022 05:41:40 +0000 (00:41 -0500)
committerTom Clegg <tom@curii.com>
Fri, 4 Feb 2022 05:41:40 +0000 (00:41 -0500)
refs #18664

Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

tilelib.go
tilelib_test.go

index 550563342c3ac9728c9d05eeb7e85c8f7ab3abb0..0b7fc599182031bb6faab854a8f712aa09c4165d 100644 (file)
@@ -61,6 +61,7 @@ type tileLibrary struct {
        retainNoCalls       bool
        skipOOO             bool
        retainTileSequences bool
+       useDups             bool
 
        taglib         *tagLibrary
        variant        [][][blake2b.Size256]byte
@@ -606,13 +607,31 @@ func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader, matchChro
                }
 
                skipped := 0
+
+               if !tilelib.useDups {
+                       // Remove any tags that appeared more than once
+                       dup := map[tagID]bool{}
+                       for _, ft := range found {
+                               _, dup[ft.tagid] = dup[ft.tagid]
+                       }
+                       dst := 0
+                       for _, ft := range found {
+                               if !dup[ft.tagid] {
+                                       found[dst] = ft
+                                       dst++
+                               }
+                       }
+                       skipped += len(found) - dst
+                       found = found[:dst]
+               }
+
                if tilelib.skipOOO {
                        log.Infof("%s %s keeping longest increasing subsequence", filelabel, job.label)
                        keep := longestIncreasingSubsequence(len(found), func(i int) int { return int(found[i].tagid) })
                        for i, x := range keep {
                                found[i] = found[x]
                        }
-                       skipped = len(found) - len(keep)
+                       skipped += len(found) - len(keep)
                        found = found[:len(keep)]
                }
 
index 5801b9baf3b6c864b09a3882b7d4b2eabd05e68c..7a2910b120723c8f0b83f8eb05108ef7e2595c24 100644 (file)
@@ -100,7 +100,7 @@ func (s *tilelibSuite) TestSkipOOO(c *check.C) {
        c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}}})
 
        // tags appear in seq: 0, 1, 1, 2 -> skip second tag1
-       tilelib = &tileLibrary{taglib: &s.taglib, skipOOO: true}
+       tilelib = &tileLibrary{taglib: &s.taglib, skipOOO: true, useDups: true}
        tseq, _, err = tilelib.TileFasta("test-label", bytes.NewBufferString(">test-seq\n"+
                s.tag[0]+
                "cccccccccccccccccccc\n"+
@@ -114,7 +114,7 @@ func (s *tilelibSuite) TestSkipOOO(c *check.C) {
        c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}, {2, 1}}})
 
        // tags appear in seq: 0, 1, 3, 0, 4 -> skip second tag0
-       tilelib = &tileLibrary{taglib: &s.taglib, skipOOO: true}
+       tilelib = &tileLibrary{taglib: &s.taglib, skipOOO: true, useDups: true}
        tseq, _, err = tilelib.TileFasta("test-label", bytes.NewBufferString(">test-seq\n"+
                s.tag[0]+
                "cccccccccccccccccccc\n"+