From 7b5c102a97755babf09f438881c0b676bb281878 Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Fri, 4 Feb 2022 00:41:40 -0500 Subject: [PATCH] Don't use tags that appear more than once per sequence. refs #18664 Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- tilelib.go | 21 ++++++++++++++++++++- tilelib_test.go | 4 ++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/tilelib.go b/tilelib.go index 550563342c..0b7fc59918 100644 --- a/tilelib.go +++ b/tilelib.go @@ -61,6 +61,7 @@ type tileLibrary struct { retainNoCalls bool skipOOO bool retainTileSequences bool + useDups bool taglib *tagLibrary variant [][][blake2b.Size256]byte @@ -606,13 +607,31 @@ func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader, matchChro } skipped := 0 + + if !tilelib.useDups { + // Remove any tags that appeared more than once + dup := map[tagID]bool{} + for _, ft := range found { + _, dup[ft.tagid] = dup[ft.tagid] + } + dst := 0 + for _, ft := range found { + if !dup[ft.tagid] { + found[dst] = ft + dst++ + } + } + skipped += len(found) - dst + found = found[:dst] + } + if tilelib.skipOOO { log.Infof("%s %s keeping longest increasing subsequence", filelabel, job.label) keep := longestIncreasingSubsequence(len(found), func(i int) int { return int(found[i].tagid) }) for i, x := range keep { found[i] = found[x] } - skipped = len(found) - len(keep) + skipped += len(found) - len(keep) found = found[:len(keep)] } diff --git a/tilelib_test.go b/tilelib_test.go index 5801b9baf3..7a2910b120 100644 --- a/tilelib_test.go +++ b/tilelib_test.go @@ -100,7 +100,7 @@ func (s *tilelibSuite) TestSkipOOO(c *check.C) { c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}}}) // tags appear in seq: 0, 1, 1, 2 -> skip second tag1 - tilelib = &tileLibrary{taglib: &s.taglib, skipOOO: true} + tilelib = &tileLibrary{taglib: &s.taglib, skipOOO: true, useDups: true} tseq, _, err = tilelib.TileFasta("test-label", bytes.NewBufferString(">test-seq\n"+ s.tag[0]+ "cccccccccccccccccccc\n"+ @@ -114,7 +114,7 @@ func (s *tilelibSuite) TestSkipOOO(c *check.C) { c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}, {2, 1}}}) // tags appear in seq: 0, 1, 3, 0, 4 -> skip second tag0 - tilelib = &tileLibrary{taglib: &s.taglib, skipOOO: true} + tilelib = &tileLibrary{taglib: &s.taglib, skipOOO: true, useDups: true} tseq, _, err = tilelib.TileFasta("test-label", bytes.NewBufferString(">test-seq\n"+ s.tag[0]+ "cccccccccccccccccccc\n"+ -- 2.30.2