From 5d939b4ccdab73d0729db56219ba674fb9995c26 Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Wed, 2 Feb 2022 21:37:19 -0500 Subject: [PATCH] Skip tags that appear twice in the same chromosome. refs #18664 Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- slicenumpy.go | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/slicenumpy.go b/slicenumpy.go index 91413824fb..1dcf4cb61c 100644 --- a/slicenumpy.go +++ b/slicenumpy.go @@ -155,10 +155,10 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s } cmd.cgnames = nil - taglen := -1 + var tagset [][]byte DecodeLibrary(in0, strings.HasSuffix(infiles[0], ".gz"), func(ent *LibraryEntry) error { if len(ent.TagSet) > 0 { - taglen = len(ent.TagSet[0]) + tagset = ent.TagSet } for _, cseq := range ent.CompactSequences { if cseq.Name == *ref || *ref == "" { @@ -185,10 +185,18 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s err = fmt.Errorf("%s: reference sequence not found", infiles[0]) return 1 } - if taglen < 0 { + if len(tagset) == 0 { err = fmt.Errorf("tagset not found") return 1 } + + taglib := &tagLibrary{} + err = taglib.setTags(tagset) + if err != nil { + return 1 + } + taglen := taglib.TagLen() + if len(cmd.cgnames) == 0 { err = fmt.Errorf("no genomes found matching regexp %q", cmd.filter.MatchGenome) return 1 @@ -244,6 +252,20 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s err = fmt.Errorf("missing tiledata for tag %d variant %d in %s in ref", libref.Tag, libref.Variant, seqname) return 1 } + foundthistag := false + taglib.FindAll(tiledata[:len(tiledata)-1], func(tagid tagID, offset, _ int) { + if !foundthistag && tagid == libref.Tag { + foundthistag = true + return + } + if dupref, ok := reftile[tagid]; ok { + log.Printf("dropping reference tile %+v, tag not unique, also found inside %+v from %s @ %d", dupref, libref, seqname, pos+offset+1) + delete(reftile, tagid) + } else { + log.Printf("found tag %d at offset %d inside tile variant %+v on %s @ %d", tagid, offset, libref, seqname, pos+offset+1) + } + isdup[tagid] = true + }) if isdup[libref.Tag] { log.Printf("dropping reference tile %+v from %s @ %d, tag not unique", libref, seqname, pos) } else if reftile[libref.Tag] != nil { -- 2.30.2