X-Git-Url: https://git.arvados.org/lightning.git/blobdiff_plain/cf6bba16548ea76aef580fce9fdd04abebcd7d3f..292a9459c8dde94b7b9a4245a1d96d10b527f193:/slicenumpy.go diff --git a/slicenumpy.go b/slicenumpy.go index 9e5e315c50..77877b3fca 100644 --- a/slicenumpy.go +++ b/slicenumpy.go @@ -155,10 +155,10 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s } cmd.cgnames = nil - taglen := -1 + var tagset [][]byte DecodeLibrary(in0, strings.HasSuffix(infiles[0], ".gz"), func(ent *LibraryEntry) error { if len(ent.TagSet) > 0 { - taglen = len(ent.TagSet[0]) + tagset = ent.TagSet } for _, cseq := range ent.CompactSequences { if cseq.Name == *ref || *ref == "" { @@ -185,10 +185,18 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s err = fmt.Errorf("%s: reference sequence not found", infiles[0]) return 1 } - if taglen < 0 { + if len(tagset) == 0 { err = fmt.Errorf("tagset not found") return 1 } + + taglib := &tagLibrary{} + err = taglib.setTags(tagset) + if err != nil { + return 1 + } + taglen := taglib.TagLen() + if len(cmd.cgnames) == 0 { err = fmt.Errorf("no genomes found matching regexp %q", cmd.filter.MatchGenome) return 1 @@ -244,6 +252,20 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s err = fmt.Errorf("missing tiledata for tag %d variant %d in %s in ref", libref.Tag, libref.Variant, seqname) return 1 } + foundthistag := false + taglib.FindAll(tiledata[:len(tiledata)-1], func(tagid tagID, offset, _ int) { + if !foundthistag && tagid == libref.Tag { + foundthistag = true + return + } + if dupref, ok := reftile[tagid]; ok { + log.Printf("dropping reference tile %+v from %s @ %d, tag not unique, also found inside %+v from %s @ %d", tileLibRef{Tag: tagid, Variant: dupref.variant}, dupref.seqname, dupref.pos, libref, seqname, pos+offset+1) + delete(reftile, tagid) + } else { + log.Printf("found tag %d at offset %d inside tile variant %+v on %s @ %d", tagid, offset, libref, seqname, pos+offset+1) + } + isdup[tagid] = true + }) if isdup[libref.Tag] { log.Printf("dropping reference tile %+v from %s @ %d, tag not unique", libref, seqname, pos) } else if reftile[libref.Tag] != nil { @@ -596,9 +618,10 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s throttleNumpyMem.Release() } if *onehotSingle { - log.Infof("%04d: keeping onehot chunk in memory (rows=%d, cols=%d, mem=%d)", infileIdx, len(cmd.cgnames), len(onehotChunk), (len(cmd.cgnames)+int(onehotXrefSize))*len(onehotChunk)) onehotIndirect[infileIdx] = onehotChunk2Indirect(onehotChunk) onehotXrefs[infileIdx] = onehotXref + n := len(onehotIndirect[infileIdx][0]) + log.Infof("%04d: keeping onehot coordinates in memory (n=%d, mem=%d)", infileIdx, n, n*8) } if !(*onehotSingle || *onehotChunked) || *mergeOutput || *hgvsSingle { log.Infof("%04d: preparing numpy", infileIdx) @@ -1194,20 +1217,17 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI var onehot [][]int8 var xref []onehotXref for homcol := 4; homcol < len(obs); homcol += 2 { - p := [2]float64{ - pvalue(obs[homcol], cmd.chi2Cases), - pvalue(obs[homcol+1], cmd.chi2Cases), - } - if cmd.chi2PValue < 1 && !(p[0] < cmd.chi2PValue || p[1] < cmd.chi2PValue) { - continue - } for het := 0; het < 2; het++ { + p := pvalue(obs[homcol+het], cmd.chi2Cases) + if cmd.chi2PValue < 1 && !(p < cmd.chi2PValue) { + continue + } onehot = append(onehot, bool2int8(obs[homcol+het])) xref = append(xref, onehotXref{ tag: tag, variant: tileVariantID(homcol / 2), het: het == 1, - pvalue: p[het], + pvalue: p, }) } }