"strconv"
"strings"
"sync/atomic"
+ "unsafe"
"git.arvados.org/arvados.git/sdk/go/arvados"
"github.com/arvados/lightning/hgvs"
}
cmd.cgnames = nil
- taglen := -1
+ var tagset [][]byte
DecodeLibrary(in0, strings.HasSuffix(infiles[0], ".gz"), func(ent *LibraryEntry) error {
if len(ent.TagSet) > 0 {
- taglen = len(ent.TagSet[0])
+ tagset = ent.TagSet
}
for _, cseq := range ent.CompactSequences {
if cseq.Name == *ref || *ref == "" {
err = fmt.Errorf("%s: reference sequence not found", infiles[0])
return 1
}
- if taglen < 0 {
+ if len(tagset) == 0 {
err = fmt.Errorf("tagset not found")
return 1
}
+
+ taglib := &tagLibrary{}
+ err = taglib.setTags(tagset)
+ if err != nil {
+ return 1
+ }
+ taglen := taglib.TagLen()
+
if len(cmd.cgnames) == 0 {
err = fmt.Errorf("no genomes found matching regexp %q", cmd.filter.MatchGenome)
return 1
for seqname, cseq := range refseq {
pos := 0
for _, libref := range cseq {
+ if libref.Tag > tagID(cmd.filter.MaxTag) {
+ continue
+ }
tiledata := reftiledata[libref]
if len(tiledata) == 0 {
err = fmt.Errorf("missing tiledata for tag %d variant %d in %s in ref", libref.Tag, libref.Variant, seqname)
return 1
}
+ foundthistag := false
+ taglib.FindAll(tiledata[:len(tiledata)-1], func(tagid tagID, offset, _ int) {
+ if !foundthistag && tagid == libref.Tag {
+ foundthistag = true
+ return
+ }
+ if dupref, ok := reftile[tagid]; ok {
+ log.Printf("dropping reference tile %+v from %s @ %d, tag not unique, also found inside %+v from %s @ %d", tileLibRef{Tag: tagid, Variant: dupref.variant}, dupref.seqname, dupref.pos, libref, seqname, pos+offset+1)
+ delete(reftile, tagid)
+ } else {
+ log.Printf("found tag %d at offset %d inside tile variant %+v on %s @ %d", tagid, offset, libref, seqname, pos+offset+1)
+ }
+ isdup[tagid] = true
+ })
if isdup[libref.Tag] {
log.Printf("dropping reference tile %+v from %s @ %d, tag not unique", libref, seqname, pos)
} else if reftile[libref.Tag] != nil {
if *mergeOutput || *hgvsSingle {
toMerge = make([][]int16, len(infiles))
}
- var onehotChunks [][][]int8
+ var onehotIndirect [][2][]uint32 // [chunkIndex][axis][index]
var onehotXrefs [][]onehotXref
if *onehotSingle {
- onehotChunks = make([][][]int8, len(infiles))
+ onehotIndirect = make([][2][]uint32, len(infiles))
onehotXrefs = make([][]onehotXref, len(infiles))
}
annow := bufio.NewWriterSize(annof, 1<<20)
outcol := 0
for tag := tagstart; tag < tagend; tag++ {
- rt, ok := reftile[tag]
- if !ok {
- if mask == nil {
- outcol++
- }
- // Excluded by specified
- // regions, or reference does
- // not use any variant of this
- // tile. (TODO: log this?
- // mention it in annotations?)
+ rt := reftile[tag]
+ if rt == nil && mask != nil {
+ // Excluded by specified regions
+ continue
+ }
+ if tag > tagID(cmd.filter.MaxTag) {
continue
}
remap := variantRemap[tag-tagstart]
onehotChunk = append(onehotChunk, onehot...)
onehotXref = append(onehotXref, xrefs...)
}
+ if rt == nil {
+ // Reference does not use any
+ // variant of this tile
+ outcol++
+ continue
+ }
fmt.Fprintf(annow, "%d,%d,%d,=,%s,%d,,,\n", tag, outcol, rt.variant, rt.seqname, rt.pos)
variants := seq[tag]
reftilestr := strings.ToUpper(string(rt.tiledata))
throttleNumpyMem.Release()
}
if *onehotSingle {
- onehotChunks[infileIdx] = onehotChunk
+ onehotIndirect[infileIdx] = onehotChunk2Indirect(onehotChunk)
onehotXrefs[infileIdx] = onehotXref
+ n := len(onehotIndirect[infileIdx][0])
+ log.Infof("%04d: keeping onehot coordinates in memory (n=%d, mem=%d)", infileIdx, n, n*8)
}
if !(*onehotSingle || *onehotChunked) || *mergeOutput || *hgvsSingle {
log.Infof("%04d: preparing numpy", infileIdx)
}
}
if *onehotSingle {
- var onehot [][]int8
+ nzCount := 0
+ for _, part := range onehotIndirect {
+ nzCount += len(part[0])
+ }
+ onehot := make([]uint32, nzCount*2) // [r,r,r,...,c,c,c,...]
var xrefs []onehotXref
- for i := range onehotChunks {
- onehot = append(onehot, onehotChunks[i]...)
- onehotChunks[i] = nil
+ outcol := 0
+ for i, part := range onehotIndirect {
+ for i := range part[1] {
+ part[1][i] += uint32(outcol)
+ }
+ copy(onehot[outcol:], part[0])
+ copy(onehot[outcol+nzCount:], part[1])
+ outcol += len(part[0])
xrefs = append(xrefs, onehotXrefs[i]...)
+
+ part[0] = nil
+ part[1] = nil
onehotXrefs[i] = nil
+ debug.FreeOSMemory()
}
fnm := fmt.Sprintf("%s/onehot.npy", *outputDir)
- err = writeNumpyInt8(fnm, onehotcols2int8(onehot), len(cmd.cgnames), len(onehot))
+ err = writeNumpyUint32(fnm, onehot, 2, nzCount)
if err != nil {
return 1
}
cases = append(cases, c)
}
return len(cases) >= cmd.minCoverage &&
- (pvalue(cases, col0) <= cmd.chi2PValue || pvalue(cases, col1) <= cmd.chi2PValue)
+ (pvalue(col0, cases) <= cmd.chi2PValue || pvalue(col1, cases) <= cmd.chi2PValue)
+}
+
+func writeNumpyUint32(fnm string, out []uint32, rows, cols int) error {
+ output, err := os.Create(fnm)
+ if err != nil {
+ return err
+ }
+ defer output.Close()
+ bufw := bufio.NewWriterSize(output, 1<<26)
+ npw, err := gonpy.NewWriter(nopCloser{bufw})
+ if err != nil {
+ return err
+ }
+ log.WithFields(log.Fields{
+ "filename": fnm,
+ "rows": rows,
+ "cols": cols,
+ "bytes": rows * cols * 4,
+ }).Infof("writing numpy: %s", fnm)
+ npw.Shape = []int{rows, cols}
+ npw.WriteUint32(out)
+ err = bufw.Flush()
+ if err != nil {
+ return err
+ }
+ return output.Close()
}
func writeNumpyInt32(fnm string, out []int32, rows, cols int) error {
pvalue float64
}
+const onehotXrefSize = unsafe.Sizeof(onehotXref{})
+
// Build onehot matrix (m[variant*2+isHet][genome] == 0 or 1) for all
// variants of a single tile/tag#.
//
var onehot [][]int8
var xref []onehotXref
for homcol := 4; homcol < len(obs); homcol += 2 {
- p := [2]float64{
- pvalue(cmd.chi2Cases, obs[homcol]),
- pvalue(cmd.chi2Cases, obs[homcol+1]),
- }
- if cmd.chi2PValue < 1 && !(p[0] < cmd.chi2PValue || p[1] < cmd.chi2PValue) {
- continue
- }
for het := 0; het < 2; het++ {
+ p := pvalue(obs[homcol+het], cmd.chi2Cases)
+ if cmd.chi2PValue < 1 && !(p < cmd.chi2PValue) {
+ continue
+ }
onehot = append(onehot, bool2int8(obs[homcol+het]))
xref = append(xref, onehotXref{
tag: tag,
variant: tileVariantID(homcol / 2),
het: het == 1,
- pvalue: p[het],
+ pvalue: p,
})
}
}
}
return out
}
+
+// Return [2][]uint32{rowIndices, colIndices} indicating which
+// elements of matrixT[c][r] have non-zero values.
+func onehotChunk2Indirect(matrixT [][]int8) [2][]uint32 {
+ var nz [2][]uint32
+ for c, col := range matrixT {
+ for r, val := range col {
+ if val != 0 {
+ nz[0] = append(nz[0], uint32(r))
+ nz[1] = append(nz[1], uint32(c))
+ }
+ }
+ }
+ return nz
+}