"strconv"
"strings"
"sync/atomic"
+ "unsafe"
"git.arvados.org/arvados.git/sdk/go/arvados"
"github.com/arvados/lightning/hgvs"
}
cmd.cgnames = nil
- taglen := -1
+ var tagset [][]byte
DecodeLibrary(in0, strings.HasSuffix(infiles[0], ".gz"), func(ent *LibraryEntry) error {
if len(ent.TagSet) > 0 {
- taglen = len(ent.TagSet[0])
+ tagset = ent.TagSet
}
for _, cseq := range ent.CompactSequences {
if cseq.Name == *ref || *ref == "" {
err = fmt.Errorf("%s: reference sequence not found", infiles[0])
return 1
}
- if taglen < 0 {
+ if len(tagset) == 0 {
err = fmt.Errorf("tagset not found")
return 1
}
+
+ taglib := &tagLibrary{}
+ err = taglib.setTags(tagset)
+ if err != nil {
+ return 1
+ }
+ taglen := taglib.TagLen()
+
if len(cmd.cgnames) == 0 {
err = fmt.Errorf("no genomes found matching regexp %q", cmd.filter.MatchGenome)
return 1
err = fmt.Errorf("missing tiledata for tag %d variant %d in %s in ref", libref.Tag, libref.Variant, seqname)
return 1
}
+ foundthistag := false
+ taglib.FindAll(tiledata[:len(tiledata)-1], func(tagid tagID, offset, _ int) {
+ if !foundthistag && tagid == libref.Tag {
+ foundthistag = true
+ return
+ }
+ if dupref, ok := reftile[tagid]; ok {
+ log.Printf("dropping reference tile %+v, tag not unique, also found inside %+v from %s @ %d", dupref, libref, seqname, pos+offset+1)
+ delete(reftile, tagid)
+ } else {
+ log.Printf("found tag %d at offset %d inside tile variant %+v on %s @ %d", tagid, offset, libref, seqname, pos+offset+1)
+ }
+ isdup[tagid] = true
+ })
if isdup[libref.Tag] {
log.Printf("dropping reference tile %+v from %s @ %d, tag not unique", libref, seqname, pos)
} else if reftile[libref.Tag] != nil {
if *mergeOutput || *hgvsSingle {
toMerge = make([][]int16, len(infiles))
}
- var onehotChunks [][][]int8
+ var onehotIndirect [][2][]uint32 // [chunkIndex][axis][index]
var onehotXrefs [][]onehotXref
if *onehotSingle {
- onehotChunks = make([][][]int8, len(infiles))
+ onehotIndirect = make([][2][]uint32, len(infiles))
onehotXrefs = make([][]onehotXref, len(infiles))
}
throttleNumpyMem.Release()
}
if *onehotSingle {
- onehotChunks[infileIdx] = onehotChunk
+ onehotIndirect[infileIdx] = onehotChunk2Indirect(onehotChunk)
onehotXrefs[infileIdx] = onehotXref
+ n := len(onehotIndirect[infileIdx][0])
+ log.Infof("%04d: keeping onehot coordinates in memory (n=%d, mem=%d)", infileIdx, n, n*8)
}
if !(*onehotSingle || *onehotChunked) || *mergeOutput || *hgvsSingle {
log.Infof("%04d: preparing numpy", infileIdx)
}
}
if *onehotSingle {
- var onehot [][]int8
+ nzCount := 0
+ for _, part := range onehotIndirect {
+ nzCount += len(part[0])
+ }
+ onehot := make([]uint32, nzCount*2) // [r,r,r,...,c,c,c,...]
var xrefs []onehotXref
- for i := range onehotChunks {
- onehot = append(onehot, onehotChunks[i]...)
- onehotChunks[i] = nil
+ outcol := 0
+ for i, part := range onehotIndirect {
+ for i := range part[1] {
+ part[1][i] += uint32(outcol)
+ }
+ copy(onehot[outcol:], part[0])
+ copy(onehot[outcol+nzCount:], part[1])
+ outcol += len(part[0])
xrefs = append(xrefs, onehotXrefs[i]...)
+
+ part[0] = nil
+ part[1] = nil
onehotXrefs[i] = nil
+ debug.FreeOSMemory()
}
fnm := fmt.Sprintf("%s/onehot.npy", *outputDir)
- err = writeNumpyInt8(fnm, onehotcols2int8(onehot), len(cmd.cgnames), len(onehot))
+ err = writeNumpyUint32(fnm, onehot, 2, nzCount)
if err != nil {
return 1
}
(pvalue(col0, cases) <= cmd.chi2PValue || pvalue(col1, cases) <= cmd.chi2PValue)
}
+func writeNumpyUint32(fnm string, out []uint32, rows, cols int) error {
+ output, err := os.Create(fnm)
+ if err != nil {
+ return err
+ }
+ defer output.Close()
+ bufw := bufio.NewWriterSize(output, 1<<26)
+ npw, err := gonpy.NewWriter(nopCloser{bufw})
+ if err != nil {
+ return err
+ }
+ log.WithFields(log.Fields{
+ "filename": fnm,
+ "rows": rows,
+ "cols": cols,
+ "bytes": rows * cols * 4,
+ }).Infof("writing numpy: %s", fnm)
+ npw.Shape = []int{rows, cols}
+ npw.WriteUint32(out)
+ err = bufw.Flush()
+ if err != nil {
+ return err
+ }
+ return output.Close()
+}
+
func writeNumpyInt32(fnm string, out []int32, rows, cols int) error {
output, err := os.Create(fnm)
if err != nil {
pvalue float64
}
+const onehotXrefSize = unsafe.Sizeof(onehotXref{})
+
// Build onehot matrix (m[variant*2+isHet][genome] == 0 or 1) for all
// variants of a single tile/tag#.
//
}
return out
}
+
+// Return [2][]uint32{rowIndices, colIndices} indicating which
+// elements of matrixT[c][r] have non-zero values.
+func onehotChunk2Indirect(matrixT [][]int8) [2][]uint32 {
+ var nz [2][]uint32
+ for c, col := range matrixT {
+ for r, val := range col {
+ if val != 0 {
+ nz[0] = append(nz[0], uint32(r))
+ nz[1] = append(nz[1], uint32(c))
+ }
+ }
+ }
+ return nz
+}