X-Git-Url: https://git.arvados.org/lightning.git/blobdiff_plain/c33e73e5c68b94a8d7bda06886c945ffd1a8f5ba..77a836b5883e33c11e8e9ef933d0abb6e08bc1a5:/slicenumpy.go diff --git a/slicenumpy.go b/slicenumpy.go index 2bd892eb15..db4b0d367b 100644 --- a/slicenumpy.go +++ b/slicenumpy.go @@ -166,7 +166,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s cmd.cgnames = nil var tagset [][]byte - DecodeLibrary(in0, strings.HasSuffix(infiles[0], ".gz"), func(ent *LibraryEntry) error { + err = DecodeLibrary(in0, strings.HasSuffix(infiles[0], ".gz"), func(ent *LibraryEntry) error { if len(ent.TagSet) > 0 { tagset = ent.TagSet } @@ -360,6 +360,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s onehotChunkSize = make([]uint32, len(infiles)) onehotXrefs = make([][]onehotXref, len(infiles)) } + chunkStartTag := make([]tagID, len(infiles)) throttleMem := throttle{Max: cmd.threads} // TODO: estimate using mem and data size throttleNumpyMem := throttle{Max: cmd.threads/2 + 1} @@ -425,6 +426,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s } tagstart := cgs[cmd.cgnames[0]].StartTag tagend := cgs[cmd.cgnames[0]].EndTag + chunkStartTag[infileIdx] = tagstart // TODO: filters @@ -691,11 +693,11 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s continue } if v == 0 { - out[outidx] = -1 // 0 better? + out[outidx] = 0 // tag not found / spanning tile } else if variants, ok := seq[tag]; ok && int(v) < len(variants) && len(variants[v].Sequence) > 0 { out[outidx] = int16(variantRemap[tag-tagstart][v]) } else { - out[outidx] = -1 + out[outidx] = -1 // low quality tile variant } if tag == cmd.debugTag { log.Printf("tag %d row %d col %d outidx %d v %d out %d", tag, row, col, outidx, v, out[outidx]) @@ -999,11 +1001,33 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s return 1 } fnm = fmt.Sprintf("%s/onehot-columns.npy", *outputDir) - err = writeNumpyInt32(fnm, onehotXref2int32(xrefs), 4, len(xrefs)) + err = writeNumpyInt32(fnm, onehotXref2int32(xrefs), 5, len(xrefs)) if err != nil { return 1 } } + if !*mergeOutput && !*onehotChunked && !*onehotSingle { + tagoffsetFilename := *outputDir + "/chunk-tag-offset.csv" + log.Infof("writing tag offsets to %s", tagoffsetFilename) + var f *os.File + f, err = os.Create(tagoffsetFilename) + if err != nil { + return 1 + } + defer f.Close() + for idx, offset := range chunkStartTag { + _, err = fmt.Fprintf(f, "%q,%d\n", fmt.Sprintf("matrix.%04d.npy", idx), offset) + if err != nil { + err = fmt.Errorf("write %s: %w", tagoffsetFilename, err) + return 1 + } + } + err = f.Close() + if err != nil { + err = fmt.Errorf("close %s: %w", tagoffsetFilename, err) + return 1 + } + } return 0 } @@ -1236,7 +1260,7 @@ func allele2homhet(colpair [2][]int8) { type onehotXref struct { tag tagID variant tileVariantID - het bool + hom bool pvalue float64 } @@ -1306,7 +1330,7 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI xref = append(xref, onehotXref{ tag: tag, variant: tileVariantID(col >> 1), - het: col&1 == 1, + hom: col&1 == 0, pvalue: p, }) } @@ -1331,14 +1355,15 @@ func bool2int8(in []bool) []int8 { // P-value row contains 1000000x actual p-value. func onehotXref2int32(xrefs []onehotXref) []int32 { xcols := len(xrefs) - xdata := make([]int32, 4*xcols) + xdata := make([]int32, 5*xcols) for i, xref := range xrefs { xdata[i] = int32(xref.tag) xdata[xcols+i] = int32(xref.variant) - if xref.het { + if xref.hom { xdata[xcols*2+i] = 1 } xdata[xcols*3+i] = int32(xref.pvalue * 1000000) + xdata[xcols*4+i] = int32(-math.Log10(xref.pvalue) * 1000000) } return xdata }