19073: Fix dup tag detection.
[lightning.git] / slicenumpy.go
index 2bd892eb15ed6882b1dec04e65694944d85da9c2..db4b0d367b6210ca6ec1f802b7889693f5d45047 100644 (file)
@@ -166,7 +166,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
 
        cmd.cgnames = nil
        var tagset [][]byte
-       DecodeLibrary(in0, strings.HasSuffix(infiles[0], ".gz"), func(ent *LibraryEntry) error {
+       err = DecodeLibrary(in0, strings.HasSuffix(infiles[0], ".gz"), func(ent *LibraryEntry) error {
                if len(ent.TagSet) > 0 {
                        tagset = ent.TagSet
                }
@@ -360,6 +360,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
                onehotChunkSize = make([]uint32, len(infiles))
                onehotXrefs = make([][]onehotXref, len(infiles))
        }
+       chunkStartTag := make([]tagID, len(infiles))
 
        throttleMem := throttle{Max: cmd.threads} // TODO: estimate using mem and data size
        throttleNumpyMem := throttle{Max: cmd.threads/2 + 1}
@@ -425,6 +426,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
                        }
                        tagstart := cgs[cmd.cgnames[0]].StartTag
                        tagend := cgs[cmd.cgnames[0]].EndTag
+                       chunkStartTag[infileIdx] = tagstart
 
                        // TODO: filters
 
@@ -691,11 +693,11 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
                                                        continue
                                                }
                                                if v == 0 {
-                                                       out[outidx] = -1 // 0 better?
+                                                       out[outidx] = 0 // tag not found / spanning tile
                                                } else if variants, ok := seq[tag]; ok && int(v) < len(variants) && len(variants[v].Sequence) > 0 {
                                                        out[outidx] = int16(variantRemap[tag-tagstart][v])
                                                } else {
-                                                       out[outidx] = -1
+                                                       out[outidx] = -1 // low quality tile variant
                                                }
                                                if tag == cmd.debugTag {
                                                        log.Printf("tag %d row %d col %d outidx %d v %d out %d", tag, row, col, outidx, v, out[outidx])
@@ -999,11 +1001,33 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
                        return 1
                }
                fnm = fmt.Sprintf("%s/onehot-columns.npy", *outputDir)
-               err = writeNumpyInt32(fnm, onehotXref2int32(xrefs), 4, len(xrefs))
+               err = writeNumpyInt32(fnm, onehotXref2int32(xrefs), 5, len(xrefs))
                if err != nil {
                        return 1
                }
        }
+       if !*mergeOutput && !*onehotChunked && !*onehotSingle {
+               tagoffsetFilename := *outputDir + "/chunk-tag-offset.csv"
+               log.Infof("writing tag offsets to %s", tagoffsetFilename)
+               var f *os.File
+               f, err = os.Create(tagoffsetFilename)
+               if err != nil {
+                       return 1
+               }
+               defer f.Close()
+               for idx, offset := range chunkStartTag {
+                       _, err = fmt.Fprintf(f, "%q,%d\n", fmt.Sprintf("matrix.%04d.npy", idx), offset)
+                       if err != nil {
+                               err = fmt.Errorf("write %s: %w", tagoffsetFilename, err)
+                               return 1
+                       }
+               }
+               err = f.Close()
+               if err != nil {
+                       err = fmt.Errorf("close %s: %w", tagoffsetFilename, err)
+                       return 1
+               }
+       }
        return 0
 }
 
@@ -1236,7 +1260,7 @@ func allele2homhet(colpair [2][]int8) {
 type onehotXref struct {
        tag     tagID
        variant tileVariantID
-       het     bool
+       hom     bool
        pvalue  float64
 }
 
@@ -1306,7 +1330,7 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI
                xref = append(xref, onehotXref{
                        tag:     tag,
                        variant: tileVariantID(col >> 1),
-                       het:     col&1 == 1,
+                       hom:     col&1 == 0,
                        pvalue:  p,
                })
        }
@@ -1331,14 +1355,15 @@ func bool2int8(in []bool) []int8 {
 // P-value row contains 1000000x actual p-value.
 func onehotXref2int32(xrefs []onehotXref) []int32 {
        xcols := len(xrefs)
-       xdata := make([]int32, 4*xcols)
+       xdata := make([]int32, 5*xcols)
        for i, xref := range xrefs {
                xdata[i] = int32(xref.tag)
                xdata[xcols+i] = int32(xref.variant)
-               if xref.het {
+               if xref.hom {
                        xdata[xcols*2+i] = 1
                }
                xdata[xcols*3+i] = int32(xref.pvalue * 1000000)
+               xdata[xcols*4+i] = int32(-math.Log10(xref.pvalue) * 1000000)
        }
        return xdata
 }