Fix low-coverage tiles counting toward min coverage threshold.
authorTom Clegg <tom@curii.com>
Fri, 1 Jul 2022 20:28:23 +0000 (16:28 -0400)
committerTom Clegg <tom@curii.com>
Fri, 1 Jul 2022 20:28:23 +0000 (16:28 -0400)
refs #19168

Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

slice_test.go
slicenumpy.go

index 8af51614ae430493faaa588509097b056bbdc63d..bda561a8567c014fe20c7645de4089be540fbefa 100644 (file)
@@ -381,6 +381,7 @@ func (s *sliceSuite) Test_tv2homhet(c *check.C) {
                chi2Cases:       []bool{false, true, true, false},
                chi2PValue:      .5,
                includeVariant1: true,
+               minCoverage:     3,
        }
        cgs := map[string]CompactGenome{
                "sample1": CompactGenome{
@@ -399,9 +400,12 @@ func (s *sliceSuite) Test_tv2homhet(c *check.C) {
        maxv := tileVariantID(3)
        remap := []tileVariantID{0, 1, 0, 0, 0, 2, 0, 0, 0, 3}
        chunkstarttag := tagID(10)
+       fakevariant := TileVariant{Sequence: []byte("ACGT")}
+       seq := map[tagID][]TileVariant{}
        for tag := tagID(10); tag < 12; tag++ {
+               seq[tag-chunkstarttag] = []TileVariant{TileVariant{}, fakevariant, TileVariant{}, TileVariant{}, TileVariant{}, fakevariant}
                c.Logf("=== tag %d", tag)
-               chunk, xref := cmd.tv2homhet(cgs, maxv, remap, tag, chunkstarttag)
+               chunk, xref := cmd.tv2homhet(cgs, maxv, remap, tag, chunkstarttag, seq)
                c.Logf("chunk len=%d", len(chunk))
                for _, x := range chunk {
                        c.Logf("%+v", x)
index db4b0d367b6210ca6ec1f802b7889693f5d45047..4728b7f32c27858f5ad28aae595ab81b04d3c7bd 100644 (file)
@@ -537,7 +537,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
                                        }
                                }
                                if *onehotChunked || *onehotSingle {
-                                       onehot, xrefs := cmd.tv2homhet(cgs, maxv, remap, tag, tagstart)
+                                       onehot, xrefs := cmd.tv2homhet(cgs, maxv, remap, tag, tagstart, seq)
                                        if tag == cmd.debugTag {
                                                log.WithFields(logrus.Fields{
                                                        "onehot": onehot,
@@ -1270,7 +1270,7 @@ const onehotXrefSize = unsafe.Sizeof(onehotXref{})
 // variants of a single tile/tag#.
 //
 // Return nil if no tile variant passes Χ² filter.
-func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantID, remap []tileVariantID, tag, chunkstarttag tagID) ([][]int8, []onehotXref) {
+func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantID, remap []tileVariantID, tag, chunkstarttag tagID, seq map[tagID][]TileVariant) ([][]int8, []onehotXref) {
        if tag == cmd.debugTag {
                tv := make([]tileVariantID, len(cmd.cgnames)*2)
                for i, name := range cmd.cgnames {
@@ -1291,7 +1291,13 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI
        tagoffset := tag - chunkstarttag
        coverage := 0
        for _, cg := range cgs {
-               if cg.Variants[tagoffset*2] > 0 && cg.Variants[tagoffset*2+1] > 0 {
+               alleles := 0
+               for _, v := range cg.Variants[tagoffset*2 : tagoffset*2+2] {
+                       if v > 0 && int(v) < len(seq[tagoffset]) && len(seq[tagoffset][v].Sequence) > 0 {
+                               alleles++
+                       }
+               }
+               if alleles == 2 {
                        coverage++
                }
        }