Fix Χ² calculation.
authorTom Clegg <tom@curii.com>
Fri, 21 Jan 2022 19:05:34 +0000 (14:05 -0500)
committerTom Clegg <tom@curii.com>
Fri, 21 Jan 2022 19:05:34 +0000 (14:05 -0500)
refs #18581

Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

chisquare.go
chisquare_test.go
slice_test.go
slicenumpy.go

index 31142a1710686d2763f794cff54d0a27debcd3fa..a044ced1eb127532124297b3604c7360423e552c 100644 (file)
@@ -11,35 +11,34 @@ import (
 
 var chisquared = distuv.ChiSquared{K: 1, Src: rand.NewSource(rand.Uint64())}
 
-func pvalue(a, b []bool) float64 {
-       //     !b        b
-       // !a  tab[0]    tab[1]
-       // a   tab[2]    tab[3]
-       tab := make([]int, 4)
-       for ai, aval := range []bool{false, true} {
-               for bi, bval := range []bool{false, true} {
-                       obs := 0
-                       for i := range a {
-                               if a[i] == aval && b[i] == bval {
-                                       obs++
-                               }
+func pvalue(x, y []bool) float64 {
+       var (
+               obs, exp [2]float64
+               sum      float64
+               sz       = float64(len(y))
+       )
+       for i, yi := range y {
+               if x[i] {
+                       if yi {
+                               obs[0]++
+                       } else {
+                               obs[1]++
                        }
-                       tab[ai*2+bi] = obs
                }
-       }
-       var sum float64
-       for ai := 0; ai < 2; ai++ {
-               for bi := 0; bi < 2; bi++ {
-                       rowtotal := tab[ai*2] + tab[ai*2+1]
-                       coltotal := tab[bi] + tab[2+bi]
-                       if rowtotal == 0 || coltotal == 0 {
-                               return 1
-                       }
-                       exp := float64(rowtotal) * float64(coltotal) / float64(len(a))
-                       obs := tab[ai*2+bi]
-                       d := float64(obs) - exp
-                       sum += (d * d) / exp
+               if yi {
+                       exp[0]++
+               } else {
+                       exp[1]++
                }
        }
+       if exp[0] == 0 || exp[1] == 0 || obs[0]+obs[1] == 0 {
+               return 1
+       }
+       exp[0] = (obs[0] + obs[1]) * exp[0] / sz
+       exp[1] = (obs[0] + obs[1]) * exp[1] / sz
+       for i := range exp {
+               d := obs[i] - exp[i]
+               sum += d * d / exp[i]
+       }
        return 1 - chisquared.CDF(sum)
 }
index 3a59671c407d5ca70f6584b5e85de32e4237c443..c9db0c6710f0a9d7b9070daeb848516e3f7fd295 100644 (file)
@@ -24,12 +24,28 @@ func (s *pvalueSuite) TestPvalue(c *check.C) {
        for i := 25; i < 31; i++ {
                a[i] = true
        }
-       for i := 31; i < 39; i++ {
+       for i := 31; i < 40; i++ {
                b[i] = true
        }
-       c.Check(fmt.Sprintf("%.7f", pvalue(a, b)), check.Equals, "0.0006297")
+       c.Check(fmt.Sprintf("%.8f", pvalue(a, b)), check.Equals, "0.04147853")
+
+       a = make([]bool, 54)
+       b = make([]bool, 54)
+       for i := 0; i < 25; i++ {
+               a[i] = true
+               b[i] = true
+       }
+       c.Check(fmt.Sprintf("%.9f", pvalue(a, b)), check.Equals, "0.000000072")
+       for i := range a {
+               a[i] = !a[i]
+       }
+       c.Check(fmt.Sprintf("%.9f", pvalue(a, b)), check.Equals, "0.000000573")
+
+       a = []bool{true, true, true, false, true, false, false, false}
+       b = []bool{true, true, true, true, false, false, false, false}
+       c.Check(fmt.Sprintf("%.8f", pvalue(a, b)), check.Equals, "0.31731051")
        for i := range a {
                a[i] = !a[i]
        }
-       c.Check(fmt.Sprintf("%.7f", pvalue(a, b)), check.Equals, "0.0006297")
+       c.Check(fmt.Sprintf("%.8f", pvalue(a, b)), check.Equals, "0.31731051")
 }
index 4278f13e203aa31e371f989d956488a51f98114a..be89a764c084964e15fee5bd10d57ca3d9f80744 100644 (file)
@@ -227,7 +227,7 @@ pipeline1dup/input2 0
                        "-chunked-hgvs-matrix=true",
                        "-chi2-case-control-file=" + tmpdir + "/casecontrol.tsv",
                        "-chi2-case-control-column=CC",
-                       "-chi2-p-value=0.05",
+                       "-chi2-p-value=0.5",
                        "-min-coverage=0.75",
                        "-input-dir=" + slicedir,
                        "-output-dir=" + npydir,
@@ -259,7 +259,7 @@ pipeline1dup/input2 0
                        "-chunked-onehot=true",
                        "-chi2-case-control-file=" + tmpdir + "/casecontrol.tsv",
                        "-chi2-case-control-column=CC",
-                       "-chi2-p-value=0.05",
+                       "-chi2-p-value=0.5",
                        "-min-coverage=0.75",
                        "-input-dir=" + slicedir,
                        "-output-dir=" + npydir,
@@ -303,7 +303,7 @@ pipeline1dup/input2 0
                        "-single-onehot=true",
                        "-chi2-case-control-file=" + tmpdir + "/casecontrol.tsv",
                        "-chi2-case-control-column=CC",
-                       "-chi2-p-value=0.05",
+                       "-chi2-p-value=0.5",
                        "-min-coverage=0.75",
                        "-input-dir=" + slicedir,
                        "-output-dir=" + npydir,
index 0fa3f2196feaf4cb5593bae5e513fda56ca9e10b..5ef777bf21cd820a61ad944036cd860f0135b456 100644 (file)
@@ -1007,7 +1007,7 @@ func (cmd *sliceNumpy) filterHGVScolpair(colpair [2][]int8) bool {
                cases = append(cases, c)
        }
        return len(cases) >= cmd.minCoverage &&
-               (pvalue(cases, col0) <= cmd.chi2PValue || pvalue(cases, col1) <= cmd.chi2PValue)
+               (pvalue(col0, cases) <= cmd.chi2PValue || pvalue(col1, cases) <= cmd.chi2PValue)
 }
 
 func writeNumpyInt32(fnm string, out []int32, rows, cols int) error {
@@ -1152,8 +1152,8 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI
        var xref []onehotXref
        for homcol := 4; homcol < len(obs); homcol += 2 {
                p := [2]float64{
-                       pvalue(cmd.chi2Cases, obs[homcol]),
-                       pvalue(cmd.chi2Cases, obs[homcol+1]),
+                       pvalue(obs[homcol], cmd.chi2Cases),
+                       pvalue(obs[homcol+1], cmd.chi2Cases),
                }
                if cmd.chi2PValue < 1 && !(p[0] < cmd.chi2PValue || p[1] < cmd.chi2PValue) {
                        continue