From c9e64a7e3421b77890f048cad4740cc43767e694 Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Fri, 21 Jan 2022 14:05:34 -0500 Subject: [PATCH] =?utf8?q?Fix=20=CE=A7=C2=B2=20calculation.?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit refs #18581 Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- chisquare.go | 51 +++++++++++++++++++++++------------------------ chisquare_test.go | 22 +++++++++++++++++--- slice_test.go | 6 +++--- slicenumpy.go | 6 +++--- 4 files changed, 50 insertions(+), 35 deletions(-) diff --git a/chisquare.go b/chisquare.go index 31142a1710..a044ced1eb 100644 --- a/chisquare.go +++ b/chisquare.go @@ -11,35 +11,34 @@ import ( var chisquared = distuv.ChiSquared{K: 1, Src: rand.NewSource(rand.Uint64())} -func pvalue(a, b []bool) float64 { - // !b b - // !a tab[0] tab[1] - // a tab[2] tab[3] - tab := make([]int, 4) - for ai, aval := range []bool{false, true} { - for bi, bval := range []bool{false, true} { - obs := 0 - for i := range a { - if a[i] == aval && b[i] == bval { - obs++ - } +func pvalue(x, y []bool) float64 { + var ( + obs, exp [2]float64 + sum float64 + sz = float64(len(y)) + ) + for i, yi := range y { + if x[i] { + if yi { + obs[0]++ + } else { + obs[1]++ } - tab[ai*2+bi] = obs } - } - var sum float64 - for ai := 0; ai < 2; ai++ { - for bi := 0; bi < 2; bi++ { - rowtotal := tab[ai*2] + tab[ai*2+1] - coltotal := tab[bi] + tab[2+bi] - if rowtotal == 0 || coltotal == 0 { - return 1 - } - exp := float64(rowtotal) * float64(coltotal) / float64(len(a)) - obs := tab[ai*2+bi] - d := float64(obs) - exp - sum += (d * d) / exp + if yi { + exp[0]++ + } else { + exp[1]++ } } + if exp[0] == 0 || exp[1] == 0 || obs[0]+obs[1] == 0 { + return 1 + } + exp[0] = (obs[0] + obs[1]) * exp[0] / sz + exp[1] = (obs[0] + obs[1]) * exp[1] / sz + for i := range exp { + d := obs[i] - exp[i] + sum += d * d / exp[i] + } return 1 - chisquared.CDF(sum) } diff --git a/chisquare_test.go b/chisquare_test.go index 3a59671c40..c9db0c6710 100644 --- a/chisquare_test.go +++ b/chisquare_test.go @@ -24,12 +24,28 @@ func (s *pvalueSuite) TestPvalue(c *check.C) { for i := 25; i < 31; i++ { a[i] = true } - for i := 31; i < 39; i++ { + for i := 31; i < 40; i++ { b[i] = true } - c.Check(fmt.Sprintf("%.7f", pvalue(a, b)), check.Equals, "0.0006297") + c.Check(fmt.Sprintf("%.8f", pvalue(a, b)), check.Equals, "0.04147853") + + a = make([]bool, 54) + b = make([]bool, 54) + for i := 0; i < 25; i++ { + a[i] = true + b[i] = true + } + c.Check(fmt.Sprintf("%.9f", pvalue(a, b)), check.Equals, "0.000000072") + for i := range a { + a[i] = !a[i] + } + c.Check(fmt.Sprintf("%.9f", pvalue(a, b)), check.Equals, "0.000000573") + + a = []bool{true, true, true, false, true, false, false, false} + b = []bool{true, true, true, true, false, false, false, false} + c.Check(fmt.Sprintf("%.8f", pvalue(a, b)), check.Equals, "0.31731051") for i := range a { a[i] = !a[i] } - c.Check(fmt.Sprintf("%.7f", pvalue(a, b)), check.Equals, "0.0006297") + c.Check(fmt.Sprintf("%.8f", pvalue(a, b)), check.Equals, "0.31731051") } diff --git a/slice_test.go b/slice_test.go index 4278f13e20..be89a764c0 100644 --- a/slice_test.go +++ b/slice_test.go @@ -227,7 +227,7 @@ pipeline1dup/input2 0 "-chunked-hgvs-matrix=true", "-chi2-case-control-file=" + tmpdir + "/casecontrol.tsv", "-chi2-case-control-column=CC", - "-chi2-p-value=0.05", + "-chi2-p-value=0.5", "-min-coverage=0.75", "-input-dir=" + slicedir, "-output-dir=" + npydir, @@ -259,7 +259,7 @@ pipeline1dup/input2 0 "-chunked-onehot=true", "-chi2-case-control-file=" + tmpdir + "/casecontrol.tsv", "-chi2-case-control-column=CC", - "-chi2-p-value=0.05", + "-chi2-p-value=0.5", "-min-coverage=0.75", "-input-dir=" + slicedir, "-output-dir=" + npydir, @@ -303,7 +303,7 @@ pipeline1dup/input2 0 "-single-onehot=true", "-chi2-case-control-file=" + tmpdir + "/casecontrol.tsv", "-chi2-case-control-column=CC", - "-chi2-p-value=0.05", + "-chi2-p-value=0.5", "-min-coverage=0.75", "-input-dir=" + slicedir, "-output-dir=" + npydir, diff --git a/slicenumpy.go b/slicenumpy.go index 0fa3f2196f..5ef777bf21 100644 --- a/slicenumpy.go +++ b/slicenumpy.go @@ -1007,7 +1007,7 @@ func (cmd *sliceNumpy) filterHGVScolpair(colpair [2][]int8) bool { cases = append(cases, c) } return len(cases) >= cmd.minCoverage && - (pvalue(cases, col0) <= cmd.chi2PValue || pvalue(cases, col1) <= cmd.chi2PValue) + (pvalue(col0, cases) <= cmd.chi2PValue || pvalue(col1, cases) <= cmd.chi2PValue) } func writeNumpyInt32(fnm string, out []int32, rows, cols int) error { @@ -1152,8 +1152,8 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI var xref []onehotXref for homcol := 4; homcol < len(obs); homcol += 2 { p := [2]float64{ - pvalue(cmd.chi2Cases, obs[homcol]), - pvalue(cmd.chi2Cases, obs[homcol+1]), + pvalue(obs[homcol], cmd.chi2Cases), + pvalue(obs[homcol+1], cmd.chi2Cases), } if cmd.chi2PValue < 1 && !(p[0] < cmd.chi2PValue || p[1] < cmd.chi2PValue) { continue -- 2.30.2