19995: Skip GLM on variants below frequency threshold.

author Tom Clegg <tom@curii.com>

Mon, 6 Mar 2023 16:21:25 +0000 (11:21 -0500)

committer Tom Clegg <tom@curii.com>

Mon, 6 Mar 2023 16:21:25 +0000 (11:21 -0500)
author Tom Clegg <tom@curii.com>
Mon, 6 Mar 2023 16:21:25 +0000 (11:21 -0500)
committer Tom Clegg <tom@curii.com>
Mon, 6 Mar 2023 16:21:25 +0000 (11:21 -0500)
diff --git a/glm.go b/glm.go

index b68bdd1898419ffab3c15d5516c8259ad4646073..20721a6d503316ac4e31ffe48dc61dae17d54047 100644 (file)
--- a/glm.go
+++ b/glm.go
@@ -35,7 +35,7 @@ func normalize(a []float64) {
  // onehot is the observed outcome, in same order as sampleInfo, but
  // shorter because it only has entries for samples with
  // isTraining==true.
-func glmPvalueFunc(sampleInfo []sampleInfo, nPCA int) func(onehot []bool) float64 {
+func glmPvalueFunc(sampleInfo []sampleInfo, nPCA int, minFrequency float64) func(onehot []bool) float64 {
         pcaNames := make([]string, 0, nPCA)
         data := make([][]statmodel.Dtype, 0, nPCA)
         for pca := 0; pca < nPCA; pca++ {
@@ -86,16 +86,21 @@ func glmPvalueFunc(sampleInfo []sampleInfo, nPCA int) func(onehot []bool) float6
  
                 variant := make([]statmodel.Dtype, 0, len(sampleInfo))
                 row := 0
+               ones := 0
                 for _, si := range sampleInfo {
                         if si.isTraining {
                                 if onehot[row] {
                                         variant = append(variant, 1)
+                                       ones++
                                 } else {
                                         variant = append(variant, 0)
                                 }
                                 row++
                         }
                 }
+               if float64(ones) < float64(len(variant))*minFrequency {
+                       return math.NaN()
+               }
  
                 data := append([][]statmodel.Dtype{data[0], variant}, data[1:]...)
                 names := append([]string{"outcome", "variant"}, names[1:]...)
diff --git a/glm_test.go b/glm_test.go

index ec3c000f9594d51c60ea4fdd2783f69a072b287b..8ca5d099f3fd7bbc77f6f486467a0c2aa48ce0f3 100644 (file)
--- a/glm_test.go
+++ b/glm_test.go
@@ -174,7 +174,7 @@ func (s *glmSuite) TestPvalueRealDataVsPython(c *check.C) {
                 }
         }
  
-       pGo := glmPvalueFunc(samples, nPCA)(onehot)
+       pGo := glmPvalueFunc(samples, nPCA, 1)(onehot)
         c.Logf("pGo = %g", pGo)
  
         var pydata bytes.Buffer
@@ -275,7 +275,7 @@ func (s *glmSuite) TestPvalue(c *check.C) {
  1, 1, 2, 1.22, 2.32
  1, 1, 3, 1.21, 2.31
  `)
-       c.Check(glmPvalueFunc(samples, npca)(onehot), check.Equals, 0.002789665435066107)
+       c.Check(glmPvalueFunc(samples, npca, 1)(onehot), check.Equals, 0.002789665435066107)
  
         samples, onehot, npca = csv2test(`
  # case=1, onehot=1, pca1, pca2, pca3
@@ -289,7 +289,7 @@ func (s *glmSuite) TestPvalue(c *check.C) {
  1, 1, 2, 1.22, 2.32
  1, 1, 3, 1.21, 2.31
  `)
-       c.Check(math.IsNaN(glmPvalueFunc(samples, npca)(onehot)), check.Equals, true)
+       c.Check(math.IsNaN(glmPvalueFunc(samples, npca, 1)(onehot)), check.Equals, true)
  }
  
  var benchSamples, benchOnehot = func() ([]sampleInfo, []bool) {
@@ -315,7 +315,7 @@ var benchSamples, benchOnehot = func() ([]sampleInfo, []bool) {
  
  func (s *glmSuite) BenchmarkPvalue(c *check.C) {
         for i := 0; i < c.N; i++ {
-               p := glmPvalueFunc(benchSamples, len(benchSamples[0].pcaComponents))(benchOnehot)
+               p := glmPvalueFunc(benchSamples, len(benchSamples[0].pcaComponents), 1)(benchOnehot)
                 c.Check(p, check.Equals, 0.0)
         }
  }
diff --git a/slicenumpy.go b/slicenumpy.go

index 895c3c15fa2d91cfd7b153f65d6b78a9c59ad806..c3d02a99bc91a9bfa98093c9d95b1f2568361268 100644 (file)
--- a/slicenumpy.go
+++ b/slicenumpy.go
@@ -44,6 +44,7 @@ type sliceNumpy struct {
         threads         int
         chi2Cases       []bool
         chi2PValue      float64
+       glmMinFrequency float64
         pcaComponents   int
         minCoverage     int
         includeVariant1 bool
@@ -94,6 +95,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
         debugTag := flags.Int("debug-tag", -1, "log debugging details about specified tag")
         flags.IntVar(&cmd.threads, "threads", 16, "number of memory-hungry assembly threads, and number of VCPUs to request for arvados container")
         flags.Float64Var(&cmd.chi2PValue, "chi2-p-value", 1, "do Χ² test (or logistic regression if -samples file has PCA components) and omit columns with p-value above this threshold")
+       flags.Float64Var(&cmd.glmMinFrequency, "glm-min-frequency", 0.01, "skip GLM calculation on tile variants below this frequency in the training set")
         flags.BoolVar(&cmd.includeVariant1, "include-variant-1", false, "include most common variant when building one-hot matrix")
         cmd.filter.Flags(flags)
         err := flags.Parse(args)
@@ -151,6 +153,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                         "-pca-components=" + fmt.Sprintf("%d", cmd.pcaComponents),
                         "-max-pca-tiles=" + fmt.Sprintf("%d", *maxPCATiles),
                         "-chi2-p-value=" + fmt.Sprintf("%f", cmd.chi2PValue),
+                       "-glm-min-frequency=" + fmt.Sprintf("%f", cmd.glmMinFrequency),
                         "-include-variant-1=" + fmt.Sprintf("%v", cmd.includeVariant1),
                         "-debug-tag=" + fmt.Sprintf("%d", cmd.debugTag),
                 }
@@ -302,7 +305,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
         }
  
         if len(cmd.samples[0].pcaComponents) > 0 {
-               cmd.pvalue = glmPvalueFunc(cmd.samples, cmd.pcaComponents)
+               cmd.pvalue = glmPvalueFunc(cmd.samples, cmd.pcaComponents, cmd.glmMinFrequency)
                 // Unfortunately, statsmodel/glm lib logs stuff to
                 // os.Stdout when it panics on an unsolvable
                 // problem. We recover() from the panic in glm.go, but
author	Tom Clegg <tom@curii.com>
	Mon, 6 Mar 2023 16:21:25 +0000 (11:21 -0500)
committer	Tom Clegg <tom@curii.com>
	Mon, 6 Mar 2023 16:21:25 +0000 (11:21 -0500)
glm.go		patch \| blob \| history
glm_test.go		patch \| blob \| history
slicenumpy.go		patch \| blob \| history