19995: Use allele frequency, apply min freq cutoff to Χ² pvalue too.

author Tom Clegg <tom@curii.com>

Wed, 29 Mar 2023 15:56:42 +0000 (11:56 -0400)

committer Tom Clegg <tom@curii.com>

Wed, 29 Mar 2023 15:56:42 +0000 (11:56 -0400)
author Tom Clegg <tom@curii.com>
Wed, 29 Mar 2023 15:56:42 +0000 (11:56 -0400)
committer Tom Clegg <tom@curii.com>
Wed, 29 Mar 2023 15:56:42 +0000 (11:56 -0400)
diff --git a/glm.go b/glm.go

index 20721a6d503316ac4e31ffe48dc61dae17d54047..b68bdd1898419ffab3c15d5516c8259ad4646073 100644 (file)
--- a/glm.go
+++ b/glm.go
@@ -35,7 +35,7 @@ func normalize(a []float64) {
  // onehot is the observed outcome, in same order as sampleInfo, but
  // shorter because it only has entries for samples with
  // isTraining==true.
-func glmPvalueFunc(sampleInfo []sampleInfo, nPCA int, minFrequency float64) func(onehot []bool) float64 {
+func glmPvalueFunc(sampleInfo []sampleInfo, nPCA int) func(onehot []bool) float64 {
         pcaNames := make([]string, 0, nPCA)
         data := make([][]statmodel.Dtype, 0, nPCA)
         for pca := 0; pca < nPCA; pca++ {
@@ -86,21 +86,16 @@ func glmPvalueFunc(sampleInfo []sampleInfo, nPCA int, minFrequency float64) func
  
                 variant := make([]statmodel.Dtype, 0, len(sampleInfo))
                 row := 0
-               ones := 0
                 for _, si := range sampleInfo {
                         if si.isTraining {
                                 if onehot[row] {
                                         variant = append(variant, 1)
-                                       ones++
                                 } else {
                                         variant = append(variant, 0)
                                 }
                                 row++
                         }
                 }
-               if float64(ones) < float64(len(variant))*minFrequency {
-                       return math.NaN()
-               }
  
                 data := append([][]statmodel.Dtype{data[0], variant}, data[1:]...)
                 names := append([]string{"outcome", "variant"}, names[1:]...)
diff --git a/slicenumpy.go b/slicenumpy.go

index c3d02a99bc91a9bfa98093c9d95b1f2568361268..94bbb978496cf634011b0529cad0cf5f04adb913 100644 (file)
--- a/slicenumpy.go
+++ b/slicenumpy.go
@@ -40,15 +40,15 @@ import (
  const annotationMaxTileSpan = 100
  
  type sliceNumpy struct {
-       filter          filter
-       threads         int
-       chi2Cases       []bool
-       chi2PValue      float64
-       glmMinFrequency float64
-       pcaComponents   int
-       minCoverage     int
-       includeVariant1 bool
-       debugTag        tagID
+       filter             filter
+       threads            int
+       chi2Cases          []bool
+       chi2PValue         float64
+       pvalueMinFrequency float64
+       pcaComponents      int
+       minCoverage        int
+       includeVariant1    bool
+       debugTag           tagID
  
         cgnames         []string
         samples         []sampleInfo
@@ -95,7 +95,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
         debugTag := flags.Int("debug-tag", -1, "log debugging details about specified tag")
         flags.IntVar(&cmd.threads, "threads", 16, "number of memory-hungry assembly threads, and number of VCPUs to request for arvados container")
         flags.Float64Var(&cmd.chi2PValue, "chi2-p-value", 1, "do Χ² test (or logistic regression if -samples file has PCA components) and omit columns with p-value above this threshold")
-       flags.Float64Var(&cmd.glmMinFrequency, "glm-min-frequency", 0.01, "skip GLM calculation on tile variants below this frequency in the training set")
+       flags.Float64Var(&cmd.pvalueMinFrequency, "pvalue-min-frequency", 0.01, "skip p-value calculation on tile variants below this frequency in the training set")
         flags.BoolVar(&cmd.includeVariant1, "include-variant-1", false, "include most common variant when building one-hot matrix")
         cmd.filter.Flags(flags)
         err := flags.Parse(args)
@@ -153,7 +153,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                         "-pca-components=" + fmt.Sprintf("%d", cmd.pcaComponents),
                         "-max-pca-tiles=" + fmt.Sprintf("%d", *maxPCATiles),
                         "-chi2-p-value=" + fmt.Sprintf("%f", cmd.chi2PValue),
-                       "-glm-min-frequency=" + fmt.Sprintf("%f", cmd.glmMinFrequency),
+                       "-pvalue-min-frequency=" + fmt.Sprintf("%f", cmd.pvalueMinFrequency),
                         "-include-variant-1=" + fmt.Sprintf("%v", cmd.includeVariant1),
                         "-debug-tag=" + fmt.Sprintf("%d", cmd.debugTag),
                 }
@@ -305,7 +305,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
         }
  
         if len(cmd.samples[0].pcaComponents) > 0 {
-               cmd.pvalue = glmPvalueFunc(cmd.samples, cmd.pcaComponents, cmd.glmMinFrequency)
+               cmd.pvalue = glmPvalueFunc(cmd.samples, cmd.pcaComponents)
                 // Unfortunately, statsmodel/glm lib logs stuff to
                 // os.Stdout when it panics on an unsolvable
                 // problem. We recover() from the panic in glm.go, but
@@ -1642,6 +1642,12 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI
                 if col < 4 && !cmd.includeVariant1 {
                         continue
                 }
+               if col&1 == 0 && cmd.pvalueMinFrequency < 1 && homhet2maf(obs[col:col+2]) < cmd.pvalueMinFrequency {
+                       // Skip both columns (hom and het) if allele
+                       // frequency is below threshold
+                       col++
+                       continue
+               }
                 atomic.AddInt64(&cmd.pvalueCallCount, 1)
                 p := cmd.pvalue(obs[col])
                 if cmd.chi2PValue < 1 && !(p < cmd.chi2PValue) {
@@ -1658,6 +1664,23 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI
         return onehot, xref
  }
  
+func homhet2maf(onehot [][]bool) float64 {
+       if len(onehot[0]) == 0 {
+               return 0
+       }
+       n := 0
+       for i := range onehot[0] {
+               if onehot[0][i] {
+                       // hom
+                       n += 2
+               } else if onehot[1][i] {
+                       // het
+                       n += 1
+               }
+       }
+       return float64(n) / float64(len(onehot[0])*2)
+}
+
  // convert a []onehotXref with length N to a numpy-style []int32
  // matrix with N columns, one row per field of onehotXref struct.
  //
author	Tom Clegg <tom@curii.com>
	Wed, 29 Mar 2023 15:56:42 +0000 (11:56 -0400)
committer	Tom Clegg <tom@curii.com>
	Wed, 29 Mar 2023 15:56:42 +0000 (11:56 -0400)
glm.go		patch \| blob \| history
slicenumpy.go		patch \| blob \| history