19566: Option to limit pca components used in glm. Fix onehot use.

author Tom Clegg <tom@curii.com>

Tue, 29 Nov 2022 16:22:12 +0000 (11:22 -0500)

committer Tom Clegg <tom@curii.com>

Tue, 29 Nov 2022 16:22:12 +0000 (11:22 -0500)
author Tom Clegg <tom@curii.com>
Tue, 29 Nov 2022 16:22:12 +0000 (11:22 -0500)
committer Tom Clegg <tom@curii.com>
Tue, 29 Nov 2022 16:22:12 +0000 (11:22 -0500)
diff --git a/glm.go b/glm.go

index 35ed37891898f989dc2619c13f17e8a74c1d6ae0..2f2327479ba4582555b1d4240e85dd80b1647df7 100644 (file)
--- a/glm.go
+++ b/glm.go
@@ -18,8 +18,12 @@ var glmConfig = &glm.Config{
         ConcurrentIRLS: 1000,
  }
  
-func pvalueGLM(sampleInfo []sampleInfo, onehot []bool) float64 {
-       nPCA := len(sampleInfo[0].pcaComponents)
+// Logistic regression.
+//
+// onehot is the observed outcome, in same order as sampleInfo, but
+// shorter because it only has entries for samples with
+// isTraining==true.
+func pvalueGLM(sampleInfo []sampleInfo, onehot []bool, nPCA int) float64 {
         pcaNames := make([]string, 0, nPCA)
         data := make([][]statmodel.Dtype, 0, nPCA)
         for pca := 0; pca < nPCA; pca++ {
@@ -35,7 +39,8 @@ func pvalueGLM(sampleInfo []sampleInfo, onehot []bool) float64 {
  
         variant := make([]statmodel.Dtype, 0, len(sampleInfo))
         outcome := make([]statmodel.Dtype, 0, len(sampleInfo))
-       for row, si := range sampleInfo {
+       row := 0
+       for _, si := range sampleInfo {
                 if si.isTraining {
                         if onehot[row] {
                                 variant = append(variant, 1)
@@ -47,6 +52,7 @@ func pvalueGLM(sampleInfo []sampleInfo, onehot []bool) float64 {
                         } else {
                                 outcome = append(outcome, 0)
                         }
+                       row++
                 }
         }
         data = append(data, variant, outcome)
diff --git a/glm_test.go b/glm_test.go

index 8cd06233f2dd352550040d6ad85410b70fba3946..e2f61f514cddc0c24395b1b8e2fb75cc6104d743 100644 (file)
--- a/glm_test.go
+++ b/glm_test.go
@@ -23,7 +23,7 @@ func (s *glmSuite) TestPvalue(c *check.C) {
                 {id: "sample4", isCase: true, isTraining: true, pcaComponents: []float64{-4, 1.1, -2}},
         }, []bool{
                 false, false, true, true,
-       }), check.Equals, 0.09589096738494937)
+       }, 3), check.Equals, 0.09589096738494937)
  
         c.Check(pvalueGLM([]sampleInfo{
                 {id: "sample1", isCase: false, isTraining: true, pcaComponents: []float64{1, 1.21, 2.37}},
@@ -37,7 +37,7 @@ func (s *glmSuite) TestPvalue(c *check.C) {
                 {id: "sample9", isCase: true, isTraining: true, pcaComponents: []float64{3, 1.21, 2.31}},
         }, []bool{
                 false, false, false, false, false, true, true, true, true,
-       }), check.Equals, 0.001028375654911555)
+       }, 3), check.Equals, 0.001028375654911555)
  
         c.Check(pvalueGLM([]sampleInfo{
                 {id: "sample1", isCase: false, isTraining: true, pcaComponents: []float64{1.001, -1.01, 2.39}},
@@ -60,8 +60,8 @@ func (s *glmSuite) TestPvalue(c *check.C) {
                 {id: "sample18", isCase: true, isTraining: true, pcaComponents: []float64{2.009, 8.10, -2.10}},
                 {id: "sample19", isCase: true, isTraining: true, pcaComponents: []float64{2.010, 8.11, -2.11}},
         }, []bool{
-               false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true, true, true,
-       }), check.Equals, 0.9999944849940106)
+               false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true, true, true,
+       }, 3), check.Equals, 0.9999944849940106)
  }
  
  var benchSamples, benchOnehot = func() ([]sampleInfo, []bool) {
@@ -87,7 +87,7 @@ var benchSamples, benchOnehot = func() ([]sampleInfo, []bool) {
  
  func (s *glmSuite) BenchmarkPvalue(c *check.C) {
         for i := 0; i < c.N; i++ {
-               p := pvalueGLM(benchSamples, benchOnehot)
+               p := pvalueGLM(benchSamples, benchOnehot, len(benchSamples[0].pcaComponents))
                 c.Check(p, check.Equals, 0.0)
         }
  }
diff --git a/slicenumpy.go b/slicenumpy.go

index 25fc466c116c4b9cc0a9508378ff33db91247c0b..a718cbf8148035f9105d216c9f22a1c4e33931b3 100644 (file)
--- a/slicenumpy.go
+++ b/slicenumpy.go
@@ -43,6 +43,7 @@ type sliceNumpy struct {
         threads         int
         chi2Cases       []bool
         chi2PValue      float64
+       pcaComponents   int
         minCoverage     int
         includeVariant1 bool
         debugTag        tagID
@@ -84,7 +85,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
         samplesFilename := flags.String("samples", "", "`samples.csv` file with training/validation and case/control groups (see 'lightning choose-samples')")
         caseControlOnly := flags.Bool("case-control-only", false, "drop samples that are not in case/control groups")
         onlyPCA := flags.Bool("pca", false, "run principal component analysis, write components to pca.npy and samples.csv")
-       pcaComponents := flags.Int("pca-components", 4, "number of PCA components")
+       flags.IntVar(&cmd.pcaComponents, "pca-components", 4, "number of PCA components to compute / use in logistic regression")
         maxPCATiles := flags.Int("max-pca-tiles", 0, "maximum tiles to use as PCA input (filter, then drop every 2nd colum pair until below max)")
         debugTag := flags.Int("debug-tag", -1, "log debugging details about specified tag")
         flags.IntVar(&cmd.threads, "threads", 16, "number of memory-hungry assembly threads, and number of VCPUs to request for arvados container")
@@ -142,7 +143,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                         "-samples=" + *samplesFilename,
                         "-case-control-only=" + fmt.Sprintf("%v", *caseControlOnly),
                         "-pca=" + fmt.Sprintf("%v", *onlyPCA),
-                       "-pca-components=" + fmt.Sprintf("%d", *pcaComponents),
+                       "-pca-components=" + fmt.Sprintf("%d", cmd.pcaComponents),
                         "-max-pca-tiles=" + fmt.Sprintf("%d", *maxPCATiles),
                         "-chi2-p-value=" + fmt.Sprintf("%f", cmd.chi2PValue),
                         "-include-variant-1=" + fmt.Sprintf("%v", cmd.includeVariant1),
@@ -1207,7 +1208,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                                 }
                         }
                         log.Print("fitting")
-                       transformer := nlp.NewPCA(*pcaComponents)
+                       transformer := nlp.NewPCA(cmd.pcaComponents)
                         transformer.Fit(mtxTrain.T())
                         log.Printf("transforming")
                         pca, err := transformer.Transform(mtxFull.T())
@@ -1603,7 +1604,7 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI
                 }
                 var p float64
                 if len(cmd.samples[0].pcaComponents) > 0 {
-                       p = pvalueGLM(cmd.samples, obs[col])
+                       p = pvalueGLM(cmd.samples, obs[col], cmd.pcaComponents)
                 } else {
                         p = pvalue(obs[col], cmd.chi2Cases)
                 }
author	Tom Clegg <tom@curii.com>
	Tue, 29 Nov 2022 16:22:12 +0000 (11:22 -0500)
committer	Tom Clegg <tom@curii.com>
	Tue, 29 Nov 2022 16:22:12 +0000 (11:22 -0500)
glm.go		patch \| blob \| history
glm_test.go		patch \| blob \| history
slicenumpy.go		patch \| blob \| history