ConcurrentIRLS: 1000,
}
-func pvalueGLM(sampleInfo []sampleInfo, onehot []bool) float64 {
- nPCA := len(sampleInfo[0].pcaComponents)
+// Logistic regression.
+//
+// onehot is the observed outcome, in same order as sampleInfo, but
+// shorter because it only has entries for samples with
+// isTraining==true.
+func pvalueGLM(sampleInfo []sampleInfo, onehot []bool, nPCA int) float64 {
pcaNames := make([]string, 0, nPCA)
data := make([][]statmodel.Dtype, 0, nPCA)
for pca := 0; pca < nPCA; pca++ {
variant := make([]statmodel.Dtype, 0, len(sampleInfo))
outcome := make([]statmodel.Dtype, 0, len(sampleInfo))
- for row, si := range sampleInfo {
+ row := 0
+ for _, si := range sampleInfo {
if si.isTraining {
if onehot[row] {
variant = append(variant, 1)
} else {
outcome = append(outcome, 0)
}
+ row++
}
}
data = append(data, variant, outcome)
{id: "sample4", isCase: true, isTraining: true, pcaComponents: []float64{-4, 1.1, -2}},
}, []bool{
false, false, true, true,
- }), check.Equals, 0.09589096738494937)
+ }, 3), check.Equals, 0.09589096738494937)
c.Check(pvalueGLM([]sampleInfo{
{id: "sample1", isCase: false, isTraining: true, pcaComponents: []float64{1, 1.21, 2.37}},
{id: "sample9", isCase: true, isTraining: true, pcaComponents: []float64{3, 1.21, 2.31}},
}, []bool{
false, false, false, false, false, true, true, true, true,
- }), check.Equals, 0.001028375654911555)
+ }, 3), check.Equals, 0.001028375654911555)
c.Check(pvalueGLM([]sampleInfo{
{id: "sample1", isCase: false, isTraining: true, pcaComponents: []float64{1.001, -1.01, 2.39}},
{id: "sample18", isCase: true, isTraining: true, pcaComponents: []float64{2.009, 8.10, -2.10}},
{id: "sample19", isCase: true, isTraining: true, pcaComponents: []float64{2.010, 8.11, -2.11}},
}, []bool{
- false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true, true, true,
- }), check.Equals, 0.9999944849940106)
+ false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true, true, true,
+ }, 3), check.Equals, 0.9999944849940106)
}
var benchSamples, benchOnehot = func() ([]sampleInfo, []bool) {
func (s *glmSuite) BenchmarkPvalue(c *check.C) {
for i := 0; i < c.N; i++ {
- p := pvalueGLM(benchSamples, benchOnehot)
+ p := pvalueGLM(benchSamples, benchOnehot, len(benchSamples[0].pcaComponents))
c.Check(p, check.Equals, 0.0)
}
}
threads int
chi2Cases []bool
chi2PValue float64
+ pcaComponents int
minCoverage int
includeVariant1 bool
debugTag tagID
samplesFilename := flags.String("samples", "", "`samples.csv` file with training/validation and case/control groups (see 'lightning choose-samples')")
caseControlOnly := flags.Bool("case-control-only", false, "drop samples that are not in case/control groups")
onlyPCA := flags.Bool("pca", false, "run principal component analysis, write components to pca.npy and samples.csv")
- pcaComponents := flags.Int("pca-components", 4, "number of PCA components")
+ flags.IntVar(&cmd.pcaComponents, "pca-components", 4, "number of PCA components to compute / use in logistic regression")
maxPCATiles := flags.Int("max-pca-tiles", 0, "maximum tiles to use as PCA input (filter, then drop every 2nd colum pair until below max)")
debugTag := flags.Int("debug-tag", -1, "log debugging details about specified tag")
flags.IntVar(&cmd.threads, "threads", 16, "number of memory-hungry assembly threads, and number of VCPUs to request for arvados container")
"-samples=" + *samplesFilename,
"-case-control-only=" + fmt.Sprintf("%v", *caseControlOnly),
"-pca=" + fmt.Sprintf("%v", *onlyPCA),
- "-pca-components=" + fmt.Sprintf("%d", *pcaComponents),
+ "-pca-components=" + fmt.Sprintf("%d", cmd.pcaComponents),
"-max-pca-tiles=" + fmt.Sprintf("%d", *maxPCATiles),
"-chi2-p-value=" + fmt.Sprintf("%f", cmd.chi2PValue),
"-include-variant-1=" + fmt.Sprintf("%v", cmd.includeVariant1),
}
}
log.Print("fitting")
- transformer := nlp.NewPCA(*pcaComponents)
+ transformer := nlp.NewPCA(cmd.pcaComponents)
transformer.Fit(mtxTrain.T())
log.Printf("transforming")
pca, err := transformer.Transform(mtxFull.T())
}
var p float64
if len(cmd.samples[0].pcaComponents) > 0 {
- p = pvalueGLM(cmd.samples, obs[col])
+ p = pvalueGLM(cmd.samples, obs[col], cmd.pcaComponents)
} else {
p = pvalue(obs[col], cmd.chi2Cases)
}