// onehot is the observed outcome, in same order as sampleInfo, but
// shorter because it only has entries for samples with
// isTraining==true.
-func pvalueGLM(sampleInfo []sampleInfo, onehot []bool, nPCA int) (p float64) {
+func glmPvalueFunc(sampleInfo []sampleInfo, nPCA int) func(onehot []bool) float64 {
pcaNames := make([]string, 0, nPCA)
data := make([][]statmodel.Dtype, 0, nPCA)
for pca := 0; pca < nPCA; pca++ {
pcaNames = append(pcaNames, fmt.Sprintf("pca%d", pca))
}
- variant := make([]statmodel.Dtype, 0, len(sampleInfo))
outcome := make([]statmodel.Dtype, 0, len(sampleInfo))
constants := make([]statmodel.Dtype, 0, len(sampleInfo))
row := 0
for _, si := range sampleInfo {
if si.isTraining {
- if onehot[row] {
- variant = append(variant, 1)
- } else {
- variant = append(variant, 0)
- }
if si.isCase {
outcome = append(outcome, 1)
} else {
row++
}
}
- data = append(data, variant, outcome, constants)
- dataset := statmodel.NewDataset(data, append(pcaNames, "variant", "outcome", "constants"))
+ data = append([][]statmodel.Dtype{outcome, constants}, data...)
+ names := append([]string{"outcome", "constants"}, pcaNames...)
+ dataset := statmodel.NewDataset(data, names)
- defer func() {
- if recover() != nil {
- // typically "matrix singular or near-singular with condition number +Inf"
- p = math.NaN()
- }
- }()
- model, err := glm.NewGLM(dataset, "outcome", append([]string{"constants"}, pcaNames...), glmConfig)
+ model, err := glm.NewGLM(dataset, "outcome", names[1:], glmConfig)
if err != nil {
- return math.NaN()
+ log.Printf("%s", err)
+ return func([]bool) float64 { return math.NaN() }
}
resultCov := model.Fit()
logCov := resultCov.LogLike()
- model, err = glm.NewGLM(dataset, "outcome", append([]string{"constants", "variant"}, pcaNames...), glmConfig)
- if err != nil {
- return math.NaN()
+
+ return func(onehot []bool) (p float64) {
+ defer func() {
+ if recover() != nil {
+ // typically "matrix singular or near-singular with condition number +Inf"
+ p = math.NaN()
+ }
+ }()
+
+ variant := make([]statmodel.Dtype, 0, len(sampleInfo))
+ row := 0
+ for _, si := range sampleInfo {
+ if si.isTraining {
+ if onehot[row] {
+ variant = append(variant, 1)
+ } else {
+ variant = append(variant, 0)
+ }
+ row++
+ }
+ }
+
+ data := append([][]statmodel.Dtype{data[0], variant}, data[1:]...)
+ names := append([]string{"outcome", "variant"}, names[1:]...)
+ dataset := statmodel.NewDataset(data, names)
+
+ model, err := glm.NewGLM(dataset, "outcome", names[1:], glmConfig)
+ if err != nil {
+ return math.NaN()
+ }
+ resultComp := model.Fit()
+ logComp := resultComp.LogLike()
+ dist := distuv.ChiSquared{K: 1}
+ return dist.Survival(-2 * (logCov - logComp))
}
- resultComp := model.Fit()
- logComp := resultComp.LogLike()
- dist := distuv.ChiSquared{K: 1}
- return dist.Survival(-2 * (logCov - logComp))
}
}
}
- pGo := pvalueGLM(samples, onehot, nPCA)
+ pGo := glmPvalueFunc(samples, nPCA)(onehot)
c.Logf("pGo = %g", pGo)
var pydata bytes.Buffer
return
}
- c.Check(pvalueGLM(csv2test(`
+ samples, onehot, npca := csv2test(`
# case=1, onehot=1, pca1, pca2, pca3
0, 0, 1, 1.21, 2.37
0, 0, 2, 1.22, 2.38
1, 1, 1, 1.23, 2.36
1, 1, 2, 1.22, 2.32
1, 1, 3, 1.21, 2.31
-`)), check.Equals, 0.002789665435066107)
+`)
+ c.Check(glmPvalueFunc(samples, npca)(onehot), check.Equals, 0.002789665435066107)
}
var benchSamples, benchOnehot = func() ([]sampleInfo, []bool) {
func (s *glmSuite) BenchmarkPvalue(c *check.C) {
for i := 0; i < c.N; i++ {
- p := pvalueGLM(benchSamples, benchOnehot, len(benchSamples[0].pcaComponents))
+ p := glmPvalueFunc(benchSamples, len(benchSamples[0].pcaComponents))(benchOnehot)
c.Check(p, check.Equals, 0.0)
}
}
samples []sampleInfo
trainingSet []int // samples index => training set index, or -1 if not in training set
trainingSetSize int
+ pvalue func(onehot []bool) float64
}
func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
if err != nil {
return err
}
+ if len(cmd.samples[0].pcaComponents) > 0 {
+ cmd.pvalue = glmPvalueFunc(cmd.samples, cmd.pcaComponents)
+ }
} else if *caseControlOnly {
return fmt.Errorf("-case-control-only does not make sense without -samples")
}
cmd.trainingSet[i] = -1
}
}
+ cmd.pvalue = func(onehot []bool) float64 {
+ return pvalue(onehot, cmd.chi2Cases)
+ }
}
if cmd.filter.MinCoverage == 1 {
// In the generic formula below, floating point
if col < 4 && !cmd.includeVariant1 {
continue
}
- var p float64
- if len(cmd.samples[0].pcaComponents) > 0 {
- p = pvalueGLM(cmd.samples, obs[col], cmd.pcaComponents)
- } else {
- p = pvalue(obs[col], cmd.chi2Cases)
- }
+ p := cmd.pvalue(obs[col])
if cmd.chi2PValue < 1 && !(p < cmd.chi2PValue) {
continue
}