X-Git-Url: https://git.arvados.org/lightning.git/blobdiff_plain/b1b80df910990e6e9428888ed3407741616bf27e..ed6e40e98ebd142cfc761a4c0fddc451875896bc:/glm.go diff --git a/glm.go b/glm.go index 6a829ca547..b68bdd1898 100644 --- a/glm.go +++ b/glm.go @@ -12,6 +12,7 @@ import ( "github.com/kshedden/statmodel/glm" "github.com/kshedden/statmodel/statmodel" + "gonum.org/v1/gonum/stat" "gonum.org/v1/gonum/stat/distuv" ) @@ -22,12 +23,19 @@ var glmConfig = &glm.Config{ Log: log.New(io.Discard, "", 0), } +func normalize(a []float64) { + mean, std := stat.MeanStdDev(a, nil) + for i, x := range a { + a[i] = (x - mean) / std + } +} + // Logistic regression. // // onehot is the observed outcome, in same order as sampleInfo, but // shorter because it only has entries for samples with // isTraining==true. -func pvalueGLM(sampleInfo []sampleInfo, onehot []bool, nPCA int) (p float64) { +func glmPvalueFunc(sampleInfo []sampleInfo, nPCA int) func(onehot []bool) float64 { pcaNames := make([]string, 0, nPCA) data := make([][]statmodel.Dtype, 0, nPCA) for pca := 0; pca < nPCA; pca++ { @@ -37,21 +45,16 @@ func pvalueGLM(sampleInfo []sampleInfo, onehot []bool, nPCA int) (p float64) { series = append(series, si.pcaComponents[pca]) } } + normalize(series) data = append(data, series) pcaNames = append(pcaNames, fmt.Sprintf("pca%d", pca)) } - variant := make([]statmodel.Dtype, 0, len(sampleInfo)) outcome := make([]statmodel.Dtype, 0, len(sampleInfo)) constants := make([]statmodel.Dtype, 0, len(sampleInfo)) row := 0 for _, si := range sampleInfo { if si.isTraining { - if onehot[row] { - variant = append(variant, 1) - } else { - variant = append(variant, 0) - } if si.isCase { outcome = append(outcome, 1) } else { @@ -61,27 +64,50 @@ func pvalueGLM(sampleInfo []sampleInfo, onehot []bool, nPCA int) (p float64) { row++ } } - data = append(data, variant, outcome, constants) - dataset := statmodel.NewDataset(data, append(pcaNames, "variant", "outcome", "constants")) + data = append([][]statmodel.Dtype{outcome, constants}, data...) + names := append([]string{"outcome", "constants"}, pcaNames...) + dataset := statmodel.NewDataset(data, names) - defer func() { - if recover() != nil { - // typically "matrix singular or near-singular with condition number +Inf" - p = math.NaN() - } - }() - model, err := glm.NewGLM(dataset, "outcome", append([]string{"constants"}, pcaNames...), glmConfig) + model, err := glm.NewGLM(dataset, "outcome", names[1:], glmConfig) if err != nil { - return math.NaN() + log.Printf("%s", err) + return func([]bool) float64 { return math.NaN() } } resultCov := model.Fit() logCov := resultCov.LogLike() - model, err = glm.NewGLM(dataset, "outcome", append([]string{"constants", "variant"}, pcaNames...), glmConfig) - if err != nil { - return math.NaN() + + return func(onehot []bool) (p float64) { + defer func() { + if recover() != nil { + // typically "matrix singular or near-singular with condition number +Inf" + p = math.NaN() + } + }() + + variant := make([]statmodel.Dtype, 0, len(sampleInfo)) + row := 0 + for _, si := range sampleInfo { + if si.isTraining { + if onehot[row] { + variant = append(variant, 1) + } else { + variant = append(variant, 0) + } + row++ + } + } + + data := append([][]statmodel.Dtype{data[0], variant}, data[1:]...) + names := append([]string{"outcome", "variant"}, names[1:]...) + dataset := statmodel.NewDataset(data, names) + + model, err := glm.NewGLM(dataset, "outcome", names[1:], glmConfig) + if err != nil { + return math.NaN() + } + resultComp := model.Fit() + logComp := resultComp.LogLike() + dist := distuv.ChiSquared{K: 1} + return dist.Survival(-2 * (logCov - logComp)) } - resultComp := model.Fit() - logComp := resultComp.LogLike() - dist := distuv.ChiSquared{K: 1} - return dist.Survival(-2 * (logCov - logComp)) }