--- /dev/null
+// Copyright (C) The Lightning Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package lightning
+
+import (
+ "fmt"
+ "math"
+
+ "github.com/kshedden/statmodel/glm"
+ "github.com/kshedden/statmodel/statmodel"
+)
+
+var glmConfig = &glm.Config{
+ Family: glm.NewFamily(glm.BinomialFamily),
+ FitMethod: "IRLS",
+ ConcurrentIRLS: 1000,
+}
+
+func pvalueGLM(sampleInfo []sampleInfo, onehotPair [][]bool) float64 {
+ nPCA := len(sampleInfo[0].pcaComponents)
+ pcaNames := make([]string, 0, nPCA)
+ data := make([][]statmodel.Dtype, 0, nPCA)
+ for pca := 0; pca < nPCA; pca++ {
+ series := make([]statmodel.Dtype, 0, len(sampleInfo))
+ for _, si := range sampleInfo {
+ if si.isTraining {
+ series = append(series, si.pcaComponents[pca])
+ }
+ }
+ data = append(data, series)
+ pcaNames = append(pcaNames, fmt.Sprintf("pca%d", pca))
+ }
+
+ variant := make([]statmodel.Dtype, 0, len(sampleInfo))
+ outcome := make([]statmodel.Dtype, 0, len(sampleInfo))
+ for row, si := range sampleInfo {
+ if si.isTraining {
+ if onehotPair[0][row] {
+ variant = append(variant, 1)
+ } else {
+ variant = append(variant, 0)
+ }
+ if si.isCase {
+ outcome = append(outcome, 1)
+ } else {
+ outcome = append(outcome, 0)
+ }
+ }
+ }
+ data = append(data, variant, outcome)
+
+ dataset := statmodel.NewDataset(data, append(pcaNames, "variant", "outcome"))
+ model, err := glm.NewGLM(dataset, "outcome", pcaNames, glmConfig)
+ if err != nil {
+ return math.NaN()
+ }
+ resultCov := model.Fit()
+ logCov := resultCov.LogLike()
+ model, err = glm.NewGLM(dataset, "outcome", append([]string{"variant"}, pcaNames...), glmConfig)
+ if err != nil {
+ return math.NaN()
+ }
+ resultComp := model.Fit()
+ logComp := resultComp.LogLike()
+ return chisquared.Survival(-2 * (logCov - logComp))
+}
--- /dev/null
+// Copyright (C) The Lightning Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package lightning
+
+import (
+ "fmt"
+ "math/rand"
+
+ "gopkg.in/check.v1"
+)
+
+type glmSuite struct{}
+
+var _ = check.Suite(&glmSuite{})
+
+func (s *glmSuite) TestPvalue(c *check.C) {
+ c.Check(pvalueGLM([]sampleInfo{
+ {id: "sample1", isCase: false, isTraining: true, pcaComponents: []float64{-4, 1.2, -3}},
+ {id: "sample2", isCase: false, isTraining: true, pcaComponents: []float64{7, -1.2, 2}},
+ {id: "sample3", isCase: true, isTraining: true, pcaComponents: []float64{7, -1.2, 2}},
+ {id: "sample4", isCase: true, isTraining: true, pcaComponents: []float64{-4, 1.1, -2}},
+ }, [][]bool{
+ {false, false, true, true},
+ {false, false, true, true},
+ }), check.Equals, 0.09589096738494937)
+
+ c.Check(pvalueGLM([]sampleInfo{
+ {id: "sample1", isCase: false, isTraining: true, pcaComponents: []float64{1, 1.21, 2.37}},
+ {id: "sample2", isCase: false, isTraining: true, pcaComponents: []float64{2, 1.22, 2.38}},
+ {id: "sample3", isCase: false, isTraining: true, pcaComponents: []float64{3, 1.23, 2.39}},
+ {id: "sample4", isCase: false, isTraining: true, pcaComponents: []float64{1, 1.24, 2.33}},
+ {id: "sample5", isCase: false, isTraining: true, pcaComponents: []float64{2, 1.25, 2.34}},
+ {id: "sample6", isCase: true, isTraining: true, pcaComponents: []float64{3, 1.26, 2.35}},
+ {id: "sample7", isCase: true, isTraining: true, pcaComponents: []float64{1, 1.23, 2.36}},
+ {id: "sample8", isCase: true, isTraining: true, pcaComponents: []float64{2, 1.22, 2.32}},
+ {id: "sample9", isCase: true, isTraining: true, pcaComponents: []float64{3, 1.21, 2.31}},
+ }, [][]bool{
+ {false, false, false, false, false, true, true, true, true},
+ {false, false, false, false, false, true, true, true, true},
+ }), check.Equals, 0.001028375654911555)
+
+ c.Check(pvalueGLM([]sampleInfo{
+ {id: "sample1", isCase: false, isTraining: true, pcaComponents: []float64{1.001, -1.01, 2.39}},
+ {id: "sample2", isCase: false, isTraining: true, pcaComponents: []float64{1.002, -1.02, 2.38}},
+ {id: "sample3", isCase: false, isTraining: true, pcaComponents: []float64{1.003, -1.03, 2.37}},
+ {id: "sample4", isCase: false, isTraining: true, pcaComponents: []float64{1.004, -1.04, 2.36}},
+ {id: "sample5", isCase: false, isTraining: true, pcaComponents: []float64{1.005, -1.05, 2.35}},
+ {id: "sample6", isCase: false, isTraining: true, pcaComponents: []float64{1.006, -1.06, 2.34}},
+ {id: "sample7", isCase: false, isTraining: true, pcaComponents: []float64{1.007, -1.07, 2.33}},
+ {id: "sample8", isCase: false, isTraining: true, pcaComponents: []float64{1.008, -1.08, 2.32}},
+ {id: "sample9", isCase: false, isTraining: false, pcaComponents: []float64{2.000, 8.01, -2.01}},
+ {id: "sample10", isCase: true, isTraining: true, pcaComponents: []float64{2.001, 8.02, -2.02}},
+ {id: "sample11", isCase: true, isTraining: true, pcaComponents: []float64{2.002, 8.03, -2.03}},
+ {id: "sample12", isCase: true, isTraining: true, pcaComponents: []float64{2.003, 8.04, -2.04}},
+ {id: "sample13", isCase: true, isTraining: true, pcaComponents: []float64{2.004, 8.05, -2.05}},
+ {id: "sample14", isCase: true, isTraining: true, pcaComponents: []float64{2.005, 8.06, -2.06}},
+ {id: "sample15", isCase: true, isTraining: true, pcaComponents: []float64{2.006, 8.07, -2.07}},
+ {id: "sample16", isCase: true, isTraining: true, pcaComponents: []float64{2.007, 8.08, -2.08}},
+ {id: "sample17", isCase: true, isTraining: true, pcaComponents: []float64{2.008, 8.09, -2.09}},
+ {id: "sample18", isCase: true, isTraining: true, pcaComponents: []float64{2.009, 8.10, -2.10}},
+ {id: "sample19", isCase: true, isTraining: true, pcaComponents: []float64{2.010, 8.11, -2.11}},
+ }, [][]bool{
+ {false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true, true, true},
+ {false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true, true, true},
+ }), check.Equals, 0.9999944849940106)
+}
+
+var benchSamples, benchOnehot = func() ([]sampleInfo, [][]bool) {
+ pcaComponents := 10
+ samples := []sampleInfo{}
+ onehot := make([][]bool, 2)
+ r := make([]float64, pcaComponents)
+ for j := 0; j < 10000; j++ {
+ for i := 0; i < len(r); i++ {
+ r[i] = rand.Float64()
+ }
+ samples = append(samples, sampleInfo{
+ id: fmt.Sprintf("sample%d", j),
+ isCase: j%2 == 0 && j > 200,
+ isControl: j%2 == 1 || j <= 200,
+ isTraining: true,
+ pcaComponents: append([]float64(nil), r...),
+ })
+ onehot[0] = append(onehot[0], j%2 == 0)
+ onehot[1] = append(onehot[1], j%2 == 0)
+ }
+ return samples, onehot
+}()
+
+func (s *glmSuite) BenchmarkPvalue(c *check.C) {
+ for i := 0; i < c.N; i++ {
+ p := pvalueGLM(benchSamples, benchOnehot)
+ c.Check(p, check.Equals, 0.0)
+ }
+}
onehotChunked := flags.Bool("chunked-onehot", false, "generate one-hot tile-based matrix per input chunk")
samplesFilename := flags.String("samples", "", "`samples.csv` file with training/validation and case/control groups (see 'lightning choose-samples')")
caseControlOnly := flags.Bool("case-control-only", false, "drop samples that are not in case/control groups")
- onlyPCA := flags.Bool("pca", false, "generate pca matrix")
+ onlyPCA := flags.Bool("pca", false, "run principal component analysis, write components to pca.npy and samples.csv")
pcaComponents := flags.Int("pca-components", 4, "number of PCA components")
maxPCATiles := flags.Int("max-pca-tiles", 0, "maximum tiles to use as PCA input (filter, then drop every 2nd colum pair until below max)")
debugTag := flags.Int("debug-tag", -1, "log debugging details about specified tag")
flags.IntVar(&cmd.threads, "threads", 16, "number of memory-hungry assembly threads, and number of VCPUs to request for arvados container")
- flags.Float64Var(&cmd.chi2PValue, "chi2-p-value", 1, "do Χ² test and omit columns with p-value above this threshold")
+ flags.Float64Var(&cmd.chi2PValue, "chi2-p-value", 1, "do Χ² test (or logistic regression if -samples file has PCA components) and omit columns with p-value above this threshold")
flags.BoolVar(&cmd.includeVariant1, "include-variant-1", false, "include most common variant when building one-hot matrix")
cmd.filter.Flags(flags)
err := flags.Parse(args)
if idx != len(si) {
return nil, fmt.Errorf("%s line %d: index %d out of order", samplesFilename, lineNum, idx)
}
+ var pcaComponents []float64
+ if len(split) > 4 {
+ for _, s := range split[4:] {
+ f, err := strconv.ParseFloat(s, 64)
+ if err != nil {
+ return nil, fmt.Errorf("%s line %d: cannot parse float %q: %s", samplesFilename, lineNum, s, err)
+ }
+ pcaComponents = append(pcaComponents, f)
+ }
+ }
si = append(si, sampleInfo{
- id: split[1],
- isCase: split[2] == "1",
- isControl: split[2] == "0",
- isTraining: split[3] == "1",
- isValidation: split[3] == "0",
+ id: split[1],
+ isCase: split[2] == "1",
+ isControl: split[2] == "0",
+ isTraining: split[3] == "1",
+ isValidation: split[3] == "0",
+ pcaComponents: pcaComponents,
})
}
return si, nil
if col < 4 && !cmd.includeVariant1 {
continue
}
- p := pvalue(obs[col], cmd.chi2Cases)
+ var p float64
+ if len(cmd.samples[0].pcaComponents) > 0 {
+ p = pvalueGLM(cmd.samples, obs[col:col+2])
+ } else {
+ p = pvalue(obs[col], cmd.chi2Cases)
+ }
if cmd.chi2PValue < 1 && !(p < cmd.chi2PValue) {
continue
}