Mounts map[string]map[string]interface{}
Priority int
KeepCache int // cache buffers per VCPU (0 for default)
+ Preemptible bool
}
func (runner *arvadosContainerRunner) Run() (string, error) {
"priority": runner.Priority,
"state": arvados.ContainerRequestStateCommitted,
"scheduling_parameters": arvados.SchedulingParameters{
- Preemptible: false,
+ Preemptible: runner.Preemptible,
Partitions: []string{},
},
"environment": map[string]string{
// onehot is the observed outcome, in same order as sampleInfo, but
// shorter because it only has entries for samples with
// isTraining==true.
-func pvalueGLM(sampleInfo []sampleInfo, onehot []bool, nPCA int) (p float64) {
+func glmPvalueFunc(sampleInfo []sampleInfo, nPCA int) func(onehot []bool) float64 {
pcaNames := make([]string, 0, nPCA)
data := make([][]statmodel.Dtype, 0, nPCA)
for pca := 0; pca < nPCA; pca++ {
pcaNames = append(pcaNames, fmt.Sprintf("pca%d", pca))
}
- variant := make([]statmodel.Dtype, 0, len(sampleInfo))
outcome := make([]statmodel.Dtype, 0, len(sampleInfo))
constants := make([]statmodel.Dtype, 0, len(sampleInfo))
row := 0
for _, si := range sampleInfo {
if si.isTraining {
- if onehot[row] {
- variant = append(variant, 1)
- } else {
- variant = append(variant, 0)
- }
if si.isCase {
outcome = append(outcome, 1)
} else {
row++
}
}
- data = append(data, variant, outcome, constants)
- dataset := statmodel.NewDataset(data, append(pcaNames, "variant", "outcome", "constants"))
+ data = append([][]statmodel.Dtype{outcome, constants}, data...)
+ names := append([]string{"outcome", "constants"}, pcaNames...)
+ dataset := statmodel.NewDataset(data, names)
- defer func() {
- if recover() != nil {
- // typically "matrix singular or near-singular with condition number +Inf"
- p = math.NaN()
- }
- }()
- model, err := glm.NewGLM(dataset, "outcome", append([]string{"constants"}, pcaNames...), glmConfig)
+ model, err := glm.NewGLM(dataset, "outcome", names[1:], glmConfig)
if err != nil {
- return math.NaN()
+ log.Printf("%s", err)
+ return func([]bool) float64 { return math.NaN() }
}
resultCov := model.Fit()
logCov := resultCov.LogLike()
- model, err = glm.NewGLM(dataset, "outcome", append([]string{"constants", "variant"}, pcaNames...), glmConfig)
- if err != nil {
- return math.NaN()
+
+ return func(onehot []bool) (p float64) {
+ defer func() {
+ if recover() != nil {
+ // typically "matrix singular or near-singular with condition number +Inf"
+ p = math.NaN()
+ }
+ }()
+
+ variant := make([]statmodel.Dtype, 0, len(sampleInfo))
+ row := 0
+ for _, si := range sampleInfo {
+ if si.isTraining {
+ if onehot[row] {
+ variant = append(variant, 1)
+ } else {
+ variant = append(variant, 0)
+ }
+ row++
+ }
+ }
+
+ data := append([][]statmodel.Dtype{data[0], variant}, data[1:]...)
+ names := append([]string{"outcome", "variant"}, names[1:]...)
+ dataset := statmodel.NewDataset(data, names)
+
+ model, err := glm.NewGLM(dataset, "outcome", names[1:], glmConfig)
+ if err != nil {
+ return math.NaN()
+ }
+ resultComp := model.Fit()
+ logComp := resultComp.LogLike()
+ dist := distuv.ChiSquared{K: 1}
+ return dist.Survival(-2 * (logCov - logComp))
}
- resultComp := model.Fit()
- logComp := resultComp.LogLike()
- dist := distuv.ChiSquared{K: 1}
- return dist.Survival(-2 * (logCov - logComp))
}
}
}
- pGo := pvalueGLM(samples, onehot, nPCA)
+ pGo := glmPvalueFunc(samples, nPCA)(onehot)
c.Logf("pGo = %g", pGo)
var pydata bytes.Buffer
return
}
- c.Check(pvalueGLM(csv2test(`
+ samples, onehot, npca := csv2test(`
# case=1, onehot=1, pca1, pca2, pca3
0, 0, 1, 1.21, 2.37
0, 0, 2, 1.22, 2.38
1, 1, 1, 1.23, 2.36
1, 1, 2, 1.22, 2.32
1, 1, 3, 1.21, 2.31
-`)), check.Equals, 0.002789665435066107)
+`)
+ c.Check(glmPvalueFunc(samples, npca)(onehot), check.Equals, 0.002789665435066107)
+
+ samples, onehot, npca = csv2test(`
+# case=1, onehot=1, pca1, pca2, pca3
+0, 1, 1, 1.21, 2.37
+0, 1, 2, 1.22, 2.38
+0, 1, 3, 1.23, 2.39
+0, 1, 1, 1.24, 2.33
+0, 1, 2, 1.25, 2.34
+1, 1, 3, 1.26, 2.35
+1, 1, 1, 1.23, 2.36
+1, 1, 2, 1.22, 2.32
+1, 1, 3, 1.21, 2.31
+`)
+ c.Check(math.IsNaN(glmPvalueFunc(samples, npca)(onehot)), check.Equals, true)
}
var benchSamples, benchOnehot = func() ([]sampleInfo, []bool) {
func (s *glmSuite) BenchmarkPvalue(c *check.C) {
for i := 0; i < c.N; i++ {
- p := pvalueGLM(benchSamples, benchOnehot, len(benchSamples[0].pcaComponents))
+ p := glmPvalueFunc(benchSamples, len(benchSamples[0].pcaComponents))(benchOnehot)
c.Check(p, check.Equals, 0.0)
}
}
tilepos[int(annotation[0])] = (annotation[4], int(annotation[5]))
series = {"#CHROM": [], "POS": [], "P": []}
-for tag, chrpos in sorted(tilepos.items(), key=lambda item: (item[1][0].lstrip('chr').zfill(2), item[1][1])):
+for tag, chrpos in sorted(tilepos.items(), key=lambda item: (item[1][0][-1] > '9', item[1][0].lstrip('chr').zfill(2), item[1][1])):
for p in pvalue.get(tag, []):
series["#CHROM"].append(chrpos[0])
series["POS"].append(chrpos[1])
series["P"].append(p)
-qmplot.manhattanplot(data=pandas.DataFrame(series))
-matplotlib.pyplot.savefig(output_path)
+qmplot.manhattanplot(data=pandas.DataFrame(series),
+ suggestiveline=2e-10, # Turn off suggestiveline
+ genomewideline=2e-11, # Turn off genomewidel
+ sign_line_cols=["#D62728", "#2CA02C"],
+ marker=".",
+ alpha = 0.6,
+ hline_kws={"linestyle": "--", "lw": 1.3},
+ title="Tile Variant Manhattan Plot",
+ # xtick_label_set=xtick,
+ xlabel="Chromosome",
+ ylabel=r"$-log_{10}{(P)}$",
+ xticklabel_kws={"rotation": "vertical"})
+matplotlib.pyplot.savefig(output_path, bbox_inches="tight")
runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
projectUUID := flags.String("project", "", "project `UUID` for output data")
priority := flags.Int("priority", 500, "container request priority")
+ preemptible := flags.Bool("preemptible", true, "request preemptible instance")
outputDir := flags.String("output-dir", "./out", "output `directory`")
tagsPerFile := flags.Int("tags-per-file", 50000, "tags per file (nfiles will be ~10M÷x)")
err = flags.Parse(args)
Priority: *priority,
KeepCache: 2,
APIAccess: true,
+ Preemptible: *preemptible,
}
for i := range inputDirs {
err = runner.TranslatePaths(&inputDirs[i])
"bufio"
"bytes"
"encoding/gob"
+ "encoding/json"
"errors"
"flag"
"fmt"
samples []sampleInfo
trainingSet []int // samples index => training set index, or -1 if not in training set
trainingSetSize int
+ pvalue func(onehot []bool) float64
+ pvalueCallCount int64
}
func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
arvadosVCPUs := flags.Int("arvados-vcpus", 96, "number of VCPUs to request for arvados container")
projectUUID := flags.String("project", "", "project `UUID` for output data")
priority := flags.Int("priority", 500, "container request priority")
+ preemptible := flags.Bool("preemptible", true, "request preemptible instance")
inputDir := flags.String("input-dir", "./in", "input `directory`")
outputDir := flags.String("output-dir", "./out", "output `directory`")
ref := flags.String("ref", "", "reference name (if blank, choose last one that appears in input)")
Priority: *priority,
KeepCache: 2,
APIAccess: true,
+ Preemptible: *preemptible,
}
err = runner.TranslatePaths(inputDir, regionsFilename, samplesFilename)
if err != nil {
if err != nil {
return err
}
+ if len(cmd.samples[0].pcaComponents) > 0 {
+ cmd.pvalue = glmPvalueFunc(cmd.samples, cmd.pcaComponents)
+ // Unfortunately, statsmodel/glm lib logs
+ // stuff to os.Stdout when it panics on an
+ // unsolvable problem. We recover() from the
+ // panic in glm.go, but we also need to
+ // commandeer os.Stdout to avoid producing
+ // large quantities of logs.
+ stdoutWas := os.Stdout
+ defer func() { os.Stdout = stdoutWas }()
+ os.Stdout, err = os.Open(os.DevNull)
+ if err != nil {
+ return err
+ }
+ }
} else if *caseControlOnly {
return fmt.Errorf("-case-control-only does not make sense without -samples")
}
cmd.trainingSet[i] = -1
}
}
+ if cmd.pvalue == nil {
+ cmd.pvalue = func(onehot []bool) float64 {
+ return pvalue(onehot, cmd.chi2Cases)
+ }
+ }
}
if cmd.filter.MinCoverage == 1 {
// In the generic formula below, floating point
if err != nil {
return err
}
+ fnm = fmt.Sprintf("%s/stats.json", *outputDir)
+ j, err := json.Marshal(map[string]interface{}{
+ "pvalueCallCount": cmd.pvalueCallCount,
+ })
+ if err != nil {
+ return err
+ }
+ err = os.WriteFile(fnm, j, 0777)
+ if err != nil {
+ return err
+ }
}
if *onlyPCA {
cols := 0
if col < 4 && !cmd.includeVariant1 {
continue
}
- var p float64
- if len(cmd.samples[0].pcaComponents) > 0 {
- p = pvalueGLM(cmd.samples, obs[col], cmd.pcaComponents)
- } else {
- p = pvalue(obs[col], cmd.chi2Cases)
- }
+ atomic.AddInt64(&cmd.pvalueCallCount, 1)
+ p := cmd.pvalue(obs[col])
if cmd.chi2PValue < 1 && !(p < cmd.chi2PValue) {
continue
}