// onehot is the observed outcome, in same order as sampleInfo, but
// shorter because it only has entries for samples with
// isTraining==true.
-func glmPvalueFunc(sampleInfo []sampleInfo, nPCA int, minFrequency float64) func(onehot []bool) float64 {
+func glmPvalueFunc(sampleInfo []sampleInfo, nPCA int) func(onehot []bool) float64 {
pcaNames := make([]string, 0, nPCA)
data := make([][]statmodel.Dtype, 0, nPCA)
for pca := 0; pca < nPCA; pca++ {
variant := make([]statmodel.Dtype, 0, len(sampleInfo))
row := 0
- ones := 0
for _, si := range sampleInfo {
if si.isTraining {
if onehot[row] {
variant = append(variant, 1)
- ones++
} else {
variant = append(variant, 0)
}
row++
}
}
- if float64(ones) < float64(len(variant))*minFrequency {
- return math.NaN()
- }
data := append([][]statmodel.Dtype{data[0], variant}, data[1:]...)
names := append([]string{"outcome", "variant"}, names[1:]...)
const annotationMaxTileSpan = 100
type sliceNumpy struct {
- filter filter
- threads int
- chi2Cases []bool
- chi2PValue float64
- glmMinFrequency float64
- pcaComponents int
- minCoverage int
- includeVariant1 bool
- debugTag tagID
+ filter filter
+ threads int
+ chi2Cases []bool
+ chi2PValue float64
+ pvalueMinFrequency float64
+ pcaComponents int
+ minCoverage int
+ includeVariant1 bool
+ debugTag tagID
cgnames []string
samples []sampleInfo
debugTag := flags.Int("debug-tag", -1, "log debugging details about specified tag")
flags.IntVar(&cmd.threads, "threads", 16, "number of memory-hungry assembly threads, and number of VCPUs to request for arvados container")
flags.Float64Var(&cmd.chi2PValue, "chi2-p-value", 1, "do Χ² test (or logistic regression if -samples file has PCA components) and omit columns with p-value above this threshold")
- flags.Float64Var(&cmd.glmMinFrequency, "glm-min-frequency", 0.01, "skip GLM calculation on tile variants below this frequency in the training set")
+ flags.Float64Var(&cmd.pvalueMinFrequency, "pvalue-min-frequency", 0.01, "skip p-value calculation on tile variants below this frequency in the training set")
flags.BoolVar(&cmd.includeVariant1, "include-variant-1", false, "include most common variant when building one-hot matrix")
cmd.filter.Flags(flags)
err := flags.Parse(args)
"-pca-components=" + fmt.Sprintf("%d", cmd.pcaComponents),
"-max-pca-tiles=" + fmt.Sprintf("%d", *maxPCATiles),
"-chi2-p-value=" + fmt.Sprintf("%f", cmd.chi2PValue),
- "-glm-min-frequency=" + fmt.Sprintf("%f", cmd.glmMinFrequency),
+ "-pvalue-min-frequency=" + fmt.Sprintf("%f", cmd.pvalueMinFrequency),
"-include-variant-1=" + fmt.Sprintf("%v", cmd.includeVariant1),
"-debug-tag=" + fmt.Sprintf("%d", cmd.debugTag),
}
}
if len(cmd.samples[0].pcaComponents) > 0 {
- cmd.pvalue = glmPvalueFunc(cmd.samples, cmd.pcaComponents, cmd.glmMinFrequency)
+ cmd.pvalue = glmPvalueFunc(cmd.samples, cmd.pcaComponents)
// Unfortunately, statsmodel/glm lib logs stuff to
// os.Stdout when it panics on an unsolvable
// problem. We recover() from the panic in glm.go, but
if col < 4 && !cmd.includeVariant1 {
continue
}
+ if col&1 == 0 && cmd.pvalueMinFrequency < 1 && homhet2maf(obs[col:col+2]) < cmd.pvalueMinFrequency {
+ // Skip both columns (hom and het) if allele
+ // frequency is below threshold
+ col++
+ continue
+ }
atomic.AddInt64(&cmd.pvalueCallCount, 1)
p := cmd.pvalue(obs[col])
if cmd.chi2PValue < 1 && !(p < cmd.chi2PValue) {
return onehot, xref
}
+func homhet2maf(onehot [][]bool) float64 {
+ if len(onehot[0]) == 0 {
+ return 0
+ }
+ n := 0
+ for i := range onehot[0] {
+ if onehot[0][i] {
+ // hom
+ n += 2
+ } else if onehot[1][i] {
+ // het
+ n += 1
+ }
+ }
+ return float64(n) / float64(len(onehot[0])*2)
+}
+
// convert a []onehotXref with length N to a numpy-style []int32
// matrix with N columns, one row per field of onehotXref struct.
//