From: Tom Clegg Date: Wed, 29 Mar 2023 15:56:42 +0000 (-0400) Subject: 19995: Use allele frequency, apply min freq cutoff to Χ² pvalue too. X-Git-Url: https://git.arvados.org/lightning.git/commitdiff_plain/12a961c3c7c1114e7f99c19b88e0f1f2149619ca 19995: Use allele frequency, apply min freq cutoff to Χ² pvalue too. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- diff --git a/glm.go b/glm.go index 20721a6d50..b68bdd1898 100644 --- a/glm.go +++ b/glm.go @@ -35,7 +35,7 @@ func normalize(a []float64) { // onehot is the observed outcome, in same order as sampleInfo, but // shorter because it only has entries for samples with // isTraining==true. -func glmPvalueFunc(sampleInfo []sampleInfo, nPCA int, minFrequency float64) func(onehot []bool) float64 { +func glmPvalueFunc(sampleInfo []sampleInfo, nPCA int) func(onehot []bool) float64 { pcaNames := make([]string, 0, nPCA) data := make([][]statmodel.Dtype, 0, nPCA) for pca := 0; pca < nPCA; pca++ { @@ -86,21 +86,16 @@ func glmPvalueFunc(sampleInfo []sampleInfo, nPCA int, minFrequency float64) func variant := make([]statmodel.Dtype, 0, len(sampleInfo)) row := 0 - ones := 0 for _, si := range sampleInfo { if si.isTraining { if onehot[row] { variant = append(variant, 1) - ones++ } else { variant = append(variant, 0) } row++ } } - if float64(ones) < float64(len(variant))*minFrequency { - return math.NaN() - } data := append([][]statmodel.Dtype{data[0], variant}, data[1:]...) names := append([]string{"outcome", "variant"}, names[1:]...) diff --git a/slicenumpy.go b/slicenumpy.go index c3d02a99bc..94bbb97849 100644 --- a/slicenumpy.go +++ b/slicenumpy.go @@ -40,15 +40,15 @@ import ( const annotationMaxTileSpan = 100 type sliceNumpy struct { - filter filter - threads int - chi2Cases []bool - chi2PValue float64 - glmMinFrequency float64 - pcaComponents int - minCoverage int - includeVariant1 bool - debugTag tagID + filter filter + threads int + chi2Cases []bool + chi2PValue float64 + pvalueMinFrequency float64 + pcaComponents int + minCoverage int + includeVariant1 bool + debugTag tagID cgnames []string samples []sampleInfo @@ -95,7 +95,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout, debugTag := flags.Int("debug-tag", -1, "log debugging details about specified tag") flags.IntVar(&cmd.threads, "threads", 16, "number of memory-hungry assembly threads, and number of VCPUs to request for arvados container") flags.Float64Var(&cmd.chi2PValue, "chi2-p-value", 1, "do Χ² test (or logistic regression if -samples file has PCA components) and omit columns with p-value above this threshold") - flags.Float64Var(&cmd.glmMinFrequency, "glm-min-frequency", 0.01, "skip GLM calculation on tile variants below this frequency in the training set") + flags.Float64Var(&cmd.pvalueMinFrequency, "pvalue-min-frequency", 0.01, "skip p-value calculation on tile variants below this frequency in the training set") flags.BoolVar(&cmd.includeVariant1, "include-variant-1", false, "include most common variant when building one-hot matrix") cmd.filter.Flags(flags) err := flags.Parse(args) @@ -153,7 +153,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout, "-pca-components=" + fmt.Sprintf("%d", cmd.pcaComponents), "-max-pca-tiles=" + fmt.Sprintf("%d", *maxPCATiles), "-chi2-p-value=" + fmt.Sprintf("%f", cmd.chi2PValue), - "-glm-min-frequency=" + fmt.Sprintf("%f", cmd.glmMinFrequency), + "-pvalue-min-frequency=" + fmt.Sprintf("%f", cmd.pvalueMinFrequency), "-include-variant-1=" + fmt.Sprintf("%v", cmd.includeVariant1), "-debug-tag=" + fmt.Sprintf("%d", cmd.debugTag), } @@ -305,7 +305,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout, } if len(cmd.samples[0].pcaComponents) > 0 { - cmd.pvalue = glmPvalueFunc(cmd.samples, cmd.pcaComponents, cmd.glmMinFrequency) + cmd.pvalue = glmPvalueFunc(cmd.samples, cmd.pcaComponents) // Unfortunately, statsmodel/glm lib logs stuff to // os.Stdout when it panics on an unsolvable // problem. We recover() from the panic in glm.go, but @@ -1642,6 +1642,12 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI if col < 4 && !cmd.includeVariant1 { continue } + if col&1 == 0 && cmd.pvalueMinFrequency < 1 && homhet2maf(obs[col:col+2]) < cmd.pvalueMinFrequency { + // Skip both columns (hom and het) if allele + // frequency is below threshold + col++ + continue + } atomic.AddInt64(&cmd.pvalueCallCount, 1) p := cmd.pvalue(obs[col]) if cmd.chi2PValue < 1 && !(p < cmd.chi2PValue) { @@ -1658,6 +1664,23 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI return onehot, xref } +func homhet2maf(onehot [][]bool) float64 { + if len(onehot[0]) == 0 { + return 0 + } + n := 0 + for i := range onehot[0] { + if onehot[0][i] { + // hom + n += 2 + } else if onehot[1][i] { + // het + n += 1 + } + } + return float64(n) / float64(len(onehot[0])*2) +} + // convert a []onehotXref with length N to a numpy-style []int32 // matrix with N columns, one row per field of onehotXref struct. //