X-Git-Url: https://git.arvados.org/lightning.git/blobdiff_plain/12a961c3c7c1114e7f99c19b88e0f1f2149619ca..3bd21cb6368569d0bda4a8d8735c010c524e1584:/slicenumpy.go diff --git a/slicenumpy.go b/slicenumpy.go index 94bbb97849..b557b9475c 100644 --- a/slicenumpy.go +++ b/slicenumpy.go @@ -45,6 +45,7 @@ type sliceNumpy struct { chi2Cases []bool chi2PValue float64 pvalueMinFrequency float64 + maxFrequency float64 pcaComponents int minCoverage int includeVariant1 bool @@ -96,6 +97,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout, flags.IntVar(&cmd.threads, "threads", 16, "number of memory-hungry assembly threads, and number of VCPUs to request for arvados container") flags.Float64Var(&cmd.chi2PValue, "chi2-p-value", 1, "do Χ² test (or logistic regression if -samples file has PCA components) and omit columns with p-value above this threshold") flags.Float64Var(&cmd.pvalueMinFrequency, "pvalue-min-frequency", 0.01, "skip p-value calculation on tile variants below this frequency in the training set") + flags.Float64Var(&cmd.maxFrequency, "max-frequency", 1, "do not output variants above this frequency in the training set") flags.BoolVar(&cmd.includeVariant1, "include-variant-1", false, "include most common variant when building one-hot matrix") cmd.filter.Flags(flags) err := flags.Parse(args) @@ -154,6 +156,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout, "-max-pca-tiles=" + fmt.Sprintf("%d", *maxPCATiles), "-chi2-p-value=" + fmt.Sprintf("%f", cmd.chi2PValue), "-pvalue-min-frequency=" + fmt.Sprintf("%f", cmd.pvalueMinFrequency), + "-max-frequency=" + fmt.Sprintf("%f", cmd.maxFrequency), "-include-variant-1=" + fmt.Sprintf("%v", cmd.includeVariant1), "-debug-tag=" + fmt.Sprintf("%d", cmd.debugTag), } @@ -1265,7 +1268,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout, for i := range cmd.samples { cmd.samples[i].pcaComponents = make([]float64, outcols) for c := 0; c < outcols; c++ { - cmd.samples[i].pcaComponents[i] = pca.At(i, c) + cmd.samples[i].pcaComponents[c] = pca.At(i, c) } } log.Print("done") @@ -1564,6 +1567,7 @@ type onehotXref struct { variant tileVariantID hom bool pvalue float64 + maf float64 } const onehotXrefSize = unsafe.Sizeof(onehotXref{}) @@ -1635,6 +1639,7 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI } var onehot [][]int8 var xref []onehotXref + var maf float64 for col := 2; col < len(obs); col++ { // col 0,1 correspond to tile variant 0, i.e., // no-call; col 2,3 correspond to the most common @@ -1642,11 +1647,20 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI if col < 4 && !cmd.includeVariant1 { continue } - if col&1 == 0 && cmd.pvalueMinFrequency < 1 && homhet2maf(obs[col:col+2]) < cmd.pvalueMinFrequency { - // Skip both columns (hom and het) if allele - // frequency is below threshold - col++ - continue + if col&1 == 0 { + maf = homhet2maf(obs[col : col+2]) + if maf < cmd.pvalueMinFrequency { + // Skip both columns (hom and het) if + // allele frequency is below threshold + col++ + continue + } + if maf > cmd.maxFrequency { + // Skip both columns if allele + // frequency is above threshold + col++ + continue + } } atomic.AddInt64(&cmd.pvalueCallCount, 1) p := cmd.pvalue(obs[col]) @@ -1659,6 +1673,7 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI variant: tileVariantID(col >> 1), hom: col&1 == 0, pvalue: p, + maf: maf, }) } return onehot, xref @@ -1689,7 +1704,7 @@ func homhet2maf(onehot [][]bool) float64 { // P-value row contains 1000000x actual p-value. func onehotXref2int32(xrefs []onehotXref) []int32 { xcols := len(xrefs) - xdata := make([]int32, 5*xcols) + xdata := make([]int32, 6*xcols) for i, xref := range xrefs { xdata[i] = int32(xref.tag) xdata[xcols+i] = int32(xref.variant) @@ -1698,6 +1713,7 @@ func onehotXref2int32(xrefs []onehotXref) []int32 { } xdata[xcols*3+i] = int32(xref.pvalue * 1000000) xdata[xcols*4+i] = int32(-math.Log10(xref.pvalue) * 1000000) + xdata[xcols*5+i] = int32(xref.maf * 1000000) } return xdata }