X-Git-Url: https://git.arvados.org/lightning.git/blobdiff_plain/fa2af342408bac2cfbe173bc0e68561735f729c7..3bd21cb6368569d0bda4a8d8735c010c524e1584:/slicenumpy.go diff --git a/slicenumpy.go b/slicenumpy.go index 895c3c15fa..b557b9475c 100644 --- a/slicenumpy.go +++ b/slicenumpy.go @@ -40,14 +40,16 @@ import ( const annotationMaxTileSpan = 100 type sliceNumpy struct { - filter filter - threads int - chi2Cases []bool - chi2PValue float64 - pcaComponents int - minCoverage int - includeVariant1 bool - debugTag tagID + filter filter + threads int + chi2Cases []bool + chi2PValue float64 + pvalueMinFrequency float64 + maxFrequency float64 + pcaComponents int + minCoverage int + includeVariant1 bool + debugTag tagID cgnames []string samples []sampleInfo @@ -94,6 +96,8 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout, debugTag := flags.Int("debug-tag", -1, "log debugging details about specified tag") flags.IntVar(&cmd.threads, "threads", 16, "number of memory-hungry assembly threads, and number of VCPUs to request for arvados container") flags.Float64Var(&cmd.chi2PValue, "chi2-p-value", 1, "do Χ² test (or logistic regression if -samples file has PCA components) and omit columns with p-value above this threshold") + flags.Float64Var(&cmd.pvalueMinFrequency, "pvalue-min-frequency", 0.01, "skip p-value calculation on tile variants below this frequency in the training set") + flags.Float64Var(&cmd.maxFrequency, "max-frequency", 1, "do not output variants above this frequency in the training set") flags.BoolVar(&cmd.includeVariant1, "include-variant-1", false, "include most common variant when building one-hot matrix") cmd.filter.Flags(flags) err := flags.Parse(args) @@ -151,6 +155,8 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout, "-pca-components=" + fmt.Sprintf("%d", cmd.pcaComponents), "-max-pca-tiles=" + fmt.Sprintf("%d", *maxPCATiles), "-chi2-p-value=" + fmt.Sprintf("%f", cmd.chi2PValue), + "-pvalue-min-frequency=" + fmt.Sprintf("%f", cmd.pvalueMinFrequency), + "-max-frequency=" + fmt.Sprintf("%f", cmd.maxFrequency), "-include-variant-1=" + fmt.Sprintf("%v", cmd.includeVariant1), "-debug-tag=" + fmt.Sprintf("%d", cmd.debugTag), } @@ -1262,7 +1268,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout, for i := range cmd.samples { cmd.samples[i].pcaComponents = make([]float64, outcols) for c := 0; c < outcols; c++ { - cmd.samples[i].pcaComponents[i] = pca.At(i, c) + cmd.samples[i].pcaComponents[c] = pca.At(i, c) } } log.Print("done") @@ -1561,6 +1567,7 @@ type onehotXref struct { variant tileVariantID hom bool pvalue float64 + maf float64 } const onehotXrefSize = unsafe.Sizeof(onehotXref{}) @@ -1632,6 +1639,7 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI } var onehot [][]int8 var xref []onehotXref + var maf float64 for col := 2; col < len(obs); col++ { // col 0,1 correspond to tile variant 0, i.e., // no-call; col 2,3 correspond to the most common @@ -1639,6 +1647,21 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI if col < 4 && !cmd.includeVariant1 { continue } + if col&1 == 0 { + maf = homhet2maf(obs[col : col+2]) + if maf < cmd.pvalueMinFrequency { + // Skip both columns (hom and het) if + // allele frequency is below threshold + col++ + continue + } + if maf > cmd.maxFrequency { + // Skip both columns if allele + // frequency is above threshold + col++ + continue + } + } atomic.AddInt64(&cmd.pvalueCallCount, 1) p := cmd.pvalue(obs[col]) if cmd.chi2PValue < 1 && !(p < cmd.chi2PValue) { @@ -1650,11 +1673,29 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI variant: tileVariantID(col >> 1), hom: col&1 == 0, pvalue: p, + maf: maf, }) } return onehot, xref } +func homhet2maf(onehot [][]bool) float64 { + if len(onehot[0]) == 0 { + return 0 + } + n := 0 + for i := range onehot[0] { + if onehot[0][i] { + // hom + n += 2 + } else if onehot[1][i] { + // het + n += 1 + } + } + return float64(n) / float64(len(onehot[0])*2) +} + // convert a []onehotXref with length N to a numpy-style []int32 // matrix with N columns, one row per field of onehotXref struct. // @@ -1663,7 +1704,7 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI // P-value row contains 1000000x actual p-value. func onehotXref2int32(xrefs []onehotXref) []int32 { xcols := len(xrefs) - xdata := make([]int32, 5*xcols) + xdata := make([]int32, 6*xcols) for i, xref := range xrefs { xdata[i] = int32(xref.tag) xdata[xcols+i] = int32(xref.variant) @@ -1672,6 +1713,7 @@ func onehotXref2int32(xrefs []onehotXref) []int32 { } xdata[xcols*3+i] = int32(xref.pvalue * 1000000) xdata[xcols*4+i] = int32(-math.Log10(xref.pvalue) * 1000000) + xdata[xcols*5+i] = int32(xref.maf * 1000000) } return xdata }