X-Git-Url: https://git.arvados.org/lightning.git/blobdiff_plain/b2d63f86ef386c54afe3e84029c65373893311d8..566c65f83e443f3dfdbab2305e59db809fdb727b:/slicenumpy.go diff --git a/slicenumpy.go b/slicenumpy.go index c3d02a99bc..34cd777458 100644 --- a/slicenumpy.go +++ b/slicenumpy.go @@ -40,15 +40,17 @@ import ( const annotationMaxTileSpan = 100 type sliceNumpy struct { - filter filter - threads int - chi2Cases []bool - chi2PValue float64 - glmMinFrequency float64 - pcaComponents int - minCoverage int - includeVariant1 bool - debugTag tagID + filter filter + threads int + chi2Cases []bool + chi2PValue float64 + pvalueMinFrequency float64 + maxFrequency float64 + pcaComponents int + minCoverage int + minCoverageAll bool + includeVariant1 bool + debugTag tagID cgnames []string samples []sampleInfo @@ -93,9 +95,11 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout, flags.IntVar(&cmd.pcaComponents, "pca-components", 4, "number of PCA components to compute / use in logistic regression") maxPCATiles := flags.Int("max-pca-tiles", 0, "maximum tiles to use as PCA input (filter, then drop every 2nd colum pair until below max)") debugTag := flags.Int("debug-tag", -1, "log debugging details about specified tag") + flags.BoolVar(&cmd.minCoverageAll, "min-coverage-all", false, "apply -min-coverage filter based on all samples, not just training set") flags.IntVar(&cmd.threads, "threads", 16, "number of memory-hungry assembly threads, and number of VCPUs to request for arvados container") flags.Float64Var(&cmd.chi2PValue, "chi2-p-value", 1, "do Χ² test (or logistic regression if -samples file has PCA components) and omit columns with p-value above this threshold") - flags.Float64Var(&cmd.glmMinFrequency, "glm-min-frequency", 0.01, "skip GLM calculation on tile variants below this frequency in the training set") + flags.Float64Var(&cmd.pvalueMinFrequency, "pvalue-min-frequency", 0.01, "skip p-value calculation on tile variants below this frequency in the training set") + flags.Float64Var(&cmd.maxFrequency, "max-frequency", 1, "do not output variants above this frequency in the training set") flags.BoolVar(&cmd.includeVariant1, "include-variant-1", false, "include most common variant when building one-hot matrix") cmd.filter.Flags(flags) err := flags.Parse(args) @@ -149,11 +153,13 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout, "-chunked-onehot=" + fmt.Sprintf("%v", *onehotChunked), "-samples=" + *samplesFilename, "-case-control-only=" + fmt.Sprintf("%v", *caseControlOnly), + "-min-coverage-all=" + fmt.Sprintf("%v", cmd.minCoverageAll), "-pca=" + fmt.Sprintf("%v", *onlyPCA), "-pca-components=" + fmt.Sprintf("%d", cmd.pcaComponents), "-max-pca-tiles=" + fmt.Sprintf("%d", *maxPCATiles), "-chi2-p-value=" + fmt.Sprintf("%f", cmd.chi2PValue), - "-glm-min-frequency=" + fmt.Sprintf("%f", cmd.glmMinFrequency), + "-pvalue-min-frequency=" + fmt.Sprintf("%f", cmd.pvalueMinFrequency), + "-max-frequency=" + fmt.Sprintf("%f", cmd.maxFrequency), "-include-variant-1=" + fmt.Sprintf("%v", cmd.includeVariant1), "-debug-tag=" + fmt.Sprintf("%d", cmd.debugTag), } @@ -294,18 +300,18 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout, } } } - if cmd.filter.MinCoverage == 1 { - // In the generic formula below, floating point - // arithmetic can effectively push the coverage - // threshold above 1.0, which is impossible/useless. - // 1.0 needs to mean exactly 100% coverage. + + if cmd.minCoverageAll { cmd.minCoverage = len(cmd.cgnames) } else { - cmd.minCoverage = int(math.Ceil(cmd.filter.MinCoverage * float64(len(cmd.cgnames)))) + cmd.minCoverage = cmd.trainingSetSize + } + if cmd.filter.MinCoverage < 1 { + cmd.minCoverage = int(math.Ceil(cmd.filter.MinCoverage * float64(cmd.minCoverage))) } if len(cmd.samples[0].pcaComponents) > 0 { - cmd.pvalue = glmPvalueFunc(cmd.samples, cmd.pcaComponents, cmd.glmMinFrequency) + cmd.pvalue = glmPvalueFunc(cmd.samples, cmd.pcaComponents) // Unfortunately, statsmodel/glm lib logs stuff to // os.Stdout when it panics on an unsolvable // problem. We recover() from the panic in glm.go, but @@ -355,7 +361,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout, return err } foundthistag := false - taglib.FindAll(tiledata[:len(tiledata)-1], func(tagid tagID, offset, _ int) { + taglib.FindAll(bufio.NewReader(bytes.NewReader(tiledata[:len(tiledata)-1])), nil, func(tagid tagID, offset, _ int) { if !foundthistag && tagid == libref.Tag { foundthistag = true return @@ -528,7 +534,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout, if err == errSkip { return nil } else if err != nil { - return fmt.Errorf("%04d: DecodeLibrary(%s): err", infileIdx, infile) + return fmt.Errorf("%04d: DecodeLibrary(%s): %w", infileIdx, infile, err) } tagstart := cgs[cmd.cgnames[0]].StartTag tagend := cgs[cmd.cgnames[0]].EndTag @@ -550,7 +556,11 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout, count[blake2b.Sum256(rt.tiledata)] = 0 } - for cgname, cg := range cgs { + for cgidx, cgname := range cmd.cgnames { + if !cmd.minCoverageAll && !cmd.samples[cgidx].isTraining { + continue + } + cg := cgs[cgname] idx := int(tag-tagstart) * 2 for allele := 0; allele < 2; allele++ { v := cg.Variants[idx+allele] @@ -862,7 +872,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout, if cmd.filter.MaxTag >= 0 && tag > tagID(cmd.filter.MaxTag) { break } - if rt := reftile[tag]; rt == nil || rt.excluded { + if rt := reftile[tag]; mask != nil && (rt == nil || rt.excluded) { continue } if v == 0 { @@ -1265,7 +1275,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout, for i := range cmd.samples { cmd.samples[i].pcaComponents = make([]float64, outcols) for c := 0; c < outcols; c++ { - cmd.samples[i].pcaComponents[i] = pca.At(i, c) + cmd.samples[i].pcaComponents[c] = pca.At(i, c) } } log.Print("done") @@ -1564,6 +1574,7 @@ type onehotXref struct { variant tileVariantID hom bool pvalue float64 + maf float64 } const onehotXrefSize = unsafe.Sizeof(onehotXref{}) @@ -1592,7 +1603,11 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI } tagoffset := tag - chunkstarttag coverage := 0 - for _, cg := range cgs { + for cgidx, cgname := range cmd.cgnames { + if !cmd.minCoverageAll && !cmd.samples[cgidx].isTraining { + continue + } + cg := cgs[cgname] alleles := 0 for _, v := range cg.Variants[tagoffset*2 : tagoffset*2+2] { if v > 0 && int(v) < len(seq[tag]) && len(seq[tag][v].Sequence) > 0 { @@ -1635,6 +1650,7 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI } var onehot [][]int8 var xref []onehotXref + var maf float64 for col := 2; col < len(obs); col++ { // col 0,1 correspond to tile variant 0, i.e., // no-call; col 2,3 correspond to the most common @@ -1642,6 +1658,21 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI if col < 4 && !cmd.includeVariant1 { continue } + if col&1 == 0 { + maf = homhet2maf(obs[col : col+2]) + if maf < cmd.pvalueMinFrequency { + // Skip both columns (hom and het) if + // allele frequency is below threshold + col++ + continue + } + if maf > cmd.maxFrequency { + // Skip both columns if allele + // frequency is above threshold + col++ + continue + } + } atomic.AddInt64(&cmd.pvalueCallCount, 1) p := cmd.pvalue(obs[col]) if cmd.chi2PValue < 1 && !(p < cmd.chi2PValue) { @@ -1653,11 +1684,29 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI variant: tileVariantID(col >> 1), hom: col&1 == 0, pvalue: p, + maf: maf, }) } return onehot, xref } +func homhet2maf(onehot [][]bool) float64 { + if len(onehot[0]) == 0 { + return 0 + } + n := 0 + for i := range onehot[0] { + if onehot[0][i] { + // hom + n += 2 + } else if onehot[1][i] { + // het + n += 1 + } + } + return float64(n) / float64(len(onehot[0])*2) +} + // convert a []onehotXref with length N to a numpy-style []int32 // matrix with N columns, one row per field of onehotXref struct. // @@ -1666,7 +1715,7 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI // P-value row contains 1000000x actual p-value. func onehotXref2int32(xrefs []onehotXref) []int32 { xcols := len(xrefs) - xdata := make([]int32, 5*xcols) + xdata := make([]int32, 6*xcols) for i, xref := range xrefs { xdata[i] = int32(xref.tag) xdata[xcols+i] = int32(xref.variant) @@ -1675,6 +1724,7 @@ func onehotXref2int32(xrefs []onehotXref) []int32 { } xdata[xcols*3+i] = int32(xref.pvalue * 1000000) xdata[xcols*4+i] = int32(-math.Log10(xref.pvalue) * 1000000) + xdata[xcols*5+i] = int32(xref.maf * 1000000) } return xdata }