X-Git-Url: https://git.arvados.org/lightning.git/blobdiff_plain/b2d63f86ef386c54afe3e84029c65373893311d8..566c65f83e443f3dfdbab2305e59db809fdb727b:/slicenumpy.go

diff --git a/slicenumpy.go b/slicenumpy.go
index c3d02a99bc..34cd777458 100644
--- a/slicenumpy.go
+++ b/slicenumpy.go
@@ -40,15 +40,17 @@ import (
 const annotationMaxTileSpan = 100
 
 type sliceNumpy struct {
-	filter          filter
-	threads         int
-	chi2Cases       []bool
-	chi2PValue      float64
-	glmMinFrequency float64
-	pcaComponents   int
-	minCoverage     int
-	includeVariant1 bool
-	debugTag        tagID
+	filter             filter
+	threads            int
+	chi2Cases          []bool
+	chi2PValue         float64
+	pvalueMinFrequency float64
+	maxFrequency       float64
+	pcaComponents      int
+	minCoverage        int
+	minCoverageAll     bool
+	includeVariant1    bool
+	debugTag           tagID
 
 	cgnames         []string
 	samples         []sampleInfo
@@ -93,9 +95,11 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
 	flags.IntVar(&cmd.pcaComponents, "pca-components", 4, "number of PCA components to compute / use in logistic regression")
 	maxPCATiles := flags.Int("max-pca-tiles", 0, "maximum tiles to use as PCA input (filter, then drop every 2nd colum pair until below max)")
 	debugTag := flags.Int("debug-tag", -1, "log debugging details about specified tag")
+	flags.BoolVar(&cmd.minCoverageAll, "min-coverage-all", false, "apply -min-coverage filter based on all samples, not just training set")
 	flags.IntVar(&cmd.threads, "threads", 16, "number of memory-hungry assembly threads, and number of VCPUs to request for arvados container")
 	flags.Float64Var(&cmd.chi2PValue, "chi2-p-value", 1, "do Î§Â² test (or logistic regression if -samples file has PCA components) and omit columns with p-value above this threshold")
-	flags.Float64Var(&cmd.glmMinFrequency, "glm-min-frequency", 0.01, "skip GLM calculation on tile variants below this frequency in the training set")
+	flags.Float64Var(&cmd.pvalueMinFrequency, "pvalue-min-frequency", 0.01, "skip p-value calculation on tile variants below this frequency in the training set")
+	flags.Float64Var(&cmd.maxFrequency, "max-frequency", 1, "do not output variants above this frequency in the training set")
 	flags.BoolVar(&cmd.includeVariant1, "include-variant-1", false, "include most common variant when building one-hot matrix")
 	cmd.filter.Flags(flags)
 	err := flags.Parse(args)
@@ -149,11 +153,13 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
 			"-chunked-onehot=" + fmt.Sprintf("%v", *onehotChunked),
 			"-samples=" + *samplesFilename,
 			"-case-control-only=" + fmt.Sprintf("%v", *caseControlOnly),
+			"-min-coverage-all=" + fmt.Sprintf("%v", cmd.minCoverageAll),
 			"-pca=" + fmt.Sprintf("%v", *onlyPCA),
 			"-pca-components=" + fmt.Sprintf("%d", cmd.pcaComponents),
 			"-max-pca-tiles=" + fmt.Sprintf("%d", *maxPCATiles),
 			"-chi2-p-value=" + fmt.Sprintf("%f", cmd.chi2PValue),
-			"-glm-min-frequency=" + fmt.Sprintf("%f", cmd.glmMinFrequency),
+			"-pvalue-min-frequency=" + fmt.Sprintf("%f", cmd.pvalueMinFrequency),
+			"-max-frequency=" + fmt.Sprintf("%f", cmd.maxFrequency),
 			"-include-variant-1=" + fmt.Sprintf("%v", cmd.includeVariant1),
 			"-debug-tag=" + fmt.Sprintf("%d", cmd.debugTag),
 		}
@@ -294,18 +300,18 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
 			}
 		}
 	}
-	if cmd.filter.MinCoverage == 1 {
-		// In the generic formula below, floating point
-		// arithmetic can effectively push the coverage
-		// threshold above 1.0, which is impossible/useless.
-		// 1.0 needs to mean exactly 100% coverage.
+
+	if cmd.minCoverageAll {
 		cmd.minCoverage = len(cmd.cgnames)
 	} else {
-		cmd.minCoverage = int(math.Ceil(cmd.filter.MinCoverage * float64(len(cmd.cgnames))))
+		cmd.minCoverage = cmd.trainingSetSize
+	}
+	if cmd.filter.MinCoverage < 1 {
+		cmd.minCoverage = int(math.Ceil(cmd.filter.MinCoverage * float64(cmd.minCoverage)))
 	}
 
 	if len(cmd.samples[0].pcaComponents) > 0 {
-		cmd.pvalue = glmPvalueFunc(cmd.samples, cmd.pcaComponents, cmd.glmMinFrequency)
+		cmd.pvalue = glmPvalueFunc(cmd.samples, cmd.pcaComponents)
 		// Unfortunately, statsmodel/glm lib logs stuff to
 		// os.Stdout when it panics on an unsolvable
 		// problem. We recover() from the panic in glm.go, but
@@ -355,7 +361,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
 				return err
 			}
 			foundthistag := false
-			taglib.FindAll(tiledata[:len(tiledata)-1], func(tagid tagID, offset, _ int) {
+			taglib.FindAll(bufio.NewReader(bytes.NewReader(tiledata[:len(tiledata)-1])), nil, func(tagid tagID, offset, _ int) {
 				if !foundthistag && tagid == libref.Tag {
 					foundthistag = true
 					return
@@ -528,7 +534,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
 			if err == errSkip {
 				return nil
 			} else if err != nil {
-				return fmt.Errorf("%04d: DecodeLibrary(%s): err", infileIdx, infile)
+				return fmt.Errorf("%04d: DecodeLibrary(%s): %w", infileIdx, infile, err)
 			}
 			tagstart := cgs[cmd.cgnames[0]].StartTag
 			tagend := cgs[cmd.cgnames[0]].EndTag
@@ -550,7 +556,11 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
 						count[blake2b.Sum256(rt.tiledata)] = 0
 					}
 
-					for cgname, cg := range cgs {
+					for cgidx, cgname := range cmd.cgnames {
+						if !cmd.minCoverageAll && !cmd.samples[cgidx].isTraining {
+							continue
+						}
+						cg := cgs[cgname]
 						idx := int(tag-tagstart) * 2
 						for allele := 0; allele < 2; allele++ {
 							v := cg.Variants[idx+allele]
@@ -862,7 +872,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
 						if cmd.filter.MaxTag >= 0 && tag > tagID(cmd.filter.MaxTag) {
 							break
 						}
-						if rt := reftile[tag]; rt == nil || rt.excluded {
+						if rt := reftile[tag]; mask != nil && (rt == nil || rt.excluded) {
 							continue
 						}
 						if v == 0 {
@@ -1265,7 +1275,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
 			for i := range cmd.samples {
 				cmd.samples[i].pcaComponents = make([]float64, outcols)
 				for c := 0; c < outcols; c++ {
-					cmd.samples[i].pcaComponents[i] = pca.At(i, c)
+					cmd.samples[i].pcaComponents[c] = pca.At(i, c)
 				}
 			}
 			log.Print("done")
@@ -1564,6 +1574,7 @@ type onehotXref struct {
 	variant tileVariantID
 	hom     bool
 	pvalue  float64
+	maf     float64
 }
 
 const onehotXrefSize = unsafe.Sizeof(onehotXref{})
@@ -1592,7 +1603,11 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI
 	}
 	tagoffset := tag - chunkstarttag
 	coverage := 0
-	for _, cg := range cgs {
+	for cgidx, cgname := range cmd.cgnames {
+		if !cmd.minCoverageAll && !cmd.samples[cgidx].isTraining {
+			continue
+		}
+		cg := cgs[cgname]
 		alleles := 0
 		for _, v := range cg.Variants[tagoffset*2 : tagoffset*2+2] {
 			if v > 0 && int(v) < len(seq[tag]) && len(seq[tag][v].Sequence) > 0 {
@@ -1635,6 +1650,7 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI
 	}
 	var onehot [][]int8
 	var xref []onehotXref
+	var maf float64
 	for col := 2; col < len(obs); col++ {
 		// col 0,1 correspond to tile variant 0, i.e.,
 		// no-call; col 2,3 correspond to the most common
@@ -1642,6 +1658,21 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI
 		if col < 4 && !cmd.includeVariant1 {
 			continue
 		}
+		if col&1 == 0 {
+			maf = homhet2maf(obs[col : col+2])
+			if maf < cmd.pvalueMinFrequency {
+				// Skip both columns (hom and het) if
+				// allele frequency is below threshold
+				col++
+				continue
+			}
+			if maf > cmd.maxFrequency {
+				// Skip both columns if allele
+				// frequency is above threshold
+				col++
+				continue
+			}
+		}
 		atomic.AddInt64(&cmd.pvalueCallCount, 1)
 		p := cmd.pvalue(obs[col])
 		if cmd.chi2PValue < 1 && !(p < cmd.chi2PValue) {
@@ -1653,11 +1684,29 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI
 			variant: tileVariantID(col >> 1),
 			hom:     col&1 == 0,
 			pvalue:  p,
+			maf:     maf,
 		})
 	}
 	return onehot, xref
 }
 
+func homhet2maf(onehot [][]bool) float64 {
+	if len(onehot[0]) == 0 {
+		return 0
+	}
+	n := 0
+	for i := range onehot[0] {
+		if onehot[0][i] {
+			// hom
+			n += 2
+		} else if onehot[1][i] {
+			// het
+			n += 1
+		}
+	}
+	return float64(n) / float64(len(onehot[0])*2)
+}
+
 // convert a []onehotXref with length N to a numpy-style []int32
 // matrix with N columns, one row per field of onehotXref struct.
 //
@@ -1666,7 +1715,7 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI
 // P-value row contains 1000000x actual p-value.
 func onehotXref2int32(xrefs []onehotXref) []int32 {
 	xcols := len(xrefs)
-	xdata := make([]int32, 5*xcols)
+	xdata := make([]int32, 6*xcols)
 	for i, xref := range xrefs {
 		xdata[i] = int32(xref.tag)
 		xdata[xcols+i] = int32(xref.variant)
@@ -1675,6 +1724,7 @@ func onehotXref2int32(xrefs []onehotXref) []int32 {
 		}
 		xdata[xcols*3+i] = int32(xref.pvalue * 1000000)
 		xdata[xcols*4+i] = int32(-math.Log10(xref.pvalue) * 1000000)
+		xdata[xcols*5+i] = int32(xref.maf * 1000000)
 	}
 	return xdata
 }