X-Git-Url: https://git.arvados.org/lightning.git/blobdiff_plain/0f214db02343c250f7a245966e31f19b39043938..ed6e40e98ebd142cfc761a4c0fddc451875896bc:/slicenumpy.go?ds=sidebyside

diff --git a/slicenumpy.go b/slicenumpy.go
index 25fc466c11..1c64744134 100644
--- a/slicenumpy.go
+++ b/slicenumpy.go
@@ -8,6 +8,7 @@ import (
 	"bufio"
 	"bytes"
 	"encoding/gob"
+	"encoding/json"
 	"errors"
 	"flag"
 	"fmt"
@@ -43,6 +44,7 @@ type sliceNumpy struct {
 	threads         int
 	chi2Cases       []bool
 	chi2PValue      float64
+	pcaComponents   int
 	minCoverage     int
 	includeVariant1 bool
 	debugTag        tagID
@@ -51,6 +53,8 @@ type sliceNumpy struct {
 	samples         []sampleInfo
 	trainingSet     []int // samples index => training set index, or -1 if not in training set
 	trainingSetSize int
+	pvalue          func(onehot []bool) float64
+	pvalueCallCount int64
 }
 
 func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
@@ -71,6 +75,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
 	arvadosVCPUs := flags.Int("arvados-vcpus", 96, "number of VCPUs to request for arvados container")
 	projectUUID := flags.String("project", "", "project `UUID` for output data")
 	priority := flags.Int("priority", 500, "container request priority")
+	preemptible := flags.Bool("preemptible", true, "request preemptible instance")
 	inputDir := flags.String("input-dir", "./in", "input `directory`")
 	outputDir := flags.String("output-dir", "./out", "output `directory`")
 	ref := flags.String("ref", "", "reference name (if blank, choose last one that appears in input)")
@@ -84,7 +89,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
 	samplesFilename := flags.String("samples", "", "`samples.csv` file with training/validation and case/control groups (see 'lightning choose-samples')")
 	caseControlOnly := flags.Bool("case-control-only", false, "drop samples that are not in case/control groups")
 	onlyPCA := flags.Bool("pca", false, "run principal component analysis, write components to pca.npy and samples.csv")
-	pcaComponents := flags.Int("pca-components", 4, "number of PCA components")
+	flags.IntVar(&cmd.pcaComponents, "pca-components", 4, "number of PCA components to compute / use in logistic regression")
 	maxPCATiles := flags.Int("max-pca-tiles", 0, "maximum tiles to use as PCA input (filter, then drop every 2nd colum pair until below max)")
 	debugTag := flags.Int("debug-tag", -1, "log debugging details about specified tag")
 	flags.IntVar(&cmd.threads, "threads", 16, "number of memory-hungry assembly threads, and number of VCPUs to request for arvados container")
@@ -122,6 +127,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
 			Priority:    *priority,
 			KeepCache:   2,
 			APIAccess:   true,
+			Preemptible: *preemptible,
 		}
 		err = runner.TranslatePaths(inputDir, regionsFilename, samplesFilename)
 		if err != nil {
@@ -142,7 +148,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
 			"-samples=" + *samplesFilename,
 			"-case-control-only=" + fmt.Sprintf("%v", *caseControlOnly),
 			"-pca=" + fmt.Sprintf("%v", *onlyPCA),
-			"-pca-components=" + fmt.Sprintf("%d", *pcaComponents),
+			"-pca-components=" + fmt.Sprintf("%d", cmd.pcaComponents),
 			"-max-pca-tiles=" + fmt.Sprintf("%d", *maxPCATiles),
 			"-chi2-p-value=" + fmt.Sprintf("%f", cmd.chi2PValue),
 			"-include-variant-1=" + fmt.Sprintf("%v", cmd.includeVariant1),
@@ -182,7 +188,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
 	}
 
 	if *samplesFilename != "" {
-		cmd.samples, err = cmd.loadSampleInfo(*samplesFilename)
+		cmd.samples, err = loadSampleInfo(*samplesFilename)
 		if err != nil {
 			return err
 		}
@@ -279,6 +285,11 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
 				cmd.trainingSet[i] = -1
 			}
 		}
+		if cmd.pvalue == nil {
+			cmd.pvalue = func(onehot []bool) float64 {
+				return pvalue(onehot, cmd.chi2Cases)
+			}
+		}
 	}
 	if cmd.filter.MinCoverage == 1 {
 		// In the generic formula below, floating point
@@ -290,6 +301,28 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
 		cmd.minCoverage = int(math.Ceil(cmd.filter.MinCoverage * float64(len(cmd.cgnames))))
 	}
 
+	if len(cmd.samples[0].pcaComponents) > 0 {
+		cmd.pvalue = glmPvalueFunc(cmd.samples, cmd.pcaComponents)
+		// Unfortunately, statsmodel/glm lib logs stuff to
+		// os.Stdout when it panics on an unsolvable
+		// problem. We recover() from the panic in glm.go, but
+		// we also need to commandeer os.Stdout to avoid
+		// producing large quantities of logs.
+		stdoutWas := os.Stdout
+		defer func() { os.Stdout = stdoutWas }()
+		os.Stdout, err = os.Open(os.DevNull)
+		if err != nil {
+			return err
+		}
+	}
+
+	// cgnamemap[name]==true for samples that we are including in
+	// output
+	cgnamemap := map[string]bool{}
+	for _, name := range cmd.cgnames {
+		cgnamemap[name] = true
+	}
+
 	{
 		samplesOutFilename := *outputDir + "/samples.csv"
 		log.Infof("writing sample metadata to %s", samplesOutFilename)
@@ -308,7 +341,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
 			}
 			if si.isTraining {
 				tv = "1"
-			} else {
+			} else if si.isValidation {
 				tv = "0"
 			}
 			_, err = fmt.Fprintf(f, "%d,%s,%s,%s\n", i, si.id, cc, tv)
@@ -506,7 +539,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
 					if cmd.filter.MaxTag >= 0 && cg.StartTag > tagID(cmd.filter.MaxTag) {
 						return errSkip
 					}
-					if !matchGenome.MatchString(cg.Name) {
+					if !cgnamemap[cg.Name] {
 						continue
 					}
 					// pad to full slice size
@@ -1173,6 +1206,17 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
 			if err != nil {
 				return err
 			}
+			fnm = fmt.Sprintf("%s/stats.json", *outputDir)
+			j, err := json.Marshal(map[string]interface{}{
+				"pvalueCallCount": cmd.pvalueCallCount,
+			})
+			if err != nil {
+				return err
+			}
+			err = os.WriteFile(fnm, j, 0777)
+			if err != nil {
+				return err
+			}
 		}
 		if *onlyPCA {
 			cols := 0
@@ -1207,7 +1251,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
 				}
 			}
 			log.Print("fitting")
-			transformer := nlp.NewPCA(*pcaComponents)
+			transformer := nlp.NewPCA(cmd.pcaComponents)
 			transformer.Fit(mtxTrain.T())
 			log.Printf("transforming")
 			pca, err := transformer.Transform(mtxFull.T())
@@ -1252,6 +1296,14 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
 				return err
 			}
 			defer f.Close()
+			pcaLabels := ""
+			for i := 0; i < outcols; i++ {
+				pcaLabels += fmt.Sprintf(",PCA%d", i)
+			}
+			_, err = fmt.Fprintf(f, "Index,SampleID,CaseControl,TrainingValidation%s\n", pcaLabels)
+			if err != nil {
+				return err
+			}
 			for i, si := range cmd.samples {
 				var cc, tv string
 				if si.isCase {
@@ -1261,7 +1313,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
 				}
 				if si.isTraining {
 					tv = "1"
-				} else {
+				} else if si.isValidation {
 					tv = "0"
 				}
 				var pcavals string
@@ -1319,7 +1371,7 @@ type sampleInfo struct {
 
 // Read samples.csv file with case/control and training/validation
 // flags.
-func (cmd *sliceNumpy) loadSampleInfo(samplesFilename string) ([]sampleInfo, error) {
+func loadSampleInfo(samplesFilename string) ([]sampleInfo, error) {
 	var si []sampleInfo
 	f, err := open(samplesFilename)
 	if err != nil {
@@ -1601,12 +1653,8 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI
 		if col < 4 && !cmd.includeVariant1 {
 			continue
 		}
-		var p float64
-		if len(cmd.samples[0].pcaComponents) > 0 {
-			p = pvalueGLM(cmd.samples, obs[col])
-		} else {
-			p = pvalue(obs[col], cmd.chi2Cases)
-		}
+		atomic.AddInt64(&cmd.pvalueCallCount, 1)
+		p := cmd.pvalue(obs[col])
 		if cmd.chi2PValue < 1 && !(p < cmd.chi2PValue) {
 			continue
 		}