Fix validation=0 in samples.csv (should be empty for non-c/c).
[lightning.git] / slicenumpy.go
index 25fc466c116c4b9cc0a9508378ff33db91247c0b..1c64744134ca1122173a2952a0754cddce8088a5 100644 (file)
@@ -8,6 +8,7 @@ import (
        "bufio"
        "bytes"
        "encoding/gob"
+       "encoding/json"
        "errors"
        "flag"
        "fmt"
@@ -43,6 +44,7 @@ type sliceNumpy struct {
        threads         int
        chi2Cases       []bool
        chi2PValue      float64
+       pcaComponents   int
        minCoverage     int
        includeVariant1 bool
        debugTag        tagID
@@ -51,6 +53,8 @@ type sliceNumpy struct {
        samples         []sampleInfo
        trainingSet     []int // samples index => training set index, or -1 if not in training set
        trainingSetSize int
+       pvalue          func(onehot []bool) float64
+       pvalueCallCount int64
 }
 
 func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
@@ -71,6 +75,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
        arvadosVCPUs := flags.Int("arvados-vcpus", 96, "number of VCPUs to request for arvados container")
        projectUUID := flags.String("project", "", "project `UUID` for output data")
        priority := flags.Int("priority", 500, "container request priority")
+       preemptible := flags.Bool("preemptible", true, "request preemptible instance")
        inputDir := flags.String("input-dir", "./in", "input `directory`")
        outputDir := flags.String("output-dir", "./out", "output `directory`")
        ref := flags.String("ref", "", "reference name (if blank, choose last one that appears in input)")
@@ -84,7 +89,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
        samplesFilename := flags.String("samples", "", "`samples.csv` file with training/validation and case/control groups (see 'lightning choose-samples')")
        caseControlOnly := flags.Bool("case-control-only", false, "drop samples that are not in case/control groups")
        onlyPCA := flags.Bool("pca", false, "run principal component analysis, write components to pca.npy and samples.csv")
-       pcaComponents := flags.Int("pca-components", 4, "number of PCA components")
+       flags.IntVar(&cmd.pcaComponents, "pca-components", 4, "number of PCA components to compute / use in logistic regression")
        maxPCATiles := flags.Int("max-pca-tiles", 0, "maximum tiles to use as PCA input (filter, then drop every 2nd colum pair until below max)")
        debugTag := flags.Int("debug-tag", -1, "log debugging details about specified tag")
        flags.IntVar(&cmd.threads, "threads", 16, "number of memory-hungry assembly threads, and number of VCPUs to request for arvados container")
@@ -122,6 +127,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                        Priority:    *priority,
                        KeepCache:   2,
                        APIAccess:   true,
+                       Preemptible: *preemptible,
                }
                err = runner.TranslatePaths(inputDir, regionsFilename, samplesFilename)
                if err != nil {
@@ -142,7 +148,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                        "-samples=" + *samplesFilename,
                        "-case-control-only=" + fmt.Sprintf("%v", *caseControlOnly),
                        "-pca=" + fmt.Sprintf("%v", *onlyPCA),
-                       "-pca-components=" + fmt.Sprintf("%d", *pcaComponents),
+                       "-pca-components=" + fmt.Sprintf("%d", cmd.pcaComponents),
                        "-max-pca-tiles=" + fmt.Sprintf("%d", *maxPCATiles),
                        "-chi2-p-value=" + fmt.Sprintf("%f", cmd.chi2PValue),
                        "-include-variant-1=" + fmt.Sprintf("%v", cmd.includeVariant1),
@@ -182,7 +188,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
        }
 
        if *samplesFilename != "" {
-               cmd.samples, err = cmd.loadSampleInfo(*samplesFilename)
+               cmd.samples, err = loadSampleInfo(*samplesFilename)
                if err != nil {
                        return err
                }
@@ -279,6 +285,11 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                                cmd.trainingSet[i] = -1
                        }
                }
+               if cmd.pvalue == nil {
+                       cmd.pvalue = func(onehot []bool) float64 {
+                               return pvalue(onehot, cmd.chi2Cases)
+                       }
+               }
        }
        if cmd.filter.MinCoverage == 1 {
                // In the generic formula below, floating point
@@ -290,6 +301,28 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                cmd.minCoverage = int(math.Ceil(cmd.filter.MinCoverage * float64(len(cmd.cgnames))))
        }
 
+       if len(cmd.samples[0].pcaComponents) > 0 {
+               cmd.pvalue = glmPvalueFunc(cmd.samples, cmd.pcaComponents)
+               // Unfortunately, statsmodel/glm lib logs stuff to
+               // os.Stdout when it panics on an unsolvable
+               // problem. We recover() from the panic in glm.go, but
+               // we also need to commandeer os.Stdout to avoid
+               // producing large quantities of logs.
+               stdoutWas := os.Stdout
+               defer func() { os.Stdout = stdoutWas }()
+               os.Stdout, err = os.Open(os.DevNull)
+               if err != nil {
+                       return err
+               }
+       }
+
+       // cgnamemap[name]==true for samples that we are including in
+       // output
+       cgnamemap := map[string]bool{}
+       for _, name := range cmd.cgnames {
+               cgnamemap[name] = true
+       }
+
        {
                samplesOutFilename := *outputDir + "/samples.csv"
                log.Infof("writing sample metadata to %s", samplesOutFilename)
@@ -308,7 +341,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                        }
                        if si.isTraining {
                                tv = "1"
-                       } else {
+                       } else if si.isValidation {
                                tv = "0"
                        }
                        _, err = fmt.Fprintf(f, "%d,%s,%s,%s\n", i, si.id, cc, tv)
@@ -506,7 +539,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                                        if cmd.filter.MaxTag >= 0 && cg.StartTag > tagID(cmd.filter.MaxTag) {
                                                return errSkip
                                        }
-                                       if !matchGenome.MatchString(cg.Name) {
+                                       if !cgnamemap[cg.Name] {
                                                continue
                                        }
                                        // pad to full slice size
@@ -1173,6 +1206,17 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                        if err != nil {
                                return err
                        }
+                       fnm = fmt.Sprintf("%s/stats.json", *outputDir)
+                       j, err := json.Marshal(map[string]interface{}{
+                               "pvalueCallCount": cmd.pvalueCallCount,
+                       })
+                       if err != nil {
+                               return err
+                       }
+                       err = os.WriteFile(fnm, j, 0777)
+                       if err != nil {
+                               return err
+                       }
                }
                if *onlyPCA {
                        cols := 0
@@ -1207,7 +1251,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                                }
                        }
                        log.Print("fitting")
-                       transformer := nlp.NewPCA(*pcaComponents)
+                       transformer := nlp.NewPCA(cmd.pcaComponents)
                        transformer.Fit(mtxTrain.T())
                        log.Printf("transforming")
                        pca, err := transformer.Transform(mtxFull.T())
@@ -1252,6 +1296,14 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                                return err
                        }
                        defer f.Close()
+                       pcaLabels := ""
+                       for i := 0; i < outcols; i++ {
+                               pcaLabels += fmt.Sprintf(",PCA%d", i)
+                       }
+                       _, err = fmt.Fprintf(f, "Index,SampleID,CaseControl,TrainingValidation%s\n", pcaLabels)
+                       if err != nil {
+                               return err
+                       }
                        for i, si := range cmd.samples {
                                var cc, tv string
                                if si.isCase {
@@ -1261,7 +1313,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                                }
                                if si.isTraining {
                                        tv = "1"
-                               } else {
+                               } else if si.isValidation {
                                        tv = "0"
                                }
                                var pcavals string
@@ -1319,7 +1371,7 @@ type sampleInfo struct {
 
 // Read samples.csv file with case/control and training/validation
 // flags.
-func (cmd *sliceNumpy) loadSampleInfo(samplesFilename string) ([]sampleInfo, error) {
+func loadSampleInfo(samplesFilename string) ([]sampleInfo, error) {
        var si []sampleInfo
        f, err := open(samplesFilename)
        if err != nil {
@@ -1601,12 +1653,8 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI
                if col < 4 && !cmd.includeVariant1 {
                        continue
                }
-               var p float64
-               if len(cmd.samples[0].pcaComponents) > 0 {
-                       p = pvalueGLM(cmd.samples, obs[col])
-               } else {
-                       p = pvalue(obs[col], cmd.chi2Cases)
-               }
+               atomic.AddInt64(&cmd.pvalueCallCount, 1)
+               p := cmd.pvalue(obs[col])
                if cmd.chi2PValue < 1 && !(p < cmd.chi2PValue) {
                        continue
                }