Fix validation=0 in samples.csv (should be empty for non-c/c).
[lightning.git] / slicenumpy.go
index 1dbe328686f7d84deca7c9f6fe1e0a21678f0251..1c64744134ca1122173a2952a0754cddce8088a5 100644 (file)
@@ -8,6 +8,7 @@ import (
        "bufio"
        "bytes"
        "encoding/gob"
+       "encoding/json"
        "errors"
        "flag"
        "fmt"
@@ -53,6 +54,7 @@ type sliceNumpy struct {
        trainingSet     []int // samples index => training set index, or -1 if not in training set
        trainingSetSize int
        pvalue          func(onehot []bool) float64
+       pvalueCallCount int64
 }
 
 func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
@@ -73,6 +75,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
        arvadosVCPUs := flags.Int("arvados-vcpus", 96, "number of VCPUs to request for arvados container")
        projectUUID := flags.String("project", "", "project `UUID` for output data")
        priority := flags.Int("priority", 500, "container request priority")
+       preemptible := flags.Bool("preemptible", true, "request preemptible instance")
        inputDir := flags.String("input-dir", "./in", "input `directory`")
        outputDir := flags.String("output-dir", "./out", "output `directory`")
        ref := flags.String("ref", "", "reference name (if blank, choose last one that appears in input)")
@@ -124,6 +127,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                        Priority:    *priority,
                        KeepCache:   2,
                        APIAccess:   true,
+                       Preemptible: *preemptible,
                }
                err = runner.TranslatePaths(inputDir, regionsFilename, samplesFilename)
                if err != nil {
@@ -188,9 +192,6 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                if err != nil {
                        return err
                }
-               if len(cmd.samples[0].pcaComponents) > 0 {
-                       cmd.pvalue = glmPvalueFunc(cmd.samples, cmd.pcaComponents)
-               }
        } else if *caseControlOnly {
                return fmt.Errorf("-case-control-only does not make sense without -samples")
        }
@@ -284,8 +285,10 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                                cmd.trainingSet[i] = -1
                        }
                }
-               cmd.pvalue = func(onehot []bool) float64 {
-                       return pvalue(onehot, cmd.chi2Cases)
+               if cmd.pvalue == nil {
+                       cmd.pvalue = func(onehot []bool) float64 {
+                               return pvalue(onehot, cmd.chi2Cases)
+                       }
                }
        }
        if cmd.filter.MinCoverage == 1 {
@@ -298,6 +301,28 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                cmd.minCoverage = int(math.Ceil(cmd.filter.MinCoverage * float64(len(cmd.cgnames))))
        }
 
+       if len(cmd.samples[0].pcaComponents) > 0 {
+               cmd.pvalue = glmPvalueFunc(cmd.samples, cmd.pcaComponents)
+               // Unfortunately, statsmodel/glm lib logs stuff to
+               // os.Stdout when it panics on an unsolvable
+               // problem. We recover() from the panic in glm.go, but
+               // we also need to commandeer os.Stdout to avoid
+               // producing large quantities of logs.
+               stdoutWas := os.Stdout
+               defer func() { os.Stdout = stdoutWas }()
+               os.Stdout, err = os.Open(os.DevNull)
+               if err != nil {
+                       return err
+               }
+       }
+
+       // cgnamemap[name]==true for samples that we are including in
+       // output
+       cgnamemap := map[string]bool{}
+       for _, name := range cmd.cgnames {
+               cgnamemap[name] = true
+       }
+
        {
                samplesOutFilename := *outputDir + "/samples.csv"
                log.Infof("writing sample metadata to %s", samplesOutFilename)
@@ -316,7 +341,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                        }
                        if si.isTraining {
                                tv = "1"
-                       } else {
+                       } else if si.isValidation {
                                tv = "0"
                        }
                        _, err = fmt.Fprintf(f, "%d,%s,%s,%s\n", i, si.id, cc, tv)
@@ -514,7 +539,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                                        if cmd.filter.MaxTag >= 0 && cg.StartTag > tagID(cmd.filter.MaxTag) {
                                                return errSkip
                                        }
-                                       if !matchGenome.MatchString(cg.Name) {
+                                       if !cgnamemap[cg.Name] {
                                                continue
                                        }
                                        // pad to full slice size
@@ -1181,6 +1206,17 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                        if err != nil {
                                return err
                        }
+                       fnm = fmt.Sprintf("%s/stats.json", *outputDir)
+                       j, err := json.Marshal(map[string]interface{}{
+                               "pvalueCallCount": cmd.pvalueCallCount,
+                       })
+                       if err != nil {
+                               return err
+                       }
+                       err = os.WriteFile(fnm, j, 0777)
+                       if err != nil {
+                               return err
+                       }
                }
                if *onlyPCA {
                        cols := 0
@@ -1260,6 +1296,14 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                                return err
                        }
                        defer f.Close()
+                       pcaLabels := ""
+                       for i := 0; i < outcols; i++ {
+                               pcaLabels += fmt.Sprintf(",PCA%d", i)
+                       }
+                       _, err = fmt.Fprintf(f, "Index,SampleID,CaseControl,TrainingValidation%s\n", pcaLabels)
+                       if err != nil {
+                               return err
+                       }
                        for i, si := range cmd.samples {
                                var cc, tv string
                                if si.isCase {
@@ -1269,7 +1313,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                                }
                                if si.isTraining {
                                        tv = "1"
-                               } else {
+                               } else if si.isValidation {
                                        tv = "0"
                                }
                                var pcavals string
@@ -1609,6 +1653,7 @@ func (cmd *sliceNumpy) tv2homhet(cgs map[string]CompactGenome, maxv tileVariantI
                if col < 4 && !cmd.includeVariant1 {
                        continue
                }
+               atomic.AddInt64(&cmd.pvalueCallCount, 1)
                p := cmd.pvalue(obs[col])
                if cmd.chi2PValue < 1 && !(p < cmd.chi2PValue) {
                        continue