Fix validation=0 in samples.csv (should be empty for non-c/c).
[lightning.git] / slicenumpy.go
index bd0fc171f9613c74ce44539947bd970c677f3de2..1c64744134ca1122173a2952a0754cddce8088a5 100644 (file)
@@ -192,21 +192,6 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                if err != nil {
                        return err
                }
-               if len(cmd.samples[0].pcaComponents) > 0 {
-                       cmd.pvalue = glmPvalueFunc(cmd.samples, cmd.pcaComponents)
-                       // Unfortunately, statsmodel/glm lib logs
-                       // stuff to os.Stdout when it panics on an
-                       // unsolvable problem. We recover() from the
-                       // panic in glm.go, but we also need to
-                       // commandeer os.Stdout to avoid producing
-                       // large quantities of logs.
-                       stdoutWas := os.Stdout
-                       defer func() { os.Stdout = stdoutWas }()
-                       os.Stdout, err = os.Open(os.DevNull)
-                       if err != nil {
-                               return err
-                       }
-               }
        } else if *caseControlOnly {
                return fmt.Errorf("-case-control-only does not make sense without -samples")
        }
@@ -316,6 +301,28 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                cmd.minCoverage = int(math.Ceil(cmd.filter.MinCoverage * float64(len(cmd.cgnames))))
        }
 
+       if len(cmd.samples[0].pcaComponents) > 0 {
+               cmd.pvalue = glmPvalueFunc(cmd.samples, cmd.pcaComponents)
+               // Unfortunately, statsmodel/glm lib logs stuff to
+               // os.Stdout when it panics on an unsolvable
+               // problem. We recover() from the panic in glm.go, but
+               // we also need to commandeer os.Stdout to avoid
+               // producing large quantities of logs.
+               stdoutWas := os.Stdout
+               defer func() { os.Stdout = stdoutWas }()
+               os.Stdout, err = os.Open(os.DevNull)
+               if err != nil {
+                       return err
+               }
+       }
+
+       // cgnamemap[name]==true for samples that we are including in
+       // output
+       cgnamemap := map[string]bool{}
+       for _, name := range cmd.cgnames {
+               cgnamemap[name] = true
+       }
+
        {
                samplesOutFilename := *outputDir + "/samples.csv"
                log.Infof("writing sample metadata to %s", samplesOutFilename)
@@ -334,7 +341,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                        }
                        if si.isTraining {
                                tv = "1"
-                       } else {
+                       } else if si.isValidation {
                                tv = "0"
                        }
                        _, err = fmt.Fprintf(f, "%d,%s,%s,%s\n", i, si.id, cc, tv)
@@ -532,7 +539,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                                        if cmd.filter.MaxTag >= 0 && cg.StartTag > tagID(cmd.filter.MaxTag) {
                                                return errSkip
                                        }
-                                       if !matchGenome.MatchString(cg.Name) {
+                                       if !cgnamemap[cg.Name] {
                                                continue
                                        }
                                        // pad to full slice size
@@ -1289,6 +1296,14 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                                return err
                        }
                        defer f.Close()
+                       pcaLabels := ""
+                       for i := 0; i < outcols; i++ {
+                               pcaLabels += fmt.Sprintf(",PCA%d", i)
+                       }
+                       _, err = fmt.Fprintf(f, "Index,SampleID,CaseControl,TrainingValidation%s\n", pcaLabels)
+                       if err != nil {
+                               return err
+                       }
                        for i, si := range cmd.samples {
                                var cc, tv string
                                if si.isCase {
@@ -1298,7 +1313,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                                }
                                if si.isTraining {
                                        tv = "1"
-                               } else {
+                               } else if si.isValidation {
                                        tv = "0"
                                }
                                var pcavals string