Merge branch '19524-pca'

author Tom Clegg <tom@curii.com>

Mon, 31 Oct 2022 15:53:26 +0000 (11:53 -0400)

committer Tom Clegg <tom@curii.com>

Mon, 31 Oct 2022 15:53:26 +0000 (11:53 -0400)
author Tom Clegg <tom@curii.com>
Mon, 31 Oct 2022 15:53:26 +0000 (11:53 -0400)
committer Tom Clegg <tom@curii.com>
Mon, 31 Oct 2022 15:53:26 +0000 (11:53 -0400)
diff --git a/plot.go b/plot.go

index 9959b5067c09d42ee671162a8f04729702278ffe..98b7eece27ae48e5c15166ddae70c662f6c7cf3c 100644 (file)
--- a/plot.go
+++ b/plot.go
@@ -35,8 +35,10 @@ func (cmd *pythonPlot) RunCommand(prog string, args []string, stdin io.Reader, s
         outputFilename := flags.String("o", "", "output `filename` (e.g., './plot.png')")
         sampleListFilename := flags.String("samples", "", "use second column of `samples.csv` as complete list of sample IDs")
         phenotypeFilename := flags.String("phenotype", "", "use `phenotype.csv` as id->phenotype mapping (column 0 is sample id)")
-       phenotypeCategoryColumn := flags.Int("phenotype-category-column", -1, "0-based column `index` of 2nd category in phenotype.csv file")
-       phenotypeColumn := flags.Int("phenotype-column", 1, "0-based column `index` of phenotype in phenotype.csv file")
+       cat1Column := flags.Int("phenotype-cat1-column", 1, "0-based column `index` of 1st category in phenotype.csv file")
+       cat2Column := flags.Int("phenotype-cat2-column", -1, "0-based column `index` of 2nd category in phenotype.csv file")
+       xComponent := flags.Int("x", 1, "1-based PCA component to plot on x axis")
+       yComponent := flags.Int("y", 2, "1-based PCA component to plot on y axis")
         priority := flags.Int("priority", 500, "container request priority")
         runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
         err = flags.Parse(args)
@@ -68,7 +70,16 @@ func (cmd *pythonPlot) RunCommand(prog string, args []string, stdin io.Reader, s
                 }
                 *outputFilename = "/mnt/output/plot.png"
         }
-       args = []string{*inputFilename, *sampleListFilename, *phenotypeFilename, fmt.Sprintf("%d", *phenotypeCategoryColumn), fmt.Sprintf("%d", *phenotypeColumn), *outputFilename}
+       args = []string{
+               *inputFilename,
+               fmt.Sprintf("%d", *xComponent),
+               fmt.Sprintf("%d", *yComponent),
+               *sampleListFilename,
+               *phenotypeFilename,
+               fmt.Sprintf("%d", *cat1Column),
+               fmt.Sprintf("%d", *cat2Column),
+               *outputFilename,
+       }
         if *runlocal {
                 if *outputFilename == "" {
                         fmt.Fprintln(stderr, "error: must specify -o filename.png in local mode (or try -help)")
diff --git a/plot.py b/plot.py

index cd5f0707a9d2334f6791d35f6ae237a997ef084a..da84cc0df242cbc903031a6bae37fbf86c2978fe 100644 (file)
--- a/plot.py
+++ b/plot.py
@@ -9,36 +9,46 @@ import os.path
  import scipy
  import sys
  
-infile = sys.argv[1]
-X = numpy.load(infile)
+(_,
+ input_path,
+ x_component,
+ y_component,
+ samples_file,
+ phenotype_path,
+ phenotype_cat1_column,
+ phenotype_cat2_column,
+ output_path,
+ ) = sys.argv
+X = numpy.load(input_path)
  
  colors = None
  category = {}
  samples = []
-if sys.argv[2]:
+if samples_file:
      labels = {}
-    with open(sys.argv[2], 'rt', newline='') as samplelist:
+    with open(samples_file, 'rt', newline='') as samplelist:
          for row in csv.reader(samplelist):
              sampleid = row[1]
              samples.append(sampleid)
-    phenotype_category_column = int(sys.argv[4])
-    phenotype_column = int(sys.argv[5])
-    if os.path.isdir(sys.argv[3]):
-        phenotype_files = os.scandir(sys.argv[3])
+    phenotype_cat2_column = int(phenotype_cat2_column)
+    phenotype_cat1_column = int(phenotype_cat1_column)
+    if os.path.isdir(phenotype_path):
+        phenotype_files = os.scandir(phenotype_path)
      else:
-        phenotype_files = [sys.argv[3]]
+        phenotype_files = [phenotype_path]
      for phenotype_file in phenotype_files:
          with open(phenotype_file, 'rt', newline='') as phenotype:
              dialect = csv.Sniffer().sniff(phenotype.read(1024))
              phenotype.seek(0)
              for row in csv.reader(phenotype, dialect):
                  tag = row[0]
-                label = row[phenotype_column]
+                label = row[phenotype_cat1_column]
                  for sampleid in samples:
                      if tag in sampleid:
                          labels[sampleid] = label
-                        if phenotype_category_column >= 0 and row[phenotype_category_column] != '0':
+                        if phenotype_cat2_column >= 0 and row[phenotype_cat2_column] != '0':
                              category[sampleid] = True
+    unknown_color = 'grey'
      colors = []
      labelcolors = {
          'PUR': 'firebrick',
@@ -51,7 +61,7 @@ if sys.argv[2]:
          'CEU': 'green',
          'GBR': 'green',
          'FIN': 'green',
-        '2': 'green',
+        '5': 'green',
          'LWK': 'coral',
          'MSL': 'coral',
          'GWD': 'coral',
@@ -59,26 +69,25 @@ if sys.argv[2]:
          'ESN': 'coral',
          'ACB': 'coral',
          'ASW': 'coral',
-        '3': 'coral',
+        '4': 'coral',
          'KHV': 'royalblue',
          'CDX': 'royalblue',
          'CHS': 'royalblue',
          'CHB': 'royalblue',
          'JPT': 'royalblue',
-        '4': 'royalblue',
+        '2': 'royalblue',
          'STU': 'blueviolet',
          'ITU': 'blueviolet',
          'BEB': 'blueviolet',
          'GIH': 'blueviolet',
          'PJL': 'blueviolet',
-        '5': 'blueviolet',
-        '6': 'black',           # unknown?
+        '3': 'navy',
      }
      for sampleid in samples:
          if (sampleid in labels) and (labels[sampleid] in labelcolors):
              colors.append(labelcolors[labels[sampleid]])
          else:
-            colors.append('black')
+            colors.append(unknown_color)
  
  from matplotlib.figure import Figure
  from matplotlib.patches import Polygon
@@ -90,17 +99,19 @@ for marker in ['o', 'x']:
      y = []
      if samples:
          c = []
-        for i, sampleid in enumerate(samples):
-            if category.get(sampleid, False) == (marker == 'x'):
-                x.append(X[i,0])
-                y.append(X[i,1])
-                c.append(colors[i])
+        for unknownfirst in [True, False]:
+            for i, sampleid in enumerate(samples):
+                if ((colors[i] == unknown_color) == unknownfirst and
+                    category.get(sampleid, False) == (marker == 'x')):
+                    x.append(X[i,int(x_component)-1])
+                    y.append(X[i,int(y_component)-1])
+                    c.append(colors[i])
      elif marker == 'x':
          continue
      else:
-        x = X[:,0]
-        y = X[:,1]
+        x = X[:,int(x_component)-1]
+        y = X[:,int(y_component)-1]
          c = None
      ax.scatter(x, y, c=c, s=60, marker=marker, alpha=0.5)
  canvas = FigureCanvasAgg(fig)
-canvas.print_figure(sys.argv[6], dpi=80)
+canvas.print_figure(output_path, dpi=80)
diff --git a/slicenumpy.go b/slicenumpy.go

index 41f9f5eb43fb2af737fcc4a64a64679f64b54094..bef164d72071092b0f07b3ec233e91806d3e59b5 100644 (file)
--- a/slicenumpy.go
+++ b/slicenumpy.go
@@ -64,6 +64,8 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
         flags.SetOutput(stderr)
         pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
         runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
+       arvadosRAM := flags.Int("arvados-ram", 750000000000, "amount of memory to request for arvados container (`bytes`)")
+       arvadosVCPUs := flags.Int("arvados-vcpus", 96, "number of VCPUs to request for arvados container")
         projectUUID := flags.String("project", "", "project `UUID` for output data")
         priority := flags.Int("priority", 500, "container request priority")
         inputDir := flags.String("input-dir", "./in", "input `directory`")
@@ -78,8 +80,9 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
         onehotChunked := flags.Bool("chunked-onehot", false, "generate one-hot tile-based matrix per input chunk")
         onlyPCA := flags.Bool("pca", false, "generate pca matrix")
         pcaComponents := flags.Int("pca-components", 4, "number of PCA components")
+       maxPCATiles := flags.Int("max-pca-tiles", 0, "maximum tiles to use as PCA input (filter, then drop every 2nd colum pair until below max)")
         debugTag := flags.Int("debug-tag", -1, "log debugging details about specified tag")
-       flags.IntVar(&cmd.threads, "threads", 16, "number of memory-hungry assembly threads")
+       flags.IntVar(&cmd.threads, "threads", 16, "number of memory-hungry assembly threads, and number of VCPUs to request for arvados container")
         flags.StringVar(&cmd.chi2CaseControlFile, "chi2-case-control-file", "", "tsv file or directory indicating cases and controls for Χ² test (if directory, all .tsv files will be read)")
         flags.StringVar(&cmd.chi2CaseControlColumn, "chi2-case-control-column", "", "name of case/control column in case-control files for Χ² test (value must be 0 for control, 1 for case)")
         flags.Float64Var(&cmd.chi2PValue, "chi2-p-value", 1, "do Χ² test and omit columns with p-value above this threshold")
@@ -109,8 +112,8 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                         Name:        "lightning slice-numpy",
                         Client:      arvados.NewClientFromEnv(),
                         ProjectUUID: *projectUUID,
-                       RAM:         750000000000,
-                       VCPUs:       96,
+                       RAM:         int64(*arvadosRAM),
+                       VCPUs:       *arvadosVCPUs,
                         Priority:    *priority,
                         KeepCache:   2,
                         APIAccess:   true,
@@ -133,6 +136,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                         "-chunked-onehot=" + fmt.Sprintf("%v", *onehotChunked),
                         "-pca=" + fmt.Sprintf("%v", *onlyPCA),
                         "-pca-components=" + fmt.Sprintf("%d", *pcaComponents),
+                       "-max-pca-tiles=" + fmt.Sprintf("%d", *maxPCATiles),
                         "-chi2-case-control-file=" + cmd.chi2CaseControlFile,
                         "-chi2-case-control-column=" + cmd.chi2CaseControlColumn,
                         "-chi2-p-value=" + fmt.Sprintf("%f", cmd.chi2PValue),
@@ -1118,10 +1122,18 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout,
                         if cols == 0 {
                                 return fmt.Errorf("cannot do PCA: one-hot matrix is empty")
                         }
-                       log.Printf("creating matrix: %d rows, %d cols", len(cmd.cgnames), cols)
+                       log.Printf("have %d one-hot cols", cols)
+                       stride := 1
+                       for *maxPCATiles > 0 && cols > *maxPCATiles*2 {
+                               cols = (cols + 1) / 2
+                               stride = stride * 2
+                       }
+                       log.Printf("creating matrix: %d rows, %d cols, stride %d", len(cmd.cgnames), cols, stride)
                         mtx := mat.NewDense(len(cmd.cgnames), cols, nil)
                         for i, c := range onehot[nzCount:] {
-                               mtx.Set(int(onehot[i]), int(c), 1)
+                               if int(c/2)%stride == 0 {
+                                       mtx.Set(int(onehot[i]), int(c/2)/stride*2+int(c)%2, 1)
+                               }
                         }
                         log.Print("fitting")
                         transformer := nlp.NewPCA(*pcaComponents)
author	Tom Clegg <tom@curii.com>
	Mon, 31 Oct 2022 15:53:26 +0000 (11:53 -0400)
committer	Tom Clegg <tom@curii.com>
	Mon, 31 Oct 2022 15:53:26 +0000 (11:53 -0400)
plot.go		patch \| blob \| history
plot.py		patch \| blob \| history
slicenumpy.go		patch \| blob \| history