19524: Use marker shape to indicate second category variable.

author Tom Clegg <tom@curii.com>

Thu, 13 Oct 2022 18:46:46 +0000 (14:46 -0400)

committer Tom Clegg <tom@curii.com>

Fri, 14 Oct 2022 13:20:36 +0000 (09:20 -0400)
author Tom Clegg <tom@curii.com>
Thu, 13 Oct 2022 18:46:46 +0000 (14:46 -0400)
committer Tom Clegg <tom@curii.com>
Fri, 14 Oct 2022 13:20:36 +0000 (09:20 -0400)
diff --git a/plot.go b/plot.go

index 51f5c8236c27e62a2134f706999ee8752a4ce731..9959b5067c09d42ee671162a8f04729702278ffe 100644 (file)
--- a/plot.go
+++ b/plot.go
@@ -10,6 +10,8 @@ import (
         "fmt"
         "io"
         _ "net/http/pprof"
+       "os/exec"
+       "strings"
  
         "git.arvados.org/arvados.git/sdk/go/arvados"
  )
@@ -30,10 +32,13 @@ func (cmd *pythonPlot) RunCommand(prog string, args []string, stdin io.Reader, s
         flags.SetOutput(stderr)
         projectUUID := flags.String("project", "", "project `UUID` for output data")
         inputFilename := flags.String("i", "-", "input `file`")
+       outputFilename := flags.String("o", "", "output `filename` (e.g., './plot.png')")
         sampleListFilename := flags.String("samples", "", "use second column of `samples.csv` as complete list of sample IDs")
         phenotypeFilename := flags.String("phenotype", "", "use `phenotype.csv` as id->phenotype mapping (column 0 is sample id)")
+       phenotypeCategoryColumn := flags.Int("phenotype-category-column", -1, "0-based column `index` of 2nd category in phenotype.csv file")
         phenotypeColumn := flags.Int("phenotype-column", 1, "0-based column `index` of phenotype in phenotype.csv file")
         priority := flags.Int("priority", 500, "container request priority")
+       runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
         err = flags.Parse(args)
         if err == flag.ErrHelp {
                 err = nil
@@ -56,12 +61,31 @@ func (cmd *pythonPlot) RunCommand(prog string, args []string, stdin io.Reader, s
                         },
                 },
         }
-       err = runner.TranslatePaths(inputFilename, sampleListFilename, phenotypeFilename)
-       if err != nil {
-               return 1
+       if !*runlocal {
+               err = runner.TranslatePaths(inputFilename, sampleListFilename, phenotypeFilename)
+               if err != nil {
+                       return 1
+               }
+               *outputFilename = "/mnt/output/plot.png"
+       }
+       args = []string{*inputFilename, *sampleListFilename, *phenotypeFilename, fmt.Sprintf("%d", *phenotypeCategoryColumn), fmt.Sprintf("%d", *phenotypeColumn), *outputFilename}
+       if *runlocal {
+               if *outputFilename == "" {
+                       fmt.Fprintln(stderr, "error: must specify -o filename.png in local mode (or try -help)")
+                       return 1
+               }
+               cmd := exec.Command("python3", append([]string{"-"}, args...)...)
+               cmd.Stdin = strings.NewReader(plotscript)
+               cmd.Stdout = stdout
+               cmd.Stderr = stderr
+               err = cmd.Run()
+               if err != nil {
+                       return 1
+               }
+               return 0
         }
         runner.Prog = "python3"
-       runner.Args = []string{"/plot.py", *inputFilename, *sampleListFilename, *phenotypeFilename, fmt.Sprintf("%d", *phenotypeColumn), "/mnt/output/plot.png"}
+       runner.Args = append([]string{"/plot.py"}, args...)
         var output string
         output, err = runner.Run()
         if err != nil {
diff --git a/plot.py b/plot.py

index c88ed719b260f5a41e9436c2b505a8eefda210b2..cd5f0707a9d2334f6791d35f6ae237a997ef084a 100644 (file)
--- a/plot.py
+++ b/plot.py
@@ -13,14 +13,16 @@ infile = sys.argv[1]
  X = numpy.load(infile)
  
  colors = None
+category = {}
+samples = []
  if sys.argv[2]:
-    samples = []
      labels = {}
      with open(sys.argv[2], 'rt', newline='') as samplelist:
          for row in csv.reader(samplelist):
              sampleid = row[1]
              samples.append(sampleid)
-    phenotype_column = int(sys.argv[4])
+    phenotype_category_column = int(sys.argv[4])
+    phenotype_column = int(sys.argv[5])
      if os.path.isdir(sys.argv[3]):
          phenotype_files = os.scandir(sys.argv[3])
      else:
@@ -35,6 +37,8 @@ if sys.argv[2]:
                  for sampleid in samples:
                      if tag in sampleid:
                          labels[sampleid] = label
+                        if phenotype_category_column >= 0 and row[phenotype_category_column] != '0':
+                            category[sampleid] = True
      colors = []
      labelcolors = {
          'PUR': 'firebrick',
@@ -68,7 +72,7 @@ if sys.argv[2]:
          'GIH': 'blueviolet',
          'PJL': 'blueviolet',
          '5': 'blueviolet',
-        '6': 'navy',
+        '6': 'black',           # unknown?
      }
      for sampleid in samples:
          if (sampleid in labels) and (labels[sampleid] in labelcolors):
@@ -81,6 +85,22 @@ from matplotlib.patches import Polygon
  from matplotlib.backends.backend_agg import FigureCanvasAgg
  fig = Figure()
  ax = fig.add_subplot(111)
-ax.scatter(X[:,0], X[:,1], c=colors, s=60, marker='o', alpha=0.5)
+for marker in ['o', 'x']:
+    x = []
+    y = []
+    if samples:
+        c = []
+        for i, sampleid in enumerate(samples):
+            if category.get(sampleid, False) == (marker == 'x'):
+                x.append(X[i,0])
+                y.append(X[i,1])
+                c.append(colors[i])
+    elif marker == 'x':
+        continue
+    else:
+        x = X[:,0]
+        y = X[:,1]
+        c = None
+    ax.scatter(x, y, c=c, s=60, marker=marker, alpha=0.5)
  canvas = FigureCanvasAgg(fig)
-canvas.print_figure(sys.argv[5], dpi=80)
+canvas.print_figure(sys.argv[6], dpi=80)
author	Tom Clegg <tom@curii.com>
	Thu, 13 Oct 2022 18:46:46 +0000 (14:46 -0400)
committer	Tom Clegg <tom@curii.com>
	Fri, 14 Oct 2022 13:20:36 +0000 (09:20 -0400)