Add plot subcommand.
authorTom Clegg <tom@tomclegg.ca>
Wed, 4 Mar 2020 20:19:39 +0000 (15:19 -0500)
committerTom Clegg <tom@tomclegg.ca>
Wed, 4 Mar 2020 20:19:39 +0000 (15:19 -0500)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@tomclegg.ca>

arvados.go
cmd.go
plot.go [new file with mode: 0644]

index 89d23c7e312f7fdec8a7fccc0cca04610a57b22e..c1b54d98236661136a5112672950c17ed6605807 100644 (file)
@@ -22,17 +22,14 @@ type arvadosContainerRunner struct {
        RAM         int64
        Prog        string // if empty, run /proc/self/exe
        Args        []string
-       Mounts      map[string]string
+       Mounts      map[string]map[string]interface{}
 }
 
-var (
-       collectionInPathRe = regexp.MustCompile(`^(.*/)?([0-9a-f]{32}\+[0-9]+|[0-9a-z]{5}-[0-9a-z]{5}-[0-9a-z]{15})(/.*)?$`)
-)
-
 func (runner *arvadosContainerRunner) Run() error {
        if runner.ProjectUUID == "" {
                return errors.New("cannot run arvados container: ProjectUUID not provided")
        }
+
        mounts := map[string]map[string]interface{}{
                "/mnt/output": {
                        "kind":     "tmp",
@@ -40,6 +37,9 @@ func (runner *arvadosContainerRunner) Run() error {
                        "capacity": 100000000000,
                },
        }
+       for path, mnt := range runner.Mounts {
+               mounts[path] = mnt
+       }
 
        prog := runner.Prog
        if prog == "" {
@@ -55,12 +55,6 @@ func (runner *arvadosContainerRunner) Run() error {
        }
        command := append([]string{prog}, runner.Args...)
 
-       for uuid, mnt := range runner.Mounts {
-               mounts[mnt] = map[string]interface{}{
-                       "kind": "collection",
-                       "uuid": uuid,
-               }
-       }
        rc := arvados.RuntimeConstraints{
                VCPUs:        runner.VCPUs,
                RAM:          runner.RAM,
@@ -85,9 +79,11 @@ func (runner *arvadosContainerRunner) Run() error {
        return err
 }
 
+var collectionInPathRe = regexp.MustCompile(`^(.*/)?([0-9a-f]{32}\+[0-9]+|[0-9a-z]{5}-[0-9a-z]{5}-[0-9a-z]{15})(/.*)?$`)
+
 func (runner *arvadosContainerRunner) TranslatePaths(paths ...*string) error {
        if runner.Mounts == nil {
-               runner.Mounts = make(map[string]string)
+               runner.Mounts = make(map[string]map[string]interface{})
        }
        for _, path := range paths {
                if *path == "" {
@@ -98,12 +94,15 @@ func (runner *arvadosContainerRunner) TranslatePaths(paths ...*string) error {
                        return fmt.Errorf("cannot find uuid in path: %q", *path)
                }
                uuid := m[2]
-               mnt, ok := runner.Mounts[uuid]
+               mnt, ok := runner.Mounts["/mnt/"+uuid]
                if !ok {
-                       mnt = "/mnt/" + uuid
-                       runner.Mounts[uuid] = mnt
+                       mnt = map[string]interface{}{
+                               "kind": "collection",
+                               "uuid": uuid,
+                       }
+                       runner.Mounts["/mnt/"+uuid] = mnt
                }
-               *path = mnt + m[3]
+               *path = "/mnt/" + uuid + m[3]
        }
        return nil
 }
diff --git a/cmd.go b/cmd.go
index 24f4d5160ec3146e67abc650f88e2b84cb116893..769829d12af9e28e158dce04386e6d4b81019618 100644 (file)
--- a/cmd.go
+++ b/cmd.go
@@ -21,6 +21,7 @@ var (
                "filter":             &filterer{},
                "build-docker-image": &buildDockerImage{},
                "pca":                &pythonPCA{},
+               "plot":               &pythonPlot{},
        })
 )
 
@@ -41,7 +42,7 @@ func (cmd *buildDockerImage) RunCommand(prog string, args []string, stdin io.Rea
 RUN DEBIAN_FRONTEND=noninteractive \
   apt-get update && \
   apt-get dist-upgrade -y && \
-  apt-get install -y --no-install-recommends bcftools samtools python3-sklearn && \
+  apt-get install -y --no-install-recommends bcftools samtools python3-sklearn python3-matplotlib && \
   apt-get clean
 `), 0644)
        if err != nil {
diff --git a/plot.go b/plot.go
new file mode 100644 (file)
index 0000000..6705f14
--- /dev/null
+++ b/plot.go
@@ -0,0 +1,125 @@
+package main
+
+import (
+       "flag"
+       "fmt"
+       "io"
+       _ "net/http/pprof"
+
+       "git.arvados.org/arvados.git/sdk/go/arvados"
+)
+
+type pythonPlot struct{}
+
+func (cmd *pythonPlot) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
+       var err error
+       defer func() {
+               if err != nil {
+                       fmt.Fprintf(stderr, "%s\n", err)
+               }
+       }()
+       flags := flag.NewFlagSet("", flag.ContinueOnError)
+       flags.SetOutput(stderr)
+       projectUUID := flags.String("project", "", "project `UUID` for output data")
+       inputFilename := flags.String("i", "-", "input `file`")
+       sampleCSVFilename := flags.String("labels-csv", "", "use first two columns of `labels.csv` as id->color mapping")
+       sampleFastaDirname := flags.String("sample-fasta-dir", "", "`directory` containing fasta input files")
+       err = flags.Parse(args)
+       if err == flag.ErrHelp {
+               err = nil
+               return 0
+       } else if err != nil {
+               return 2
+       }
+
+       runner := arvadosContainerRunner{
+               Name:        "lightning plot",
+               Client:      arvados.NewClientFromEnv(),
+               ProjectUUID: *projectUUID,
+               RAM:         1 << 30,
+               VCPUs:       1,
+               Mounts: map[string]map[string]interface{}{
+                       "/plot.py": map[string]interface{}{
+                               "kind":    "text",
+                               "content": plotscript,
+                       },
+               },
+       }
+       err = runner.TranslatePaths(inputFilename, sampleCSVFilename, sampleFastaDirname)
+       if err != nil {
+               return 1
+       }
+       runner.Prog = "python3"
+       runner.Args = []string{"/plot.py", *inputFilename, *sampleCSVFilename, *sampleFastaDirname, "/mnt/output/plot.png"}
+       err = runner.Run()
+       if err != nil {
+               return 1
+       }
+       return 0
+}
+
+var plotscript = `
+import csv
+import os
+import scipy
+import sys
+
+infile = sys.argv[1]
+X = scipy.load(infile)
+
+colors = None
+if sys.argv[2]:
+    labels = {}
+    for fnm in os.listdir(sys.argv[3]):
+        if '.2.fasta' not in fnm:
+            labels[fnm] = '---'
+    if len(labels) != len(X):
+        raise "len(inputdir) != len(inputarray)"
+    with open(sys.argv[2], 'rt') as csvfile:
+        for row in csv.reader(csvfile):
+            ident=row[0]
+            label=row[1]
+            for fnm in labels:
+                if row[0] in fnm:
+                    labels[fnm] = row[1]
+    colors = []
+    labelcolors = {
+        'PUR': 'firebrick',
+        'CLM': 'firebrick',
+        'MXL': 'firebrick',
+        'PEL': 'firebrick',
+        'TSI': 'green',
+        'IBS': 'green',
+        'CEU': 'green',
+        'GBR': 'green',
+        'FIN': 'green',
+        'LWK': 'coral',
+        'MSL': 'coral',
+        'GWD': 'coral',
+        'YRI': 'coral',
+        'ESN': 'coral',
+        'ACB': 'coral',
+        'ASW': 'coral',
+        'KHV': 'royalblue',
+        'CDX': 'royalblue',
+        'CHS': 'royalblue',
+        'CHB': 'royalblue',
+        'JPT': 'royalblue',
+        'STU': 'blueviolet',
+        'ITU': 'blueviolet',
+        'BEB': 'blueviolet',
+        'GIH': 'blueviolet',
+        'PJL': 'blueviolet',
+    }
+    for fnm in sorted(labels.keys()):
+        colors.append(labelcolors[labels[fnm]])
+
+from matplotlib.figure import Figure
+from matplotlib.patches import Polygon
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+fig = Figure()
+ax = fig.add_subplot(111)
+ax.scatter(X[:,0], X[:,1], c=colors, s=60, marker='o', alpha=0.5)
+canvas = FigureCanvasAgg(fig)
+canvas.print_figure(sys.argv[4], dpi=80)
+`