19526: Add manhattan plot.
authorTom Clegg <tom@curii.com>
Mon, 28 Nov 2022 18:34:48 +0000 (13:34 -0500)
committerTom Clegg <tom@curii.com>
Mon, 28 Nov 2022 18:34:48 +0000 (13:34 -0500)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

cmd.go
manhattan.go [new file with mode: 0644]
manhattan.py [new file with mode: 0644]
pca_plot.go [moved from plot.go with 92% similarity]
pca_plot.py [moved from plot.py with 100% similarity]

diff --git a/cmd.go b/cmd.go
index 2b34b0ccaa4e1434b8ebfb333e083ddf89debe2b..97bc8d30af38a2f5b74d220cef59cd352ae45db4 100644 (file)
--- a/cmd.go
+++ b/cmd.go
@@ -39,6 +39,8 @@ var (
                "filter":             &filtercmd{},
                "build-docker-image": &buildDockerImage{},
                "plot":               &pythonPlot{},
+               "pca-plot":           &pythonPlot{},
+               "manhattan-plot":     &manhattanPlot{},
                "diff-fasta":         &diffFasta{},
                "stats":              &statscmd{},
                "merge":              &merger{},
@@ -78,8 +80,9 @@ func (cmd *buildDockerImage) RunCommand(prog string, args []string, stdin io.Rea
 RUN DEBIAN_FRONTEND=noninteractive \
   apt-get update && \
   apt-get dist-upgrade -y && \
-  apt-get install -y --no-install-recommends bcftools bedtools samtools python2 python3-sklearn python3-matplotlib ca-certificates && \
-  apt-get clean
+  apt-get install -y --no-install-recommends bcftools bedtools samtools python2 python3-sklearn python3-matplotlib python3-pip ca-certificates && \
+  apt-get clean && \
+  pip3 install qmplot
 `), 0644)
        if err != nil {
                fmt.Fprint(stderr, err)
diff --git a/manhattan.go b/manhattan.go
new file mode 100644 (file)
index 0000000..4ae4479
--- /dev/null
@@ -0,0 +1,97 @@
+// Copyright (C) The Lightning Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package lightning
+
+import (
+       _ "embed"
+       "flag"
+       "fmt"
+       "io"
+       "os/exec"
+       "strings"
+
+       "git.arvados.org/arvados.git/sdk/go/arvados"
+)
+
+type manhattanPlot struct{}
+
+//go:embed manhattan.py
+var manhattanPy string
+
+func (cmd *manhattanPlot) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
+       var err error
+       defer func() {
+               if err != nil {
+                       fmt.Fprintf(stderr, "%s\n", err)
+               }
+       }()
+       flags := flag.NewFlagSet("", flag.ContinueOnError)
+       flags.SetOutput(stderr)
+       projectUUID := flags.String("project", "", "project `UUID` for output data")
+       inputDirectory := flags.String("i", "-", "input `directory` (output of slice-numpy -single-onehot)")
+       outputFilename := flags.String("o", "", "output `filename` (e.g., './plot.png')")
+       priority := flags.Int("priority", 500, "container request priority")
+       runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
+       err = flags.Parse(args)
+       if err == flag.ErrHelp {
+               err = nil
+               return 0
+       } else if err != nil {
+               return 2
+       } else if flags.NArg() > 0 {
+               err = fmt.Errorf("errant command line arguments after parsed flags: %v", flags.Args())
+               return 2
+       }
+
+       runner := arvadosContainerRunner{
+               Name:        "lightning manhattan",
+               Client:      arvados.NewClientFromEnv(),
+               ProjectUUID: *projectUUID,
+               RAM:         4 << 30,
+               VCPUs:       1,
+               Priority:    *priority,
+               Mounts: map[string]map[string]interface{}{
+                       "/manhattan.py": map[string]interface{}{
+                               "kind":    "text",
+                               "content": manhattanPy,
+                       },
+               },
+       }
+       if !*runlocal {
+               err = runner.TranslatePaths(inputDirectory)
+               if err != nil {
+                       return 1
+               }
+               *outputFilename = "/mnt/output/plot.png"
+       }
+       args = []string{
+               *inputDirectory,
+               *outputFilename,
+       }
+       if *runlocal {
+               if *outputFilename == "" {
+                       fmt.Fprintln(stderr, "error: must specify -o filename.png in local mode (or try -help)")
+                       return 1
+               }
+               cmd := exec.Command("python3", append([]string{"-"}, args...)...)
+               cmd.Stdin = strings.NewReader(manhattanPy)
+               cmd.Stdout = stdout
+               cmd.Stderr = stderr
+               err = cmd.Run()
+               if err != nil {
+                       return 1
+               }
+               return 0
+       }
+       runner.Prog = "python3"
+       runner.Args = append([]string{"/manhattan.py"}, args...)
+       var output string
+       output, err = runner.Run()
+       if err != nil {
+               return 1
+       }
+       fmt.Fprintln(stdout, output+"/plot.png")
+       return 0
+}
diff --git a/manhattan.py b/manhattan.py
new file mode 100644 (file)
index 0000000..4736b8a
--- /dev/null
@@ -0,0 +1,46 @@
+# Copyright (C) The Lightning Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+import csv
+import os
+import sys
+
+import matplotlib
+import numpy
+import pandas
+import qmplot
+
+(_,
+ input_path,
+ output_path,
+ ) = sys.argv
+columns = numpy.load(os.path.join(input_path, 'onehot-columns.npy'))
+
+# pvalue maps tag# => [pvalue1, pvalue2, ...] (one het p-value and one hom p-value for each tile variant)
+pvalue = {}
+for i in range(columns.shape[1]):
+    tag = columns[0,i]
+    x = pvalue.get(tag, [])
+    x.append(pow(10, -columns[4,i] / 1000000))
+    pvalue[tag] = x
+
+# tilepos maps tag# => (chromosome, position)
+tilepos = {}
+for dirent in os.scandir(input_path):
+    if dirent.name.endswith('.annotations.csv'):
+        with open(dirent, 'rt', newline='') as annotations:
+            for annotation in csv.reader(annotations):
+                # 500000,0,2,=,chr1,160793649,,,
+                if annotation[3] == "=":
+                    tilepos[int(annotation[0])] = (annotation[4], int(annotation[5]))
+
+series = {"#CHROM": [], "POS": [], "P": []}
+for tag, chrpos in sorted(tilepos.items(), key=lambda item: (item[1][0].lstrip('chr').zfill(2), item[1][1])):
+    for p in pvalue.get(tag, []):
+        series["#CHROM"].append(chrpos[0])
+        series["POS"].append(chrpos[1])
+        series["P"].append(p)
+
+qmplot.manhattanplot(data=pandas.DataFrame(series))
+matplotlib.pyplot.savefig(output_path)
similarity index 92%
rename from plot.go
rename to pca_plot.go
index 6deaff6fc7d01057b0b4cfd1c49537cbacc7edb0..88ba06ce55539ba5dbdabd3ccf2de790dc8fc5da 100644 (file)
--- a/plot.go
@@ -9,7 +9,6 @@ import (
        "flag"
        "fmt"
        "io"
-       _ "net/http/pprof"
        "os/exec"
        "strings"
 
@@ -18,8 +17,8 @@ import (
 
 type pythonPlot struct{}
 
-//go:embed plot.py
-var plotscript string
+//go:embed pca_plot.py
+var pcaPlotPy string
 
 func (cmd *pythonPlot) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
        var err error
@@ -53,16 +52,16 @@ func (cmd *pythonPlot) RunCommand(prog string, args []string, stdin io.Reader, s
        }
 
        runner := arvadosContainerRunner{
-               Name:        "lightning plot",
+               Name:        "lightning pca-plot",
                Client:      arvados.NewClientFromEnv(),
                ProjectUUID: *projectUUID,
                RAM:         4 << 30,
                VCPUs:       1,
                Priority:    *priority,
                Mounts: map[string]map[string]interface{}{
-                       "/plot.py": map[string]interface{}{
+                       "/pca_plot.py": map[string]interface{}{
                                "kind":    "text",
-                               "content": plotscript,
+                               "content": pcaPlotPy,
                        },
                },
        }
@@ -89,7 +88,7 @@ func (cmd *pythonPlot) RunCommand(prog string, args []string, stdin io.Reader, s
                        return 1
                }
                cmd := exec.Command("python3", append([]string{"-"}, args...)...)
-               cmd.Stdin = strings.NewReader(plotscript)
+               cmd.Stdin = strings.NewReader(pcaPlotPy)
                cmd.Stdout = stdout
                cmd.Stderr = stderr
                err = cmd.Run()
@@ -99,7 +98,7 @@ func (cmd *pythonPlot) RunCommand(prog string, args []string, stdin io.Reader, s
                return 0
        }
        runner.Prog = "python3"
-       runner.Args = append([]string{"/plot.py"}, args...)
+       runner.Args = append([]string{"/pca_plot.py"}, args...)
        var output string
        output, err = runner.Run()
        if err != nil {
similarity index 100%
rename from plot.py
rename to pca_plot.py