Add numpy-common-variants sanity check.
authorTom Clegg <tom@tomclegg.ca>
Thu, 5 Nov 2020 18:29:49 +0000 (13:29 -0500)
committerTom Clegg <tom@tomclegg.ca>
Thu, 5 Nov 2020 18:29:49 +0000 (13:29 -0500)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@tomclegg.ca>

cmd.go
example-su92l-1kg.sh
numpycomvar.go [new file with mode: 0644]

diff --git a/cmd.go b/cmd.go
index 040c5d0cea68b6aef24345f037abeb66508c1f16..92bd5a29d7017d4a9ecf36e7aa4e1a7006aa6d1e 100644 (file)
--- a/cmd.go
+++ b/cmd.go
@@ -25,6 +25,7 @@ var (
                "annotate":           &annotatecmd{},
                "export":             &exporter{},
                "export-numpy":       &exportNumpy{},
+               "numpy-comvar":       &numpyComVar{},
                "filter":             &filtercmd{},
                "build-docker-image": &buildDockerImage{},
                "pca-go":             &goPCA{},
index a167334d4c52be5e309b0bf56605bd7c391b0efc..5de4a4603b50a14398fda298627143739fe83c71 100755 (executable)
@@ -24,6 +24,7 @@ ref37_lib=$(lightning  import       -project ${project} -priority ${priority} -t
 # 539s
 
 ref38_lib=$(lightning  import       -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true -output-tiles=true -include-no-calls ${ref_fa}) ; echo ref38_lib=${ref38_lib}
+# ref38_lib=su92l-4zz18-swebknshfwsvys6/library.gob
 
 unfiltered=$(lightning import       -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true -output-tiles=true ${fasta})       ; echo unfiltered=${unfiltered}
 # unfiltered=su92l-4zz18-mz3546bib6oj1gg/library.gob
@@ -47,4 +48,13 @@ plot=$(lightning       plot         -project ${project} -priority ${priority} -i
 echo >&2 "https://workbench2.${plot%%-*}.arvadosapi.com/collections/${plot}"
 echo ${plot%%/*}
 
-numpy=$(lightning      export-numpy -project ${project} -priority ${priority} -i ${merged} -one-hot -max-variants "30" -min-coverage "0.9")
+
+merged38=$(lightning   merge        -project ${project} -priority ${priority} ${unfiltered} ${ref38_lib})                                              ; echo merged38=${merged38}
+# merged38=su92l-4zz18-xq17gtaltjxbm3n/library.gob
+# 1602s
+
+numpy=$(lightning      export-numpy -project ${project} -priority ${priority} -i ${merged38} -max-variants "30" -min-coverage "0.9")                   ; echo numpy=${numpy}
+# numpy=su92l-4zz18-w3dx5k79mtbz6qt/matrix.npy
+# 6155s
+# pcapy=$(lightning      pca          -project ${project} -priority ${priority} -i ${numpy})                                                             ; echo pcapy=${pcapy}
+comvar=$(lightning     numpy-comvar -project ${project} -priority ${priority} -i ${numpy} -annotations ${numpy%/matrix.npy}/annotations.tsv)           ; echo comvar=${comvar}
diff --git a/numpycomvar.go b/numpycomvar.go
new file mode 100644 (file)
index 0000000..7f99168
--- /dev/null
@@ -0,0 +1,108 @@
+package main
+
+import (
+       "flag"
+       "fmt"
+       "io"
+       _ "net/http/pprof"
+
+       "git.arvados.org/arvados.git/sdk/go/arvados"
+)
+
+type numpyComVar struct{}
+
+func (cmd *numpyComVar) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
+       var err error
+       defer func() {
+               if err != nil {
+                       fmt.Fprintf(stderr, "%s\n", err)
+               }
+       }()
+       flags := flag.NewFlagSet("", flag.ContinueOnError)
+       flags.SetOutput(stderr)
+       projectUUID := flags.String("project", "", "project `UUID` for output data")
+       inputFilename := flags.String("i", "-", "numpy matrix `file`")
+       priority := flags.Int("priority", 500, "container request priority")
+       annotationsFilename := flags.String("annotations", "", "annotations tsv `file`")
+       maxResults := flags.Int("max-results", 256, "maximum number of tile variants to output")
+       minFrequency := flags.Float64("min-frequency", 0.4, "minimum allele frequency")
+       maxFrequency := flags.Float64("max-frequency", 0.6, "maximum allele frequency")
+       err = flags.Parse(args)
+       if err == flag.ErrHelp {
+               err = nil
+               return 0
+       } else if err != nil {
+               return 2
+       }
+
+       runner := arvadosContainerRunner{
+               Name:        "lightning numpy-comvar",
+               Client:      arvados.NewClientFromEnv(),
+               ProjectUUID: *projectUUID,
+               RAM:         120000000000,
+               VCPUs:       2,
+               Priority:    *priority,
+       }
+       err = runner.TranslatePaths(inputFilename, annotationsFilename)
+       if err != nil {
+               return 1
+       }
+       runner.Prog = "python3"
+       runner.Args = []string{"-c", `import sys
+import scipy
+import sys
+import csv
+
+numpyFile = sys.argv[1]
+annotationsFile = sys.argv[2]
+outputFile = sys.argv[3]
+maxResults = int(sys.argv[4])
+minFrequency = float(sys.argv[5])
+maxFrequency = float(sys.argv[6])
+
+out = open(outputFile, 'w')
+
+m = scipy.load(numpyFile)
+
+commonvariants = {}
+mincount = m.shape[0] * 2 * minFrequency
+maxcount = m.shape[0] * 2 * maxFrequency
+for tag in range(m.shape[1] // 2):
+  example = {}
+  counter = [0, 0, 0, 0, 0]
+  for genome in range(m.shape[0]):
+    for phase in range(2):
+      variant = m[genome][tag*2+phase]
+      if variant > 0 and variant < len(counter):
+        counter[variant] += 1
+        example[variant] = genome
+  for variant, count in enumerate(counter):
+    if count >= mincount and count <= maxcount:
+      commonvariants[tag,variant] = example[variant]
+      # sys.stderr.write('tag {} variant {} count {} example {} have {} commonvariants\n'.format(tag, variant, count, example[variant], len(commonvariants)))
+  if len(commonvariants) >= maxResults:
+    break
+
+found = {}
+with open(annotationsFile, newline='') as tsvfile:
+  rdr = csv.reader(tsvfile, delimiter='\t', quotechar='"')
+  for row in rdr:
+    tag = int(row[0])
+    variant = int(row[1])
+    if (tag, variant) in commonvariants:
+      found[tag, variant] = True
+      out.write(','.join(row + [str(commonvariants[tag, variant])]) + '\n')
+    elif len(found) >= len(commonvariants):
+      sys.stderr.write('done\n')
+      break
+
+out.close()
+`, *inputFilename, *annotationsFilename, "/mnt/output/commonvariants.csv", fmt.Sprintf("%d", *maxResults), fmt.Sprintf("%f", *minFrequency), fmt.Sprintf("%f", *maxFrequency)}
+       var output string
+       output, err = runner.Run()
+       if err != nil {
+               return 1
+       }
+       fmt.Fprintln(stdout, output+"/commonvariants.csv")
+       return 0
+}