# 539s
ref38_lib=$(lightning import -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true -output-tiles=true -include-no-calls ${ref_fa}) ; echo ref38_lib=${ref38_lib}
+# ref38_lib=su92l-4zz18-swebknshfwsvys6/library.gob
unfiltered=$(lightning import -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true -output-tiles=true ${fasta}) ; echo unfiltered=${unfiltered}
# unfiltered=su92l-4zz18-mz3546bib6oj1gg/library.gob
echo >&2 "https://workbench2.${plot%%-*}.arvadosapi.com/collections/${plot}"
echo ${plot%%/*}
-numpy=$(lightning export-numpy -project ${project} -priority ${priority} -i ${merged} -one-hot -max-variants "30" -min-coverage "0.9")
+
+merged38=$(lightning merge -project ${project} -priority ${priority} ${unfiltered} ${ref38_lib}) ; echo merged38=${merged38}
+# merged38=su92l-4zz18-xq17gtaltjxbm3n/library.gob
+# 1602s
+
+numpy=$(lightning export-numpy -project ${project} -priority ${priority} -i ${merged38} -max-variants "30" -min-coverage "0.9") ; echo numpy=${numpy}
+# numpy=su92l-4zz18-w3dx5k79mtbz6qt/matrix.npy
+# 6155s
+# pcapy=$(lightning pca -project ${project} -priority ${priority} -i ${numpy}) ; echo pcapy=${pcapy}
+comvar=$(lightning numpy-comvar -project ${project} -priority ${priority} -i ${numpy} -annotations ${numpy%/matrix.npy}/annotations.tsv) ; echo comvar=${comvar}
--- /dev/null
+package main
+
+import (
+ "flag"
+ "fmt"
+ "io"
+ _ "net/http/pprof"
+
+ "git.arvados.org/arvados.git/sdk/go/arvados"
+)
+
+type numpyComVar struct{}
+
+func (cmd *numpyComVar) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
+ var err error
+ defer func() {
+ if err != nil {
+ fmt.Fprintf(stderr, "%s\n", err)
+ }
+ }()
+ flags := flag.NewFlagSet("", flag.ContinueOnError)
+ flags.SetOutput(stderr)
+ projectUUID := flags.String("project", "", "project `UUID` for output data")
+ inputFilename := flags.String("i", "-", "numpy matrix `file`")
+ priority := flags.Int("priority", 500, "container request priority")
+ annotationsFilename := flags.String("annotations", "", "annotations tsv `file`")
+ maxResults := flags.Int("max-results", 256, "maximum number of tile variants to output")
+ minFrequency := flags.Float64("min-frequency", 0.4, "minimum allele frequency")
+ maxFrequency := flags.Float64("max-frequency", 0.6, "maximum allele frequency")
+ err = flags.Parse(args)
+ if err == flag.ErrHelp {
+ err = nil
+ return 0
+ } else if err != nil {
+ return 2
+ }
+
+ runner := arvadosContainerRunner{
+ Name: "lightning numpy-comvar",
+ Client: arvados.NewClientFromEnv(),
+ ProjectUUID: *projectUUID,
+ RAM: 120000000000,
+ VCPUs: 2,
+ Priority: *priority,
+ }
+ err = runner.TranslatePaths(inputFilename, annotationsFilename)
+ if err != nil {
+ return 1
+ }
+ runner.Prog = "python3"
+ runner.Args = []string{"-c", `import sys
+import scipy
+import sys
+import csv
+
+numpyFile = sys.argv[1]
+annotationsFile = sys.argv[2]
+outputFile = sys.argv[3]
+maxResults = int(sys.argv[4])
+minFrequency = float(sys.argv[5])
+maxFrequency = float(sys.argv[6])
+
+out = open(outputFile, 'w')
+
+m = scipy.load(numpyFile)
+
+commonvariants = {}
+mincount = m.shape[0] * 2 * minFrequency
+maxcount = m.shape[0] * 2 * maxFrequency
+for tag in range(m.shape[1] // 2):
+ example = {}
+ counter = [0, 0, 0, 0, 0]
+ for genome in range(m.shape[0]):
+ for phase in range(2):
+ variant = m[genome][tag*2+phase]
+ if variant > 0 and variant < len(counter):
+ counter[variant] += 1
+ example[variant] = genome
+ for variant, count in enumerate(counter):
+ if count >= mincount and count <= maxcount:
+ commonvariants[tag,variant] = example[variant]
+ # sys.stderr.write('tag {} variant {} count {} example {} have {} commonvariants\n'.format(tag, variant, count, example[variant], len(commonvariants)))
+ if len(commonvariants) >= maxResults:
+ break
+
+found = {}
+with open(annotationsFile, newline='') as tsvfile:
+ rdr = csv.reader(tsvfile, delimiter='\t', quotechar='"')
+ for row in rdr:
+ tag = int(row[0])
+ variant = int(row[1])
+ if (tag, variant) in commonvariants:
+ found[tag, variant] = True
+ out.write(','.join(row + [str(commonvariants[tag, variant])]) + '\n')
+ elif len(found) >= len(commonvariants):
+ sys.stderr.write('done\n')
+ break
+
+out.close()
+`, *inputFilename, *annotationsFilename, "/mnt/output/commonvariants.csv", fmt.Sprintf("%d", *maxResults), fmt.Sprintf("%f", *minFrequency), fmt.Sprintf("%f", *maxFrequency)}
+ var output string
+ output, err = runner.Run()
+ if err != nil {
+ return 1
+ }
+ fmt.Fprintln(stdout, output+"/commonvariants.csv")
+ return 0
+}