From 6fda0f1fca146e8a8f40da916d554f5255a01121 Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Thu, 17 Sep 2020 00:41:13 -0400 Subject: [PATCH] Do PCA in Go. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- cmd.go | 3 +- example-su92l-1kg.sh | 5 +- exportnumpy.go | 44 ++++++------- exportnumpy_test.go | 8 +-- go.mod | 7 +++ go.sum | 30 +++++++++ pca.go | 143 +++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 212 insertions(+), 28 deletions(-) diff --git a/cmd.go b/cmd.go index c40c06369e..833dddb275 100644 --- a/cmd.go +++ b/cmd.go @@ -22,7 +22,8 @@ var ( "export-numpy": &exportNumpy{}, "filter": &filterer{}, "build-docker-image": &buildDockerImage{}, - "pca": &pythonPCA{}, + "pca-go": &goPCA{}, + "pca-py": &pythonPCA{}, "plot": &pythonPlot{}, "diff-fasta": &diffFasta{}, }) diff --git a/example-su92l-1kg.sh b/example-su92l-1kg.sh index 7c66d76575..d59b55cb5d 100755 --- a/example-su92l-1kg.sh +++ b/example-su92l-1kg.sh @@ -18,8 +18,9 @@ genome=$(lightning ref2genome -project ${project} -priority ${priority} -r fasta=$(lightning vcf2fasta -project ${project} -priority ${priority} -ref ${ref_fa} -genome ${genome} -mask=true ${gvcf}) unfiltered=$(lightning import -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true ${fasta}) filtered=$(lightning filter -project ${project} -priority ${priority} -i ${unfiltered} -min-coverage "0.9" -max-variants "30") -numpy=$(lightning export-numpy -project ${project} -priority ${priority} -i ${filtered} -one-hot) -pca=$(lightning pca -project ${project} -priority ${priority} -i ${numpy}) +#numpy=$(lightning export-numpy -project ${project} -priority ${priority} -i ${filtered} -one-hot) +#pca=$(lightning pca-py -project ${project} -priority ${priority} -i ${numpy}) +pca=$(lightning pca-go -project ${project} -priority ${priority} -i ${filtered} -one-hot) plot=$(lightning plot -project ${project} -priority ${priority} -i ${pca} -labels-csv ${info}/sample_info.csv -sample-fasta-dir ${fasta}) echo >&2 "https://workbench2.${plot%%-*}.arvadosapi.com/collections/${plot}" echo ${plot%%/*} diff --git a/exportnumpy.go b/exportnumpy.go index 21928b54a0..fd198116b1 100644 --- a/exportnumpy.go +++ b/exportnumpy.go @@ -95,19 +95,8 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader, return 1 } sort.Slice(cgs, func(i, j int) bool { return cgs[i].Name < cgs[j].Name }) - cols := 0 - for _, cg := range cgs { - if cols < len(cg.Variants) { - cols = len(cg.Variants) - } - } - rows := len(cgs) - out := make([]uint16, rows*cols) - for row, cg := range cgs { - for i, v := range cg.Variants { - out[row*cols+i] = uint16(v) - } - } + + out, rows, cols := cgs2array(cgs) var output io.WriteCloser if *outputFilename == "-" { @@ -125,13 +114,10 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader, return 1 } if *onehot { - out, cols := recodeOnehot(out, cols) - npw.Shape = []int{rows, cols} - npw.WriteUint8(out) - } else { - npw.Shape = []int{rows, cols} - npw.WriteUint16(out) + out, cols = recodeOnehot(out, cols) } + npw.Shape = []int{rows, cols} + npw.WriteUint16(out) err = bufw.Flush() if err != nil { return 1 @@ -143,7 +129,23 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader, return 0 } -func recodeOnehot(in []uint16, incols int) ([]uint8, int) { +func cgs2array(cgs []CompactGenome) (data []uint16, rows, cols int) { + rows = len(cgs) + for _, cg := range cgs { + if cols < len(cg.Variants) { + cols = len(cg.Variants) + } + } + data = make([]uint16, rows*cols) + for row, cg := range cgs { + for i, v := range cg.Variants { + data[row*cols+i] = uint16(v) + } + } + return +} + +func recodeOnehot(in []uint16, incols int) ([]uint16, int) { rows := len(in) / incols maxvalue := make([]uint16, incols) for row := 0; row < rows; row++ { @@ -159,7 +161,7 @@ func recodeOnehot(in []uint16, incols int) ([]uint8, int) { outcol[incol] = outcols outcols += int(v) } - out := make([]uint8, rows*outcols) + out := make([]uint16, rows*outcols) for row := 0; row < rows; row++ { for col := 0; col < incols; col++ { if v := in[row*incols+col]; v > 0 { diff --git a/exportnumpy_test.go b/exportnumpy_test.go index 7f22fb01b6..96e04cc014 100644 --- a/exportnumpy_test.go +++ b/exportnumpy_test.go @@ -51,10 +51,10 @@ func (s *exportSuite) TestOnehot(c *check.C) { incols int in []uint16 outcols int - out []uint8 + out []uint16 }{ - {2, []uint16{1, 1, 1, 1}, 2, []uint8{1, 1, 1, 1}}, - {2, []uint16{1, 1, 1, 2}, 3, []uint8{1, 1, 0, 1, 0, 1}}, + {2, []uint16{1, 1, 1, 1}, 2, []uint16{1, 1, 1, 1}}, + {2, []uint16{1, 1, 1, 2}, 3, []uint16{1, 1, 0, 1, 0, 1}}, { // 2nd column => 3 one-hot columns // 4th column => 0 one-hot columns @@ -62,7 +62,7 @@ func (s *exportSuite) TestOnehot(c *check.C) { 1, 1, 0, 0, 1, 2, 1, 0, 1, 3, 0, 0, - }, 5, []uint8{ + }, 5, []uint16{ 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, diff --git a/go.mod b/go.mod index 8ec2a84966..951c7b0ead 100644 --- a/go.mod +++ b/go.mod @@ -4,16 +4,23 @@ go 1.13 require ( git.arvados.org/arvados.git v0.0.0-20200521152208-f98e61d49ee0 + github.com/armon/go-radix v1.0.0 // indirect github.com/gogo/protobuf v1.3.1 // indirect github.com/golang/protobuf v1.4.2 // indirect + github.com/gonum/floats v0.0.0-20181209220543-c233463c7e82 // indirect + github.com/gonum/internal v0.0.0-20181124074243-f884aa714029 // indirect + github.com/james-bowman/nlp v0.0.0-20200417075118-1e2772e0e1e5 + github.com/james-bowman/sparse v0.0.0-20200514124614-ae250424e52d // indirect github.com/kshedden/gonpy v0.0.0-20190510000443-66c21fac4672 github.com/prometheus/client_golang v1.6.0 // indirect github.com/prometheus/common v0.10.0 // indirect github.com/sergi/go-diff v1.1.0 github.com/sirupsen/logrus v1.6.0 + github.com/spaolacci/murmur3 v1.1.0 // indirect golang.org/x/crypto v0.0.0-20200510223506-06a226fb4e37 golang.org/x/net v0.0.0-20200520182314-0ba52f642ac2 golang.org/x/sys v0.0.0-20200519105757-fe76b779f299 // indirect + gonum.org/v1/gonum v0.8.1 gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 gopkg.in/yaml.v2 v2.3.0 // indirect ) diff --git a/go.sum b/go.sum index 17cd0d6d9f..c888974f3e 100644 --- a/go.sum +++ b/go.sum @@ -8,12 +8,15 @@ github.com/Azure/azure-sdk-for-go v19.1.0+incompatible/go.mod h1:9XXNKU+eRnpl9mo github.com/Azure/go-autorest v10.15.2+incompatible/go.mod h1:r+4oMnoxhatjLLJ6zxSWATqVooLgysK6ZNox3g/xq24= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/Microsoft/go-winio v0.4.5/go.mod h1:VhR8bwka0BXejwEJY73c50VrPtXAaKcyvVC4A4RozmA= +github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw= github.com/alcortesm/tgz v0.0.0-20161220082320-9c5fe88206d7/go.mod h1:6zEj6s6u/ghQa61ZWa/C2Aw3RkjiTBOix7dkqa1VLIs= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239/go.mod h1:2FmKhYUyUczH0OGQWaF5ceTx0UBShxjsH6f8oGKYe2c= +github.com/armon/go-radix v1.0.0 h1:F4z6KzEeeQIMeLFa97iZU6vupzoecKdU5TX24SNppXI= +github.com/armon/go-radix v1.0.0/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8= github.com/arvados/cgofuse v1.2.0-arvados1/go.mod h1:79WFV98hrkRHK9XPhh2IGGOwpFSjocsWubgxAs2KhRc= github.com/aws/aws-sdk-go v1.25.30/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= @@ -40,6 +43,7 @@ github.com/docker/docker v1.4.2-0.20180109013817-94b8a116fbf1/go.mod h1:eEKB0N0r github.com/docker/go-connections v0.3.0/go.mod h1:Gbd7IOopHjR8Iph03tsViu4nIes5XhDvyHbTtUxmeec= github.com/docker/go-units v0.3.3-0.20171221200356-d59758554a3d/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568/go.mod h1:xEzjJPgXI435gkrCt3MPfRiAkVrwSbHsst4LCFVfpJc= +github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= @@ -55,6 +59,7 @@ github.com/gogo/protobuf v1.1.1 h1:72R+M5VuhED/KujmZVcIquuo8mBgX4oVda//DQb3PXo= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.3.1 h1:DqDEcV5aeaTmdFBePNpYsp3FlcVH/2ISVVM9Qf8PSls= github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o= +github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= @@ -69,6 +74,10 @@ github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:W github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= github.com/golang/protobuf v1.4.2 h1:+Z5KGCizgyZCbGh1KZqA0fcLLkwbsjIzS4aV2v7wJX0= github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= +github.com/gonum/floats v0.0.0-20181209220543-c233463c7e82 h1:EvokxLQsaaQjcWVWSV38221VAK7qc2zhaO17bKys/18= +github.com/gonum/floats v0.0.0-20181209220543-c233463c7e82/go.mod h1:PxC8OnwL11+aosOB5+iEPoV3picfs8tUpkVd0pDo+Kg= +github.com/gonum/internal v0.0.0-20181124074243-f884aa714029 h1:8jtTdc+Nfj9AR+0soOeia9UZSvYBvETVHZrugUowJ7M= +github.com/gonum/internal v0.0.0-20181124074243-f884aa714029/go.mod h1:Pu4dmpkhSyOzRwuXkOgAvijx4o+4YMUJJo9OvPYMkks= github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= github.com/google/go-cmp v0.3.0 h1:crn/baboCvb5fXaQ0IJ1SGTsTVrWpDsCWC8EGETZijY= @@ -86,6 +95,10 @@ github.com/gorilla/mux v1.6.1-0.20180107155708-5bbbb5b2b572/go.mod h1:1lud6UwP+6 github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/imdario/mergo v0.3.8-0.20190415133143-5ef87b449ca7/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA= +github.com/james-bowman/nlp v0.0.0-20200417075118-1e2772e0e1e5 h1:JM8y2xaOUC0SdmCGitjdzVuZcTATy+yk459nByJNK2o= +github.com/james-bowman/nlp v0.0.0-20200417075118-1e2772e0e1e5/go.mod h1:kixuaexEqWB+mHZNysgnb6mqgGIT25WvD1/tFRRt0J0= +github.com/james-bowman/sparse v0.0.0-20200514124614-ae250424e52d h1:sguZEtU6begqG4l2F4KInN7OteZdK+cqLlm/wwcppNc= +github.com/james-bowman/sparse v0.0.0-20200514124614-ae250424e52d/go.mod h1:G6EcQnwZKsWtItoaQHd+FHPPk6bDeYVJSeeSP9Sge+I= github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99/go.mod h1:1lJo3i6rXxKeerYnT8Nvf0QmHCRC1n8sfWVwXF2Frvo= github.com/jmcvetta/randutil v0.0.0-20150817122601-2bb1b664bcff/go.mod h1:ddfPX8Z28YMjiqoaJhNBzWHapTHXejnB5cDCUWDwriw= github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af/go.mod h1:Nht3zPeWKUH0NzdCt2Blrr5ys8VGpn0CEB0cQHVjt7k= @@ -94,6 +107,7 @@ github.com/json-iterator/go v1.1.7/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/u github.com/json-iterator/go v1.1.9/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w= +github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes= github.com/karalabe/xgo v0.0.0-20191115072854-c5ccff8648a7/go.mod h1:iYGcTYIPUvEWhFo6aKUuLchs+AV4ssYdyuBbQJZGcBk= github.com/kevinburke/ssh_config v0.0.0-20171013211458-802051befeb5/go.mod h1:CT57kijsi8u/K/BOFA39wgDQJ9CxiF4nAY/ojJ6r6mM= github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00= @@ -164,6 +178,8 @@ github.com/sirupsen/logrus v1.4.2 h1:SPIRibHv4MatM3XXNO2BJeFLZwZ2LvZgfQ5+UNI2im4 github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/sirupsen/logrus v1.6.0 h1:UBcNElsrwanuuMsnGSlYmtmgbb23qDR5dG+6X6Oo89I= github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88= +github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI= +github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= github.com/src-d/gcfg v1.3.0/go.mod h1:p/UMsR43ujA89BJY9duynAwIpvqEujIH/jFlfL7jWoI= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= @@ -179,7 +195,13 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200510223506-06a226fb4e37 h1:cg5LA/zNPRzIXIWSCxQW10Rvpy94aQh3LT/ShoCpkHw= golang.org/x/crypto v0.0.0-20200510223506-06a226fb4e37/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20190121172915-509febef88a4 h1:c2HOrn5iMezYjSlGPncknSEr/8x5LELb/ilJbXi9DEA= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2 h1:y102fOLFqhV41b+4GPiJoa0k/x+pJcEi2/HB1Y5T6fU= +golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= @@ -227,15 +249,22 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20181030221726-6c7e314b6563/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190506145303-2d16b83fe98c/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo= +gonum.org/v1/gonum v0.8.1 h1:wGtP3yGpc5mCLOLeTeBdjeui9oZSz5De0eOjMLC/QuQ= +gonum.org/v1/gonum v0.8.1/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0= +gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= +gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc= google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE= google.golang.org/api v0.13.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= @@ -277,3 +306,4 @@ honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWh honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= rsc.io/getopt v0.0.0-20170811000552-20be20937449/go.mod h1:dhCdeqAxkyt5u3/sKRkUXuHaMXUu1Pt13GTQAM2xnig= +rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= diff --git a/pca.go b/pca.go index f02cee7945..4b60327e84 100644 --- a/pca.go +++ b/pca.go @@ -1,12 +1,22 @@ package main import ( + "bufio" + "errors" "flag" "fmt" "io" + "io/ioutil" + "net/http" _ "net/http/pprof" + "os" + "sort" "git.arvados.org/arvados.git/sdk/go/arvados" + "github.com/james-bowman/nlp" + "github.com/kshedden/gonpy" + log "github.com/sirupsen/logrus" + "gonum.org/v1/gonum/mat" ) type pythonPCA struct{} @@ -56,3 +66,136 @@ scipy.save(sys.argv[2], PCA(n_components=4).fit_transform(scipy.load(sys.argv[1] fmt.Fprintln(stdout, output+"/pca.npy") return 0 } + +type goPCA struct{} + +func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int { + var err error + defer func() { + if err != nil { + fmt.Fprintf(stderr, "%s\n", err) + } + }() + flags := flag.NewFlagSet("", flag.ContinueOnError) + flags.SetOutput(stderr) + pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`") + runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)") + projectUUID := flags.String("project", "", "project `UUID` for output data") + priority := flags.Int("priority", 500, "container request priority") + inputFilename := flags.String("i", "-", "input `file`") + outputFilename := flags.String("o", "-", "output `file`") + components := flags.Int("components", 4, "number of components") + onehot := flags.Bool("one-hot", false, "recode tile variants as one-hot") + err = flags.Parse(args) + if err == flag.ErrHelp { + err = nil + return 0 + } else if err != nil { + return 2 + } + + if *pprof != "" { + go func() { + log.Println(http.ListenAndServe(*pprof, nil)) + }() + } + + if !*runlocal { + if *outputFilename != "-" { + err = errors.New("cannot specify output file in container mode: not implemented") + return 1 + } + runner := arvadosContainerRunner{ + Name: "lightning pca-go", + Client: arvados.NewClientFromEnv(), + ProjectUUID: *projectUUID, + RAM: 432000000000, + VCPUs: 2, + Priority: *priority, + } + err = runner.TranslatePaths(inputFilename) + if err != nil { + return 1 + } + runner.Args = []string{"pca-go", "-local=true", fmt.Sprintf("-one-hot=%v", *onehot), "-i", *inputFilename, "-o", "/mnt/output/pca.npy"} + var output string + output, err = runner.Run() + if err != nil { + return 1 + } + fmt.Fprintln(stdout, output+"/pca.npy") + return 0 + } + + var input io.ReadCloser + if *inputFilename == "-" { + input = ioutil.NopCloser(stdin) + } else { + input, err = os.Open(*inputFilename) + if err != nil { + return 1 + } + defer input.Close() + } + cgs, err := ReadCompactGenomes(input) + if err != nil { + return 1 + } + err = input.Close() + if err != nil { + return 1 + } + sort.Slice(cgs, func(i, j int) bool { return cgs[i].Name < cgs[j].Name }) + + data, rows, cols := cgs2array(cgs) + if *onehot { + data, cols = recodeOnehot(data, cols) + } + pca, err := nlp.NewPCA(*components).FitTransform(array2matrix(rows, cols, data)) + if err != nil { + return 1 + } + + rows, cols = pca.Dims() + out := make([]float64, rows*cols) + for i := 0; i < rows; i++ { + for j := 0; j < cols; j++ { + out[i*cols+j] = pca.At(i, j) + } + } + + var output io.WriteCloser + if *outputFilename == "-" { + output = nopCloser{stdout} + } else { + output, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0777) + if err != nil { + return 1 + } + defer output.Close() + } + bufw := bufio.NewWriter(output) + npw, err := gonpy.NewWriter(nopCloser{bufw}) + if err != nil { + return 1 + } + npw.Shape = []int{rows, cols} + npw.WriteFloat64(out) + err = bufw.Flush() + if err != nil { + return 1 + } + err = output.Close() + if err != nil { + return 1 + } + return 0 +} + +func array2matrix(rows, cols int, data []uint16) mat.Matrix { + floatdata := make([]float64, rows*cols) + for i, v := range data { + floatdata[i] = float64(v) + } + return mat.NewDense(rows, cols, floatdata) +} -- 2.30.2