Do PCA in Go.
authorTom Clegg <tom@tomclegg.ca>
Thu, 17 Sep 2020 04:41:13 +0000 (00:41 -0400)
committerTom Clegg <tom@tomclegg.ca>
Thu, 17 Sep 2020 04:41:13 +0000 (00:41 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@tomclegg.ca>

cmd.go
example-su92l-1kg.sh
exportnumpy.go
exportnumpy_test.go
go.mod
go.sum
pca.go

diff --git a/cmd.go b/cmd.go
index c40c06369e8fcd94a20aacc0bc2dce6abd55c965..833dddb275e9900c887832a2ae82b48a0bafd4a4 100644 (file)
--- a/cmd.go
+++ b/cmd.go
@@ -22,7 +22,8 @@ var (
                "export-numpy":       &exportNumpy{},
                "filter":             &filterer{},
                "build-docker-image": &buildDockerImage{},
-               "pca":                &pythonPCA{},
+               "pca-go":             &goPCA{},
+               "pca-py":             &pythonPCA{},
                "plot":               &pythonPlot{},
                "diff-fasta":         &diffFasta{},
        })
index 7c66d76575190c807741793cf81209ddaa7cf2e1..d59b55cb5d62d42d019b6c52ea0643f5c048f1f3 100755 (executable)
@@ -18,8 +18,9 @@ genome=$(lightning     ref2genome   -project ${project} -priority ${priority} -r
 fasta=$(lightning      vcf2fasta    -project ${project} -priority ${priority} -ref ${ref_fa} -genome ${genome} -mask=true ${gvcf})
 unfiltered=$(lightning import       -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true ${fasta})
 filtered=$(lightning   filter       -project ${project} -priority ${priority} -i ${unfiltered} -min-coverage "0.9" -max-variants "30")
-numpy=$(lightning      export-numpy -project ${project} -priority ${priority} -i ${filtered} -one-hot)
-pca=$(lightning        pca          -project ${project} -priority ${priority} -i ${numpy})
+#numpy=$(lightning     export-numpy -project ${project} -priority ${priority} -i ${filtered} -one-hot)
+#pca=$(lightning       pca-py       -project ${project} -priority ${priority} -i ${numpy})
+pca=$(lightning        pca-go       -project ${project} -priority ${priority} -i ${filtered} -one-hot)
 plot=$(lightning       plot         -project ${project} -priority ${priority} -i ${pca} -labels-csv ${info}/sample_info.csv -sample-fasta-dir ${fasta})
 echo >&2 "https://workbench2.${plot%%-*}.arvadosapi.com/collections/${plot}"
 echo ${plot%%/*}
index 21928b54a0cac722e7f15949e8adfd23b2b7d6ae..fd198116b1512c01f6fb51db67bc82354edd40d5 100644 (file)
@@ -95,19 +95,8 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader,
                return 1
        }
        sort.Slice(cgs, func(i, j int) bool { return cgs[i].Name < cgs[j].Name })
-       cols := 0
-       for _, cg := range cgs {
-               if cols < len(cg.Variants) {
-                       cols = len(cg.Variants)
-               }
-       }
-       rows := len(cgs)
-       out := make([]uint16, rows*cols)
-       for row, cg := range cgs {
-               for i, v := range cg.Variants {
-                       out[row*cols+i] = uint16(v)
-               }
-       }
+
+       out, rows, cols := cgs2array(cgs)
 
        var output io.WriteCloser
        if *outputFilename == "-" {
@@ -125,13 +114,10 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader,
                return 1
        }
        if *onehot {
-               out, cols := recodeOnehot(out, cols)
-               npw.Shape = []int{rows, cols}
-               npw.WriteUint8(out)
-       } else {
-               npw.Shape = []int{rows, cols}
-               npw.WriteUint16(out)
+               out, cols = recodeOnehot(out, cols)
        }
+       npw.Shape = []int{rows, cols}
+       npw.WriteUint16(out)
        err = bufw.Flush()
        if err != nil {
                return 1
@@ -143,7 +129,23 @@ func (cmd *exportNumpy) RunCommand(prog string, args []string, stdin io.Reader,
        return 0
 }
 
-func recodeOnehot(in []uint16, incols int) ([]uint8, int) {
+func cgs2array(cgs []CompactGenome) (data []uint16, rows, cols int) {
+       rows = len(cgs)
+       for _, cg := range cgs {
+               if cols < len(cg.Variants) {
+                       cols = len(cg.Variants)
+               }
+       }
+       data = make([]uint16, rows*cols)
+       for row, cg := range cgs {
+               for i, v := range cg.Variants {
+                       data[row*cols+i] = uint16(v)
+               }
+       }
+       return
+}
+
+func recodeOnehot(in []uint16, incols int) ([]uint16, int) {
        rows := len(in) / incols
        maxvalue := make([]uint16, incols)
        for row := 0; row < rows; row++ {
@@ -159,7 +161,7 @@ func recodeOnehot(in []uint16, incols int) ([]uint8, int) {
                outcol[incol] = outcols
                outcols += int(v)
        }
-       out := make([]uint8, rows*outcols)
+       out := make([]uint16, rows*outcols)
        for row := 0; row < rows; row++ {
                for col := 0; col < incols; col++ {
                        if v := in[row*incols+col]; v > 0 {
index 7f22fb01b66535d6377a76be24fbc460cf0638ee..96e04cc014a00e15181f4f7c92076b329a9d1f27 100644 (file)
@@ -51,10 +51,10 @@ func (s *exportSuite) TestOnehot(c *check.C) {
                incols  int
                in      []uint16
                outcols int
-               out     []uint8
+               out     []uint16
        }{
-               {2, []uint16{1, 1, 1, 1}, 2, []uint8{1, 1, 1, 1}},
-               {2, []uint16{1, 1, 1, 2}, 3, []uint8{1, 1, 0, 1, 0, 1}},
+               {2, []uint16{1, 1, 1, 1}, 2, []uint16{1, 1, 1, 1}},
+               {2, []uint16{1, 1, 1, 2}, 3, []uint16{1, 1, 0, 1, 0, 1}},
                {
                        // 2nd column => 3 one-hot columns
                        // 4th column => 0 one-hot columns
@@ -62,7 +62,7 @@ func (s *exportSuite) TestOnehot(c *check.C) {
                                1, 1, 0, 0,
                                1, 2, 1, 0,
                                1, 3, 0, 0,
-                       }, 5, []uint8{
+                       }, 5, []uint16{
                                1, 1, 0, 0, 0,
                                1, 0, 1, 0, 1,
                                1, 0, 0, 1, 0,
diff --git a/go.mod b/go.mod
index 8ec2a84966cb5bd0cb6db8f34b86021a00f38f9f..951c7b0eadf17a43a1e573e636a8b92ac8ae49a7 100644 (file)
--- a/go.mod
+++ b/go.mod
@@ -4,16 +4,23 @@ go 1.13
 
 require (
        git.arvados.org/arvados.git v0.0.0-20200521152208-f98e61d49ee0
+       github.com/armon/go-radix v1.0.0 // indirect
        github.com/gogo/protobuf v1.3.1 // indirect
        github.com/golang/protobuf v1.4.2 // indirect
+       github.com/gonum/floats v0.0.0-20181209220543-c233463c7e82 // indirect
+       github.com/gonum/internal v0.0.0-20181124074243-f884aa714029 // indirect
+       github.com/james-bowman/nlp v0.0.0-20200417075118-1e2772e0e1e5
+       github.com/james-bowman/sparse v0.0.0-20200514124614-ae250424e52d // indirect
        github.com/kshedden/gonpy v0.0.0-20190510000443-66c21fac4672
        github.com/prometheus/client_golang v1.6.0 // indirect
        github.com/prometheus/common v0.10.0 // indirect
        github.com/sergi/go-diff v1.1.0
        github.com/sirupsen/logrus v1.6.0
+       github.com/spaolacci/murmur3 v1.1.0 // indirect
        golang.org/x/crypto v0.0.0-20200510223506-06a226fb4e37
        golang.org/x/net v0.0.0-20200520182314-0ba52f642ac2
        golang.org/x/sys v0.0.0-20200519105757-fe76b779f299 // indirect
+       gonum.org/v1/gonum v0.8.1
        gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15
        gopkg.in/yaml.v2 v2.3.0 // indirect
 )
diff --git a/go.sum b/go.sum
index 17cd0d6d9f6c1915c0474f650a93fe1a4c103f39..c888974f3edb625163f4644452d147ecd22aecf2 100644 (file)
--- a/go.sum
+++ b/go.sum
@@ -8,12 +8,15 @@ github.com/Azure/azure-sdk-for-go v19.1.0+incompatible/go.mod h1:9XXNKU+eRnpl9mo
 github.com/Azure/go-autorest v10.15.2+incompatible/go.mod h1:r+4oMnoxhatjLLJ6zxSWATqVooLgysK6ZNox3g/xq24=
 github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
 github.com/Microsoft/go-winio v0.4.5/go.mod h1:VhR8bwka0BXejwEJY73c50VrPtXAaKcyvVC4A4RozmA=
+github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw=
 github.com/alcortesm/tgz v0.0.0-20161220082320-9c5fe88206d7/go.mod h1:6zEj6s6u/ghQa61ZWa/C2Aw3RkjiTBOix7dkqa1VLIs=
 github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
 github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
 github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
 github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
 github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239/go.mod h1:2FmKhYUyUczH0OGQWaF5ceTx0UBShxjsH6f8oGKYe2c=
+github.com/armon/go-radix v1.0.0 h1:F4z6KzEeeQIMeLFa97iZU6vupzoecKdU5TX24SNppXI=
+github.com/armon/go-radix v1.0.0/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8=
 github.com/arvados/cgofuse v1.2.0-arvados1/go.mod h1:79WFV98hrkRHK9XPhh2IGGOwpFSjocsWubgxAs2KhRc=
 github.com/aws/aws-sdk-go v1.25.30/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo=
 github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
@@ -40,6 +43,7 @@ github.com/docker/docker v1.4.2-0.20180109013817-94b8a116fbf1/go.mod h1:eEKB0N0r
 github.com/docker/go-connections v0.3.0/go.mod h1:Gbd7IOopHjR8Iph03tsViu4nIes5XhDvyHbTtUxmeec=
 github.com/docker/go-units v0.3.3-0.20171221200356-d59758554a3d/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
 github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568/go.mod h1:xEzjJPgXI435gkrCt3MPfRiAkVrwSbHsst4LCFVfpJc=
+github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
 github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ=
 github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk=
 github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
@@ -55,6 +59,7 @@ github.com/gogo/protobuf v1.1.1 h1:72R+M5VuhED/KujmZVcIquuo8mBgX4oVda//DQb3PXo=
 github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
 github.com/gogo/protobuf v1.3.1 h1:DqDEcV5aeaTmdFBePNpYsp3FlcVH/2ISVVM9Qf8PSls=
 github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o=
+github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
 github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
 github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
 github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
@@ -69,6 +74,10 @@ github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:W
 github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
 github.com/golang/protobuf v1.4.2 h1:+Z5KGCizgyZCbGh1KZqA0fcLLkwbsjIzS4aV2v7wJX0=
 github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
+github.com/gonum/floats v0.0.0-20181209220543-c233463c7e82 h1:EvokxLQsaaQjcWVWSV38221VAK7qc2zhaO17bKys/18=
+github.com/gonum/floats v0.0.0-20181209220543-c233463c7e82/go.mod h1:PxC8OnwL11+aosOB5+iEPoV3picfs8tUpkVd0pDo+Kg=
+github.com/gonum/internal v0.0.0-20181124074243-f884aa714029 h1:8jtTdc+Nfj9AR+0soOeia9UZSvYBvETVHZrugUowJ7M=
+github.com/gonum/internal v0.0.0-20181124074243-f884aa714029/go.mod h1:Pu4dmpkhSyOzRwuXkOgAvijx4o+4YMUJJo9OvPYMkks=
 github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
 github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
 github.com/google/go-cmp v0.3.0 h1:crn/baboCvb5fXaQ0IJ1SGTsTVrWpDsCWC8EGETZijY=
@@ -86,6 +95,10 @@ github.com/gorilla/mux v1.6.1-0.20180107155708-5bbbb5b2b572/go.mod h1:1lud6UwP+6
 github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
 github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
 github.com/imdario/mergo v0.3.8-0.20190415133143-5ef87b449ca7/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA=
+github.com/james-bowman/nlp v0.0.0-20200417075118-1e2772e0e1e5 h1:JM8y2xaOUC0SdmCGitjdzVuZcTATy+yk459nByJNK2o=
+github.com/james-bowman/nlp v0.0.0-20200417075118-1e2772e0e1e5/go.mod h1:kixuaexEqWB+mHZNysgnb6mqgGIT25WvD1/tFRRt0J0=
+github.com/james-bowman/sparse v0.0.0-20200514124614-ae250424e52d h1:sguZEtU6begqG4l2F4KInN7OteZdK+cqLlm/wwcppNc=
+github.com/james-bowman/sparse v0.0.0-20200514124614-ae250424e52d/go.mod h1:G6EcQnwZKsWtItoaQHd+FHPPk6bDeYVJSeeSP9Sge+I=
 github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99/go.mod h1:1lJo3i6rXxKeerYnT8Nvf0QmHCRC1n8sfWVwXF2Frvo=
 github.com/jmcvetta/randutil v0.0.0-20150817122601-2bb1b664bcff/go.mod h1:ddfPX8Z28YMjiqoaJhNBzWHapTHXejnB5cDCUWDwriw=
 github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af/go.mod h1:Nht3zPeWKUH0NzdCt2Blrr5ys8VGpn0CEB0cQHVjt7k=
@@ -94,6 +107,7 @@ github.com/json-iterator/go v1.1.7/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/u
 github.com/json-iterator/go v1.1.9/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
 github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU=
 github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
+github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
 github.com/karalabe/xgo v0.0.0-20191115072854-c5ccff8648a7/go.mod h1:iYGcTYIPUvEWhFo6aKUuLchs+AV4ssYdyuBbQJZGcBk=
 github.com/kevinburke/ssh_config v0.0.0-20171013211458-802051befeb5/go.mod h1:CT57kijsi8u/K/BOFA39wgDQJ9CxiF4nAY/ojJ6r6mM=
 github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00=
@@ -164,6 +178,8 @@ github.com/sirupsen/logrus v1.4.2 h1:SPIRibHv4MatM3XXNO2BJeFLZwZ2LvZgfQ5+UNI2im4
 github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE=
 github.com/sirupsen/logrus v1.6.0 h1:UBcNElsrwanuuMsnGSlYmtmgbb23qDR5dG+6X6Oo89I=
 github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88=
+github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI=
+github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA=
 github.com/src-d/gcfg v1.3.0/go.mod h1:p/UMsR43ujA89BJY9duynAwIpvqEujIH/jFlfL7jWoI=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
@@ -179,7 +195,13 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20200510223506-06a226fb4e37 h1:cg5LA/zNPRzIXIWSCxQW10Rvpy94aQh3LT/ShoCpkHw=
 golang.org/x/crypto v0.0.0-20200510223506-06a226fb4e37/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
+golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20190121172915-509febef88a4 h1:c2HOrn5iMezYjSlGPncknSEr/8x5LELb/ilJbXi9DEA=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2 h1:y102fOLFqhV41b+4GPiJoa0k/x+pJcEi2/HB1Y5T6fU=
+golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
 golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
 golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
 golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
@@ -227,15 +249,22 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
 golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
+golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20181030221726-6c7e314b6563/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
 golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
 golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
 golang.org/x/tools v0.0.0-20190506145303-2d16b83fe98c/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo=
+gonum.org/v1/gonum v0.8.1 h1:wGtP3yGpc5mCLOLeTeBdjeui9oZSz5De0eOjMLC/QuQ=
+gonum.org/v1/gonum v0.8.1/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0=
+gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
+gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc=
 google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE=
 google.golang.org/api v0.13.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI=
 google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
@@ -277,3 +306,4 @@ honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWh
 honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
 honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
 rsc.io/getopt v0.0.0-20170811000552-20be20937449/go.mod h1:dhCdeqAxkyt5u3/sKRkUXuHaMXUu1Pt13GTQAM2xnig=
+rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
diff --git a/pca.go b/pca.go
index f02cee79457330575acd98cfd9f6a8f83685d588..4b60327e84e7e2c108204989f54223db5f4f8914 100644 (file)
--- a/pca.go
+++ b/pca.go
@@ -1,12 +1,22 @@
 package main
 
 import (
+       "bufio"
+       "errors"
        "flag"
        "fmt"
        "io"
+       "io/ioutil"
+       "net/http"
        _ "net/http/pprof"
+       "os"
+       "sort"
 
        "git.arvados.org/arvados.git/sdk/go/arvados"
+       "github.com/james-bowman/nlp"
+       "github.com/kshedden/gonpy"
+       log "github.com/sirupsen/logrus"
+       "gonum.org/v1/gonum/mat"
 )
 
 type pythonPCA struct{}
@@ -56,3 +66,136 @@ scipy.save(sys.argv[2], PCA(n_components=4).fit_transform(scipy.load(sys.argv[1]
        fmt.Fprintln(stdout, output+"/pca.npy")
        return 0
 }
+
+type goPCA struct{}
+
+func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
+       var err error
+       defer func() {
+               if err != nil {
+                       fmt.Fprintf(stderr, "%s\n", err)
+               }
+       }()
+       flags := flag.NewFlagSet("", flag.ContinueOnError)
+       flags.SetOutput(stderr)
+       pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
+       runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
+       projectUUID := flags.String("project", "", "project `UUID` for output data")
+       priority := flags.Int("priority", 500, "container request priority")
+       inputFilename := flags.String("i", "-", "input `file`")
+       outputFilename := flags.String("o", "-", "output `file`")
+       components := flags.Int("components", 4, "number of components")
+       onehot := flags.Bool("one-hot", false, "recode tile variants as one-hot")
+       err = flags.Parse(args)
+       if err == flag.ErrHelp {
+               err = nil
+               return 0
+       } else if err != nil {
+               return 2
+       }
+
+       if *pprof != "" {
+               go func() {
+                       log.Println(http.ListenAndServe(*pprof, nil))
+               }()
+       }
+
+       if !*runlocal {
+               if *outputFilename != "-" {
+                       err = errors.New("cannot specify output file in container mode: not implemented")
+                       return 1
+               }
+               runner := arvadosContainerRunner{
+                       Name:        "lightning pca-go",
+                       Client:      arvados.NewClientFromEnv(),
+                       ProjectUUID: *projectUUID,
+                       RAM:         432000000000,
+                       VCPUs:       2,
+                       Priority:    *priority,
+               }
+               err = runner.TranslatePaths(inputFilename)
+               if err != nil {
+                       return 1
+               }
+               runner.Args = []string{"pca-go", "-local=true", fmt.Sprintf("-one-hot=%v", *onehot), "-i", *inputFilename, "-o", "/mnt/output/pca.npy"}
+               var output string
+               output, err = runner.Run()
+               if err != nil {
+                       return 1
+               }
+               fmt.Fprintln(stdout, output+"/pca.npy")
+               return 0
+       }
+
+       var input io.ReadCloser
+       if *inputFilename == "-" {
+               input = ioutil.NopCloser(stdin)
+       } else {
+               input, err = os.Open(*inputFilename)
+               if err != nil {
+                       return 1
+               }
+               defer input.Close()
+       }
+       cgs, err := ReadCompactGenomes(input)
+       if err != nil {
+               return 1
+       }
+       err = input.Close()
+       if err != nil {
+               return 1
+       }
+       sort.Slice(cgs, func(i, j int) bool { return cgs[i].Name < cgs[j].Name })
+
+       data, rows, cols := cgs2array(cgs)
+       if *onehot {
+               data, cols = recodeOnehot(data, cols)
+       }
+       pca, err := nlp.NewPCA(*components).FitTransform(array2matrix(rows, cols, data))
+       if err != nil {
+               return 1
+       }
+
+       rows, cols = pca.Dims()
+       out := make([]float64, rows*cols)
+       for i := 0; i < rows; i++ {
+               for j := 0; j < cols; j++ {
+                       out[i*cols+j] = pca.At(i, j)
+               }
+       }
+
+       var output io.WriteCloser
+       if *outputFilename == "-" {
+               output = nopCloser{stdout}
+       } else {
+               output, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0777)
+               if err != nil {
+                       return 1
+               }
+               defer output.Close()
+       }
+       bufw := bufio.NewWriter(output)
+       npw, err := gonpy.NewWriter(nopCloser{bufw})
+       if err != nil {
+               return 1
+       }
+       npw.Shape = []int{rows, cols}
+       npw.WriteFloat64(out)
+       err = bufw.Flush()
+       if err != nil {
+               return 1
+       }
+       err = output.Close()
+       if err != nil {
+               return 1
+       }
+       return 0
+}
+
+func array2matrix(rows, cols int, data []uint16) mat.Matrix {
+       floatdata := make([]float64, rows*cols)
+       for i, v := range data {
+               floatdata[i] = float64(v)
+       }
+       return mat.NewDense(rows, cols, floatdata)
+}