1 // Copyright (C) The Lightning Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
20 "git.arvados.org/arvados.git/sdk/go/arvados"
21 "github.com/james-bowman/nlp"
22 "github.com/kshedden/gonpy"
23 log "github.com/sirupsen/logrus"
24 "gonum.org/v1/gonum/mat"
27 type pythonPCA struct{}
29 func (cmd *pythonPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
33 fmt.Fprintf(stderr, "%s\n", err)
36 flags := flag.NewFlagSet("", flag.ContinueOnError)
37 flags.SetOutput(stderr)
38 projectUUID := flags.String("project", "", "project `UUID` for output data")
39 inputFilename := flags.String("i", "-", "input `file`")
40 priority := flags.Int("priority", 500, "container request priority")
41 err = flags.Parse(args)
42 if err == flag.ErrHelp {
45 } else if err != nil {
49 runner := arvadosContainerRunner{
50 Name: "lightning pca",
51 Client: arvados.NewClientFromEnv(),
52 ProjectUUID: *projectUUID,
57 err = runner.TranslatePaths(inputFilename)
61 runner.Prog = "python3"
62 runner.Args = []string{"-c", `import sys
64 from sklearn.decomposition import PCA
65 scipy.save(sys.argv[2], PCA(n_components=4).fit_transform(scipy.load(sys.argv[1])))`, *inputFilename, "/mnt/output/pca.npy"}
67 output, err = runner.Run()
71 fmt.Fprintln(stdout, output+"/pca.npy")
79 func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
83 fmt.Fprintf(stderr, "%s\n", err)
86 flags := flag.NewFlagSet("", flag.ContinueOnError)
87 flags.SetOutput(stderr)
88 pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
89 runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
90 projectUUID := flags.String("project", "", "project `UUID` for output data")
91 priority := flags.Int("priority", 500, "container request priority")
92 inputFilename := flags.String("i", "-", "input `file`")
93 outputFilename := flags.String("o", "-", "output `file`")
94 components := flags.Int("components", 4, "number of components")
95 onehot := flags.Bool("one-hot", false, "recode tile variants as one-hot")
96 cmd.filter.Flags(flags)
97 err = flags.Parse(args)
98 if err == flag.ErrHelp {
101 } else if err != nil {
107 log.Println(http.ListenAndServe(*pprof, nil))
112 if *outputFilename != "-" {
113 err = errors.New("cannot specify output file in container mode: not implemented")
116 runner := arvadosContainerRunner{
117 Name: "lightning pca-go",
118 Client: arvados.NewClientFromEnv(),
119 ProjectUUID: *projectUUID,
120 RAM: 300000000000, // maybe 10x input size?
124 err = runner.TranslatePaths(inputFilename)
128 runner.Args = []string{"pca-go", "-local=true", fmt.Sprintf("-one-hot=%v", *onehot), "-i", *inputFilename, "-o", "/mnt/output/pca.npy"}
129 runner.Args = append(runner.Args, cmd.filter.Args()...)
131 output, err = runner.Run()
135 fmt.Fprintln(stdout, output+"/pca.npy")
139 var input io.ReadCloser
140 if *inputFilename == "-" {
141 input = ioutil.NopCloser(stdin)
143 input, err = os.Open(*inputFilename)
150 tilelib := &tileLibrary{
152 compactGenomes: map[string][]tileVariantID{},
154 err = tilelib.LoadGob(context.Background(), input, strings.HasSuffix(*inputFilename, ".gz"))
163 log.Info("filtering")
164 cmd.filter.Apply(tilelib)
168 log.Print("converting cgs to array")
169 data, rows, cols := cgs2array(tilelib, cgnames(tilelib), lowqual(tilelib), nil, 0, len(tilelib.variant))
171 log.Printf("recode one-hot: %d rows, %d cols", rows, cols)
172 data, _, cols = recodeOnehot(data, cols)
176 log.Printf("creating matrix backed by array: %d rows, %d cols", rows, cols)
177 mtx := array2matrix(rows, cols, data).T()
180 transformer := nlp.NewPCA(*components)
182 log.Printf("transforming")
183 mtx, err = transformer.Transform(mtx)
189 rows, cols = mtx.Dims()
190 log.Printf("copying result to numpy output array: %d rows, %d cols", rows, cols)
191 out := make([]float64, rows*cols)
192 for i := 0; i < rows; i++ {
193 for j := 0; j < cols; j++ {
194 out[i*cols+j] = mtx.At(i, j)
198 var output io.WriteCloser
199 if *outputFilename == "-" {
200 output = nopCloser{stdout}
202 output, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0777)
208 bufw := bufio.NewWriter(output)
209 npw, err := gonpy.NewWriter(nopCloser{bufw})
213 npw.Shape = []int{rows, cols}
214 log.Printf("writing numpy: %d rows, %d cols", rows, cols)
215 npw.WriteFloat64(out)
228 func array2matrix(rows, cols int, data []int16) mat.Matrix {
229 floatdata := make([]float64, rows*cols)
230 for i, v := range data {
231 floatdata[i] = float64(v)
233 return mat.NewDense(rows, cols, floatdata)