15 "git.arvados.org/arvados.git/sdk/go/arvados"
16 "github.com/james-bowman/nlp"
17 "github.com/kshedden/gonpy"
18 log "github.com/sirupsen/logrus"
19 "gonum.org/v1/gonum/mat"
22 type pythonPCA struct{}
24 func (cmd *pythonPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
28 fmt.Fprintf(stderr, "%s\n", err)
31 flags := flag.NewFlagSet("", flag.ContinueOnError)
32 flags.SetOutput(stderr)
33 projectUUID := flags.String("project", "", "project `UUID` for output data")
34 inputFilename := flags.String("i", "-", "input `file`")
35 priority := flags.Int("priority", 500, "container request priority")
36 err = flags.Parse(args)
37 if err == flag.ErrHelp {
40 } else if err != nil {
44 runner := arvadosContainerRunner{
45 Name: "lightning pca",
46 Client: arvados.NewClientFromEnv(),
47 ProjectUUID: *projectUUID,
52 err = runner.TranslatePaths(inputFilename)
56 runner.Prog = "python3"
57 runner.Args = []string{"-c", `import sys
59 from sklearn.decomposition import PCA
60 scipy.save(sys.argv[2], PCA(n_components=4).fit_transform(scipy.load(sys.argv[1])))`, *inputFilename, "/mnt/output/pca.npy"}
62 output, err = runner.Run()
66 fmt.Fprintln(stdout, output+"/pca.npy")
72 func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
76 fmt.Fprintf(stderr, "%s\n", err)
79 flags := flag.NewFlagSet("", flag.ContinueOnError)
80 flags.SetOutput(stderr)
81 pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
82 runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
83 projectUUID := flags.String("project", "", "project `UUID` for output data")
84 priority := flags.Int("priority", 500, "container request priority")
85 inputFilename := flags.String("i", "-", "input `file`")
86 outputFilename := flags.String("o", "-", "output `file`")
87 components := flags.Int("components", 4, "number of components")
88 onehot := flags.Bool("one-hot", false, "recode tile variants as one-hot")
89 err = flags.Parse(args)
90 if err == flag.ErrHelp {
93 } else if err != nil {
99 log.Println(http.ListenAndServe(*pprof, nil))
104 if *outputFilename != "-" {
105 err = errors.New("cannot specify output file in container mode: not implemented")
108 runner := arvadosContainerRunner{
109 Name: "lightning pca-go",
110 Client: arvados.NewClientFromEnv(),
111 ProjectUUID: *projectUUID,
116 err = runner.TranslatePaths(inputFilename)
120 runner.Args = []string{"pca-go", "-local=true", fmt.Sprintf("-one-hot=%v", *onehot), "-i", *inputFilename, "-o", "/mnt/output/pca.npy"}
122 output, err = runner.Run()
126 fmt.Fprintln(stdout, output+"/pca.npy")
130 var input io.ReadCloser
131 if *inputFilename == "-" {
132 input = ioutil.NopCloser(stdin)
134 input, err = os.Open(*inputFilename)
141 cgs, err := ReadCompactGenomes(input)
150 sort.Slice(cgs, func(i, j int) bool { return cgs[i].Name < cgs[j].Name })
152 log.Print("converting cgs to array")
153 data, rows, cols := cgs2array(cgs)
155 data, cols = recodeOnehot(data, cols)
157 log.Print("running fit+transform")
158 pca, err := nlp.NewPCA(*components).FitTransform(array2matrix(rows, cols, data).T())
163 log.Print("transposing result")
165 log.Print("copying result to numpy output array")
166 rows, cols = pca.Dims()
167 out := make([]float64, rows*cols)
168 for i := 0; i < rows; i++ {
169 for j := 0; j < cols; j++ {
170 out[i*cols+j] = pca.At(i, j)
174 var output io.WriteCloser
175 if *outputFilename == "-" {
176 output = nopCloser{stdout}
178 output, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0777)
184 bufw := bufio.NewWriter(output)
185 npw, err := gonpy.NewWriter(nopCloser{bufw})
189 npw.Shape = []int{rows, cols}
190 log.Print("writing numpy")
191 npw.WriteFloat64(out)
204 func array2matrix(rows, cols int, data []uint16) mat.Matrix {
205 floatdata := make([]float64, rows*cols)
206 for i, v := range data {
207 floatdata[i] = float64(v)
209 return mat.NewDense(rows, cols, floatdata)