-package main
+// Copyright (C) The Lightning Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package lightning
import (
"bufio"
+ "context"
"errors"
"flag"
"fmt"
"net/http"
_ "net/http/pprof"
"os"
- "sort"
+ "strings"
"git.arvados.org/arvados.git/sdk/go/arvados"
"github.com/james-bowman/nlp"
return 0
}
-type goPCA struct{}
+type goPCA struct {
+ filter filter
+}
func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
var err error
outputFilename := flags.String("o", "-", "output `file`")
components := flags.Int("components", 4, "number of components")
onehot := flags.Bool("one-hot", false, "recode tile variants as one-hot")
+ cmd.filter.Flags(flags)
err = flags.Parse(args)
if err == flag.ErrHelp {
err = nil
Name: "lightning pca-go",
Client: arvados.NewClientFromEnv(),
ProjectUUID: *projectUUID,
- RAM: 432000000000,
- VCPUs: 2,
+ RAM: 300000000000, // maybe 10x input size?
+ VCPUs: 16,
Priority: *priority,
}
err = runner.TranslatePaths(inputFilename)
return 1
}
runner.Args = []string{"pca-go", "-local=true", fmt.Sprintf("-one-hot=%v", *onehot), "-i", *inputFilename, "-o", "/mnt/output/pca.npy"}
+ runner.Args = append(runner.Args, cmd.filter.Args()...)
var output string
output, err = runner.Run()
if err != nil {
defer input.Close()
}
log.Print("reading")
- cgs, err := ReadCompactGenomes(input)
+ tilelib := &tileLibrary{
+ retainNoCalls: true,
+ compactGenomes: map[string][]tileVariantID{},
+ }
+ err = tilelib.LoadGob(context.Background(), input, strings.HasSuffix(*inputFilename, ".gz"))
if err != nil {
return 1
}
if err != nil {
return 1
}
- log.Print("sorting")
- sort.Slice(cgs, func(i, j int) bool { return cgs[i].Name < cgs[j].Name })
+
+ log.Info("filtering")
+ cmd.filter.Apply(tilelib)
+ log.Info("tidying")
+ tilelib.Tidy()
log.Print("converting cgs to array")
- data, rows, cols := cgs2array(cgs)
+ data, rows, cols := cgs2array(tilelib, cgnames(tilelib), lowqual(tilelib), nil, 0, len(tilelib.variant))
if *onehot {
log.Printf("recode one-hot: %d rows, %d cols", rows, cols)
- data, cols = recodeOnehot(data, cols)
+ data, _, cols = recodeOnehot(data, cols)
}
- log.Printf("running fit+transform: %d rows, %d cols", rows, cols)
- pca, err := nlp.NewPCA(*components).FitTransform(array2matrix(rows, cols, data).T())
+ tilelib = nil
+
+ log.Printf("creating matrix backed by array: %d rows, %d cols", rows, cols)
+ mtx := array2matrix(rows, cols, data).T()
+
+ log.Print("fitting")
+ transformer := nlp.NewPCA(*components)
+ transformer.Fit(mtx)
+ log.Printf("transforming")
+ mtx, err = transformer.Transform(mtx)
if err != nil {
return 1
}
+ mtx = mtx.T()
- log.Print("transposing result")
- pca = pca.T()
- rows, cols = pca.Dims()
+ rows, cols = mtx.Dims()
log.Printf("copying result to numpy output array: %d rows, %d cols", rows, cols)
out := make([]float64, rows*cols)
for i := 0; i < rows; i++ {
for j := 0; j < cols; j++ {
- out[i*cols+j] = pca.At(i, j)
+ out[i*cols+j] = mtx.At(i, j)
}
}
return 0
}
-func array2matrix(rows, cols int, data []uint16) mat.Matrix {
+func array2matrix(rows, cols int, data []int16) mat.Matrix {
floatdata := make([]float64, rows*cols)
for i, v := range data {
floatdata[i] = float64(v)