-package main
+package lightning
import (
"bufio"
+ "context"
"errors"
"flag"
"fmt"
"net/http"
_ "net/http/pprof"
"os"
- "sort"
+ "strings"
"git.arvados.org/arvados.git/sdk/go/arvados"
"github.com/james-bowman/nlp"
return 0
}
-type goPCA struct{}
+type goPCA struct {
+ filter filter
+}
func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
var err error
outputFilename := flags.String("o", "-", "output `file`")
components := flags.Int("components", 4, "number of components")
onehot := flags.Bool("one-hot", false, "recode tile variants as one-hot")
+ cmd.filter.Flags(flags)
err = flags.Parse(args)
if err == flag.ErrHelp {
err = nil
Name: "lightning pca-go",
Client: arvados.NewClientFromEnv(),
ProjectUUID: *projectUUID,
- RAM: 432000000000,
- VCPUs: 2,
+ RAM: 300000000000, // maybe 10x input size?
+ VCPUs: 16,
Priority: *priority,
}
err = runner.TranslatePaths(inputFilename)
return 1
}
runner.Args = []string{"pca-go", "-local=true", fmt.Sprintf("-one-hot=%v", *onehot), "-i", *inputFilename, "-o", "/mnt/output/pca.npy"}
+ runner.Args = append(runner.Args, cmd.filter.Args()...)
var output string
output, err = runner.Run()
if err != nil {
defer input.Close()
}
log.Print("reading")
- cgs, err := ReadCompactGenomes(input)
+ tilelib := &tileLibrary{
+ retainNoCalls: true,
+ compactGenomes: map[string][]tileVariantID{},
+ }
+ err = tilelib.LoadGob(context.Background(), input, strings.HasSuffix(*inputFilename, ".gz"), nil)
if err != nil {
return 1
}
if err != nil {
return 1
}
- log.Print("sorting")
- sort.Slice(cgs, func(i, j int) bool { return cgs[i].Name < cgs[j].Name })
+
+ log.Info("filtering")
+ cmd.filter.Apply(tilelib)
+ log.Info("tidying")
+ tilelib.Tidy()
log.Print("converting cgs to array")
- data, rows, cols := cgs2array(cgs)
+ data, rows, cols := cgs2array(tilelib, cgnames(tilelib), lowqual(tilelib), nil, 0, len(tilelib.variant))
if *onehot {
log.Printf("recode one-hot: %d rows, %d cols", rows, cols)
- data, cols = recodeOnehot(data, cols)
+ data, _, cols = recodeOnehot(data, cols)
}
- cgs = nil
+ tilelib = nil
log.Printf("creating matrix backed by array: %d rows, %d cols", rows, cols)
mtx := array2matrix(rows, cols, data).T()
return 0
}
-func array2matrix(rows, cols int, data []uint16) mat.Matrix {
+func array2matrix(rows, cols int, data []int16) mat.Matrix {
floatdata := make([]float64, rows*cols)
for i, v := range data {
floatdata[i] = float64(v)