X-Git-Url: https://git.arvados.org/lightning.git/blobdiff_plain/a7108a8ddd8f53b47540c9c02767509f7582d200..67852718f3bad4936e271adcce9e59c03bdf9344:/pca.go diff --git a/pca.go b/pca.go index bebd6765b7..41e3a7877f 100644 --- a/pca.go +++ b/pca.go @@ -2,6 +2,7 @@ package main import ( "bufio" + "context" "errors" "flag" "fmt" @@ -10,7 +11,7 @@ import ( "net/http" _ "net/http/pprof" "os" - "sort" + "strings" "git.arvados.org/arvados.git/sdk/go/arvados" "github.com/james-bowman/nlp" @@ -67,7 +68,9 @@ scipy.save(sys.argv[2], PCA(n_components=4).fit_transform(scipy.load(sys.argv[1] return 0 } -type goPCA struct{} +type goPCA struct { + filter filter +} func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int { var err error @@ -86,6 +89,7 @@ func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout outputFilename := flags.String("o", "-", "output `file`") components := flags.Int("components", 4, "number of components") onehot := flags.Bool("one-hot", false, "recode tile variants as one-hot") + cmd.filter.Flags(flags) err = flags.Parse(args) if err == flag.ErrHelp { err = nil @@ -109,8 +113,8 @@ func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout Name: "lightning pca-go", Client: arvados.NewClientFromEnv(), ProjectUUID: *projectUUID, - RAM: 100000000000, // maybe 10x input size? - VCPUs: 2, + RAM: 300000000000, // maybe 10x input size? + VCPUs: 16, Priority: *priority, } err = runner.TranslatePaths(inputFilename) @@ -118,6 +122,7 @@ func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout return 1 } runner.Args = []string{"pca-go", "-local=true", fmt.Sprintf("-one-hot=%v", *onehot), "-i", *inputFilename, "-o", "/mnt/output/pca.npy"} + runner.Args = append(runner.Args, cmd.filter.Args()...) var output string output, err = runner.Run() if err != nil { @@ -138,7 +143,11 @@ func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout defer input.Close() } log.Print("reading") - cgs, err := ReadCompactGenomes(input) + tilelib := &tileLibrary{ + retainNoCalls: true, + compactGenomes: map[string][]tileVariantID{}, + } + err = tilelib.LoadGob(context.Background(), input, strings.HasSuffix(*inputFilename, ".gz"), nil) if err != nil { return 1 } @@ -146,16 +155,19 @@ func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout if err != nil { return 1 } - log.Print("sorting") - sort.Slice(cgs, func(i, j int) bool { return cgs[i].Name < cgs[j].Name }) + + log.Info("filtering") + cmd.filter.Apply(tilelib) + log.Info("tidying") + tilelib.Tidy() log.Print("converting cgs to array") - data, rows, cols := cgs2array(cgs) + data, rows, cols, _ := cgs2array(tilelib) if *onehot { log.Printf("recode one-hot: %d rows, %d cols", rows, cols) - data, cols = recodeOnehot(data, cols) + data, _, cols = recodeOnehot(data, cols) } - cgs = nil + tilelib = nil log.Printf("creating matrix backed by array: %d rows, %d cols", rows, cols) mtx := array2matrix(rows, cols, data).T() @@ -209,7 +221,7 @@ func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout return 0 } -func array2matrix(rows, cols int, data []uint16) mat.Matrix { +func array2matrix(rows, cols int, data []int16) mat.Matrix { floatdata := make([]float64, rows*cols) for i, v := range data { floatdata[i] = float64(v)