X-Git-Url: https://git.arvados.org/lightning.git/blobdiff_plain/347b4125594ed65d53df5c05d2d2aff7aa249b78..194c9e16c7cebf25d5830129f8f27d79a2a47ce5:/pca.go diff --git a/pca.go b/pca.go index 2bb8471d1b..121925d4c5 100644 --- a/pca.go +++ b/pca.go @@ -1,7 +1,12 @@ -package main +// Copyright (C) The Lightning Authors. All rights reserved. +// +// SPDX-License-Identifier: AGPL-3.0 + +package lightning import ( "bufio" + "context" "errors" "flag" "fmt" @@ -10,7 +15,7 @@ import ( "net/http" _ "net/http/pprof" "os" - "sort" + "strings" "git.arvados.org/arvados.git/sdk/go/arvados" "github.com/james-bowman/nlp" @@ -67,7 +72,9 @@ scipy.save(sys.argv[2], PCA(n_components=4).fit_transform(scipy.load(sys.argv[1] return 0 } -type goPCA struct{} +type goPCA struct { + filter filter +} func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int { var err error @@ -86,6 +93,7 @@ func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout outputFilename := flags.String("o", "-", "output `file`") components := flags.Int("components", 4, "number of components") onehot := flags.Bool("one-hot", false, "recode tile variants as one-hot") + cmd.filter.Flags(flags) err = flags.Parse(args) if err == flag.ErrHelp { err = nil @@ -109,8 +117,8 @@ func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout Name: "lightning pca-go", Client: arvados.NewClientFromEnv(), ProjectUUID: *projectUUID, - RAM: 432000000000, - VCPUs: 2, + RAM: 300000000000, // maybe 10x input size? + VCPUs: 16, Priority: *priority, } err = runner.TranslatePaths(inputFilename) @@ -118,6 +126,7 @@ func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout return 1 } runner.Args = []string{"pca-go", "-local=true", fmt.Sprintf("-one-hot=%v", *onehot), "-i", *inputFilename, "-o", "/mnt/output/pca.npy"} + runner.Args = append(runner.Args, cmd.filter.Args()...) var output string output, err = runner.Run() if err != nil { @@ -137,7 +146,12 @@ func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout } defer input.Close() } - cgs, err := ReadCompactGenomes(input) + log.Print("reading") + tilelib := &tileLibrary{ + retainNoCalls: true, + compactGenomes: map[string][]tileVariantID{}, + } + err = tilelib.LoadGob(context.Background(), input, strings.HasSuffix(*inputFilename, ".gz")) if err != nil { return 1 } @@ -145,23 +159,39 @@ func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout if err != nil { return 1 } - sort.Slice(cgs, func(i, j int) bool { return cgs[i].Name < cgs[j].Name }) - data, rows, cols := cgs2array(cgs) + log.Info("filtering") + cmd.filter.Apply(tilelib) + log.Info("tidying") + tilelib.Tidy() + + log.Print("converting cgs to array") + data, rows, cols := cgs2array(tilelib, cgnames(tilelib), lowqual(tilelib), nil, 0, len(tilelib.variant)) if *onehot { - data, cols = recodeOnehot(data, cols) + log.Printf("recode one-hot: %d rows, %d cols", rows, cols) + data, _, cols = recodeOnehot(data, cols) } - pca, err := nlp.NewPCA(*components).FitTransform(array2matrix(rows, cols, data).T()) + tilelib = nil + + log.Printf("creating matrix backed by array: %d rows, %d cols", rows, cols) + mtx := array2matrix(rows, cols, data).T() + + log.Print("fitting") + transformer := nlp.NewPCA(*components) + transformer.Fit(mtx) + log.Printf("transforming") + mtx, err = transformer.Transform(mtx) if err != nil { return 1 } + mtx = mtx.T() - pca = pca.T() - rows, cols = pca.Dims() + rows, cols = mtx.Dims() + log.Printf("copying result to numpy output array: %d rows, %d cols", rows, cols) out := make([]float64, rows*cols) for i := 0; i < rows; i++ { for j := 0; j < cols; j++ { - out[i*cols+j] = pca.At(i, j) + out[i*cols+j] = mtx.At(i, j) } } @@ -181,6 +211,7 @@ func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout return 1 } npw.Shape = []int{rows, cols} + log.Printf("writing numpy: %d rows, %d cols", rows, cols) npw.WriteFloat64(out) err = bufw.Flush() if err != nil { @@ -190,10 +221,11 @@ func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout if err != nil { return 1 } + log.Print("done") return 0 } -func array2matrix(rows, cols int, data []uint16) mat.Matrix { +func array2matrix(rows, cols int, data []int16) mat.Matrix { floatdata := make([]float64, rows*cols) for i, v := range data { floatdata[i] = float64(v)