X-Git-Url: https://git.arvados.org/lightning.git/blobdiff_plain/16af77bb805d4641c33cc8920ff1c48d1471aed7..74ae10fa596b295fdd0904cf0568a3b3d7c5e0a9:/pca.go diff --git a/pca.go b/pca.go index 8c19e5623c..121925d4c5 100644 --- a/pca.go +++ b/pca.go @@ -1,4 +1,8 @@ -package main +// Copyright (C) The Lightning Authors. All rights reserved. +// +// SPDX-License-Identifier: AGPL-3.0 + +package lightning import ( "bufio" @@ -11,6 +15,7 @@ import ( "net/http" _ "net/http/pprof" "os" + "strings" "git.arvados.org/arvados.git/sdk/go/arvados" "github.com/james-bowman/nlp" @@ -67,7 +72,9 @@ scipy.save(sys.argv[2], PCA(n_components=4).fit_transform(scipy.load(sys.argv[1] return 0 } -type goPCA struct{} +type goPCA struct { + filter filter +} func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int { var err error @@ -86,6 +93,7 @@ func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout outputFilename := flags.String("o", "-", "output `file`") components := flags.Int("components", 4, "number of components") onehot := flags.Bool("one-hot", false, "recode tile variants as one-hot") + cmd.filter.Flags(flags) err = flags.Parse(args) if err == flag.ErrHelp { err = nil @@ -109,8 +117,8 @@ func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout Name: "lightning pca-go", Client: arvados.NewClientFromEnv(), ProjectUUID: *projectUUID, - RAM: 100000000000, // maybe 10x input size? - VCPUs: 2, + RAM: 300000000000, // maybe 10x input size? + VCPUs: 16, Priority: *priority, } err = runner.TranslatePaths(inputFilename) @@ -118,6 +126,7 @@ func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout return 1 } runner.Args = []string{"pca-go", "-local=true", fmt.Sprintf("-one-hot=%v", *onehot), "-i", *inputFilename, "-o", "/mnt/output/pca.npy"} + runner.Args = append(runner.Args, cmd.filter.Args()...) var output string output, err = runner.Run() if err != nil { @@ -138,11 +147,11 @@ func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout defer input.Close() } log.Print("reading") - tilelib := tileLibrary{ - includeNoCalls: true, + tilelib := &tileLibrary{ + retainNoCalls: true, compactGenomes: map[string][]tileVariantID{}, } - err = tilelib.LoadGob(context.Background(), input, nil) + err = tilelib.LoadGob(context.Background(), input, strings.HasSuffix(*inputFilename, ".gz")) if err != nil { return 1 } @@ -151,12 +160,18 @@ func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout return 1 } + log.Info("filtering") + cmd.filter.Apply(tilelib) + log.Info("tidying") + tilelib.Tidy() + log.Print("converting cgs to array") - data, rows, cols := cgs2array(tilelib.compactGenomes) + data, rows, cols := cgs2array(tilelib, cgnames(tilelib), lowqual(tilelib), nil, 0, len(tilelib.variant)) if *onehot { log.Printf("recode one-hot: %d rows, %d cols", rows, cols) data, _, cols = recodeOnehot(data, cols) } + tilelib = nil log.Printf("creating matrix backed by array: %d rows, %d cols", rows, cols) mtx := array2matrix(rows, cols, data).T() @@ -210,7 +225,7 @@ func (cmd *goPCA) RunCommand(prog string, args []string, stdin io.Reader, stdout return 0 } -func array2matrix(rows, cols int, data []uint16) mat.Matrix { +func array2matrix(rows, cols int, data []int16) mat.Matrix { floatdata := make([]float64, rows*cols) for i, v := range data { floatdata[i] = float64(v)