From efa255a60877d6193f84fd2244ec017a83b4f31c Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Wed, 19 Oct 2022 16:17:36 -0400 Subject: [PATCH] 19524: Limit size of PCA input matrix. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- slicenumpy.go | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/slicenumpy.go b/slicenumpy.go index feab8b0cd7..62982f5407 100644 --- a/slicenumpy.go +++ b/slicenumpy.go @@ -80,6 +80,7 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout, onehotChunked := flags.Bool("chunked-onehot", false, "generate one-hot tile-based matrix per input chunk") onlyPCA := flags.Bool("pca", false, "generate pca matrix") pcaComponents := flags.Int("pca-components", 4, "number of PCA components") + maxPCATiles := flags.Int("max-pca-tiles", 100000, "maximum tiles to use as PCA input (filter, then drop every 2nd colum pair until below max)") debugTag := flags.Int("debug-tag", -1, "log debugging details about specified tag") flags.IntVar(&cmd.threads, "threads", 16, "number of memory-hungry assembly threads, and number of VCPUs to request for arvados container") flags.StringVar(&cmd.chi2CaseControlFile, "chi2-case-control-file", "", "tsv file or directory indicating cases and controls for Χ² test (if directory, all .tsv files will be read)") @@ -1120,10 +1121,18 @@ func (cmd *sliceNumpy) run(prog string, args []string, stdin io.Reader, stdout, if cols == 0 { return fmt.Errorf("cannot do PCA: one-hot matrix is empty") } - log.Printf("creating matrix: %d rows, %d cols", len(cmd.cgnames), cols) + log.Printf("have %d one-hot cols", cols) + stride := 1 + for cols > *maxPCATiles*2 { + cols = cols / 2 + stride = stride * 2 + } + log.Printf("creating matrix: %d rows, %d cols, stride %d", len(cmd.cgnames), cols, stride) mtx := mat.NewDense(len(cmd.cgnames), cols, nil) for i, c := range onehot[nzCount:] { - mtx.Set(int(onehot[i]), int(c), 1) + if int(c/2)%stride == 0 { + mtx.Set(int(onehot[i]), int(c/2)/stride*2+int(c)%2, 1) + } } log.Print("fitting") transformer := nlp.NewPCA(*pcaComponents) -- 2.30.2