onehotChunked := flags.Bool("chunked-onehot", false, "generate one-hot tile-based matrix per input chunk")
onlyPCA := flags.Bool("pca", false, "generate pca matrix")
pcaComponents := flags.Int("pca-components", 4, "number of PCA components")
+ maxPCATiles := flags.Int("max-pca-tiles", 100000, "maximum tiles to use as PCA input (filter, then drop every 2nd colum pair until below max)")
debugTag := flags.Int("debug-tag", -1, "log debugging details about specified tag")
flags.IntVar(&cmd.threads, "threads", 16, "number of memory-hungry assembly threads, and number of VCPUs to request for arvados container")
flags.StringVar(&cmd.chi2CaseControlFile, "chi2-case-control-file", "", "tsv file or directory indicating cases and controls for Χ² test (if directory, all .tsv files will be read)")
if cols == 0 {
return fmt.Errorf("cannot do PCA: one-hot matrix is empty")
}
- log.Printf("creating matrix: %d rows, %d cols", len(cmd.cgnames), cols)
+ log.Printf("have %d one-hot cols", cols)
+ stride := 1
+ for cols > *maxPCATiles*2 {
+ cols = cols / 2
+ stride = stride * 2
+ }
+ log.Printf("creating matrix: %d rows, %d cols, stride %d", len(cmd.cgnames), cols, stride)
mtx := mat.NewDense(len(cmd.cgnames), cols, nil)
for i, c := range onehot[nzCount:] {
- mtx.Set(int(onehot[i]), int(c), 1)
+ if int(c/2)%stride == 0 {
+ mtx.Set(int(onehot[i]), int(c/2)/stride*2+int(c)%2, 1)
+ }
}
log.Print("fitting")
transformer := nlp.NewPCA(*pcaComponents)