outputFilename := flags.String("o", "", "output `filename` (e.g., './plot.png')")
sampleListFilename := flags.String("samples", "", "use second column of `samples.csv` as complete list of sample IDs")
phenotypeFilename := flags.String("phenotype", "", "use `phenotype.csv` as id->phenotype mapping (column 0 is sample id)")
- phenotypeCategoryColumn := flags.Int("phenotype-category-column", -1, "0-based column `index` of 2nd category in phenotype.csv file")
- phenotypeColumn := flags.Int("phenotype-column", 1, "0-based column `index` of phenotype in phenotype.csv file")
+ cat1Column := flags.Int("phenotype-cat1-column", 1, "0-based column `index` of 1st category in phenotype.csv file")
+ cat2Column := flags.Int("phenotype-cat2-column", -1, "0-based column `index` of 2nd category in phenotype.csv file")
+ xComponent := flags.Int("x", 1, "1-based PCA component to plot on x axis")
+ yComponent := flags.Int("y", 2, "1-based PCA component to plot on y axis")
priority := flags.Int("priority", 500, "container request priority")
runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
err = flags.Parse(args)
}
*outputFilename = "/mnt/output/plot.png"
}
- args = []string{*inputFilename, *sampleListFilename, *phenotypeFilename, fmt.Sprintf("%d", *phenotypeCategoryColumn), fmt.Sprintf("%d", *phenotypeColumn), *outputFilename}
+ args = []string{
+ *inputFilename,
+ fmt.Sprintf("%d", *xComponent),
+ fmt.Sprintf("%d", *yComponent),
+ *sampleListFilename,
+ *phenotypeFilename,
+ fmt.Sprintf("%d", *cat1Column),
+ fmt.Sprintf("%d", *cat2Column),
+ *outputFilename,
+ }
if *runlocal {
if *outputFilename == "" {
fmt.Fprintln(stderr, "error: must specify -o filename.png in local mode (or try -help)")
import scipy
import sys
-infile = sys.argv[1]
-X = numpy.load(infile)
+(_,
+ input_path,
+ x_component,
+ y_component,
+ samples_file,
+ phenotype_path,
+ phenotype_cat1_column,
+ phenotype_cat2_column,
+ output_path,
+ ) = sys.argv
+X = numpy.load(input_path)
colors = None
category = {}
samples = []
-if sys.argv[2]:
+if samples_file:
labels = {}
- with open(sys.argv[2], 'rt', newline='') as samplelist:
+ with open(samples_file, 'rt', newline='') as samplelist:
for row in csv.reader(samplelist):
sampleid = row[1]
samples.append(sampleid)
- phenotype_category_column = int(sys.argv[4])
- phenotype_column = int(sys.argv[5])
- if os.path.isdir(sys.argv[3]):
- phenotype_files = os.scandir(sys.argv[3])
+ phenotype_cat2_column = int(phenotype_cat2_column)
+ phenotype_cat1_column = int(phenotype_cat1_column)
+ if os.path.isdir(phenotype_path):
+ phenotype_files = os.scandir(phenotype_path)
else:
- phenotype_files = [sys.argv[3]]
+ phenotype_files = [phenotype_path]
for phenotype_file in phenotype_files:
with open(phenotype_file, 'rt', newline='') as phenotype:
dialect = csv.Sniffer().sniff(phenotype.read(1024))
phenotype.seek(0)
for row in csv.reader(phenotype, dialect):
tag = row[0]
- label = row[phenotype_column]
+ label = row[phenotype_cat1_column]
for sampleid in samples:
if tag in sampleid:
labels[sampleid] = label
- if phenotype_category_column >= 0 and row[phenotype_category_column] != '0':
+ if phenotype_cat2_column >= 0 and row[phenotype_cat2_column] != '0':
category[sampleid] = True
+ unknown_color = 'grey'
colors = []
labelcolors = {
'PUR': 'firebrick',
'CEU': 'green',
'GBR': 'green',
'FIN': 'green',
- '2': 'green',
+ '5': 'green',
'LWK': 'coral',
'MSL': 'coral',
'GWD': 'coral',
'ESN': 'coral',
'ACB': 'coral',
'ASW': 'coral',
- '3': 'coral',
+ '4': 'coral',
'KHV': 'royalblue',
'CDX': 'royalblue',
'CHS': 'royalblue',
'CHB': 'royalblue',
'JPT': 'royalblue',
- '4': 'royalblue',
+ '2': 'royalblue',
'STU': 'blueviolet',
'ITU': 'blueviolet',
'BEB': 'blueviolet',
'GIH': 'blueviolet',
'PJL': 'blueviolet',
- '5': 'blueviolet',
- '6': 'black', # unknown?
+ '3': 'navy',
}
for sampleid in samples:
if (sampleid in labels) and (labels[sampleid] in labelcolors):
colors.append(labelcolors[labels[sampleid]])
else:
- colors.append('black')
+ colors.append(unknown_color)
from matplotlib.figure import Figure
from matplotlib.patches import Polygon
y = []
if samples:
c = []
- for i, sampleid in enumerate(samples):
- if category.get(sampleid, False) == (marker == 'x'):
- x.append(X[i,0])
- y.append(X[i,1])
- c.append(colors[i])
+ for unknownfirst in [True, False]:
+ for i, sampleid in enumerate(samples):
+ if ((colors[i] == unknown_color) == unknownfirst and
+ category.get(sampleid, False) == (marker == 'x')):
+ x.append(X[i,int(x_component)-1])
+ y.append(X[i,int(y_component)-1])
+ c.append(colors[i])
elif marker == 'x':
continue
else:
- x = X[:,0]
- y = X[:,1]
+ x = X[:,int(x_component)-1]
+ y = X[:,int(y_component)-1]
c = None
ax.scatter(x, y, c=c, s=60, marker=marker, alpha=0.5)
canvas = FigureCanvasAgg(fig)
-canvas.print_figure(sys.argv[6], dpi=80)
+canvas.print_figure(output_path, dpi=80)
flags.SetOutput(stderr)
pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
+ arvadosRAM := flags.Int("arvados-ram", 750000000000, "amount of memory to request for arvados container (`bytes`)")
+ arvadosVCPUs := flags.Int("arvados-vcpus", 96, "number of VCPUs to request for arvados container")
projectUUID := flags.String("project", "", "project `UUID` for output data")
priority := flags.Int("priority", 500, "container request priority")
inputDir := flags.String("input-dir", "./in", "input `directory`")
onehotChunked := flags.Bool("chunked-onehot", false, "generate one-hot tile-based matrix per input chunk")
onlyPCA := flags.Bool("pca", false, "generate pca matrix")
pcaComponents := flags.Int("pca-components", 4, "number of PCA components")
+ maxPCATiles := flags.Int("max-pca-tiles", 0, "maximum tiles to use as PCA input (filter, then drop every 2nd colum pair until below max)")
debugTag := flags.Int("debug-tag", -1, "log debugging details about specified tag")
- flags.IntVar(&cmd.threads, "threads", 16, "number of memory-hungry assembly threads")
+ flags.IntVar(&cmd.threads, "threads", 16, "number of memory-hungry assembly threads, and number of VCPUs to request for arvados container")
flags.StringVar(&cmd.chi2CaseControlFile, "chi2-case-control-file", "", "tsv file or directory indicating cases and controls for Χ² test (if directory, all .tsv files will be read)")
flags.StringVar(&cmd.chi2CaseControlColumn, "chi2-case-control-column", "", "name of case/control column in case-control files for Χ² test (value must be 0 for control, 1 for case)")
flags.Float64Var(&cmd.chi2PValue, "chi2-p-value", 1, "do Χ² test and omit columns with p-value above this threshold")
Name: "lightning slice-numpy",
Client: arvados.NewClientFromEnv(),
ProjectUUID: *projectUUID,
- RAM: 750000000000,
- VCPUs: 96,
+ RAM: int64(*arvadosRAM),
+ VCPUs: *arvadosVCPUs,
Priority: *priority,
KeepCache: 2,
APIAccess: true,
"-chunked-onehot=" + fmt.Sprintf("%v", *onehotChunked),
"-pca=" + fmt.Sprintf("%v", *onlyPCA),
"-pca-components=" + fmt.Sprintf("%d", *pcaComponents),
+ "-max-pca-tiles=" + fmt.Sprintf("%d", *maxPCATiles),
"-chi2-case-control-file=" + cmd.chi2CaseControlFile,
"-chi2-case-control-column=" + cmd.chi2CaseControlColumn,
"-chi2-p-value=" + fmt.Sprintf("%f", cmd.chi2PValue),
if cols == 0 {
return fmt.Errorf("cannot do PCA: one-hot matrix is empty")
}
- log.Printf("creating matrix: %d rows, %d cols", len(cmd.cgnames), cols)
+ log.Printf("have %d one-hot cols", cols)
+ stride := 1
+ for *maxPCATiles > 0 && cols > *maxPCATiles*2 {
+ cols = (cols + 1) / 2
+ stride = stride * 2
+ }
+ log.Printf("creating matrix: %d rows, %d cols, stride %d", len(cmd.cgnames), cols, stride)
mtx := mat.NewDense(len(cmd.cgnames), cols, nil)
for i, c := range onehot[nzCount:] {
- mtx.Set(int(onehot[i]), int(c), 1)
+ if int(c/2)%stride == 0 {
+ mtx.Set(int(onehot[i]), int(c/2)/stride*2+int(c)%2, 1)
+ }
}
log.Print("fitting")
transformer := nlp.NewPCA(*pcaComponents)