1 // Copyright (C) The Lightning Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
21 "git.arvados.org/arvados.git/sdk/go/arvados"
22 log "github.com/sirupsen/logrus"
25 type chooseSamples struct {
29 func (cmd *chooseSamples) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
30 err := cmd.run(prog, args, stdin, stdout, stderr)
32 fmt.Fprintf(stderr, "%s\n", err)
38 func (cmd *chooseSamples) run(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) error {
39 flags := flag.NewFlagSet("", flag.ContinueOnError)
40 flags.SetOutput(stderr)
41 pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
42 runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
43 projectUUID := flags.String("project", "", "project `UUID` for output data")
44 priority := flags.Int("priority", 500, "container request priority")
45 inputDir := flags.String("input-dir", "./in", "input `directory`")
46 outputDir := flags.String("output-dir", "./out", "output `directory`")
47 trainingSetSize := flags.Float64("training-set-size", 0.8, "number (or proportion, if <=1) of eligible samples to assign to the training set")
48 caseControlFilename := flags.String("case-control-file", "", "tsv file or directory indicating cases and controls (if directory, all .tsv files will be read)")
49 caseControlColumn := flags.String("case-control-column", "", "name of case/control column in case-control files (value must be 0 for control, 1 for case)")
50 randSeed := flags.Int64("random-seed", 0, "PRNG seed")
51 cmd.filter.Flags(flags)
52 err := flags.Parse(args)
53 if err == flag.ErrHelp {
55 } else if err != nil {
57 } else if flags.NArg() > 0 {
58 return fmt.Errorf("errant command line arguments after parsed flags: %v", flags.Args())
60 if (*caseControlFilename == "") != (*caseControlColumn == "") {
61 return errors.New("must provide both -case-control-file and -case-control-column, or neither")
66 log.Println(http.ListenAndServe(*pprof, nil))
71 runner := arvadosContainerRunner{
72 Name: "lightning choose-samples",
73 Client: arvados.NewClientFromEnv(),
74 ProjectUUID: *projectUUID,
81 err = runner.TranslatePaths(inputDir, caseControlFilename)
85 runner.Args = []string{"choose-samples", "-local=true",
87 "-input-dir=" + *inputDir,
88 "-output-dir=/mnt/output",
89 "-case-control-file=" + *caseControlFilename,
90 "-case-control-column=" + *caseControlColumn,
91 "-training-set-size=" + fmt.Sprintf("%f", *trainingSetSize),
92 "-random-seed=" + fmt.Sprintf("%d", *randSeed),
94 runner.Args = append(runner.Args, cmd.filter.Args()...)
96 output, err = runner.Run()
100 fmt.Fprintln(stdout, output)
104 infiles, err := allFiles(*inputDir, matchGobFile)
108 if len(infiles) == 0 {
109 err = fmt.Errorf("no input files found in %s", *inputDir)
112 sort.Strings(infiles)
114 in0, err := open(infiles[0])
119 matchGenome, err := regexp.Compile(cmd.filter.MatchGenome)
121 err = fmt.Errorf("-match-genome: invalid regexp: %q", cmd.filter.MatchGenome)
125 var sampleIDs []string
126 err = DecodeLibrary(in0, strings.HasSuffix(infiles[0], ".gz"), func(ent *LibraryEntry) error {
127 for _, cg := range ent.CompactGenomes {
128 if matchGenome.MatchString(cg.Name) {
129 sampleIDs = append(sampleIDs, cg.Name)
139 if len(sampleIDs) == 0 {
140 err = fmt.Errorf("no genomes found matching regexp %q", cmd.filter.MatchGenome)
143 sort.Strings(sampleIDs)
144 caseControl, err := cmd.loadCaseControlFiles(*caseControlFilename, *caseControlColumn, sampleIDs)
148 if len(caseControl) == 0 {
149 err = fmt.Errorf("fatal: 0 cases, 0 controls, nothing to do")
153 var trainingSet, validationSet []int
154 for i := range caseControl {
155 trainingSet = append(trainingSet, i)
157 sort.Ints(trainingSet)
158 wantlen := int(*trainingSetSize)
159 if *trainingSetSize <= 1 {
160 wantlen = int(*trainingSetSize * float64(len(trainingSet)))
162 randsrc := rand.NewSource(*randSeed)
163 for tslen := len(trainingSet); tslen > wantlen; {
164 i := int(randsrc.Int63()) % tslen
165 validationSet = append(validationSet, trainingSet[i])
167 trainingSet[i] = trainingSet[tslen]
168 trainingSet = trainingSet[:tslen]
170 sort.Ints(trainingSet)
171 sort.Ints(validationSet)
173 samplesFilename := *outputDir + "/samples.csv"
174 log.Infof("writing sample metadata to %s", samplesFilename)
176 f, err = os.Create(samplesFilename)
181 _, err = fmt.Fprint(f, "Index,SampleID,CaseControl,TrainingValidation\n")
185 tsi := 0 // next idx in training set
186 vsi := 0 // next idx in validation set
187 for i, name := range sampleIDs {
189 if len(trainingSet) > tsi && trainingSet[tsi] == i {
197 } else if len(validationSet) > vsi && validationSet[vsi] == i {
206 _, err = fmt.Fprintf(f, "%d,%s,%s,%s\n", i, trimFilenameForLabel(name), cc, tv)
208 err = fmt.Errorf("write %s: %w", samplesFilename, err)
214 err = fmt.Errorf("close %s: %w", samplesFilename, err)
220 // Read case/control file(s). Returned map m has m[i]==true if
221 // sampleIDs[i] is case, m[i]==false if sampleIDs[i] is control.
222 func (cmd *chooseSamples) loadCaseControlFiles(path, colname string, sampleIDs []string) (map[int]bool, error) {
224 // all samples are control group
225 cc := make(map[int]bool, len(sampleIDs))
226 for i := range sampleIDs {
231 infiles, err := allFiles(path, nil)
235 // index in sampleIDs => case(true) / control(false)
237 // index in sampleIDs => true if matched by multiple patterns in case/control files
238 dup := map[int]bool{}
239 for _, infile := range infiles {
240 f, err := open(infile)
244 buf, err := io.ReadAll(f)
250 for _, tsv := range bytes.Split(buf, []byte{'\n'}) {
254 split := strings.Split(string(tsv), "\t")
257 for col, name := range split {
264 return nil, fmt.Errorf("%s: no column named %q in header row %q", infile, colname, tsv)
268 if len(split) <= ccCol {
273 for i, name := range sampleIDs {
274 if strings.Contains(name, pattern) {
276 log.Warnf("pattern %q in %s matches multiple sample IDs (%q, %q)", pattern, infile, sampleIDs[found], name)
280 } else if _, ok := cc[i]; ok {
281 log.Warnf("multiple patterns match sample ID %q, omitting from cases/controls", name)
287 if split[ccCol] == "0" {
290 if split[ccCol] == "1" {
296 log.Warnf("pattern %q in %s does not match any genome IDs", pattern, infile)