From 1b027f311ba07d95b1367e946baab261ea2844d4 Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Fri, 14 Jan 2022 15:24:11 -0500 Subject: [PATCH] =?utf8?q?Load=20case/control/neither=20from=20csv=20colum?= =?utf8?q?n,=20fix=20=CE=A7=C2=B2=20filter.?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit refs #18581 Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- chisquare.go | 3 + slice.go | 2 +- slice_test.go | 20 ++++-- slicenumpy.go | 165 +++++++++++++++++++++++++++++++++----------------- tilelib.go | 11 ++-- 5 files changed, 136 insertions(+), 65 deletions(-) diff --git a/chisquare.go b/chisquare.go index e1c8bfff1b..31142a1710 100644 --- a/chisquare.go +++ b/chisquare.go @@ -32,6 +32,9 @@ func pvalue(a, b []bool) float64 { for bi := 0; bi < 2; bi++ { rowtotal := tab[ai*2] + tab[ai*2+1] coltotal := tab[bi] + tab[2+bi] + if rowtotal == 0 || coltotal == 0 { + return 1 + } exp := float64(rowtotal) * float64(coltotal) / float64(len(a)) obs := tab[ai*2+bi] d := float64(obs) - exp diff --git a/slice.go b/slice.go index 4dbe6ab652..a4899836be 100644 --- a/slice.go +++ b/slice.go @@ -103,7 +103,7 @@ func (cmd *slicecmd) RunCommand(prog string, args []string, stdin io.Reader, std func Slice(tagsPerFile int, dstdir string, srcdirs []string) error { var infiles []string for _, srcdir := range srcdirs { - files, err := allGobFiles(srcdir) + files, err := allFiles(srcdir, matchGobFile) if err != nil { return err } diff --git a/slice_test.go b/slice_test.go index 62c0c9181f..10202a4f0a 100644 --- a/slice_test.go +++ b/slice_test.go @@ -214,13 +214,19 @@ func (s *sliceSuite) TestImportAndSlice(c *check.C) { c.Log("=== slice-numpy + chunked hgvs matrix ===") { - err = ioutil.WriteFile(tmpdir+"/cases.txt", []byte("pipeline1/input1\npipeline1dup/input1\n"), 0600) + err = ioutil.WriteFile(tmpdir+"/casecontrol.tsv", []byte(`SampleID CC +pipeline1/input1 1 +pipeline1/input2 0 +pipeline1dup/input1 1 +pipeline1dup/input2 0 +`), 0600) c.Assert(err, check.IsNil) npydir := c.MkDir() exited := (&sliceNumpy{}).RunCommand("slice-numpy", []string{ "-local=true", "-chunked-hgvs-matrix=true", - "-chi2-cases-file=" + tmpdir + "/cases.txt", + "-chi2-case-control-file=" + tmpdir + "/casecontrol.tsv", + "-chi2-case-control-column=CC", "-chi2-p-value=0.05", "-min-coverage=0.75", "-input-dir=" + slicedir, @@ -240,13 +246,19 @@ func (s *sliceSuite) TestImportAndSlice(c *check.C) { c.Log("=== slice-numpy + onehot ===") { - err = ioutil.WriteFile(tmpdir+"/cases.txt", []byte("pipeline1/input1\npipeline1dup/input1\n"), 0600) + err = ioutil.WriteFile(tmpdir+"/casecontrol.tsv", []byte(`SampleID CC +pipeline1/input1 1 +pipeline1/input2 0 +pipeline1dup/input1 1 +pipeline1dup/input2 0 +`), 0600) c.Assert(err, check.IsNil) npydir := c.MkDir() exited := (&sliceNumpy{}).RunCommand("slice-numpy", []string{ "-local=true", "-chunked-onehot=true", - "-chi2-cases-file=" + tmpdir + "/cases.txt", + "-chi2-case-control-file=" + tmpdir + "/casecontrol.tsv", + "-chi2-case-control-column=CC", "-chi2-p-value=0.05", "-min-coverage=0.75", "-input-dir=" + slicedir, diff --git a/slicenumpy.go b/slicenumpy.go index c05f1509c0..25224b2617 100644 --- a/slicenumpy.go +++ b/slicenumpy.go @@ -32,13 +32,14 @@ import ( ) type sliceNumpy struct { - filter filter - threads int - chi2CasesFile string - chi2Cases []bool - chi2PValue float64 - minCoverage int - cgnames []string + filter filter + threads int + chi2CaseControlColumn string + chi2CaseControlFile string + chi2Cases []bool + chi2PValue float64 + minCoverage int + cgnames []string } func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int { @@ -64,7 +65,8 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s hgvsChunked := flags.Bool("chunked-hgvs-matrix", false, "also generate hgvs-based matrix per chromosome") onehotChunked := flags.Bool("chunked-onehot", false, "generate one-hot tile-based matrix") flags.IntVar(&cmd.threads, "threads", 16, "number of memory-hungry assembly threads") - flags.StringVar(&cmd.chi2CasesFile, "chi2-cases-file", "", "text file indicating positive cases (for Χ² test)") + flags.StringVar(&cmd.chi2CaseControlFile, "chi2-case-control-file", "", "tsv file or directory indicating cases and controls for Χ² test (if directory, all .tsv files will be read)") + flags.StringVar(&cmd.chi2CaseControlColumn, "chi2-case-control-column", "", "name of case/control column in case-control files for Χ² test (value must be 0 for control, 1 for case)") flags.Float64Var(&cmd.chi2PValue, "chi2-p-value", 1, "do Χ² test and omit columns with p-value above this threshold") cmd.filter.Flags(flags) err = flags.Parse(args) @@ -81,8 +83,8 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s }() } - if cmd.chi2CasesFile == "" && cmd.chi2PValue != 1 { - log.Errorf("cannot use provided -chi2-p-value=%f because -chi2-cases-file= value is empty", cmd.chi2PValue) + if cmd.chi2PValue != 1 && (cmd.chi2CaseControlFile == "" || cmd.chi2CaseControlColumn == "") { + log.Errorf("cannot use provided -chi2-p-value=%f because -chi2-case-control-file= or -chi2-case-control-column= value is empty", cmd.chi2PValue) return 2 } @@ -97,7 +99,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s KeepCache: 2, APIAccess: true, } - err = runner.TranslatePaths(inputDir, regionsFilename, &cmd.chi2CasesFile) + err = runner.TranslatePaths(inputDir, regionsFilename, &cmd.chi2CaseControlFile) if err != nil { return 1 } @@ -112,7 +114,8 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s "-single-hgvs-matrix=" + fmt.Sprintf("%v", *hgvsSingle), "-chunked-hgvs-matrix=" + fmt.Sprintf("%v", *hgvsChunked), "-chunked-onehot=" + fmt.Sprintf("%v", *onehotChunked), - "-chi2-cases-file=" + cmd.chi2CasesFile, + "-chi2-case-control-file=" + cmd.chi2CaseControlFile, + "-chi2-case-control-column=" + cmd.chi2CaseControlColumn, "-chi2-p-value=" + fmt.Sprintf("%f", cmd.chi2PValue), } runner.Args = append(runner.Args, cmd.filter.Args()...) @@ -125,7 +128,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s return 0 } - infiles, err := allGobFiles(*inputDir) + infiles, err := allFiles(*inputDir, matchGobFile) if err != nil { return 1 } @@ -188,51 +191,14 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s return 1 } sort.Strings(cmd.cgnames) - - cmd.minCoverage = int(math.Ceil(cmd.filter.MinCoverage * float64(len(cmd.cgnames)))) - - if cmd.chi2CasesFile != "" { - f, err2 := open(cmd.chi2CasesFile) - if err2 != nil { - err = err2 - return 1 - } - buf, err2 := io.ReadAll(f) - f.Close() - if err2 != nil { - err = err2 - return 1 - } - cmd.chi2Cases = make([]bool, len(cmd.cgnames)) - ncases := 0 - for _, pattern := range bytes.Split(buf, []byte{'\n'}) { - if len(pattern) == 0 { - continue - } - pattern := string(pattern) - idx := -1 - for i, name := range cmd.cgnames { - if !strings.Contains(name, pattern) { - continue - } - cmd.chi2Cases[i] = true - ncases++ - if idx >= 0 { - log.Warnf("pattern %q in cases file matches multiple genome IDs: %q, %q", pattern, cmd.cgnames[idx], name) - } else { - idx = i - } - } - if idx < 0 { - log.Warnf("pattern %q in cases file does not match any genome IDs", pattern) - continue - } - } - log.Printf("%d cases, %d controls", ncases, len(cmd.cgnames)-ncases) + err = cmd.useCaseControlFiles() + if err != nil { + return 1 } + cmd.minCoverage = int(math.Ceil(cmd.filter.MinCoverage * float64(len(cmd.cgnames)))) { - labelsFilename := *outputDir + "/labels.csv" + labelsFilename := *outputDir + "/samples.csv" log.Infof("writing labels to %s", labelsFilename) var f *os.File f, err = os.Create(labelsFilename) @@ -241,7 +207,11 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s } defer f.Close() for i, name := range cmd.cgnames { - _, err = fmt.Fprintf(f, "%d,%q\n", i, trimFilenameForLabel(name)) + cc := 0 + if cmd.chi2Cases != nil && cmd.chi2Cases[i] { + cc = 1 + } + _, err = fmt.Fprintf(f, "%d,%q,%d\n", i, trimFilenameForLabel(name), cc) if err != nil { err = fmt.Errorf("write %s: %w", labelsFilename, err) return 1 @@ -926,6 +896,89 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s return 0 } +// Read case/control files, remove non-case/control entries from +// cmd.cgnames, and build cmd.chi2Cases. +func (cmd *sliceNumpy) useCaseControlFiles() error { + if cmd.chi2CaseControlFile == "" { + return nil + } + infiles, err := allFiles(cmd.chi2CaseControlFile, nil) + if err != nil { + return err + } + // index in cmd.cgnames => case(true) / control(false) + cc := map[int]bool{} + for _, infile := range infiles { + f, err := open(infile) + if err != nil { + return err + } + buf, err := io.ReadAll(f) + f.Close() + if err != nil { + return err + } + ccCol := -1 + for _, tsv := range bytes.Split(buf, []byte{'\n'}) { + if len(tsv) == 0 { + continue + } + split := strings.Split(string(tsv), "\t") + if ccCol < 0 { + // header row + for col, name := range split { + if name == cmd.chi2CaseControlColumn { + ccCol = col + break + } + } + if ccCol < 0 { + return fmt.Errorf("%s: no column named %q in header row %q", infile, cmd.chi2CaseControlColumn, tsv) + } + continue + } + if len(split) <= ccCol { + continue + } + pattern := split[0] + found := -1 + for i, name := range cmd.cgnames { + if strings.Contains(name, pattern) { + if found >= 0 { + log.Warnf("pattern %q in %s matches multiple genome IDs (%qs, %q)", pattern, infile, cmd.cgnames[found], name) + } + found = i + } + } + if found < 0 { + log.Warnf("pattern %q in %s does not match any genome IDs", pattern, infile) + continue + } + if split[ccCol] == "0" { + cc[found] = false + } + if split[ccCol] == "1" { + cc[found] = true + } + } + } + allnames := cmd.cgnames + cmd.cgnames = nil + cmd.chi2Cases = nil + ncases := 0 + for i, name := range allnames { + if cc, ok := cc[i]; ok { + cmd.cgnames = append(cmd.cgnames, name) + cmd.chi2Cases = append(cmd.chi2Cases, cc) + if cc { + ncases++ + } + } + } + log.Printf("%d cases, %d controls, %d neither (dropped)", ncases, len(cmd.cgnames)-ncases, len(allnames)-len(cmd.cgnames)) + return nil +} + func (cmd *sliceNumpy) filterHGVScolpair(colpair [2][]int8) bool { if cmd.chi2PValue >= 1 { return true diff --git a/tilelib.go b/tilelib.go index 6acd8d6a26..550563342c 100644 --- a/tilelib.go +++ b/tilelib.go @@ -236,7 +236,7 @@ func (tilelib *tileLibrary) loadCompactSequences(cseqs []CompactSequence, varian return nil } -func allGobFiles(path string) ([]string, error) { +func allFiles(path string, re *regexp.Regexp) ([]string, error) { var files []string f, err := open(path) if err != nil { @@ -251,21 +251,24 @@ func allGobFiles(path string) ([]string, error) { if fi.Name() == "." || fi.Name() == ".." { continue } else if child := path + "/" + fi.Name(); fi.IsDir() { - add, err := allGobFiles(child) + add, err := allFiles(child, re) if err != nil { return nil, err } files = append(files, add...) - } else if strings.HasSuffix(child, ".gob") || strings.HasSuffix(child, ".gob.gz") { + } else if re == nil || re.MatchString(child) { files = append(files, child) } } + sort.Strings(files) return files, nil } +var matchGobFile = regexp.MustCompile(`\.gob(\.gz)?$`) + func (tilelib *tileLibrary) LoadDir(ctx context.Context, path string) error { log.Infof("LoadDir: walk dir %s", path) - files, err := allGobFiles(path) + files, err := allFiles(path, matchGobFile) if err != nil { return err } -- 2.30.2