From 4a3899f5808423646643b6a2c1a68dd4837d1372 Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Wed, 18 Nov 2020 20:39:46 -0500 Subject: [PATCH] Configurable chromosome name pattern. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- import.go | 11 +++++++++-- tilelib.go | 7 ++++--- tilelib_test.go | 19 +++++++++++-------- 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/import.go b/import.go index 629fccac2f..f7e099b4e6 100644 --- a/import.go +++ b/import.go @@ -41,6 +41,7 @@ type importer struct { outputTiles bool saveIncompleteTiles bool outputStats string + matchChromosome *regexp.Regexp encoder *gob.Encoder } @@ -62,6 +63,7 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std flags.BoolVar(&cmd.outputTiles, "output-tiles", false, "include tile variant sequences in output file") flags.BoolVar(&cmd.saveIncompleteTiles, "save-incomplete-tiles", false, "treat tiles with no-calls as regular tiles") flags.StringVar(&cmd.outputStats, "output-stats", "", "output stats to `file` (json)") + matchChromosome := flags.String("match-chromosome", "^(chr)?([0-9]+|X|Y|MT?)$", "import chromosomes that match the given `regexp`") priority := flags.Int("priority", 500, "container request priority") pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`") loglevel := flags.String("loglevel", "info", "logging threshold (trace, debug, info, warn, error, fatal, or panic)") @@ -91,6 +93,11 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std } log.SetLevel(lvl) + cmd.matchChromosome, err = regexp.Compile(*matchChromosome) + if err != nil { + return 1 + } + if !cmd.runLocal { runner := arvadosContainerRunner{ Name: "lightning import", @@ -214,7 +221,7 @@ func (cmd *importer) tileFasta(tilelib *tileLibrary, infile string) (tileSeq, [] } defer input.Close() } - return tilelib.TileFasta(infile, input) + return tilelib.TileFasta(infile, input, cmd.matchChromosome) } func (cmd *importer) loadTagLibrary() (*tagLibrary, error) { @@ -480,7 +487,7 @@ func (cmd *importer) tileGVCF(tilelib *tileLibrary, infile string, phase int) (t return } defer consensus.Wait() - tileseq, stats, err = tilelib.TileFasta(fmt.Sprintf("%s phase %d", infile, phase+1), stdout) + tileseq, stats, err = tilelib.TileFasta(fmt.Sprintf("%s phase %d", infile, phase+1), stdout, cmd.matchChromosome) if err != nil { return } diff --git a/tilelib.go b/tilelib.go index 5ad12e7228..12facfdb5f 100644 --- a/tilelib.go +++ b/tilelib.go @@ -7,6 +7,7 @@ import ( "encoding/gob" "fmt" "io" + "regexp" "runtime" "sort" "strings" @@ -255,7 +256,7 @@ type importStats struct { DroppedOutOfOrderTiles int } -func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq, []importStats, error) { +func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader, matchChromosome *regexp.Regexp) (tileSeq, []importStats, error) { ret := tileSeq{} type jobT struct { label string @@ -293,7 +294,7 @@ func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq, for job := range todo { if len(job.fasta) == 0 { continue - } else if strings.Contains(job.label, "_") { + } else if !matchChromosome.MatchString(job.label) { skippedSequences++ continue } @@ -373,7 +374,7 @@ func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq, totalPathLen += len(path) } - log.Printf("%s tiled with total path len %d in %d sequences (skipped %d sequences with '_' in name, skipped %d out-of-order tags)", filelabel, totalPathLen, len(ret), skippedSequences, totalFoundTags-totalPathLen) + log.Printf("%s tiled with total path len %d in %d sequences (skipped %d sequences that did not match chromosome regexp, skipped %d out-of-order tags)", filelabel, totalPathLen, len(ret), skippedSequences, totalFoundTags-totalPathLen) return ret, stats, scanner.Err() } diff --git a/tilelib_test.go b/tilelib_test.go index 6d6ff5abb8..947d456ac8 100644 --- a/tilelib_test.go +++ b/tilelib_test.go @@ -2,6 +2,7 @@ package main import ( "bytes" + "regexp" "strings" "gopkg.in/check.v1" @@ -32,6 +33,8 @@ gttattaataataacttatcatca } func (s *tilelibSuite) TestSkipOOO(c *check.C) { + matchAllChromosomes := regexp.MustCompile(".") + // tags appear in seq: 4, 0, 2 (but skipOOO is false) tilelib := &tileLibrary{taglib: &s.taglib, skipOOO: false} tseq, _, err := tilelib.TileFasta("test-label", bytes.NewBufferString(">test-seq\n"+ @@ -40,7 +43,7 @@ func (s *tilelibSuite) TestSkipOOO(c *check.C) { s.tag[0]+ "cccccccccccccccccccc\n"+ s.tag[2]+ - "\n")) + "\n"), matchAllChromosomes) c.Assert(err, check.IsNil) c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{4, 1}, {0, 1}, {2, 1}}}) @@ -52,7 +55,7 @@ func (s *tilelibSuite) TestSkipOOO(c *check.C) { s.tag[1]+ "ggggggggggggggggggggggg\n"+ s.tag[2]+ - "\n")) + "\n"), matchAllChromosomes) c.Assert(err, check.IsNil) c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}, {2, 1}}}) @@ -64,7 +67,7 @@ func (s *tilelibSuite) TestSkipOOO(c *check.C) { s.tag[3]+ "ggggggggggggggggggggggg\n"+ s.tag[4]+ - "\n")) + "\n"), matchAllChromosomes) c.Assert(err, check.IsNil) c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{2, 1}, {3, 1}, {4, 1}}}) @@ -76,7 +79,7 @@ func (s *tilelibSuite) TestSkipOOO(c *check.C) { s.tag[0]+ "ggggggggggggggggggggggg\n"+ s.tag[2]+ - "\n")) + "\n"), matchAllChromosomes) c.Assert(err, check.IsNil) c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {2, 1}}}) @@ -88,7 +91,7 @@ func (s *tilelibSuite) TestSkipOOO(c *check.C) { s.tag[2]+ "ggggggggggggggggggggggg\n"+ s.tag[1]+ - "\n")) + "\n"), matchAllChromosomes) c.Assert(err, check.IsNil) c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}}}) @@ -102,7 +105,7 @@ func (s *tilelibSuite) TestSkipOOO(c *check.C) { s.tag[1]+ "ggggggggggggggggggggggg\n"+ s.tag[2]+ - "\n")) + "\n"), matchAllChromosomes) c.Assert(err, check.IsNil) c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}, {2, 1}}}) @@ -118,7 +121,7 @@ func (s *tilelibSuite) TestSkipOOO(c *check.C) { s.tag[0]+ "ggggggggggggggggggggggg\n"+ s.tag[4]+ - "\n")) + "\n"), matchAllChromosomes) c.Assert(err, check.IsNil) c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}, {3, 1}, {4, 1}}}) @@ -130,7 +133,7 @@ func (s *tilelibSuite) TestSkipOOO(c *check.C) { s.tag[1]+ "ggggggggggggggggggggggg\n"+ s.tag[3]+ - "\n")) + "\n"), matchAllChromosomes) c.Assert(err, check.IsNil) c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}, {3, 1}}}) } -- 2.30.2