Configurable chromosome name pattern.
authorTom Clegg <tom@tomclegg.ca>
Thu, 19 Nov 2020 01:39:46 +0000 (20:39 -0500)
committerTom Clegg <tom@tomclegg.ca>
Thu, 19 Nov 2020 01:39:46 +0000 (20:39 -0500)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@tomclegg.ca>

import.go
tilelib.go
tilelib_test.go

index 629fccac2fc049646a58c23fbbbd11a14292ef52..f7e099b4e6718509514f28dfec911157a5c5ddbe 100644 (file)
--- a/import.go
+++ b/import.go
@@ -41,6 +41,7 @@ type importer struct {
        outputTiles         bool
        saveIncompleteTiles bool
        outputStats         string
+       matchChromosome     *regexp.Regexp
        encoder             *gob.Encoder
 }
 
@@ -62,6 +63,7 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std
        flags.BoolVar(&cmd.outputTiles, "output-tiles", false, "include tile variant sequences in output file")
        flags.BoolVar(&cmd.saveIncompleteTiles, "save-incomplete-tiles", false, "treat tiles with no-calls as regular tiles")
        flags.StringVar(&cmd.outputStats, "output-stats", "", "output stats to `file` (json)")
+       matchChromosome := flags.String("match-chromosome", "^(chr)?([0-9]+|X|Y|MT?)$", "import chromosomes that match the given `regexp`")
        priority := flags.Int("priority", 500, "container request priority")
        pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
        loglevel := flags.String("loglevel", "info", "logging threshold (trace, debug, info, warn, error, fatal, or panic)")
@@ -91,6 +93,11 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std
        }
        log.SetLevel(lvl)
 
+       cmd.matchChromosome, err = regexp.Compile(*matchChromosome)
+       if err != nil {
+               return 1
+       }
+
        if !cmd.runLocal {
                runner := arvadosContainerRunner{
                        Name:        "lightning import",
@@ -214,7 +221,7 @@ func (cmd *importer) tileFasta(tilelib *tileLibrary, infile string) (tileSeq, []
                }
                defer input.Close()
        }
-       return tilelib.TileFasta(infile, input)
+       return tilelib.TileFasta(infile, input, cmd.matchChromosome)
 }
 
 func (cmd *importer) loadTagLibrary() (*tagLibrary, error) {
@@ -480,7 +487,7 @@ func (cmd *importer) tileGVCF(tilelib *tileLibrary, infile string, phase int) (t
                return
        }
        defer consensus.Wait()
-       tileseq, stats, err = tilelib.TileFasta(fmt.Sprintf("%s phase %d", infile, phase+1), stdout)
+       tileseq, stats, err = tilelib.TileFasta(fmt.Sprintf("%s phase %d", infile, phase+1), stdout, cmd.matchChromosome)
        if err != nil {
                return
        }
index 5ad12e7228667024e562e12b8a6030f7fefac05a..12facfdb5f665226f5d1de0952d30ea5c7bb5181 100644 (file)
@@ -7,6 +7,7 @@ import (
        "encoding/gob"
        "fmt"
        "io"
+       "regexp"
        "runtime"
        "sort"
        "strings"
@@ -255,7 +256,7 @@ type importStats struct {
        DroppedOutOfOrderTiles int
 }
 
-func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq, []importStats, error) {
+func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader, matchChromosome *regexp.Regexp) (tileSeq, []importStats, error) {
        ret := tileSeq{}
        type jobT struct {
                label string
@@ -293,7 +294,7 @@ func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq,
        for job := range todo {
                if len(job.fasta) == 0 {
                        continue
-               } else if strings.Contains(job.label, "_") {
+               } else if !matchChromosome.MatchString(job.label) {
                        skippedSequences++
                        continue
                }
@@ -373,7 +374,7 @@ func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq,
 
                totalPathLen += len(path)
        }
-       log.Printf("%s tiled with total path len %d in %d sequences (skipped %d sequences with '_' in name, skipped %d out-of-order tags)", filelabel, totalPathLen, len(ret), skippedSequences, totalFoundTags-totalPathLen)
+       log.Printf("%s tiled with total path len %d in %d sequences (skipped %d sequences that did not match chromosome regexp, skipped %d out-of-order tags)", filelabel, totalPathLen, len(ret), skippedSequences, totalFoundTags-totalPathLen)
        return ret, stats, scanner.Err()
 }
 
index 6d6ff5abb8245c8315e110070c7302983ee9e593..947d456ac8726cab4f74cfafbbb88e991afe543b 100644 (file)
@@ -2,6 +2,7 @@ package main
 
 import (
        "bytes"
+       "regexp"
        "strings"
 
        "gopkg.in/check.v1"
@@ -32,6 +33,8 @@ gttattaataataacttatcatca
 }
 
 func (s *tilelibSuite) TestSkipOOO(c *check.C) {
+       matchAllChromosomes := regexp.MustCompile(".")
+
        // tags appear in seq: 4, 0, 2 (but skipOOO is false)
        tilelib := &tileLibrary{taglib: &s.taglib, skipOOO: false}
        tseq, _, err := tilelib.TileFasta("test-label", bytes.NewBufferString(">test-seq\n"+
@@ -40,7 +43,7 @@ func (s *tilelibSuite) TestSkipOOO(c *check.C) {
                s.tag[0]+
                "cccccccccccccccccccc\n"+
                s.tag[2]+
-               "\n"))
+               "\n"), matchAllChromosomes)
        c.Assert(err, check.IsNil)
        c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{4, 1}, {0, 1}, {2, 1}}})
 
@@ -52,7 +55,7 @@ func (s *tilelibSuite) TestSkipOOO(c *check.C) {
                s.tag[1]+
                "ggggggggggggggggggggggg\n"+
                s.tag[2]+
-               "\n"))
+               "\n"), matchAllChromosomes)
        c.Assert(err, check.IsNil)
        c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}, {2, 1}}})
 
@@ -64,7 +67,7 @@ func (s *tilelibSuite) TestSkipOOO(c *check.C) {
                s.tag[3]+
                "ggggggggggggggggggggggg\n"+
                s.tag[4]+
-               "\n"))
+               "\n"), matchAllChromosomes)
        c.Assert(err, check.IsNil)
        c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{2, 1}, {3, 1}, {4, 1}}})
 
@@ -76,7 +79,7 @@ func (s *tilelibSuite) TestSkipOOO(c *check.C) {
                s.tag[0]+
                "ggggggggggggggggggggggg\n"+
                s.tag[2]+
-               "\n"))
+               "\n"), matchAllChromosomes)
        c.Assert(err, check.IsNil)
        c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {2, 1}}})
 
@@ -88,7 +91,7 @@ func (s *tilelibSuite) TestSkipOOO(c *check.C) {
                s.tag[2]+
                "ggggggggggggggggggggggg\n"+
                s.tag[1]+
-               "\n"))
+               "\n"), matchAllChromosomes)
        c.Assert(err, check.IsNil)
        c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}}})
 
@@ -102,7 +105,7 @@ func (s *tilelibSuite) TestSkipOOO(c *check.C) {
                s.tag[1]+
                "ggggggggggggggggggggggg\n"+
                s.tag[2]+
-               "\n"))
+               "\n"), matchAllChromosomes)
        c.Assert(err, check.IsNil)
        c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}, {2, 1}}})
 
@@ -118,7 +121,7 @@ func (s *tilelibSuite) TestSkipOOO(c *check.C) {
                s.tag[0]+
                "ggggggggggggggggggggggg\n"+
                s.tag[4]+
-               "\n"))
+               "\n"), matchAllChromosomes)
        c.Assert(err, check.IsNil)
        c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}, {3, 1}, {4, 1}}})
 
@@ -130,7 +133,7 @@ func (s *tilelibSuite) TestSkipOOO(c *check.C) {
                s.tag[1]+
                "ggggggggggggggggggggggg\n"+
                s.tag[3]+
-               "\n"))
+               "\n"), matchAllChromosomes)
        c.Assert(err, check.IsNil)
        c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}, {3, 1}}})
 }