outputTiles bool
saveIncompleteTiles bool
outputStats string
+ matchChromosome *regexp.Regexp
encoder *gob.Encoder
}
flags.BoolVar(&cmd.outputTiles, "output-tiles", false, "include tile variant sequences in output file")
flags.BoolVar(&cmd.saveIncompleteTiles, "save-incomplete-tiles", false, "treat tiles with no-calls as regular tiles")
flags.StringVar(&cmd.outputStats, "output-stats", "", "output stats to `file` (json)")
+ matchChromosome := flags.String("match-chromosome", "^(chr)?([0-9]+|X|Y|MT?)$", "import chromosomes that match the given `regexp`")
priority := flags.Int("priority", 500, "container request priority")
pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
loglevel := flags.String("loglevel", "info", "logging threshold (trace, debug, info, warn, error, fatal, or panic)")
}
log.SetLevel(lvl)
+ cmd.matchChromosome, err = regexp.Compile(*matchChromosome)
+ if err != nil {
+ return 1
+ }
+
if !cmd.runLocal {
runner := arvadosContainerRunner{
Name: "lightning import",
}
defer input.Close()
}
- return tilelib.TileFasta(infile, input)
+ return tilelib.TileFasta(infile, input, cmd.matchChromosome)
}
func (cmd *importer) loadTagLibrary() (*tagLibrary, error) {
return
}
defer consensus.Wait()
- tileseq, stats, err = tilelib.TileFasta(fmt.Sprintf("%s phase %d", infile, phase+1), stdout)
+ tileseq, stats, err = tilelib.TileFasta(fmt.Sprintf("%s phase %d", infile, phase+1), stdout, cmd.matchChromosome)
if err != nil {
return
}
"encoding/gob"
"fmt"
"io"
+ "regexp"
"runtime"
"sort"
"strings"
DroppedOutOfOrderTiles int
}
-func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq, []importStats, error) {
+func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader, matchChromosome *regexp.Regexp) (tileSeq, []importStats, error) {
ret := tileSeq{}
type jobT struct {
label string
for job := range todo {
if len(job.fasta) == 0 {
continue
- } else if strings.Contains(job.label, "_") {
+ } else if !matchChromosome.MatchString(job.label) {
skippedSequences++
continue
}
totalPathLen += len(path)
}
- log.Printf("%s tiled with total path len %d in %d sequences (skipped %d sequences with '_' in name, skipped %d out-of-order tags)", filelabel, totalPathLen, len(ret), skippedSequences, totalFoundTags-totalPathLen)
+ log.Printf("%s tiled with total path len %d in %d sequences (skipped %d sequences that did not match chromosome regexp, skipped %d out-of-order tags)", filelabel, totalPathLen, len(ret), skippedSequences, totalFoundTags-totalPathLen)
return ret, stats, scanner.Err()
}
import (
"bytes"
+ "regexp"
"strings"
"gopkg.in/check.v1"
}
func (s *tilelibSuite) TestSkipOOO(c *check.C) {
+ matchAllChromosomes := regexp.MustCompile(".")
+
// tags appear in seq: 4, 0, 2 (but skipOOO is false)
tilelib := &tileLibrary{taglib: &s.taglib, skipOOO: false}
tseq, _, err := tilelib.TileFasta("test-label", bytes.NewBufferString(">test-seq\n"+
s.tag[0]+
"cccccccccccccccccccc\n"+
s.tag[2]+
- "\n"))
+ "\n"), matchAllChromosomes)
c.Assert(err, check.IsNil)
c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{4, 1}, {0, 1}, {2, 1}}})
s.tag[1]+
"ggggggggggggggggggggggg\n"+
s.tag[2]+
- "\n"))
+ "\n"), matchAllChromosomes)
c.Assert(err, check.IsNil)
c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}, {2, 1}}})
s.tag[3]+
"ggggggggggggggggggggggg\n"+
s.tag[4]+
- "\n"))
+ "\n"), matchAllChromosomes)
c.Assert(err, check.IsNil)
c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{2, 1}, {3, 1}, {4, 1}}})
s.tag[0]+
"ggggggggggggggggggggggg\n"+
s.tag[2]+
- "\n"))
+ "\n"), matchAllChromosomes)
c.Assert(err, check.IsNil)
c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {2, 1}}})
s.tag[2]+
"ggggggggggggggggggggggg\n"+
s.tag[1]+
- "\n"))
+ "\n"), matchAllChromosomes)
c.Assert(err, check.IsNil)
c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}}})
s.tag[1]+
"ggggggggggggggggggggggg\n"+
s.tag[2]+
- "\n"))
+ "\n"), matchAllChromosomes)
c.Assert(err, check.IsNil)
c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}, {2, 1}}})
s.tag[0]+
"ggggggggggggggggggggggg\n"+
s.tag[4]+
- "\n"))
+ "\n"), matchAllChromosomes)
c.Assert(err, check.IsNil)
c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}, {3, 1}, {4, 1}}})
s.tag[1]+
"ggggggggggggggggggggggg\n"+
s.tag[3]+
- "\n"))
+ "\n"), matchAllChromosomes)
c.Assert(err, check.IsNil)
c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}, {3, 1}}})
}