From: Tom Clegg Date: Mon, 6 Apr 2020 14:09:46 +0000 (-0400) Subject: Option to skip/ignore out-of-order tags. X-Git-Url: https://git.arvados.org/lightning.git/commitdiff_plain/d62d55adee66142b030b84436a0805d28c961342 Option to skip/ignore out-of-order tags. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- diff --git a/import.go b/import.go index e6b86afd81..9c933da3de 100644 --- a/import.go +++ b/import.go @@ -31,6 +31,7 @@ type importer struct { outputFile string projectUUID string runLocal bool + skipOOO bool encoder *gob.Encoder } @@ -48,6 +49,7 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std flags.StringVar(&cmd.outputFile, "o", "-", "output `file`") flags.StringVar(&cmd.projectUUID, "project", "", "project `UUID` for output data") flags.BoolVar(&cmd.runLocal, "local", false, "run on local host (default: run in an arvados container)") + flags.BoolVar(&cmd.skipOOO, "skip-ooo", false, "skip out-of-order tags") priority := flags.Int("priority", 500, "container request priority") pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`") err = flags.Parse(args) @@ -193,7 +195,7 @@ func (cmd *importer) loadTileLibrary() (*tileLibrary, error) { return nil, fmt.Errorf("cannot tile: tag library is empty") } log.Printf("tag library %s load done", cmd.tagLibraryFile) - return &tileLibrary{taglib: &taglib}, nil + return &tileLibrary{taglib: &taglib, skipOOO: cmd.skipOOO}, nil } func listInputFiles(paths []string) (files []string, err error) { @@ -293,8 +295,11 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error { } flat := make([]tileVariantID, ntags*2) for i := 0; i < ntags; i++ { - flat[i*2] = variants[0][i] - flat[i*2+1] = variants[1][i] + for hap := 0; hap < 2; hap++ { + if i < len(variants[hap]) { + flat[i*2+hap] = variants[hap][i] + } + } } err := cmd.encoder.Encode(LibraryEntry{ CompactGenomes: []CompactGenome{{Name: infile, Variants: flat}}, diff --git a/tilelib.go b/tilelib.go index 7b90810242..339bc215e3 100644 --- a/tilelib.go +++ b/tilelib.go @@ -39,6 +39,7 @@ func (tseq tileSeq) Variants() []tileVariantID { } type tileLibrary struct { + skipOOO bool taglib *tagLibrary variant [][][blake2b.Size256]byte // count [][]int @@ -72,6 +73,12 @@ func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq, } todo <- jobT{seqlabel, fasta} }() + type foundtag struct { + pos int + tagid tagID + taglen int + } + found := make([]foundtag, 2000000) path := make([]tileLibRef, 2000000) totalPathLen := 0 skippedSequences := 0 @@ -83,19 +90,33 @@ func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq, continue } log.Debugf("%s %s tiling", filelabel, job.label) + + found = found[:0] + tilelib.taglib.FindAll(job.fasta, func(tagid tagID, pos, taglen int) { + found = append(found, foundtag{pos: pos, tagid: tagid, taglen: taglen}) + }) path = path[:0] - tilestart := -1 // position in fasta of tile that ends here - tiletagid := tagID(-1) // tag id starting tile that ends here - tilelib.taglib.FindAll(job.fasta, func(id tagID, pos, taglen int) { - if tilestart >= 0 { - path = append(path, tilelib.getRef(tiletagid, job.fasta[tilestart:pos+taglen])) + last := foundtag{tagid: -1} + for i, f := range found { + if tilelib.skipOOO { + if f.tagid < last.tagid+1 { + // e.g., last=B, this=A + continue + } + if f.tagid > last.tagid+1 && i+1 < len(found) && found[i+1].tagid <= f.tagid { + // e.g., last=A, this=C, next=B + continue + } } - tilestart = pos - tiletagid = id - }) - if tiletagid >= 0 { - path = append(path, tilelib.getRef(tiletagid, job.fasta[tilestart:])) + if last.taglen > 0 { + path = append(path, tilelib.getRef(last.tagid, job.fasta[last.pos:f.pos+f.taglen])) + } + last = f + } + if last.taglen > 0 { + path = append(path, tilelib.getRef(last.tagid, job.fasta[last.pos:])) } + pathcopy := make([]tileLibRef, len(path)) copy(pathcopy, path) ret[job.label] = pathcopy diff --git a/tilelib_test.go b/tilelib_test.go new file mode 100644 index 0000000000..4024b83029 --- /dev/null +++ b/tilelib_test.go @@ -0,0 +1,109 @@ +package main + +import ( + "bytes" + + "gopkg.in/check.v1" +) + +type tilelibSuite struct{} + +var _ = check.Suite(&tilelibSuite{}) + +func (s *tilelibSuite) TestSkipOOO(c *check.C) { + var taglib tagLibrary + err := taglib.Load(bytes.NewBufferString(`>0000.00 +ggagaactgtgctccgccttcaga +acacatgctagcgcgtcggggtgg +gactctagcagagtggccagccac +cctcccgagccgagccacccgtca +gttattaataataacttatcatca +`)) + c.Assert(err, check.IsNil) + + // tags appear in seq: 4, 0, 2 (but skipOOO is false) + tilelib := &tileLibrary{taglib: &taglib, skipOOO: false} + tseq, err := tilelib.TileFasta("test-label", bytes.NewBufferString(`>test-seq +gttattaataataacttatcatca +ggggggggggggggggggggggg +ggagaactgtgctccgccttcaga +cccccccccccccccccccc +gactctagcagagtggccagccac +`)) + c.Assert(err, check.IsNil) + c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{4, 1}, {0, 1}, {2, 1}}}) + + // tags appear in seq: 0, 1, 2 -> don't skip + tilelib = &tileLibrary{taglib: &taglib, skipOOO: true} + tseq, err = tilelib.TileFasta("test-label", bytes.NewBufferString(`>test-seq +ggagaactgtgctccgccttcaga +cccccccccccccccccccc +acacatgctagcgcgtcggggtgg +ggggggggggggggggggggggg +gactctagcagagtggccagccac +`)) + c.Assert(err, check.IsNil) + c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}, {2, 1}}}) + + // tags appear in seq: 2, 3, 4 -> don't skip + tilelib = &tileLibrary{taglib: &taglib, skipOOO: true} + tseq, err = tilelib.TileFasta("test-label", bytes.NewBufferString(`>test-seq +gactctagcagagtggccagccac +cccccccccccccccccccc +cctcccgagccgagccacccgtca +ggggggggggggggggggggggg +gttattaataataacttatcatca +`)) + c.Assert(err, check.IsNil) + c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{2, 1}, {3, 1}, {4, 1}}}) + + // tags appear in seq: 4, 0, 2 -> skip 4 + tilelib = &tileLibrary{taglib: &taglib, skipOOO: true} + tseq, err = tilelib.TileFasta("test-label", bytes.NewBufferString(`>test-seq +gttattaataataacttatcatca +cccccccccccccccccccc +ggagaactgtgctccgccttcaga +ggggggggggggggggggggggg +gactctagcagagtggccagccac +`)) + c.Assert(err, check.IsNil) + c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {2, 1}}}) + + // tags appear in seq: 0, 2, 1 -> skip 2 + tilelib = &tileLibrary{taglib: &taglib, skipOOO: true} + tseq, err = tilelib.TileFasta("test-label", bytes.NewBufferString(`>test-seq +ggagaactgtgctccgccttcaga +cccccccccccccccccccc +gactctagcagagtggccagccac +ggggggggggggggggggggggg +acacatgctagcgcgtcggggtgg +`)) + c.Assert(err, check.IsNil) + c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}}}) + + // tags appear in seq: 0, 1, 1, 2 -> skip second tag1 + tilelib = &tileLibrary{taglib: &taglib, skipOOO: true} + tseq, err = tilelib.TileFasta("test-label", bytes.NewBufferString(`>test-seq +ggagaactgtgctccgccttcaga +cccccccccccccccccccc +acacatgctagcgcgtcggggtgg +ggggggggggggggggggggggg +acacatgctagcgcgtcggggtgg +ggggggggggggggggggggggg +gactctagcagagtggccagccac +`)) + c.Assert(err, check.IsNil) + c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}, {2, 1}}}) + + // tags appear in seq: 0, 1, 3 -> don't skip + tilelib = &tileLibrary{taglib: &taglib, skipOOO: true} + tseq, err = tilelib.TileFasta("test-label", bytes.NewBufferString(`>test-seq +ggagaactgtgctccgccttcaga +cccccccccccccccccccc +acacatgctagcgcgtcggggtgg +ggggggggggggggggggggggg +cctcccgagccgagccacccgtca +`)) + c.Assert(err, check.IsNil) + c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}, {3, 1}}}) +}