outputFile string
projectUUID string
runLocal bool
+ skipOOO bool
encoder *gob.Encoder
}
flags.StringVar(&cmd.outputFile, "o", "-", "output `file`")
flags.StringVar(&cmd.projectUUID, "project", "", "project `UUID` for output data")
flags.BoolVar(&cmd.runLocal, "local", false, "run on local host (default: run in an arvados container)")
+ flags.BoolVar(&cmd.skipOOO, "skip-ooo", false, "skip out-of-order tags")
priority := flags.Int("priority", 500, "container request priority")
pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
err = flags.Parse(args)
return nil, fmt.Errorf("cannot tile: tag library is empty")
}
log.Printf("tag library %s load done", cmd.tagLibraryFile)
- return &tileLibrary{taglib: &taglib}, nil
+ return &tileLibrary{taglib: &taglib, skipOOO: cmd.skipOOO}, nil
}
func listInputFiles(paths []string) (files []string, err error) {
}
flat := make([]tileVariantID, ntags*2)
for i := 0; i < ntags; i++ {
- flat[i*2] = variants[0][i]
- flat[i*2+1] = variants[1][i]
+ for hap := 0; hap < 2; hap++ {
+ if i < len(variants[hap]) {
+ flat[i*2+hap] = variants[hap][i]
+ }
+ }
}
err := cmd.encoder.Encode(LibraryEntry{
CompactGenomes: []CompactGenome{{Name: infile, Variants: flat}},
}
type tileLibrary struct {
+ skipOOO bool
taglib *tagLibrary
variant [][][blake2b.Size256]byte
// count [][]int
}
todo <- jobT{seqlabel, fasta}
}()
+ type foundtag struct {
+ pos int
+ tagid tagID
+ taglen int
+ }
+ found := make([]foundtag, 2000000)
path := make([]tileLibRef, 2000000)
totalPathLen := 0
skippedSequences := 0
continue
}
log.Debugf("%s %s tiling", filelabel, job.label)
+
+ found = found[:0]
+ tilelib.taglib.FindAll(job.fasta, func(tagid tagID, pos, taglen int) {
+ found = append(found, foundtag{pos: pos, tagid: tagid, taglen: taglen})
+ })
path = path[:0]
- tilestart := -1 // position in fasta of tile that ends here
- tiletagid := tagID(-1) // tag id starting tile that ends here
- tilelib.taglib.FindAll(job.fasta, func(id tagID, pos, taglen int) {
- if tilestart >= 0 {
- path = append(path, tilelib.getRef(tiletagid, job.fasta[tilestart:pos+taglen]))
+ last := foundtag{tagid: -1}
+ for i, f := range found {
+ if tilelib.skipOOO {
+ if f.tagid < last.tagid+1 {
+ // e.g., last=B, this=A
+ continue
+ }
+ if f.tagid > last.tagid+1 && i+1 < len(found) && found[i+1].tagid <= f.tagid {
+ // e.g., last=A, this=C, next=B
+ continue
+ }
}
- tilestart = pos
- tiletagid = id
- })
- if tiletagid >= 0 {
- path = append(path, tilelib.getRef(tiletagid, job.fasta[tilestart:]))
+ if last.taglen > 0 {
+ path = append(path, tilelib.getRef(last.tagid, job.fasta[last.pos:f.pos+f.taglen]))
+ }
+ last = f
+ }
+ if last.taglen > 0 {
+ path = append(path, tilelib.getRef(last.tagid, job.fasta[last.pos:]))
}
+
pathcopy := make([]tileLibRef, len(path))
copy(pathcopy, path)
ret[job.label] = pathcopy
--- /dev/null
+package main
+
+import (
+ "bytes"
+
+ "gopkg.in/check.v1"
+)
+
+type tilelibSuite struct{}
+
+var _ = check.Suite(&tilelibSuite{})
+
+func (s *tilelibSuite) TestSkipOOO(c *check.C) {
+ var taglib tagLibrary
+ err := taglib.Load(bytes.NewBufferString(`>0000.00
+ggagaactgtgctccgccttcaga
+acacatgctagcgcgtcggggtgg
+gactctagcagagtggccagccac
+cctcccgagccgagccacccgtca
+gttattaataataacttatcatca
+`))
+ c.Assert(err, check.IsNil)
+
+ // tags appear in seq: 4, 0, 2 (but skipOOO is false)
+ tilelib := &tileLibrary{taglib: &taglib, skipOOO: false}
+ tseq, err := tilelib.TileFasta("test-label", bytes.NewBufferString(`>test-seq
+gttattaataataacttatcatca
+ggggggggggggggggggggggg
+ggagaactgtgctccgccttcaga
+cccccccccccccccccccc
+gactctagcagagtggccagccac
+`))
+ c.Assert(err, check.IsNil)
+ c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{4, 1}, {0, 1}, {2, 1}}})
+
+ // tags appear in seq: 0, 1, 2 -> don't skip
+ tilelib = &tileLibrary{taglib: &taglib, skipOOO: true}
+ tseq, err = tilelib.TileFasta("test-label", bytes.NewBufferString(`>test-seq
+ggagaactgtgctccgccttcaga
+cccccccccccccccccccc
+acacatgctagcgcgtcggggtgg
+ggggggggggggggggggggggg
+gactctagcagagtggccagccac
+`))
+ c.Assert(err, check.IsNil)
+ c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}, {2, 1}}})
+
+ // tags appear in seq: 2, 3, 4 -> don't skip
+ tilelib = &tileLibrary{taglib: &taglib, skipOOO: true}
+ tseq, err = tilelib.TileFasta("test-label", bytes.NewBufferString(`>test-seq
+gactctagcagagtggccagccac
+cccccccccccccccccccc
+cctcccgagccgagccacccgtca
+ggggggggggggggggggggggg
+gttattaataataacttatcatca
+`))
+ c.Assert(err, check.IsNil)
+ c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{2, 1}, {3, 1}, {4, 1}}})
+
+ // tags appear in seq: 4, 0, 2 -> skip 4
+ tilelib = &tileLibrary{taglib: &taglib, skipOOO: true}
+ tseq, err = tilelib.TileFasta("test-label", bytes.NewBufferString(`>test-seq
+gttattaataataacttatcatca
+cccccccccccccccccccc
+ggagaactgtgctccgccttcaga
+ggggggggggggggggggggggg
+gactctagcagagtggccagccac
+`))
+ c.Assert(err, check.IsNil)
+ c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {2, 1}}})
+
+ // tags appear in seq: 0, 2, 1 -> skip 2
+ tilelib = &tileLibrary{taglib: &taglib, skipOOO: true}
+ tseq, err = tilelib.TileFasta("test-label", bytes.NewBufferString(`>test-seq
+ggagaactgtgctccgccttcaga
+cccccccccccccccccccc
+gactctagcagagtggccagccac
+ggggggggggggggggggggggg
+acacatgctagcgcgtcggggtgg
+`))
+ c.Assert(err, check.IsNil)
+ c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}}})
+
+ // tags appear in seq: 0, 1, 1, 2 -> skip second tag1
+ tilelib = &tileLibrary{taglib: &taglib, skipOOO: true}
+ tseq, err = tilelib.TileFasta("test-label", bytes.NewBufferString(`>test-seq
+ggagaactgtgctccgccttcaga
+cccccccccccccccccccc
+acacatgctagcgcgtcggggtgg
+ggggggggggggggggggggggg
+acacatgctagcgcgtcggggtgg
+ggggggggggggggggggggggg
+gactctagcagagtggccagccac
+`))
+ c.Assert(err, check.IsNil)
+ c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}, {2, 1}}})
+
+ // tags appear in seq: 0, 1, 3 -> don't skip
+ tilelib = &tileLibrary{taglib: &taglib, skipOOO: true}
+ tseq, err = tilelib.TileFasta("test-label", bytes.NewBufferString(`>test-seq
+ggagaactgtgctccgccttcaga
+cccccccccccccccccccc
+acacatgctagcgcgtcggggtgg
+ggggggggggggggggggggggg
+cctcccgagccgagccacccgtca
+`))
+ c.Assert(err, check.IsNil)
+ c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}, {3, 1}}})
+}