Option to skip/ignore out-of-order tags.
authorTom Clegg <tom@tomclegg.ca>
Mon, 6 Apr 2020 14:09:46 +0000 (10:09 -0400)
committerTom Clegg <tom@tomclegg.ca>
Mon, 6 Apr 2020 14:09:46 +0000 (10:09 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@tomclegg.ca>

import.go
tilelib.go
tilelib_test.go [new file with mode: 0644]

index e6b86afd81ca22038c025c669e39a6fd3a0c3b88..9c933da3dec02c2cbf359d730acfe4872972b37c 100644 (file)
--- a/import.go
+++ b/import.go
@@ -31,6 +31,7 @@ type importer struct {
        outputFile     string
        projectUUID    string
        runLocal       bool
+       skipOOO        bool
        encoder        *gob.Encoder
 }
 
@@ -48,6 +49,7 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std
        flags.StringVar(&cmd.outputFile, "o", "-", "output `file`")
        flags.StringVar(&cmd.projectUUID, "project", "", "project `UUID` for output data")
        flags.BoolVar(&cmd.runLocal, "local", false, "run on local host (default: run in an arvados container)")
+       flags.BoolVar(&cmd.skipOOO, "skip-ooo", false, "skip out-of-order tags")
        priority := flags.Int("priority", 500, "container request priority")
        pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
        err = flags.Parse(args)
@@ -193,7 +195,7 @@ func (cmd *importer) loadTileLibrary() (*tileLibrary, error) {
                return nil, fmt.Errorf("cannot tile: tag library is empty")
        }
        log.Printf("tag library %s load done", cmd.tagLibraryFile)
-       return &tileLibrary{taglib: &taglib}, nil
+       return &tileLibrary{taglib: &taglib, skipOOO: cmd.skipOOO}, nil
 }
 
 func listInputFiles(paths []string) (files []string, err error) {
@@ -293,8 +295,11 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error {
                        }
                        flat := make([]tileVariantID, ntags*2)
                        for i := 0; i < ntags; i++ {
-                               flat[i*2] = variants[0][i]
-                               flat[i*2+1] = variants[1][i]
+                               for hap := 0; hap < 2; hap++ {
+                                       if i < len(variants[hap]) {
+                                               flat[i*2+hap] = variants[hap][i]
+                                       }
+                               }
                        }
                        err := cmd.encoder.Encode(LibraryEntry{
                                CompactGenomes: []CompactGenome{{Name: infile, Variants: flat}},
index 7b90810242335fd6ac7ff0a906b9f923d262b26d..339bc215e3e5a82704b03ba81f93d0f8377e70d7 100644 (file)
@@ -39,6 +39,7 @@ func (tseq tileSeq) Variants() []tileVariantID {
 }
 
 type tileLibrary struct {
+       skipOOO bool
        taglib  *tagLibrary
        variant [][][blake2b.Size256]byte
        // count [][]int
@@ -72,6 +73,12 @@ func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq,
                }
                todo <- jobT{seqlabel, fasta}
        }()
+       type foundtag struct {
+               pos    int
+               tagid  tagID
+               taglen int
+       }
+       found := make([]foundtag, 2000000)
        path := make([]tileLibRef, 2000000)
        totalPathLen := 0
        skippedSequences := 0
@@ -83,19 +90,33 @@ func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq,
                        continue
                }
                log.Debugf("%s %s tiling", filelabel, job.label)
+
+               found = found[:0]
+               tilelib.taglib.FindAll(job.fasta, func(tagid tagID, pos, taglen int) {
+                       found = append(found, foundtag{pos: pos, tagid: tagid, taglen: taglen})
+               })
                path = path[:0]
-               tilestart := -1        // position in fasta of tile that ends here
-               tiletagid := tagID(-1) // tag id starting tile that ends here
-               tilelib.taglib.FindAll(job.fasta, func(id tagID, pos, taglen int) {
-                       if tilestart >= 0 {
-                               path = append(path, tilelib.getRef(tiletagid, job.fasta[tilestart:pos+taglen]))
+               last := foundtag{tagid: -1}
+               for i, f := range found {
+                       if tilelib.skipOOO {
+                               if f.tagid < last.tagid+1 {
+                                       // e.g., last=B, this=A
+                                       continue
+                               }
+                               if f.tagid > last.tagid+1 && i+1 < len(found) && found[i+1].tagid <= f.tagid {
+                                       // e.g., last=A, this=C, next=B
+                                       continue
+                               }
                        }
-                       tilestart = pos
-                       tiletagid = id
-               })
-               if tiletagid >= 0 {
-                       path = append(path, tilelib.getRef(tiletagid, job.fasta[tilestart:]))
+                       if last.taglen > 0 {
+                               path = append(path, tilelib.getRef(last.tagid, job.fasta[last.pos:f.pos+f.taglen]))
+                       }
+                       last = f
+               }
+               if last.taglen > 0 {
+                       path = append(path, tilelib.getRef(last.tagid, job.fasta[last.pos:]))
                }
+
                pathcopy := make([]tileLibRef, len(path))
                copy(pathcopy, path)
                ret[job.label] = pathcopy
diff --git a/tilelib_test.go b/tilelib_test.go
new file mode 100644 (file)
index 0000000..4024b83
--- /dev/null
@@ -0,0 +1,109 @@
+package main
+
+import (
+       "bytes"
+
+       "gopkg.in/check.v1"
+)
+
+type tilelibSuite struct{}
+
+var _ = check.Suite(&tilelibSuite{})
+
+func (s *tilelibSuite) TestSkipOOO(c *check.C) {
+       var taglib tagLibrary
+       err := taglib.Load(bytes.NewBufferString(`>0000.00
+ggagaactgtgctccgccttcaga
+acacatgctagcgcgtcggggtgg
+gactctagcagagtggccagccac
+cctcccgagccgagccacccgtca
+gttattaataataacttatcatca
+`))
+       c.Assert(err, check.IsNil)
+
+       // tags appear in seq: 4, 0, 2 (but skipOOO is false)
+       tilelib := &tileLibrary{taglib: &taglib, skipOOO: false}
+       tseq, err := tilelib.TileFasta("test-label", bytes.NewBufferString(`>test-seq
+gttattaataataacttatcatca
+ggggggggggggggggggggggg
+ggagaactgtgctccgccttcaga
+cccccccccccccccccccc
+gactctagcagagtggccagccac
+`))
+       c.Assert(err, check.IsNil)
+       c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{4, 1}, {0, 1}, {2, 1}}})
+
+       // tags appear in seq: 0, 1, 2 -> don't skip
+       tilelib = &tileLibrary{taglib: &taglib, skipOOO: true}
+       tseq, err = tilelib.TileFasta("test-label", bytes.NewBufferString(`>test-seq
+ggagaactgtgctccgccttcaga
+cccccccccccccccccccc
+acacatgctagcgcgtcggggtgg
+ggggggggggggggggggggggg
+gactctagcagagtggccagccac
+`))
+       c.Assert(err, check.IsNil)
+       c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}, {2, 1}}})
+
+       // tags appear in seq: 2, 3, 4 -> don't skip
+       tilelib = &tileLibrary{taglib: &taglib, skipOOO: true}
+       tseq, err = tilelib.TileFasta("test-label", bytes.NewBufferString(`>test-seq
+gactctagcagagtggccagccac
+cccccccccccccccccccc
+cctcccgagccgagccacccgtca
+ggggggggggggggggggggggg
+gttattaataataacttatcatca
+`))
+       c.Assert(err, check.IsNil)
+       c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{2, 1}, {3, 1}, {4, 1}}})
+
+       // tags appear in seq: 4, 0, 2 -> skip 4
+       tilelib = &tileLibrary{taglib: &taglib, skipOOO: true}
+       tseq, err = tilelib.TileFasta("test-label", bytes.NewBufferString(`>test-seq
+gttattaataataacttatcatca
+cccccccccccccccccccc
+ggagaactgtgctccgccttcaga
+ggggggggggggggggggggggg
+gactctagcagagtggccagccac
+`))
+       c.Assert(err, check.IsNil)
+       c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {2, 1}}})
+
+       // tags appear in seq: 0, 2, 1 -> skip 2
+       tilelib = &tileLibrary{taglib: &taglib, skipOOO: true}
+       tseq, err = tilelib.TileFasta("test-label", bytes.NewBufferString(`>test-seq
+ggagaactgtgctccgccttcaga
+cccccccccccccccccccc
+gactctagcagagtggccagccac
+ggggggggggggggggggggggg
+acacatgctagcgcgtcggggtgg
+`))
+       c.Assert(err, check.IsNil)
+       c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}}})
+
+       // tags appear in seq: 0, 1, 1, 2 -> skip second tag1
+       tilelib = &tileLibrary{taglib: &taglib, skipOOO: true}
+       tseq, err = tilelib.TileFasta("test-label", bytes.NewBufferString(`>test-seq
+ggagaactgtgctccgccttcaga
+cccccccccccccccccccc
+acacatgctagcgcgtcggggtgg
+ggggggggggggggggggggggg
+acacatgctagcgcgtcggggtgg
+ggggggggggggggggggggggg
+gactctagcagagtggccagccac
+`))
+       c.Assert(err, check.IsNil)
+       c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}, {2, 1}}})
+
+       // tags appear in seq: 0, 1, 3 -> don't skip
+       tilelib = &tileLibrary{taglib: &taglib, skipOOO: true}
+       tseq, err = tilelib.TileFasta("test-label", bytes.NewBufferString(`>test-seq
+ggagaactgtgctccgccttcaga
+cccccccccccccccccccc
+acacatgctagcgcgtcggggtgg
+ggggggggggggggggggggggg
+cctcccgagccgagccacccgtca
+`))
+       c.Assert(err, check.IsNil)
+       c.Check(tseq, check.DeepEquals, tileSeq{"test-seq": []tileLibRef{{0, 1}, {1, 1}, {3, 1}}})
+}