Log out-of-order tags if -loglevel=debug.
[lightning.git] / tilelib.go
1 package main
2
3 import (
4         "bufio"
5         "bytes"
6         "io"
7         "strings"
8         "sync"
9
10         log "github.com/sirupsen/logrus"
11         "golang.org/x/crypto/blake2b"
12 )
13
14 type tileVariantID uint16 // 1-based
15
16 type tileLibRef struct {
17         tag     tagID
18         variant tileVariantID
19 }
20
21 type tileSeq map[string][]tileLibRef
22
23 func (tseq tileSeq) Variants() []tileVariantID {
24         maxtag := 0
25         for _, refs := range tseq {
26                 for _, ref := range refs {
27                         if maxtag < int(ref.tag) {
28                                 maxtag = int(ref.tag)
29                         }
30                 }
31         }
32         vars := make([]tileVariantID, maxtag+1)
33         for _, refs := range tseq {
34                 for _, ref := range refs {
35                         vars[int(ref.tag)] = ref.variant
36                 }
37         }
38         return vars
39 }
40
41 type tileLibrary struct {
42         skipOOO bool
43         taglib  *tagLibrary
44         variant [][][blake2b.Size256]byte
45         // count [][]int
46         // seq map[[blake2b.Size]byte][]byte
47         variants int
48
49         mtx sync.Mutex
50 }
51
52 func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq, error) {
53         ret := tileSeq{}
54         type jobT struct {
55                 label string
56                 fasta []byte
57         }
58         todo := make(chan jobT)
59         scanner := bufio.NewScanner(rdr)
60         go func() {
61                 defer close(todo)
62                 var fasta []byte
63                 var seqlabel string
64                 for scanner.Scan() {
65                         buf := scanner.Bytes()
66                         if len(buf) == 0 || buf[0] == '>' {
67                                 todo <- jobT{seqlabel, fasta}
68                                 seqlabel, fasta = string(buf[1:]), nil
69                                 log.Debugf("%s %s reading fasta", filelabel, seqlabel)
70                         } else {
71                                 fasta = append(fasta, bytes.ToLower(buf)...)
72                         }
73                 }
74                 todo <- jobT{seqlabel, fasta}
75         }()
76         type foundtag struct {
77                 pos    int
78                 tagid  tagID
79                 taglen int
80         }
81         found := make([]foundtag, 2000000)
82         path := make([]tileLibRef, 2000000)
83         totalPathLen := 0
84         skippedSequences := 0
85         for job := range todo {
86                 if len(job.fasta) == 0 {
87                         continue
88                 } else if strings.Contains(job.label, "_") {
89                         skippedSequences++
90                         continue
91                 }
92                 log.Debugf("%s %s tiling", filelabel, job.label)
93
94                 found = found[:0]
95                 tilelib.taglib.FindAll(job.fasta, func(tagid tagID, pos, taglen int) {
96                         found = append(found, foundtag{pos: pos, tagid: tagid, taglen: taglen})
97                 })
98                 path = path[:0]
99                 last := foundtag{tagid: -1}
100                 for i, f := range found {
101                         log.Tracef("%s %s found[%d] == %#v", filelabel, job.label, i, f)
102                         if tilelib.skipOOO {
103                                 if f.tagid < last.tagid+1 {
104                                         // e.g., last=B, this=A
105                                         log.Debugf("%s %s skipped out-of-order tag %d (found at %d) because it follows tag %d (found at %d)", filelabel, job.label, f.tagid, f.pos, last.tagid, last.pos)
106                                         continue
107                                 }
108                                 if f.tagid > last.tagid+1 && i+1 < len(found) && found[i+1].tagid <= f.tagid {
109                                         // e.g., last=A, this=C, next=B
110                                         log.Debugf("%s %s skipped out-of-order tag %d (found at %d) because it appears between tag %d (found at %d) and %d (found at %d)", filelabel, job.label, f.tagid, f.pos, last.tagid, last.pos, found[i+1].tagid, found[i+1].pos)
111                                         continue
112                                 }
113                         }
114                         if last.taglen > 0 {
115                                 path = append(path, tilelib.getRef(last.tagid, job.fasta[last.pos:f.pos+f.taglen]))
116                         }
117                         last = f
118                 }
119                 if last.taglen > 0 {
120                         path = append(path, tilelib.getRef(last.tagid, job.fasta[last.pos:]))
121                 }
122
123                 pathcopy := make([]tileLibRef, len(path))
124                 copy(pathcopy, path)
125                 ret[job.label] = pathcopy
126                 log.Debugf("%s %s tiled with path len %d", filelabel, job.label, len(path))
127                 totalPathLen += len(path)
128         }
129         log.Printf("%s tiled with total path len %d in %d sequences (skipped %d sequences with '_' in name)", filelabel, totalPathLen, len(ret), skippedSequences)
130         return ret, scanner.Err()
131 }
132
133 func (tilelib *tileLibrary) Len() int {
134         tilelib.mtx.Lock()
135         defer tilelib.mtx.Unlock()
136         return tilelib.variants
137 }
138
139 // Return a tileLibRef for a tile with the given tag and sequence,
140 // adding the sequence to the library if needed.
141 func (tilelib *tileLibrary) getRef(tag tagID, seq []byte) tileLibRef {
142         for _, b := range seq {
143                 if b != 'a' && b != 'c' && b != 'g' && b != 't' {
144                         // return "tile not found" if seq has any
145                         // no-calls
146                         return tileLibRef{tag: tag}
147                 }
148         }
149         tilelib.mtx.Lock()
150         defer tilelib.mtx.Unlock()
151         // if tilelib.seq == nil {
152         //      tilelib.seq = map[[blake2b.Size]byte][]byte{}
153         // }
154         if tilelib.variant == nil {
155                 tilelib.variant = make([][][blake2b.Size256]byte, tilelib.taglib.Len())
156         }
157         seqhash := blake2b.Sum256(seq)
158         for i, varhash := range tilelib.variant[tag] {
159                 if varhash == seqhash {
160                         return tileLibRef{tag: tag, variant: tileVariantID(i + 1)}
161                 }
162         }
163         tilelib.variants++
164         tilelib.variant[tag] = append(tilelib.variant[tag], seqhash)
165         // tilelib.seq[seqhash] = append([]byte(nil), seq...)
166         return tileLibRef{tag: tag, variant: tileVariantID(len(tilelib.variant[tag]))}
167 }