Option to treat tiles with no-calls as regular tiles.
[lightning.git] / tilelib.go
1 package main
2
3 import (
4         "bufio"
5         "bytes"
6         "encoding/gob"
7         "io"
8         "strings"
9         "sync"
10
11         log "github.com/sirupsen/logrus"
12         "golang.org/x/crypto/blake2b"
13 )
14
15 type tileVariantID uint16 // 1-based
16
17 type tileLibRef struct {
18         tag     tagID
19         variant tileVariantID
20 }
21
22 type tileSeq map[string][]tileLibRef
23
24 func (tseq tileSeq) Variants() ([]tileVariantID, int, int) {
25         maxtag := 0
26         for _, refs := range tseq {
27                 for _, ref := range refs {
28                         if maxtag < int(ref.tag) {
29                                 maxtag = int(ref.tag)
30                         }
31                 }
32         }
33         vars := make([]tileVariantID, maxtag+1)
34         var kept, dropped int
35         for _, refs := range tseq {
36                 for _, ref := range refs {
37                         if vars[int(ref.tag)] != 0 {
38                                 dropped++
39                         } else {
40                                 kept++
41                         }
42                         vars[int(ref.tag)] = ref.variant
43                 }
44         }
45         return vars, kept, dropped
46 }
47
48 type tileLibrary struct {
49         includeNoCalls bool
50         skipOOO        bool
51         taglib         *tagLibrary
52         variant        [][][blake2b.Size256]byte
53         // count [][]int
54         // seq map[[blake2b.Size]byte][]byte
55         variants int
56         // if non-nil, write out any tile variants added while tiling
57         encoder *gob.Encoder
58
59         mtx sync.Mutex
60 }
61
62 func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq, error) {
63         ret := tileSeq{}
64         type jobT struct {
65                 label string
66                 fasta []byte
67         }
68         todo := make(chan jobT)
69         scanner := bufio.NewScanner(rdr)
70         go func() {
71                 defer close(todo)
72                 var fasta []byte
73                 var seqlabel string
74                 for scanner.Scan() {
75                         buf := scanner.Bytes()
76                         if len(buf) > 0 && buf[0] == '>' {
77                                 todo <- jobT{seqlabel, fasta}
78                                 seqlabel, fasta = string(buf[1:]), nil
79                                 log.Debugf("%s %s reading fasta", filelabel, seqlabel)
80                         } else {
81                                 fasta = append(fasta, bytes.ToLower(buf)...)
82                         }
83                 }
84                 todo <- jobT{seqlabel, fasta}
85         }()
86         type foundtag struct {
87                 pos    int
88                 tagid  tagID
89                 taglen int
90         }
91         found := make([]foundtag, 2000000)
92         path := make([]tileLibRef, 2000000)
93         totalFoundTags := 0
94         totalPathLen := 0
95         skippedSequences := 0
96         for job := range todo {
97                 if len(job.fasta) == 0 {
98                         continue
99                 } else if strings.Contains(job.label, "_") {
100                         skippedSequences++
101                         continue
102                 }
103                 log.Debugf("%s %s tiling", filelabel, job.label)
104
105                 found = found[:0]
106                 tilelib.taglib.FindAll(job.fasta, func(tagid tagID, pos, taglen int) {
107                         found = append(found, foundtag{pos: pos, tagid: tagid, taglen: taglen})
108                 })
109                 totalFoundTags += len(found)
110
111                 skipped := 0
112                 path = path[:0]
113                 last := foundtag{tagid: -1}
114                 if tilelib.skipOOO {
115                         keep := longestIncreasingSubsequence(len(found), func(i int) int { return int(found[i].tagid) })
116                         for i, x := range keep {
117                                 found[i] = found[x]
118                         }
119                         skipped = len(found) - len(keep)
120                         found = found[:len(keep)]
121                 }
122                 for i, f := range found {
123                         log.Tracef("%s %s found[%d] == %#v", filelabel, job.label, i, f)
124                         if last.taglen > 0 {
125                                 path = append(path, tilelib.getRef(last.tagid, job.fasta[last.pos:f.pos+f.taglen]))
126                         }
127                         last = f
128                 }
129                 if last.taglen > 0 {
130                         path = append(path, tilelib.getRef(last.tagid, job.fasta[last.pos:]))
131                 }
132
133                 pathcopy := make([]tileLibRef, len(path))
134                 copy(pathcopy, path)
135                 ret[job.label] = pathcopy
136                 log.Debugf("%s %s tiled with path len %d, skipped %d", filelabel, job.label, len(path), skipped)
137                 totalPathLen += len(path)
138         }
139         log.Printf("%s tiled with total path len %d in %d sequences (skipped %d sequences with '_' in name, skipped %d out-of-order tags)", filelabel, totalPathLen, len(ret), skippedSequences, totalFoundTags-totalPathLen)
140         return ret, scanner.Err()
141 }
142
143 func (tilelib *tileLibrary) Len() int {
144         tilelib.mtx.Lock()
145         defer tilelib.mtx.Unlock()
146         return tilelib.variants
147 }
148
149 // Return a tileLibRef for a tile with the given tag and sequence,
150 // adding the sequence to the library if needed.
151 func (tilelib *tileLibrary) getRef(tag tagID, seq []byte) tileLibRef {
152         if !tilelib.includeNoCalls {
153                 for _, b := range seq {
154                         if b != 'a' && b != 'c' && b != 'g' && b != 't' {
155                                 // return "tile not found" if seq has any
156                                 // no-calls
157                                 return tileLibRef{tag: tag}
158                         }
159                 }
160         }
161         tilelib.mtx.Lock()
162         // if tilelib.seq == nil {
163         //      tilelib.seq = map[[blake2b.Size]byte][]byte{}
164         // }
165         if tilelib.variant == nil {
166                 tilelib.variant = make([][][blake2b.Size256]byte, tilelib.taglib.Len())
167         }
168         seqhash := blake2b.Sum256(seq)
169         for i, varhash := range tilelib.variant[tag] {
170                 if varhash == seqhash {
171                         tilelib.mtx.Unlock()
172                         return tileLibRef{tag: tag, variant: tileVariantID(i + 1)}
173                 }
174         }
175         tilelib.variants++
176         tilelib.variant[tag] = append(tilelib.variant[tag], seqhash)
177         // tilelib.seq[seqhash] = append([]byte(nil), seq...)
178         ret := tileLibRef{tag: tag, variant: tileVariantID(len(tilelib.variant[tag]))}
179         tilelib.mtx.Unlock()
180
181         if tilelib.encoder != nil {
182                 tilelib.encoder.Encode(LibraryEntry{
183                         TileVariants: []TileVariant{{
184                                 Tag:      tag,
185                                 Blake2b:  seqhash,
186                                 Sequence: seq,
187                         }},
188                 })
189         }
190         return ret
191 }