ca3857566f2500e85823dcc4eecd06370e9389a8
[lightning.git] / tilelib.go
1 package main
2
3 import (
4         "bufio"
5         "bytes"
6         "encoding/gob"
7         "io"
8         "strings"
9         "sync"
10
11         log "github.com/sirupsen/logrus"
12         "golang.org/x/crypto/blake2b"
13 )
14
15 type tileVariantID uint16 // 1-based
16
17 type tileLibRef struct {
18         tag     tagID
19         variant tileVariantID
20 }
21
22 type tileSeq map[string][]tileLibRef
23
24 func (tseq tileSeq) Variants() ([]tileVariantID, int, int) {
25         maxtag := 0
26         for _, refs := range tseq {
27                 for _, ref := range refs {
28                         if maxtag < int(ref.tag) {
29                                 maxtag = int(ref.tag)
30                         }
31                 }
32         }
33         vars := make([]tileVariantID, maxtag+1)
34         var kept, dropped int
35         for _, refs := range tseq {
36                 for _, ref := range refs {
37                         if vars[int(ref.tag)] != 0 {
38                                 dropped++
39                         } else {
40                                 kept++
41                         }
42                         vars[int(ref.tag)] = ref.variant
43                 }
44         }
45         return vars, kept, dropped
46 }
47
48 type tileLibrary struct {
49         skipOOO bool
50         taglib  *tagLibrary
51         variant [][][blake2b.Size256]byte
52         // count [][]int
53         // seq map[[blake2b.Size]byte][]byte
54         variants int
55         // if non-nil, write out any tile variants added while tiling
56         encoder *gob.Encoder
57
58         mtx sync.Mutex
59 }
60
61 func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq, error) {
62         ret := tileSeq{}
63         type jobT struct {
64                 label string
65                 fasta []byte
66         }
67         todo := make(chan jobT)
68         scanner := bufio.NewScanner(rdr)
69         go func() {
70                 defer close(todo)
71                 var fasta []byte
72                 var seqlabel string
73                 for scanner.Scan() {
74                         buf := scanner.Bytes()
75                         if len(buf) > 0 && buf[0] == '>' {
76                                 todo <- jobT{seqlabel, fasta}
77                                 seqlabel, fasta = string(buf[1:]), nil
78                                 log.Debugf("%s %s reading fasta", filelabel, seqlabel)
79                         } else {
80                                 fasta = append(fasta, bytes.ToLower(buf)...)
81                         }
82                 }
83                 todo <- jobT{seqlabel, fasta}
84         }()
85         type foundtag struct {
86                 pos    int
87                 tagid  tagID
88                 taglen int
89         }
90         found := make([]foundtag, 2000000)
91         path := make([]tileLibRef, 2000000)
92         totalFoundTags := 0
93         totalPathLen := 0
94         skippedSequences := 0
95         for job := range todo {
96                 if len(job.fasta) == 0 {
97                         continue
98                 } else if strings.Contains(job.label, "_") {
99                         skippedSequences++
100                         continue
101                 }
102                 log.Debugf("%s %s tiling", filelabel, job.label)
103
104                 found = found[:0]
105                 tilelib.taglib.FindAll(job.fasta, func(tagid tagID, pos, taglen int) {
106                         found = append(found, foundtag{pos: pos, tagid: tagid, taglen: taglen})
107                 })
108                 totalFoundTags += len(found)
109
110                 skipped := 0
111                 path = path[:0]
112                 last := foundtag{tagid: -1}
113                 if tilelib.skipOOO {
114                         keep := longestIncreasingSubsequence(len(found), func(i int) int { return int(found[i].tagid) })
115                         for i, x := range keep {
116                                 found[i] = found[x]
117                         }
118                         skipped = len(found) - len(keep)
119                         found = found[:len(keep)]
120                 }
121                 for i, f := range found {
122                         log.Tracef("%s %s found[%d] == %#v", filelabel, job.label, i, f)
123                         if last.taglen > 0 {
124                                 path = append(path, tilelib.getRef(last.tagid, job.fasta[last.pos:f.pos+f.taglen]))
125                         }
126                         last = f
127                 }
128                 if last.taglen > 0 {
129                         path = append(path, tilelib.getRef(last.tagid, job.fasta[last.pos:]))
130                 }
131
132                 pathcopy := make([]tileLibRef, len(path))
133                 copy(pathcopy, path)
134                 ret[job.label] = pathcopy
135                 log.Debugf("%s %s tiled with path len %d, skipped %d", filelabel, job.label, len(path), skipped)
136                 totalPathLen += len(path)
137         }
138         log.Printf("%s tiled with total path len %d in %d sequences (skipped %d sequences with '_' in name, skipped %d out-of-order tags)", filelabel, totalPathLen, len(ret), skippedSequences, totalFoundTags-totalPathLen)
139         return ret, scanner.Err()
140 }
141
142 func (tilelib *tileLibrary) Len() int {
143         tilelib.mtx.Lock()
144         defer tilelib.mtx.Unlock()
145         return tilelib.variants
146 }
147
148 // Return a tileLibRef for a tile with the given tag and sequence,
149 // adding the sequence to the library if needed.
150 func (tilelib *tileLibrary) getRef(tag tagID, seq []byte) tileLibRef {
151         for _, b := range seq {
152                 if b != 'a' && b != 'c' && b != 'g' && b != 't' {
153                         // return "tile not found" if seq has any
154                         // no-calls
155                         return tileLibRef{tag: tag}
156                 }
157         }
158         tilelib.mtx.Lock()
159         // if tilelib.seq == nil {
160         //      tilelib.seq = map[[blake2b.Size]byte][]byte{}
161         // }
162         if tilelib.variant == nil {
163                 tilelib.variant = make([][][blake2b.Size256]byte, tilelib.taglib.Len())
164         }
165         seqhash := blake2b.Sum256(seq)
166         for i, varhash := range tilelib.variant[tag] {
167                 if varhash == seqhash {
168                         tilelib.mtx.Unlock()
169                         return tileLibRef{tag: tag, variant: tileVariantID(i + 1)}
170                 }
171         }
172         tilelib.variants++
173         tilelib.variant[tag] = append(tilelib.variant[tag], seqhash)
174         // tilelib.seq[seqhash] = append([]byte(nil), seq...)
175         ret := tileLibRef{tag: tag, variant: tileVariantID(len(tilelib.variant[tag]))}
176         tilelib.mtx.Unlock()
177
178         if tilelib.encoder != nil {
179                 tilelib.encoder.Encode(LibraryEntry{
180                         TileVariants: []TileVariant{{
181                                 Tag:      tag,
182                                 Blake2b:  seqhash,
183                                 Sequence: seq,
184                         }},
185                 })
186         }
187         return ret
188 }