More memory for export-numpy.
[lightning.git] / tilelib.go
1 package main
2
3 import (
4         "bufio"
5         "bytes"
6         "io"
7         "strings"
8         "sync"
9
10         log "github.com/sirupsen/logrus"
11         "golang.org/x/crypto/blake2b"
12 )
13
14 type tileVariantID uint16 // 1-based
15
16 type tileLibRef struct {
17         tag     tagID
18         variant tileVariantID
19 }
20
21 type tileSeq map[string][]tileLibRef
22
23 func (tseq tileSeq) Variants() ([]tileVariantID, int, int) {
24         maxtag := 0
25         for _, refs := range tseq {
26                 for _, ref := range refs {
27                         if maxtag < int(ref.tag) {
28                                 maxtag = int(ref.tag)
29                         }
30                 }
31         }
32         vars := make([]tileVariantID, maxtag+1)
33         var kept, dropped int
34         for _, refs := range tseq {
35                 for _, ref := range refs {
36                         if vars[int(ref.tag)] != 0 {
37                                 dropped++
38                         } else {
39                                 kept++
40                         }
41                         vars[int(ref.tag)] = ref.variant
42                 }
43         }
44         return vars, kept, dropped
45 }
46
47 type tileLibrary struct {
48         skipOOO bool
49         taglib  *tagLibrary
50         variant [][][blake2b.Size256]byte
51         // count [][]int
52         // seq map[[blake2b.Size]byte][]byte
53         variants int
54
55         mtx sync.Mutex
56 }
57
58 func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq, error) {
59         ret := tileSeq{}
60         type jobT struct {
61                 label string
62                 fasta []byte
63         }
64         todo := make(chan jobT)
65         scanner := bufio.NewScanner(rdr)
66         go func() {
67                 defer close(todo)
68                 var fasta []byte
69                 var seqlabel string
70                 for scanner.Scan() {
71                         buf := scanner.Bytes()
72                         if len(buf) > 0 && buf[0] == '>' {
73                                 todo <- jobT{seqlabel, fasta}
74                                 seqlabel, fasta = string(buf[1:]), nil
75                                 log.Debugf("%s %s reading fasta", filelabel, seqlabel)
76                         } else {
77                                 fasta = append(fasta, bytes.ToLower(buf)...)
78                         }
79                 }
80                 todo <- jobT{seqlabel, fasta}
81         }()
82         type foundtag struct {
83                 pos    int
84                 tagid  tagID
85                 taglen int
86         }
87         found := make([]foundtag, 2000000)
88         path := make([]tileLibRef, 2000000)
89         totalFoundTags := 0
90         totalPathLen := 0
91         skippedSequences := 0
92         for job := range todo {
93                 if len(job.fasta) == 0 {
94                         continue
95                 } else if strings.Contains(job.label, "_") {
96                         skippedSequences++
97                         continue
98                 }
99                 log.Debugf("%s %s tiling", filelabel, job.label)
100
101                 found = found[:0]
102                 tilelib.taglib.FindAll(job.fasta, func(tagid tagID, pos, taglen int) {
103                         found = append(found, foundtag{pos: pos, tagid: tagid, taglen: taglen})
104                 })
105                 totalFoundTags += len(found)
106
107                 skipped := 0
108                 path = path[:0]
109                 last := foundtag{tagid: -1}
110                 if tilelib.skipOOO {
111                         keep := longestIncreasingSubsequence(len(found), func(i int) int { return int(found[i].tagid) })
112                         for i, x := range keep {
113                                 found[i] = found[x]
114                         }
115                         skipped = len(found) - len(keep)
116                         found = found[:len(keep)]
117                 }
118                 for i, f := range found {
119                         log.Tracef("%s %s found[%d] == %#v", filelabel, job.label, i, f)
120                         if last.taglen > 0 {
121                                 path = append(path, tilelib.getRef(last.tagid, job.fasta[last.pos:f.pos+f.taglen]))
122                         }
123                         last = f
124                 }
125                 if last.taglen > 0 {
126                         path = append(path, tilelib.getRef(last.tagid, job.fasta[last.pos:]))
127                 }
128
129                 pathcopy := make([]tileLibRef, len(path))
130                 copy(pathcopy, path)
131                 ret[job.label] = pathcopy
132                 log.Debugf("%s %s tiled with path len %d, skipped %d", filelabel, job.label, len(path), skipped)
133                 totalPathLen += len(path)
134         }
135         log.Printf("%s tiled with total path len %d in %d sequences (skipped %d sequences with '_' in name, skipped %d out-of-order tags)", filelabel, totalPathLen, len(ret), skippedSequences, totalFoundTags-totalPathLen)
136         return ret, scanner.Err()
137 }
138
139 func (tilelib *tileLibrary) Len() int {
140         tilelib.mtx.Lock()
141         defer tilelib.mtx.Unlock()
142         return tilelib.variants
143 }
144
145 // Return a tileLibRef for a tile with the given tag and sequence,
146 // adding the sequence to the library if needed.
147 func (tilelib *tileLibrary) getRef(tag tagID, seq []byte) tileLibRef {
148         for _, b := range seq {
149                 if b != 'a' && b != 'c' && b != 'g' && b != 't' {
150                         // return "tile not found" if seq has any
151                         // no-calls
152                         return tileLibRef{tag: tag}
153                 }
154         }
155         tilelib.mtx.Lock()
156         defer tilelib.mtx.Unlock()
157         // if tilelib.seq == nil {
158         //      tilelib.seq = map[[blake2b.Size]byte][]byte{}
159         // }
160         if tilelib.variant == nil {
161                 tilelib.variant = make([][][blake2b.Size256]byte, tilelib.taglib.Len())
162         }
163         seqhash := blake2b.Sum256(seq)
164         for i, varhash := range tilelib.variant[tag] {
165                 if varhash == seqhash {
166                         return tileLibRef{tag: tag, variant: tileVariantID(i + 1)}
167                 }
168         }
169         tilelib.variants++
170         tilelib.variant[tag] = append(tilelib.variant[tag], seqhash)
171         // tilelib.seq[seqhash] = append([]byte(nil), seq...)
172         return tileLibRef{tag: tag, variant: tileVariantID(len(tilelib.variant[tag]))}
173 }