Reduce allocs while tiling.
[lightning.git] / tilelib.go
1 package main
2
3 import (
4         "bufio"
5         "bytes"
6         "io"
7         "log"
8         "sync"
9
10         "golang.org/x/crypto/blake2b"
11 )
12
13 type tileVariantID int32 // 1-based
14
15 type tileLibRef struct {
16         tag     tagID
17         variant tileVariantID
18 }
19
20 type tileSeq map[string][]tileLibRef
21
22 type tileLibrary struct {
23         taglib  *tagLibrary
24         variant [][][blake2b.Size256]byte
25         // count [][]int
26         // seq map[[blake2b.Size]byte][]byte
27         variants int
28
29         mtx sync.Mutex
30 }
31
32 func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq, error) {
33         ret := tileSeq{}
34         type jobT struct {
35                 label string
36                 fasta []byte
37         }
38         todo := make(chan jobT)
39         scanner := bufio.NewScanner(rdr)
40         go func() {
41                 defer close(todo)
42                 var fasta []byte
43                 var seqlabel string
44                 for scanner.Scan() {
45                         buf := scanner.Bytes()
46                         if len(buf) == 0 || buf[0] == '>' {
47                                 todo <- jobT{seqlabel, fasta}
48                                 seqlabel, fasta = string(buf[1:]), nil
49                                 log.Printf("%s %s reading fasta", filelabel, seqlabel)
50                         } else {
51                                 fasta = append(fasta, bytes.ToLower(buf)...)
52                         }
53                 }
54                 todo <- jobT{seqlabel, fasta}
55         }()
56         path := make([]tileLibRef, 2000000)
57         for job := range todo {
58                 if len(job.fasta) == 0 {
59                         continue
60                 }
61                 log.Printf("%s %s tiling", filelabel, job.label)
62                 path = path[:0]
63                 tilestart := -1        // position in fasta of tile that ends here
64                 tiletagid := tagID(-1) // tag id starting tile that ends here
65                 tilelib.taglib.FindAll(job.fasta, func(id tagID, pos, taglen int) {
66                         if tilestart >= 0 {
67                                 path = append(path, tilelib.getRef(tiletagid, job.fasta[tilestart:pos+taglen]))
68                         }
69                         tilestart = pos
70                         tiletagid = id
71                 })
72                 if tiletagid >= 0 {
73                         path = append(path, tilelib.getRef(tiletagid, job.fasta[tilestart:]))
74                 }
75                 pathcopy := make([]tileLibRef, len(path))
76                 copy(pathcopy, path)
77                 ret[job.label] = pathcopy
78                 log.Printf("%s %s tiled with path len %d", filelabel, job.label, len(path))
79         }
80         return ret, scanner.Err()
81 }
82
83 func (tilelib *tileLibrary) Len() int {
84         tilelib.mtx.Lock()
85         defer tilelib.mtx.Unlock()
86         return tilelib.variants
87 }
88
89 // Return a tileLibRef for a tile with the given tag and sequence,
90 // adding the sequence to the library if needed.
91 func (tilelib *tileLibrary) getRef(tag tagID, seq []byte) tileLibRef {
92         for _, b := range seq {
93                 if b != 'a' && b != 'c' && b != 'g' && b != 't' {
94                         // return "tile not found" if seq has any
95                         // no-calls
96                         return tileLibRef{tag: tag}
97                 }
98         }
99         tilelib.mtx.Lock()
100         defer tilelib.mtx.Unlock()
101         // if tilelib.seq == nil {
102         //      tilelib.seq = map[[blake2b.Size]byte][]byte{}
103         // }
104         if tilelib.variant == nil {
105                 tilelib.variant = make([][][blake2b.Size256]byte, tilelib.taglib.Len())
106         }
107         seqhash := blake2b.Sum256(seq)
108         for i, varhash := range tilelib.variant[tag] {
109                 if varhash == seqhash {
110                         return tileLibRef{tag: tag, variant: tileVariantID(i + 1)}
111                 }
112         }
113         tilelib.variants++
114         tilelib.variant[tag] = append(tilelib.variant[tag], seqhash)
115         // tilelib.seq[seqhash] = append([]byte(nil), seq...)
116         return tileLibRef{tag: tag, variant: tileVariantID(len(tilelib.variant[tag]))}
117 }