Less verbose logging.
[lightning.git] / tilelib.go
1 package main
2
3 import (
4         "bufio"
5         "bytes"
6         "io"
7         "strings"
8         "sync"
9
10         log "github.com/sirupsen/logrus"
11         "golang.org/x/crypto/blake2b"
12 )
13
14 type tileVariantID uint16 // 1-based
15
16 type tileLibRef struct {
17         tag     tagID
18         variant tileVariantID
19 }
20
21 type tileSeq map[string][]tileLibRef
22
23 func (tseq tileSeq) Variants() []tileVariantID {
24         maxtag := 0
25         for _, refs := range tseq {
26                 for _, ref := range refs {
27                         if maxtag < int(ref.tag) {
28                                 maxtag = int(ref.tag)
29                         }
30                 }
31         }
32         vars := make([]tileVariantID, maxtag+1)
33         for _, refs := range tseq {
34                 for _, ref := range refs {
35                         vars[int(ref.tag)] = ref.variant
36                 }
37         }
38         return vars
39 }
40
41 type tileLibrary struct {
42         taglib  *tagLibrary
43         variant [][][blake2b.Size256]byte
44         // count [][]int
45         // seq map[[blake2b.Size]byte][]byte
46         variants int
47
48         mtx sync.Mutex
49 }
50
51 func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq, error) {
52         ret := tileSeq{}
53         type jobT struct {
54                 label string
55                 fasta []byte
56         }
57         todo := make(chan jobT)
58         scanner := bufio.NewScanner(rdr)
59         go func() {
60                 defer close(todo)
61                 var fasta []byte
62                 var seqlabel string
63                 for scanner.Scan() {
64                         buf := scanner.Bytes()
65                         if len(buf) == 0 || buf[0] == '>' {
66                                 todo <- jobT{seqlabel, fasta}
67                                 seqlabel, fasta = string(buf[1:]), nil
68                                 log.Debugf("%s %s reading fasta", filelabel, seqlabel)
69                         } else {
70                                 fasta = append(fasta, bytes.ToLower(buf)...)
71                         }
72                 }
73                 todo <- jobT{seqlabel, fasta}
74         }()
75         path := make([]tileLibRef, 2000000)
76         totalPathLen := 0
77         skippedSequences := 0
78         for job := range todo {
79                 if len(job.fasta) == 0 {
80                         continue
81                 } else if strings.Contains(job.label, "_") {
82                         skippedSequences++
83                         continue
84                 }
85                 log.Debugf("%s %s tiling", filelabel, job.label)
86                 path = path[:0]
87                 tilestart := -1        // position in fasta of tile that ends here
88                 tiletagid := tagID(-1) // tag id starting tile that ends here
89                 tilelib.taglib.FindAll(job.fasta, func(id tagID, pos, taglen int) {
90                         if tilestart >= 0 {
91                                 path = append(path, tilelib.getRef(tiletagid, job.fasta[tilestart:pos+taglen]))
92                         }
93                         tilestart = pos
94                         tiletagid = id
95                 })
96                 if tiletagid >= 0 {
97                         path = append(path, tilelib.getRef(tiletagid, job.fasta[tilestart:]))
98                 }
99                 pathcopy := make([]tileLibRef, len(path))
100                 copy(pathcopy, path)
101                 ret[job.label] = pathcopy
102                 log.Debugf("%s %s tiled with path len %d", filelabel, job.label, len(path))
103                 totalPathLen += len(path)
104         }
105         log.Printf("%s tiled with total path len %d in %d sequences (skipped %d sequences with '_' in name)", filelabel, totalPathLen, len(ret), skippedSequences)
106         return ret, scanner.Err()
107 }
108
109 func (tilelib *tileLibrary) Len() int {
110         tilelib.mtx.Lock()
111         defer tilelib.mtx.Unlock()
112         return tilelib.variants
113 }
114
115 // Return a tileLibRef for a tile with the given tag and sequence,
116 // adding the sequence to the library if needed.
117 func (tilelib *tileLibrary) getRef(tag tagID, seq []byte) tileLibRef {
118         for _, b := range seq {
119                 if b != 'a' && b != 'c' && b != 'g' && b != 't' {
120                         // return "tile not found" if seq has any
121                         // no-calls
122                         return tileLibRef{tag: tag}
123                 }
124         }
125         tilelib.mtx.Lock()
126         defer tilelib.mtx.Unlock()
127         // if tilelib.seq == nil {
128         //      tilelib.seq = map[[blake2b.Size]byte][]byte{}
129         // }
130         if tilelib.variant == nil {
131                 tilelib.variant = make([][][blake2b.Size256]byte, tilelib.taglib.Len())
132         }
133         seqhash := blake2b.Sum256(seq)
134         for i, varhash := range tilelib.variant[tag] {
135                 if varhash == seqhash {
136                         return tileLibRef{tag: tag, variant: tileVariantID(i + 1)}
137                 }
138         }
139         tilelib.variants++
140         tilelib.variant[tag] = append(tilelib.variant[tag], seqhash)
141         // tilelib.seq[seqhash] = append([]byte(nil), seq...)
142         return tileLibRef{tag: tag, variant: tileVariantID(len(tilelib.variant[tag]))}
143 }