Skip chr*_*.
[lightning.git] / tilelib.go
1 package main
2
3 import (
4         "bufio"
5         "bytes"
6         "io"
7         "log"
8         "strings"
9         "sync"
10
11         "golang.org/x/crypto/blake2b"
12 )
13
14 type tileVariantID int32 // 1-based
15
16 type tileLibRef struct {
17         tag     tagID
18         variant tileVariantID
19 }
20
21 type tileSeq map[string][]tileLibRef
22
23 type tileLibrary struct {
24         taglib  *tagLibrary
25         variant [][][blake2b.Size256]byte
26         // count [][]int
27         // seq map[[blake2b.Size]byte][]byte
28         variants int
29
30         mtx sync.Mutex
31 }
32
33 func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq, error) {
34         ret := tileSeq{}
35         type jobT struct {
36                 label string
37                 fasta []byte
38         }
39         todo := make(chan jobT)
40         scanner := bufio.NewScanner(rdr)
41         go func() {
42                 defer close(todo)
43                 var fasta []byte
44                 var seqlabel string
45                 for scanner.Scan() {
46                         buf := scanner.Bytes()
47                         if len(buf) == 0 || buf[0] == '>' {
48                                 todo <- jobT{seqlabel, fasta}
49                                 seqlabel, fasta = string(buf[1:]), nil
50                                 log.Printf("%s %s reading fasta", filelabel, seqlabel)
51                         } else {
52                                 fasta = append(fasta, bytes.ToLower(buf)...)
53                         }
54                 }
55                 todo <- jobT{seqlabel, fasta}
56         }()
57         path := make([]tileLibRef, 2000000)
58         for job := range todo {
59                 if len(job.fasta) == 0 || strings.Contains(job.label, "_") {
60                         continue
61                 }
62                 log.Printf("%s %s tiling", filelabel, job.label)
63                 path = path[:0]
64                 tilestart := -1        // position in fasta of tile that ends here
65                 tiletagid := tagID(-1) // tag id starting tile that ends here
66                 tilelib.taglib.FindAll(job.fasta, func(id tagID, pos, taglen int) {
67                         if tilestart >= 0 {
68                                 path = append(path, tilelib.getRef(tiletagid, job.fasta[tilestart:pos+taglen]))
69                         }
70                         tilestart = pos
71                         tiletagid = id
72                 })
73                 if tiletagid >= 0 {
74                         path = append(path, tilelib.getRef(tiletagid, job.fasta[tilestart:]))
75                 }
76                 pathcopy := make([]tileLibRef, len(path))
77                 copy(pathcopy, path)
78                 ret[job.label] = pathcopy
79                 log.Printf("%s %s tiled with path len %d", filelabel, job.label, len(path))
80         }
81         return ret, scanner.Err()
82 }
83
84 func (tilelib *tileLibrary) Len() int {
85         tilelib.mtx.Lock()
86         defer tilelib.mtx.Unlock()
87         return tilelib.variants
88 }
89
90 // Return a tileLibRef for a tile with the given tag and sequence,
91 // adding the sequence to the library if needed.
92 func (tilelib *tileLibrary) getRef(tag tagID, seq []byte) tileLibRef {
93         for _, b := range seq {
94                 if b != 'a' && b != 'c' && b != 'g' && b != 't' {
95                         // return "tile not found" if seq has any
96                         // no-calls
97                         return tileLibRef{tag: tag}
98                 }
99         }
100         tilelib.mtx.Lock()
101         defer tilelib.mtx.Unlock()
102         // if tilelib.seq == nil {
103         //      tilelib.seq = map[[blake2b.Size]byte][]byte{}
104         // }
105         if tilelib.variant == nil {
106                 tilelib.variant = make([][][blake2b.Size256]byte, tilelib.taglib.Len())
107         }
108         seqhash := blake2b.Sum256(seq)
109         for i, varhash := range tilelib.variant[tag] {
110                 if varhash == seqhash {
111                         return tileLibRef{tag: tag, variant: tileVariantID(i + 1)}
112                 }
113         }
114         tilelib.variants++
115         tilelib.variant[tag] = append(tilelib.variant[tag], seqhash)
116         // tilelib.seq[seqhash] = append([]byte(nil), seq...)
117         return tileLibRef{tag: tag, variant: tileVariantID(len(tilelib.variant[tag]))}
118 }