Fix unchecked error.
[lightning.git] / tilelib.go
1 // Copyright (C) The Lightning Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package lightning
6
7 import (
8         "bufio"
9         "bytes"
10         "context"
11         "encoding/gob"
12         "fmt"
13         "io"
14         "os"
15         "regexp"
16         "runtime"
17         "sort"
18         "strings"
19         "sync"
20         "sync/atomic"
21
22         "github.com/klauspost/pgzip"
23         log "github.com/sirupsen/logrus"
24         "golang.org/x/crypto/blake2b"
25 )
26
27 type tileVariantID uint16 // 1-based
28
29 type tileLibRef struct {
30         Tag     tagID
31         Variant tileVariantID
32 }
33
34 type tileSeq map[string][]tileLibRef
35
36 func (tseq tileSeq) Variants() ([]tileVariantID, int, int) {
37         maxtag := 0
38         for _, refs := range tseq {
39                 for _, ref := range refs {
40                         if maxtag < int(ref.Tag) {
41                                 maxtag = int(ref.Tag)
42                         }
43                 }
44         }
45         vars := make([]tileVariantID, maxtag+1)
46         var kept, dropped int
47         for _, refs := range tseq {
48                 for _, ref := range refs {
49                         if vars[int(ref.Tag)] != 0 {
50                                 dropped++
51                         } else {
52                                 kept++
53                         }
54                         vars[int(ref.Tag)] = ref.Variant
55                 }
56         }
57         return vars, kept, dropped
58 }
59
60 type tileLibrary struct {
61         retainNoCalls       bool
62         skipOOO             bool
63         retainTileSequences bool
64
65         taglib         *tagLibrary
66         variant        [][][blake2b.Size256]byte
67         refseqs        map[string]map[string][]tileLibRef
68         compactGenomes map[string][]tileVariantID
69         seq2           map[[2]byte]map[[blake2b.Size256]byte][]byte
70         seq2lock       map[[2]byte]sync.Locker
71         variants       int64
72         // if non-nil, write out any tile variants added while tiling
73         encoder *gob.Encoder
74
75         mtx   sync.RWMutex
76         vlock []sync.Locker
77 }
78
79 func (tilelib *tileLibrary) loadTagSet(newtagset [][]byte) error {
80         // Loading a tagset means either passing it through to the
81         // output (if it's the first one we've seen), or just ensuring
82         // it doesn't disagree with what we already have.
83         if len(newtagset) == 0 {
84                 return nil
85         }
86         tilelib.mtx.Lock()
87         defer tilelib.mtx.Unlock()
88         if tilelib.taglib == nil || tilelib.taglib.Len() == 0 {
89                 tilelib.taglib = &tagLibrary{}
90                 err := tilelib.taglib.setTags(newtagset)
91                 if err != nil {
92                         return err
93                 }
94                 if tilelib.encoder != nil {
95                         err = tilelib.encoder.Encode(LibraryEntry{
96                                 TagSet: newtagset,
97                         })
98                         if err != nil {
99                                 return err
100                         }
101                 }
102         } else if tilelib.taglib.Len() != len(newtagset) {
103                 return fmt.Errorf("cannot merge libraries with differing tagsets")
104         } else {
105                 current := tilelib.taglib.Tags()
106                 for i := range newtagset {
107                         if !bytes.Equal(newtagset[i], current[i]) {
108                                 return fmt.Errorf("cannot merge libraries with differing tagsets")
109                         }
110                 }
111         }
112         return nil
113 }
114
115 func (tilelib *tileLibrary) loadTileVariants(tvs []TileVariant, variantmap map[tileLibRef]tileVariantID) error {
116         for _, tv := range tvs {
117                 // Assign a new variant ID (unique across all inputs)
118                 // for each input variant.
119                 variantmap[tileLibRef{Tag: tv.Tag, Variant: tv.Variant}] = tilelib.getRef(tv.Tag, tv.Sequence).Variant
120         }
121         return nil
122 }
123
124 func (tilelib *tileLibrary) loadCompactGenomes(cgs []CompactGenome, variantmap map[tileLibRef]tileVariantID, onLoadGenome func(CompactGenome)) error {
125         log.Debugf("loadCompactGenomes: %d", len(cgs))
126         var wg sync.WaitGroup
127         errs := make(chan error, 1)
128         for _, cg := range cgs {
129                 wg.Add(1)
130                 cg := cg
131                 go func() {
132                         defer wg.Done()
133                         for i, variant := range cg.Variants {
134                                 if len(errs) > 0 {
135                                         return
136                                 }
137                                 if variant == 0 {
138                                         continue
139                                 }
140                                 tag := tagID(i / 2)
141                                 newvariant, ok := variantmap[tileLibRef{Tag: tag, Variant: variant}]
142                                 if !ok {
143                                         err := fmt.Errorf("oops: genome %q has variant %d for tag %d, but that variant was not in its library", cg.Name, variant, tag)
144                                         select {
145                                         case errs <- err:
146                                         default:
147                                         }
148                                         return
149                                 }
150                                 // log.Tracef("loadCompactGenomes: cg %s tag %d variant %d => %d", cg.Name, tag, variant, newvariant)
151                                 cg.Variants[i] = newvariant
152                         }
153                         if onLoadGenome != nil {
154                                 onLoadGenome(cg)
155                         }
156                         if tilelib.encoder != nil {
157                                 err := tilelib.encoder.Encode(LibraryEntry{
158                                         CompactGenomes: []CompactGenome{cg},
159                                 })
160                                 if err != nil {
161                                         select {
162                                         case errs <- err:
163                                         default:
164                                         }
165                                         return
166                                 }
167                         }
168                         if tilelib.compactGenomes != nil {
169                                 tilelib.mtx.Lock()
170                                 defer tilelib.mtx.Unlock()
171                                 tilelib.compactGenomes[cg.Name] = cg.Variants
172                         }
173                 }()
174         }
175         wg.Wait()
176         go close(errs)
177         return <-errs
178 }
179
180 func (tilelib *tileLibrary) loadCompactSequences(cseqs []CompactSequence, variantmap map[tileLibRef]tileVariantID) error {
181         log.Infof("loadCompactSequences: %d todo", len(cseqs))
182         for _, cseq := range cseqs {
183                 log.Infof("loadCompactSequences: checking %s", cseq.Name)
184                 for _, tseq := range cseq.TileSequences {
185                         for i, libref := range tseq {
186                                 if libref.Variant == 0 {
187                                         // No variant (e.g., import
188                                         // dropped tiles with
189                                         // no-calls) = no translation.
190                                         continue
191                                 }
192                                 v, ok := variantmap[libref]
193                                 if !ok {
194                                         return fmt.Errorf("oops: CompactSequence %q has variant %d for tag %d, but that variant was not in its library", cseq.Name, libref.Variant, libref.Tag)
195                                 }
196                                 tseq[i].Variant = v
197                         }
198                 }
199                 if tilelib.encoder != nil {
200                         if err := tilelib.encoder.Encode(LibraryEntry{
201                                 CompactSequences: []CompactSequence{cseq},
202                         }); err != nil {
203                                 return err
204                         }
205                 }
206                 log.Infof("loadCompactSequences: checking %s done", cseq.Name)
207         }
208         tilelib.mtx.Lock()
209         defer tilelib.mtx.Unlock()
210         if tilelib.refseqs == nil {
211                 tilelib.refseqs = map[string]map[string][]tileLibRef{}
212         }
213         for _, cseq := range cseqs {
214                 tilelib.refseqs[cseq.Name] = cseq.TileSequences
215         }
216         log.Info("loadCompactSequences: done")
217         return nil
218 }
219
220 func (tilelib *tileLibrary) LoadDir(ctx context.Context, path string, onLoadGenome func(CompactGenome)) error {
221         var files []string
222         var walk func(string) error
223         walk = func(path string) error {
224                 f, err := open(path)
225                 if err != nil {
226                         return err
227                 }
228                 defer f.Close()
229                 fis, err := f.Readdir(-1)
230                 if err != nil {
231                         files = append(files, path)
232                         return nil
233                 }
234                 for _, fi := range fis {
235                         if fi.Name() == "." || fi.Name() == ".." {
236                                 continue
237                         } else if child := path + "/" + fi.Name(); fi.IsDir() {
238                                 err = walk(child)
239                                 if err != nil {
240                                         return err
241                                 }
242                         } else if strings.HasSuffix(child, ".gob") || strings.HasSuffix(child, ".gob.gz") {
243                                 files = append(files, child)
244                         }
245                 }
246                 return nil
247         }
248         log.Infof("LoadDir: walk dir %s", path)
249         err := walk(path)
250         if err != nil {
251                 return err
252         }
253         ctx, cancel := context.WithCancel(ctx)
254         defer cancel()
255         var mtx sync.Mutex
256         allcgs := make([][]CompactGenome, len(files))
257         allcseqs := make([][]CompactSequence, len(files))
258         allvariantmap := map[tileLibRef]tileVariantID{}
259         errs := make(chan error, len(files))
260         log.Infof("LoadDir: read %d files", len(files))
261         for fileno, path := range files {
262                 fileno, path := fileno, path
263                 go func() {
264                         f, err := open(path)
265                         if err != nil {
266                                 errs <- err
267                                 return
268                         }
269                         defer f.Close()
270                         defer log.Infof("LoadDir: finished reading %s", path)
271
272                         var variantmap = map[tileLibRef]tileVariantID{}
273                         var cgs []CompactGenome
274                         var cseqs []CompactSequence
275                         err = DecodeLibrary(f, strings.HasSuffix(path, ".gz"), func(ent *LibraryEntry) error {
276                                 if ctx.Err() != nil {
277                                         return ctx.Err()
278                                 }
279                                 if len(ent.TagSet) > 0 {
280                                         mtx.Lock()
281                                         if tilelib.taglib == nil || tilelib.taglib.Len() != len(ent.TagSet) {
282                                                 // load first set of tags, or
283                                                 // report mismatch if 2 sets
284                                                 // have different #tags.
285                                                 if err := tilelib.loadTagSet(ent.TagSet); err != nil {
286                                                         mtx.Unlock()
287                                                         return err
288                                                 }
289                                         }
290                                         mtx.Unlock()
291                                 }
292                                 for _, tv := range ent.TileVariants {
293                                         variantmap[tileLibRef{Tag: tv.Tag, Variant: tv.Variant}] = tilelib.getRef(tv.Tag, tv.Sequence).Variant
294                                 }
295                                 cgs = append(cgs, ent.CompactGenomes...)
296                                 cseqs = append(cseqs, ent.CompactSequences...)
297                                 return nil
298                         })
299                         allcgs[fileno] = cgs
300                         allcseqs[fileno] = cseqs
301                         mtx.Lock()
302                         defer mtx.Unlock()
303                         for k, v := range variantmap {
304                                 allvariantmap[k] = v
305                         }
306                         errs <- err
307                 }()
308         }
309         for range files {
310                 err := <-errs
311                 if err != nil {
312                         return err
313                 }
314         }
315
316         log.Info("LoadDir: loadCompactGenomes")
317         var flatcgs []CompactGenome
318         for _, cgs := range allcgs {
319                 flatcgs = append(flatcgs, cgs...)
320         }
321         err = tilelib.loadCompactGenomes(flatcgs, allvariantmap, onLoadGenome)
322         if err != nil {
323                 return err
324         }
325
326         log.Info("LoadDir: loadCompactSequences")
327         var flatcseqs []CompactSequence
328         for _, cseqs := range allcseqs {
329                 flatcseqs = append(flatcseqs, cseqs...)
330         }
331         err = tilelib.loadCompactSequences(flatcseqs, allvariantmap)
332         if err != nil {
333                 return err
334         }
335
336         log.Info("LoadDir done")
337         return nil
338 }
339
340 func (tilelib *tileLibrary) WriteDir(dir string) error {
341         ntilefiles := 128
342         nfiles := ntilefiles + len(tilelib.refseqs)
343         files := make([]*os.File, nfiles)
344         for i := range files {
345                 f, err := os.OpenFile(fmt.Sprintf("%s/library.%04d.gob.gz", dir, i), os.O_CREATE|os.O_WRONLY, 0666)
346                 if err != nil {
347                         return err
348                 }
349                 defer f.Close()
350                 files[i] = f
351         }
352         bufws := make([]*bufio.Writer, nfiles)
353         for i := range bufws {
354                 bufws[i] = bufio.NewWriterSize(files[i], 1<<26)
355         }
356         zws := make([]*pgzip.Writer, nfiles)
357         for i := range zws {
358                 zws[i] = pgzip.NewWriter(bufws[i])
359                 defer zws[i].Close()
360         }
361         encoders := make([]*gob.Encoder, nfiles)
362         for i := range encoders {
363                 encoders[i] = gob.NewEncoder(zws[i])
364         }
365
366         cgnames := make([]string, 0, len(tilelib.compactGenomes))
367         for name := range tilelib.compactGenomes {
368                 cgnames = append(cgnames, name)
369         }
370         sort.Strings(cgnames)
371
372         refnames := make([]string, 0, len(tilelib.refseqs))
373         for name := range tilelib.refseqs {
374                 refnames = append(refnames, name)
375         }
376         sort.Strings(refnames)
377
378         log.Infof("WriteDir: writing %d files", nfiles)
379         ctx, cancel := context.WithCancel(context.Background())
380         defer cancel()
381         errs := make(chan error, nfiles)
382         for start := range files {
383                 start := start
384                 go func() {
385                         err := encoders[start].Encode(LibraryEntry{TagSet: tilelib.taglib.Tags()})
386                         if err != nil {
387                                 errs <- err
388                                 return
389                         }
390                         if refidx := start - ntilefiles; refidx >= 0 {
391                                 // write each ref to its own file
392                                 // (they seem to load very slowly)
393                                 name := refnames[refidx]
394                                 errs <- encoders[start].Encode(LibraryEntry{CompactSequences: []CompactSequence{{
395                                         Name:          name,
396                                         TileSequences: tilelib.refseqs[name],
397                                 }}})
398                                 return
399                         }
400                         for i := start; i < len(cgnames); i += ntilefiles {
401                                 err := encoders[start].Encode(LibraryEntry{CompactGenomes: []CompactGenome{{
402                                         Name:     cgnames[i],
403                                         Variants: tilelib.compactGenomes[cgnames[i]],
404                                 }}})
405                                 if err != nil {
406                                         errs <- err
407                                         return
408                                 }
409                         }
410                         tvs := []TileVariant{}
411                         for tag := start; tag < len(tilelib.variant) && ctx.Err() == nil; tag += ntilefiles {
412                                 tvs = tvs[:0]
413                                 for idx, hash := range tilelib.variant[tag] {
414                                         tvs = append(tvs, TileVariant{
415                                                 Tag:      tagID(tag),
416                                                 Variant:  tileVariantID(idx + 1),
417                                                 Blake2b:  hash,
418                                                 Sequence: tilelib.hashSequence(hash),
419                                         })
420                                 }
421                                 err := encoders[start].Encode(LibraryEntry{TileVariants: tvs})
422                                 if err != nil {
423                                         errs <- err
424                                         return
425                                 }
426                         }
427                         errs <- nil
428                 }()
429         }
430         for range files {
431                 err := <-errs
432                 if err != nil {
433                         return err
434                 }
435         }
436         log.Info("WriteDir: flushing")
437         for i := range zws {
438                 err := zws[i].Close()
439                 if err != nil {
440                         return err
441                 }
442                 err = bufws[i].Flush()
443                 if err != nil {
444                         return err
445                 }
446                 err = files[i].Close()
447                 if err != nil {
448                         return err
449                 }
450         }
451         log.Info("WriteDir: done")
452         return nil
453 }
454
455 // Load library data from rdr. Tile variants might be renumbered in
456 // the process; in that case, genomes variants will be renumbered to
457 // match.
458 //
459 // If onLoadGenome is non-nil, call it on each CompactGenome entry.
460 func (tilelib *tileLibrary) LoadGob(ctx context.Context, rdr io.Reader, gz bool, onLoadGenome func(CompactGenome)) error {
461         cgs := []CompactGenome{}
462         cseqs := []CompactSequence{}
463         variantmap := map[tileLibRef]tileVariantID{}
464         err := DecodeLibrary(rdr, gz, func(ent *LibraryEntry) error {
465                 if ctx.Err() != nil {
466                         return ctx.Err()
467                 }
468                 if err := tilelib.loadTagSet(ent.TagSet); err != nil {
469                         return err
470                 }
471                 if err := tilelib.loadTileVariants(ent.TileVariants, variantmap); err != nil {
472                         return err
473                 }
474                 cgs = append(cgs, ent.CompactGenomes...)
475                 cseqs = append(cseqs, ent.CompactSequences...)
476                 return nil
477         })
478         if err != nil {
479                 return err
480         }
481         if ctx.Err() != nil {
482                 return ctx.Err()
483         }
484         err = tilelib.loadCompactGenomes(cgs, variantmap, onLoadGenome)
485         if err != nil {
486                 return err
487         }
488         err = tilelib.loadCompactSequences(cseqs, variantmap)
489         if err != nil {
490                 return err
491         }
492         return nil
493 }
494
495 func (tilelib *tileLibrary) dump(out io.Writer) {
496         printTV := func(tag int, variant tileVariantID) {
497                 if variant < 1 {
498                         fmt.Fprintf(out, " -")
499                 } else if tag >= len(tilelib.variant) {
500                         fmt.Fprintf(out, " (!tag=%d)", tag)
501                 } else if int(variant) > len(tilelib.variant[tag]) {
502                         fmt.Fprintf(out, " (tag=%d,!variant=%d)", tag, variant)
503                 } else {
504                         fmt.Fprintf(out, " %x", tilelib.variant[tag][variant-1][:8])
505                 }
506         }
507         for refname, refseqs := range tilelib.refseqs {
508                 for seqname, seq := range refseqs {
509                         fmt.Fprintf(out, "ref %s %s", refname, seqname)
510                         for _, libref := range seq {
511                                 printTV(int(libref.Tag), libref.Variant)
512                         }
513                         fmt.Fprintf(out, "\n")
514                 }
515         }
516         for name, cg := range tilelib.compactGenomes {
517                 fmt.Fprintf(out, "cg %s", name)
518                 for tag, variant := range cg {
519                         printTV(tag/2, variant)
520                 }
521                 fmt.Fprintf(out, "\n")
522         }
523 }
524
525 type importStats struct {
526         InputFile              string
527         InputLabel             string
528         InputLength            int
529         InputCoverage          int
530         PathLength             int
531         DroppedOutOfOrderTiles int
532 }
533
534 func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader, matchChromosome *regexp.Regexp) (tileSeq, []importStats, error) {
535         ret := tileSeq{}
536         type jobT struct {
537                 label string
538                 fasta []byte
539         }
540         todo := make(chan jobT, 1)
541         scanner := bufio.NewScanner(rdr)
542         go func() {
543                 defer close(todo)
544                 var fasta []byte
545                 var seqlabel string
546                 for scanner.Scan() {
547                         buf := scanner.Bytes()
548                         if len(buf) > 0 && buf[0] == '>' {
549                                 todo <- jobT{seqlabel, append([]byte(nil), fasta...)}
550                                 seqlabel, fasta = strings.SplitN(string(buf[1:]), " ", 2)[0], fasta[:0]
551                                 log.Debugf("%s %s reading fasta", filelabel, seqlabel)
552                         } else {
553                                 fasta = append(fasta, bytes.ToLower(buf)...)
554                         }
555                 }
556                 todo <- jobT{seqlabel, fasta}
557         }()
558         type foundtag struct {
559                 pos   int
560                 tagid tagID
561         }
562         found := make([]foundtag, 2000000)
563         path := make([]tileLibRef, 2000000)
564         totalFoundTags := 0
565         totalPathLen := 0
566         skippedSequences := 0
567         taglen := tilelib.taglib.TagLen()
568         var stats []importStats
569         for job := range todo {
570                 if len(job.fasta) == 0 {
571                         continue
572                 } else if !matchChromosome.MatchString(job.label) {
573                         skippedSequences++
574                         continue
575                 }
576                 log.Debugf("%s %s tiling", filelabel, job.label)
577
578                 found = found[:0]
579                 tilelib.taglib.FindAll(job.fasta, func(tagid tagID, pos, taglen int) {
580                         found = append(found, foundtag{pos: pos, tagid: tagid})
581                 })
582                 totalFoundTags += len(found)
583                 if len(found) == 0 {
584                         log.Warnf("%s %s no tags found", filelabel, job.label)
585                 }
586
587                 skipped := 0
588                 if tilelib.skipOOO {
589                         log.Infof("%s %s keeping longest increasing subsequence", filelabel, job.label)
590                         keep := longestIncreasingSubsequence(len(found), func(i int) int { return int(found[i].tagid) })
591                         for i, x := range keep {
592                                 found[i] = found[x]
593                         }
594                         skipped = len(found) - len(keep)
595                         found = found[:len(keep)]
596                 }
597
598                 log.Infof("%s %s getting %d librefs", filelabel, job.label, len(found))
599                 throttle := &throttle{Max: runtime.NumCPU()}
600                 path = path[:len(found)]
601                 var lowquality int64
602                 for i, f := range found {
603                         i, f := i, f
604                         throttle.Acquire()
605                         go func() {
606                                 defer throttle.Release()
607                                 var startpos, endpos int
608                                 if i == 0 {
609                                         startpos = 0
610                                 } else {
611                                         startpos = f.pos
612                                 }
613                                 if i == len(found)-1 {
614                                         endpos = len(job.fasta)
615                                 } else {
616                                         endpos = found[i+1].pos + taglen
617                                 }
618                                 path[i] = tilelib.getRef(f.tagid, job.fasta[startpos:endpos])
619                                 if countBases(job.fasta[startpos:endpos]) != endpos-startpos {
620                                         atomic.AddInt64(&lowquality, 1)
621                                 }
622                         }()
623                 }
624                 throttle.Wait()
625
626                 log.Infof("%s %s copying path", filelabel, job.label)
627
628                 pathcopy := make([]tileLibRef, len(path))
629                 copy(pathcopy, path)
630                 ret[job.label] = pathcopy
631
632                 basesIn := countBases(job.fasta)
633                 log.Infof("%s %s fasta in %d coverage in %d path len %d low-quality %d skipped-out-of-order %d", filelabel, job.label, len(job.fasta), basesIn, len(path), lowquality, skipped)
634                 stats = append(stats, importStats{
635                         InputFile:              filelabel,
636                         InputLabel:             job.label,
637                         InputLength:            len(job.fasta),
638                         InputCoverage:          basesIn,
639                         PathLength:             len(path),
640                         DroppedOutOfOrderTiles: skipped,
641                 })
642
643                 totalPathLen += len(path)
644         }
645         log.Printf("%s tiled with total path len %d in %d sequences (skipped %d sequences that did not match chromosome regexp, skipped %d out-of-order tags)", filelabel, totalPathLen, len(ret), skippedSequences, totalFoundTags-totalPathLen)
646         return ret, stats, scanner.Err()
647 }
648
649 func (tilelib *tileLibrary) Len() int64 {
650         return atomic.LoadInt64(&tilelib.variants)
651 }
652
653 // Return a tileLibRef for a tile with the given tag and sequence,
654 // adding the sequence to the library if needed.
655 func (tilelib *tileLibrary) getRef(tag tagID, seq []byte) tileLibRef {
656         dropSeq := false
657         if !tilelib.retainNoCalls {
658                 for _, b := range seq {
659                         if b != 'a' && b != 'c' && b != 'g' && b != 't' {
660                                 dropSeq = true
661                                 break
662                         }
663                 }
664         }
665         seqhash := blake2b.Sum256(seq)
666         var vlock sync.Locker
667
668         tilelib.mtx.RLock()
669         if len(tilelib.vlock) > int(tag) {
670                 vlock = tilelib.vlock[tag]
671         }
672         tilelib.mtx.RUnlock()
673
674         if vlock != nil {
675                 vlock.Lock()
676                 for i, varhash := range tilelib.variant[tag] {
677                         if varhash == seqhash {
678                                 vlock.Unlock()
679                                 return tileLibRef{Tag: tag, Variant: tileVariantID(i + 1)}
680                         }
681                 }
682                 vlock.Unlock()
683         } else {
684                 tilelib.mtx.Lock()
685                 if tilelib.variant == nil && tilelib.taglib != nil {
686                         tilelib.variant = make([][][blake2b.Size256]byte, tilelib.taglib.Len())
687                         tilelib.vlock = make([]sync.Locker, tilelib.taglib.Len())
688                         for i := range tilelib.vlock {
689                                 tilelib.vlock[i] = new(sync.Mutex)
690                         }
691                 }
692                 if int(tag) >= len(tilelib.variant) {
693                         oldlen := len(tilelib.vlock)
694                         for i := 0; i < oldlen; i++ {
695                                 tilelib.vlock[i].Lock()
696                         }
697                         // If we haven't seen the tag library yet (as
698                         // in a merge), tilelib.taglib.Len() is
699                         // zero. We can still behave correctly, we
700                         // just need to expand the tilelib.variant and
701                         // tilelib.vlock slices as needed.
702                         if int(tag) >= cap(tilelib.variant) {
703                                 // Allocate 2x capacity.
704                                 newslice := make([][][blake2b.Size256]byte, int(tag)+1, (int(tag)+1)*2)
705                                 copy(newslice, tilelib.variant)
706                                 tilelib.variant = newslice[:int(tag)+1]
707                                 newvlock := make([]sync.Locker, int(tag)+1, (int(tag)+1)*2)
708                                 copy(newvlock, tilelib.vlock)
709                                 tilelib.vlock = newvlock[:int(tag)+1]
710                         } else {
711                                 // Use previously allocated capacity,
712                                 // avoiding copy.
713                                 tilelib.variant = tilelib.variant[:int(tag)+1]
714                                 tilelib.vlock = tilelib.vlock[:int(tag)+1]
715                         }
716                         for i := oldlen; i < len(tilelib.vlock); i++ {
717                                 tilelib.vlock[i] = new(sync.Mutex)
718                         }
719                         for i := 0; i < oldlen; i++ {
720                                 tilelib.vlock[i].Unlock()
721                         }
722                 }
723                 vlock = tilelib.vlock[tag]
724                 tilelib.mtx.Unlock()
725         }
726
727         vlock.Lock()
728         for i, varhash := range tilelib.variant[tag] {
729                 if varhash == seqhash {
730                         vlock.Unlock()
731                         return tileLibRef{Tag: tag, Variant: tileVariantID(i + 1)}
732                 }
733         }
734         atomic.AddInt64(&tilelib.variants, 1)
735         tilelib.variant[tag] = append(tilelib.variant[tag], seqhash)
736         variant := tileVariantID(len(tilelib.variant[tag]))
737         vlock.Unlock()
738
739         if tilelib.retainTileSequences && !dropSeq {
740                 seqCopy := append([]byte(nil), seq...)
741                 if tilelib.seq2 == nil {
742                         tilelib.mtx.Lock()
743                         if tilelib.seq2 == nil {
744                                 tilelib.seq2lock = map[[2]byte]sync.Locker{}
745                                 m := map[[2]byte]map[[blake2b.Size256]byte][]byte{}
746                                 var k [2]byte
747                                 for i := 0; i < 256; i++ {
748                                         k[0] = byte(i)
749                                         for j := 0; j < 256; j++ {
750                                                 k[1] = byte(j)
751                                                 m[k] = map[[blake2b.Size256]byte][]byte{}
752                                                 tilelib.seq2lock[k] = &sync.Mutex{}
753                                         }
754                                 }
755                                 tilelib.seq2 = m
756                         }
757                         tilelib.mtx.Unlock()
758                 }
759                 var k [2]byte
760                 copy(k[:], seqhash[:])
761                 locker := tilelib.seq2lock[k]
762                 locker.Lock()
763                 tilelib.seq2[k][seqhash] = seqCopy
764                 locker.Unlock()
765         }
766
767         if tilelib.encoder != nil {
768                 saveSeq := seq
769                 if dropSeq {
770                         // Save the hash, but not the sequence
771                         saveSeq = nil
772                 }
773                 tilelib.encoder.Encode(LibraryEntry{
774                         TileVariants: []TileVariant{{
775                                 Tag:      tag,
776                                 Variant:  variant,
777                                 Blake2b:  seqhash,
778                                 Sequence: saveSeq,
779                         }},
780                 })
781         }
782         return tileLibRef{Tag: tag, Variant: variant}
783 }
784
785 func (tilelib *tileLibrary) hashSequence(hash [blake2b.Size256]byte) []byte {
786         var partition [2]byte
787         copy(partition[:], hash[:])
788         return tilelib.seq2[partition][hash]
789 }
790
791 func (tilelib *tileLibrary) TileVariantSequence(libref tileLibRef) []byte {
792         if libref.Variant == 0 || len(tilelib.variant) <= int(libref.Tag) || len(tilelib.variant[libref.Tag]) < int(libref.Variant) {
793                 return nil
794         }
795         return tilelib.hashSequence(tilelib.variant[libref.Tag][libref.Variant-1])
796 }
797
798 // Tidy deletes unreferenced tile variants and renumbers variants so
799 // more common variants have smaller IDs.
800 func (tilelib *tileLibrary) Tidy() {
801         log.Print("Tidy: compute inref")
802         inref := map[tileLibRef]bool{}
803         for _, refseq := range tilelib.refseqs {
804                 for _, librefs := range refseq {
805                         for _, libref := range librefs {
806                                 inref[libref] = true
807                         }
808                 }
809         }
810         log.Print("Tidy: compute remap")
811         remap := make([][]tileVariantID, len(tilelib.variant))
812         throttle := throttle{Max: runtime.NumCPU() + 1}
813         for tag, oldvariants := range tilelib.variant {
814                 tag, oldvariants := tagID(tag), oldvariants
815                 if tag%1000000 == 0 {
816                         log.Printf("Tidy: tag %d", tag)
817                 }
818                 throttle.Acquire()
819                 go func() {
820                         defer throttle.Release()
821                         uses := make([]int, len(oldvariants))
822                         for _, cg := range tilelib.compactGenomes {
823                                 for phase := 0; phase < 2; phase++ {
824                                         cgi := int(tag)*2 + phase
825                                         if cgi < len(cg) && cg[cgi] > 0 {
826                                                 uses[cg[cgi]-1]++
827                                         }
828                                 }
829                         }
830
831                         // Compute desired order of variants:
832                         // neworder[x] == index in oldvariants that
833                         // should move to position x.
834                         neworder := make([]int, len(oldvariants))
835                         for i := range neworder {
836                                 neworder[i] = i
837                         }
838                         sort.Slice(neworder, func(i, j int) bool {
839                                 if cmp := uses[neworder[i]] - uses[neworder[j]]; cmp != 0 {
840                                         return cmp > 0
841                                 } else {
842                                         return bytes.Compare(oldvariants[neworder[i]][:], oldvariants[neworder[j]][:]) < 0
843                                 }
844                         })
845
846                         // Replace tilelib.variant[tag] with a new
847                         // re-ordered slice of hashes, and make a
848                         // mapping from old to new variant IDs.
849                         remaptag := make([]tileVariantID, len(oldvariants)+1)
850                         newvariants := make([][blake2b.Size256]byte, 0, len(neworder))
851                         for _, oldi := range neworder {
852                                 if uses[oldi] > 0 || inref[tileLibRef{Tag: tag, Variant: tileVariantID(oldi + 1)}] {
853                                         newvariants = append(newvariants, oldvariants[oldi])
854                                         remaptag[oldi+1] = tileVariantID(len(newvariants))
855                                 }
856                         }
857                         tilelib.variant[tag] = newvariants
858                         remap[tag] = remaptag
859                 }()
860         }
861         throttle.Wait()
862
863         // Apply remap to genomes and reference sequences, so they
864         // refer to the same tile variants using the changed IDs.
865         log.Print("Tidy: apply remap")
866         var wg sync.WaitGroup
867         for _, cg := range tilelib.compactGenomes {
868                 cg := cg
869                 wg.Add(1)
870                 go func() {
871                         defer wg.Done()
872                         for idx, variant := range cg {
873                                 cg[idx] = remap[tagID(idx/2)][variant]
874                         }
875                 }()
876         }
877         for _, refcs := range tilelib.refseqs {
878                 for _, refseq := range refcs {
879                         refseq := refseq
880                         wg.Add(1)
881                         go func() {
882                                 defer wg.Done()
883                                 for i, tv := range refseq {
884                                         refseq[i].Variant = remap[tv.Tag][tv.Variant]
885                                 }
886                         }()
887                 }
888         }
889         wg.Wait()
890         log.Print("Tidy: done")
891 }
892
893 func countBases(seq []byte) int {
894         n := 0
895         for _, c := range seq {
896                 if isbase[c] {
897                         n++
898                 }
899         }
900         return n
901 }