Fix some tests.
[lightning.git] / tilelib.go
1 // Copyright (C) The Lightning Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package lightning
6
7 import (
8         "bufio"
9         "bytes"
10         "context"
11         "encoding/gob"
12         "fmt"
13         "io"
14         "math/rand"
15         "os"
16         "regexp"
17         "runtime"
18         "sort"
19         "strings"
20         "sync"
21         "sync/atomic"
22
23         "github.com/klauspost/pgzip"
24         log "github.com/sirupsen/logrus"
25         "golang.org/x/crypto/blake2b"
26 )
27
28 type tileVariantID uint16 // 1-based
29
30 type tileLibRef struct {
31         Tag     tagID
32         Variant tileVariantID
33 }
34
35 type tileSeq map[string][]tileLibRef
36
37 func (tseq tileSeq) Variants() ([]tileVariantID, int, int) {
38         maxtag := 0
39         for _, refs := range tseq {
40                 for _, ref := range refs {
41                         if maxtag < int(ref.Tag) {
42                                 maxtag = int(ref.Tag)
43                         }
44                 }
45         }
46         vars := make([]tileVariantID, maxtag+1)
47         var kept, dropped int
48         for _, refs := range tseq {
49                 for _, ref := range refs {
50                         if vars[int(ref.Tag)] != 0 {
51                                 dropped++
52                         } else {
53                                 kept++
54                         }
55                         vars[int(ref.Tag)] = ref.Variant
56                 }
57         }
58         return vars, kept, dropped
59 }
60
61 type tileLibrary struct {
62         retainNoCalls       bool
63         skipOOO             bool
64         retainTileSequences bool
65         useDups             bool
66
67         taglib         *tagLibrary
68         variant        [][][blake2b.Size256]byte
69         refseqs        map[string]map[string][]tileLibRef
70         compactGenomes map[string][]tileVariantID
71         seq2           map[[2]byte]map[[blake2b.Size256]byte][]byte
72         seq2lock       map[[2]byte]sync.Locker
73         variants       int64
74         // if non-nil, write out any tile variants added while tiling
75         encoder *gob.Encoder
76         // set Ref flag when writing new variants to encoder
77         encodeRef bool
78
79         onAddTileVariant func(libref tileLibRef, hash [blake2b.Size256]byte, seq []byte) error
80         onAddGenome      func(CompactGenome) error
81         onAddRefseq      func(CompactSequence) error
82
83         mtx   sync.RWMutex
84         vlock []sync.Locker
85 }
86
87 func (tilelib *tileLibrary) loadTagSet(newtagset [][]byte) error {
88         // Loading a tagset means either passing it through to the
89         // output (if it's the first one we've seen), or just ensuring
90         // it doesn't disagree with what we already have.
91         if len(newtagset) == 0 {
92                 return nil
93         }
94         tilelib.mtx.Lock()
95         defer tilelib.mtx.Unlock()
96         if tilelib.taglib == nil || tilelib.taglib.Len() == 0 {
97                 tilelib.taglib = &tagLibrary{}
98                 err := tilelib.taglib.setTags(newtagset)
99                 if err != nil {
100                         return err
101                 }
102                 if tilelib.encoder != nil {
103                         err = tilelib.encoder.Encode(LibraryEntry{
104                                 TagSet: newtagset,
105                         })
106                         if err != nil {
107                                 return err
108                         }
109                 }
110         } else if tilelib.taglib.Len() != len(newtagset) {
111                 return fmt.Errorf("cannot merge libraries with differing tagsets")
112         } else {
113                 current := tilelib.taglib.Tags()
114                 for i := range newtagset {
115                         if !bytes.Equal(newtagset[i], current[i]) {
116                                 return fmt.Errorf("cannot merge libraries with differing tagsets")
117                         }
118                 }
119         }
120         return nil
121 }
122
123 func (tilelib *tileLibrary) loadTileVariants(tvs []TileVariant, variantmap map[tileLibRef]tileVariantID) error {
124         for _, tv := range tvs {
125                 // Assign a new variant ID (unique across all inputs)
126                 // for each input variant.
127                 variantmap[tileLibRef{Tag: tv.Tag, Variant: tv.Variant}] = tilelib.getRef(tv.Tag, tv.Sequence, tv.Ref).Variant
128         }
129         return nil
130 }
131
132 func (tilelib *tileLibrary) loadCompactGenomes(cgs []CompactGenome, variantmap map[tileLibRef]tileVariantID) error {
133         log.Debugf("loadCompactGenomes: %d", len(cgs))
134         var wg sync.WaitGroup
135         errs := make(chan error, 1)
136         for _, cg := range cgs {
137                 wg.Add(1)
138                 cg := cg
139                 go func() {
140                         defer wg.Done()
141                         for i, variant := range cg.Variants {
142                                 if len(errs) > 0 {
143                                         return
144                                 }
145                                 if variant == 0 {
146                                         continue
147                                 }
148                                 tag := tagID(i / 2)
149                                 newvariant, ok := variantmap[tileLibRef{Tag: tag, Variant: variant}]
150                                 if !ok {
151                                         err := fmt.Errorf("oops: genome %q has variant %d for tag %d, but that variant was not in its library", cg.Name, variant, tag)
152                                         select {
153                                         case errs <- err:
154                                         default:
155                                         }
156                                         return
157                                 }
158                                 // log.Tracef("loadCompactGenomes: cg %s tag %d variant %d => %d", cg.Name, tag, variant, newvariant)
159                                 cg.Variants[i] = newvariant
160                         }
161                         if tilelib.onAddGenome != nil {
162                                 err := tilelib.onAddGenome(cg)
163                                 if err != nil {
164                                         select {
165                                         case errs <- err:
166                                         default:
167                                         }
168                                         return
169                                 }
170                         }
171                         if tilelib.encoder != nil {
172                                 err := tilelib.encoder.Encode(LibraryEntry{
173                                         CompactGenomes: []CompactGenome{cg},
174                                 })
175                                 if err != nil {
176                                         select {
177                                         case errs <- err:
178                                         default:
179                                         }
180                                         return
181                                 }
182                         }
183                         if tilelib.compactGenomes != nil {
184                                 tilelib.mtx.Lock()
185                                 defer tilelib.mtx.Unlock()
186                                 tilelib.compactGenomes[cg.Name] = cg.Variants
187                         }
188                 }()
189         }
190         wg.Wait()
191         go close(errs)
192         return <-errs
193 }
194
195 func (tilelib *tileLibrary) loadCompactSequences(cseqs []CompactSequence, variantmap map[tileLibRef]tileVariantID) error {
196         log.Infof("loadCompactSequences: %d todo", len(cseqs))
197         for _, cseq := range cseqs {
198                 log.Infof("loadCompactSequences: checking %s", cseq.Name)
199                 for _, tseq := range cseq.TileSequences {
200                         for i, libref := range tseq {
201                                 if libref.Variant == 0 {
202                                         // No variant (e.g., import
203                                         // dropped tiles with
204                                         // no-calls) = no translation.
205                                         continue
206                                 }
207                                 v, ok := variantmap[libref]
208                                 if !ok {
209                                         return fmt.Errorf("oops: CompactSequence %q has variant %d for tag %d, but that variant was not in its library", cseq.Name, libref.Variant, libref.Tag)
210                                 }
211                                 tseq[i].Variant = v
212                         }
213                 }
214                 if tilelib.encoder != nil {
215                         if err := tilelib.encoder.Encode(LibraryEntry{
216                                 CompactSequences: []CompactSequence{cseq},
217                         }); err != nil {
218                                 return err
219                         }
220                 }
221                 if tilelib.onAddRefseq != nil {
222                         err := tilelib.onAddRefseq(cseq)
223                         if err != nil {
224                                 return err
225                         }
226                 }
227                 log.Infof("loadCompactSequences: checking %s done", cseq.Name)
228         }
229         tilelib.mtx.Lock()
230         defer tilelib.mtx.Unlock()
231         if tilelib.refseqs == nil {
232                 tilelib.refseqs = map[string]map[string][]tileLibRef{}
233         }
234         for _, cseq := range cseqs {
235                 tilelib.refseqs[cseq.Name] = cseq.TileSequences
236         }
237         log.Info("loadCompactSequences: done")
238         return nil
239 }
240
241 func allFiles(path string, re *regexp.Regexp) ([]string, error) {
242         var files []string
243         f, err := open(path)
244         if err != nil {
245                 return nil, err
246         }
247         defer f.Close()
248         fis, err := f.Readdir(-1)
249         if err != nil {
250                 return []string{path}, nil
251         }
252         for _, fi := range fis {
253                 if fi.Name() == "." || fi.Name() == ".." {
254                         continue
255                 } else if child := path + "/" + fi.Name(); fi.IsDir() {
256                         add, err := allFiles(child, re)
257                         if err != nil {
258                                 return nil, err
259                         }
260                         files = append(files, add...)
261                 } else if re == nil || re.MatchString(child) {
262                         files = append(files, child)
263                 }
264         }
265         sort.Strings(files)
266         return files, nil
267 }
268
269 var matchGobFile = regexp.MustCompile(`\.gob(\.gz)?$`)
270
271 func (tilelib *tileLibrary) LoadDir(ctx context.Context, path string) error {
272         log.Infof("LoadDir: walk dir %s", path)
273         files, err := allFiles(path, matchGobFile)
274         if err != nil {
275                 return err
276         }
277         ctx, cancel := context.WithCancel(ctx)
278         defer cancel()
279         var mtx sync.Mutex
280         allcgs := make([][]CompactGenome, len(files))
281         allcseqs := make([][]CompactSequence, len(files))
282         allvariantmap := map[tileLibRef]tileVariantID{}
283         errs := make(chan error, len(files))
284         log.Infof("LoadDir: read %d files", len(files))
285         for fileno, path := range files {
286                 fileno, path := fileno, path
287                 go func() {
288                         f, err := open(path)
289                         if err != nil {
290                                 errs <- err
291                                 return
292                         }
293                         defer f.Close()
294                         defer log.Infof("LoadDir: finished reading %s", path)
295
296                         var variantmap = map[tileLibRef]tileVariantID{}
297                         var cgs []CompactGenome
298                         var cseqs []CompactSequence
299                         err = DecodeLibrary(f, strings.HasSuffix(path, ".gz"), func(ent *LibraryEntry) error {
300                                 if ctx.Err() != nil {
301                                         return ctx.Err()
302                                 }
303                                 if len(ent.TagSet) > 0 {
304                                         mtx.Lock()
305                                         if tilelib.taglib == nil || tilelib.taglib.Len() != len(ent.TagSet) {
306                                                 // load first set of tags, or
307                                                 // report mismatch if 2 sets
308                                                 // have different #tags.
309                                                 if err := tilelib.loadTagSet(ent.TagSet); err != nil {
310                                                         mtx.Unlock()
311                                                         return err
312                                                 }
313                                         }
314                                         mtx.Unlock()
315                                 }
316                                 for _, tv := range ent.TileVariants {
317                                         variantmap[tileLibRef{Tag: tv.Tag, Variant: tv.Variant}] = tilelib.getRef(tv.Tag, tv.Sequence, tv.Ref).Variant
318                                 }
319                                 cgs = append(cgs, ent.CompactGenomes...)
320                                 cseqs = append(cseqs, ent.CompactSequences...)
321                                 return nil
322                         })
323                         allcgs[fileno] = cgs
324                         allcseqs[fileno] = cseqs
325                         mtx.Lock()
326                         defer mtx.Unlock()
327                         for k, v := range variantmap {
328                                 allvariantmap[k] = v
329                         }
330                         errs <- err
331                 }()
332         }
333         for range files {
334                 err := <-errs
335                 if err != nil {
336                         return err
337                 }
338         }
339
340         log.Info("LoadDir: loadCompactGenomes")
341         var flatcgs []CompactGenome
342         for _, cgs := range allcgs {
343                 flatcgs = append(flatcgs, cgs...)
344         }
345         err = tilelib.loadCompactGenomes(flatcgs, allvariantmap)
346         if err != nil {
347                 return err
348         }
349
350         log.Info("LoadDir: loadCompactSequences")
351         var flatcseqs []CompactSequence
352         for _, cseqs := range allcseqs {
353                 flatcseqs = append(flatcseqs, cseqs...)
354         }
355         err = tilelib.loadCompactSequences(flatcseqs, allvariantmap)
356         if err != nil {
357                 return err
358         }
359
360         log.Info("LoadDir done")
361         return nil
362 }
363
364 func (tilelib *tileLibrary) WriteDir(dir string) error {
365         ntilefiles := 128
366         nfiles := ntilefiles + len(tilelib.refseqs)
367         files := make([]*os.File, nfiles)
368         for i := range files {
369                 f, err := os.OpenFile(fmt.Sprintf("%s/library.%04d.gob.gz", dir, i), os.O_CREATE|os.O_WRONLY, 0666)
370                 if err != nil {
371                         return err
372                 }
373                 defer f.Close()
374                 files[i] = f
375         }
376         bufws := make([]*bufio.Writer, nfiles)
377         for i := range bufws {
378                 bufws[i] = bufio.NewWriterSize(files[i], 1<<26)
379         }
380         zws := make([]*pgzip.Writer, nfiles)
381         for i := range zws {
382                 zws[i] = pgzip.NewWriter(bufws[i])
383                 defer zws[i].Close()
384         }
385         encoders := make([]*gob.Encoder, nfiles)
386         for i := range encoders {
387                 encoders[i] = gob.NewEncoder(zws[i])
388         }
389
390         cgnames := make([]string, 0, len(tilelib.compactGenomes))
391         for name := range tilelib.compactGenomes {
392                 cgnames = append(cgnames, name)
393         }
394         sort.Strings(cgnames)
395
396         refnames := make([]string, 0, len(tilelib.refseqs))
397         for name := range tilelib.refseqs {
398                 refnames = append(refnames, name)
399         }
400         sort.Strings(refnames)
401
402         log.Infof("WriteDir: writing %d files", nfiles)
403         ctx, cancel := context.WithCancel(context.Background())
404         defer cancel()
405         errs := make(chan error, nfiles)
406         for start := range files {
407                 start := start
408                 go func() {
409                         err := encoders[start].Encode(LibraryEntry{TagSet: tilelib.taglib.Tags()})
410                         if err != nil {
411                                 errs <- err
412                                 return
413                         }
414                         if refidx := start - ntilefiles; refidx >= 0 {
415                                 // write each ref to its own file
416                                 // (they seem to load very slowly)
417                                 name := refnames[refidx]
418                                 errs <- encoders[start].Encode(LibraryEntry{CompactSequences: []CompactSequence{{
419                                         Name:          name,
420                                         TileSequences: tilelib.refseqs[name],
421                                 }}})
422                                 return
423                         }
424                         for i := start; i < len(cgnames); i += ntilefiles {
425                                 err := encoders[start].Encode(LibraryEntry{CompactGenomes: []CompactGenome{{
426                                         Name:     cgnames[i],
427                                         Variants: tilelib.compactGenomes[cgnames[i]],
428                                 }}})
429                                 if err != nil {
430                                         errs <- err
431                                         return
432                                 }
433                         }
434                         tvs := []TileVariant{}
435                         for tag := start; tag < len(tilelib.variant) && ctx.Err() == nil; tag += ntilefiles {
436                                 tvs = tvs[:0]
437                                 for idx, hash := range tilelib.variant[tag] {
438                                         tvs = append(tvs, TileVariant{
439                                                 Tag:      tagID(tag),
440                                                 Variant:  tileVariantID(idx + 1),
441                                                 Blake2b:  hash,
442                                                 Sequence: tilelib.hashSequence(hash),
443                                         })
444                                 }
445                                 err := encoders[start].Encode(LibraryEntry{TileVariants: tvs})
446                                 if err != nil {
447                                         errs <- err
448                                         return
449                                 }
450                         }
451                         errs <- nil
452                 }()
453         }
454         for range files {
455                 err := <-errs
456                 if err != nil {
457                         return err
458                 }
459         }
460         log.Info("WriteDir: flushing")
461         for i := range zws {
462                 err := zws[i].Close()
463                 if err != nil {
464                         return err
465                 }
466                 err = bufws[i].Flush()
467                 if err != nil {
468                         return err
469                 }
470                 err = files[i].Close()
471                 if err != nil {
472                         return err
473                 }
474         }
475         log.Info("WriteDir: done")
476         return nil
477 }
478
479 // Load library data from rdr. Tile variants might be renumbered in
480 // the process; in that case, genomes variants will be renumbered to
481 // match.
482 func (tilelib *tileLibrary) LoadGob(ctx context.Context, rdr io.Reader, gz bool) error {
483         cgs := []CompactGenome{}
484         cseqs := []CompactSequence{}
485         variantmap := map[tileLibRef]tileVariantID{}
486         err := DecodeLibrary(rdr, gz, func(ent *LibraryEntry) error {
487                 if ctx.Err() != nil {
488                         return ctx.Err()
489                 }
490                 if err := tilelib.loadTagSet(ent.TagSet); err != nil {
491                         return err
492                 }
493                 if err := tilelib.loadTileVariants(ent.TileVariants, variantmap); err != nil {
494                         return err
495                 }
496                 cgs = append(cgs, ent.CompactGenomes...)
497                 cseqs = append(cseqs, ent.CompactSequences...)
498                 return nil
499         })
500         if err != nil {
501                 return err
502         }
503         if ctx.Err() != nil {
504                 return ctx.Err()
505         }
506         err = tilelib.loadCompactGenomes(cgs, variantmap)
507         if err != nil {
508                 return err
509         }
510         err = tilelib.loadCompactSequences(cseqs, variantmap)
511         if err != nil {
512                 return err
513         }
514         return nil
515 }
516
517 func (tilelib *tileLibrary) dump(out io.Writer) {
518         printTV := func(tag int, variant tileVariantID) {
519                 if variant < 1 {
520                         fmt.Fprintf(out, " -")
521                 } else if tag >= len(tilelib.variant) {
522                         fmt.Fprintf(out, " (!tag=%d)", tag)
523                 } else if int(variant) > len(tilelib.variant[tag]) {
524                         fmt.Fprintf(out, " (tag=%d,!variant=%d)", tag, variant)
525                 } else {
526                         fmt.Fprintf(out, " %x", tilelib.variant[tag][variant-1][:8])
527                 }
528         }
529         for refname, refseqs := range tilelib.refseqs {
530                 for seqname, seq := range refseqs {
531                         fmt.Fprintf(out, "ref %s %s", refname, seqname)
532                         for _, libref := range seq {
533                                 printTV(int(libref.Tag), libref.Variant)
534                         }
535                         fmt.Fprintf(out, "\n")
536                 }
537         }
538         for name, cg := range tilelib.compactGenomes {
539                 fmt.Fprintf(out, "cg %s", name)
540                 for tag, variant := range cg {
541                         printTV(tag/2, variant)
542                 }
543                 fmt.Fprintf(out, "\n")
544         }
545 }
546
547 type importStats struct {
548         InputFile             string
549         InputLabel            string
550         InputLength           int
551         InputCoverage         int
552         PathLength            int
553         DroppedRepeatedTags   int
554         DroppedOutOfOrderTags int
555 }
556
557 func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader, matchChromosome *regexp.Regexp, isRef bool) (tileSeq, []importStats, error) {
558         ret := tileSeq{}
559         type foundtag struct {
560                 pos   int
561                 tagid tagID
562         }
563         found := make([]foundtag, 2000000)
564         path := make([]tileLibRef, 2000000)
565         totalFoundTags := 0
566         totalPathLen := 0
567         skippedSequences := 0
568         taglen := tilelib.taglib.TagLen()
569         var stats []importStats
570
571         in := bufio.NewReader(rdr)
572 readall:
573         for {
574                 var seqlabel string
575                 // Advance to next '>', then
576                 // read seqlabel up to \r?\n
577         readseqlabel:
578                 for seqlabelStarted := false; ; {
579                         rune, _, err := in.ReadRune()
580                         if err == io.EOF {
581                                 break readall
582                         } else if err != nil {
583                                 return nil, nil, err
584                         }
585                         switch {
586                         case rune == '\r':
587                         case seqlabelStarted && rune == '\n':
588                                 break readseqlabel
589                         case seqlabelStarted:
590                                 seqlabel += string(rune)
591                         case rune == '>':
592                                 seqlabelStarted = true
593                         default:
594                         }
595                 }
596                 log.Debugf("%s %s reading fasta", filelabel, seqlabel)
597                 if !matchChromosome.MatchString(seqlabel) {
598                         skippedSequences++
599                         continue
600                 }
601                 log.Debugf("%s %s tiling", filelabel, seqlabel)
602
603                 fasta := bytes.NewBuffer(nil)
604                 found = found[:0]
605                 err := tilelib.taglib.FindAll(in, fasta, func(tagid tagID, pos, taglen int) {
606                         found = append(found, foundtag{pos: pos, tagid: tagid})
607                 })
608                 if err != nil {
609                         return nil, nil, err
610                 }
611                 totalFoundTags += len(found)
612                 if len(found) == 0 {
613                         log.Warnf("%s %s no tags found", filelabel, seqlabel)
614                 }
615
616                 droppedDup := 0
617                 if !tilelib.useDups {
618                         // Remove any tags that appeared more than once
619                         dup := map[tagID]bool{}
620                         for _, ft := range found {
621                                 _, dup[ft.tagid] = dup[ft.tagid]
622                         }
623                         dst := 0
624                         for _, ft := range found {
625                                 if !dup[ft.tagid] {
626                                         found[dst] = ft
627                                         dst++
628                                 }
629                         }
630                         droppedDup = len(found) - dst
631                         log.Infof("%s %s dropping %d non-unique tags", filelabel, seqlabel, droppedDup)
632                         found = found[:dst]
633                 }
634
635                 droppedOOO := 0
636                 if tilelib.skipOOO {
637                         keep := longestIncreasingSubsequence(len(found), func(i int) int { return int(found[i].tagid) })
638                         for i, x := range keep {
639                                 found[i] = found[x]
640                         }
641                         droppedOOO = len(found) - len(keep)
642                         log.Infof("%s %s dropping %d out-of-order tags", filelabel, seqlabel, droppedOOO)
643                         found = found[:len(keep)]
644                 }
645
646                 log.Infof("%s %s getting %d librefs", filelabel, seqlabel, len(found))
647                 path = path[:len(found)]
648                 var lowquality int64
649                 // Visit each element of found, but start at a random
650                 // index, to reduce the likelihood of lock contention
651                 // when importing many samples concurrently.
652                 startpoint := rand.Int() % len(found)
653                 for offset := range found {
654                         i := startpoint + offset
655                         if i >= len(found) {
656                                 i -= len(found)
657                         }
658                         f := found[i]
659                         var startpos, endpos int
660                         if i == 0 {
661                                 startpos = 0
662                         } else {
663                                 startpos = f.pos
664                         }
665                         if i == len(found)-1 {
666                                 endpos = fasta.Len()
667                         } else {
668                                 endpos = found[i+1].pos + taglen
669                         }
670                         path[i] = tilelib.getRef(f.tagid, fasta.Bytes()[startpos:endpos], isRef)
671                         if countBases(fasta.Bytes()[startpos:endpos]) != endpos-startpos {
672                                 lowquality++
673                         }
674                 }
675
676                 log.Infof("%s %s copying path", filelabel, seqlabel)
677
678                 pathcopy := make([]tileLibRef, len(path))
679                 copy(pathcopy, path)
680                 ret[seqlabel] = pathcopy
681
682                 basesIn := countBases(fasta.Bytes())
683                 log.Infof("%s %s fasta in %d coverage in %d path len %d low-quality %d", filelabel, seqlabel, fasta.Len(), basesIn, len(path), lowquality)
684                 stats = append(stats, importStats{
685                         InputFile:             filelabel,
686                         InputLabel:            seqlabel,
687                         InputLength:           fasta.Len(),
688                         InputCoverage:         basesIn,
689                         PathLength:            len(path),
690                         DroppedOutOfOrderTags: droppedOOO,
691                         DroppedRepeatedTags:   droppedDup,
692                 })
693
694                 totalPathLen += len(path)
695         }
696         log.Printf("%s tiled with total path len %d in %d sequences (skipped %d sequences that did not match chromosome regexp, skipped %d out-of-order tags)", filelabel, totalPathLen, len(ret), skippedSequences, totalFoundTags-totalPathLen)
697         return ret, stats, nil
698 }
699
700 func (tilelib *tileLibrary) Len() int64 {
701         return atomic.LoadInt64(&tilelib.variants)
702 }
703
704 // Return a tileLibRef for a tile with the given tag and sequence,
705 // adding the sequence to the library if needed.
706 func (tilelib *tileLibrary) getRef(tag tagID, seq []byte, usedByRef bool) tileLibRef {
707         dropSeq := false
708         if !tilelib.retainNoCalls {
709                 for _, b := range seq {
710                         if b != 'a' && b != 'c' && b != 'g' && b != 't' {
711                                 dropSeq = true
712                                 break
713                         }
714                 }
715         }
716         seqhash := blake2b.Sum256(seq)
717         var vlock sync.Locker
718
719         tilelib.mtx.RLock()
720         if len(tilelib.vlock) > int(tag) {
721                 vlock = tilelib.vlock[tag]
722         }
723         tilelib.mtx.RUnlock()
724
725         if vlock != nil {
726                 vlock.Lock()
727                 for i, varhash := range tilelib.variant[tag] {
728                         if varhash == seqhash {
729                                 vlock.Unlock()
730                                 return tileLibRef{Tag: tag, Variant: tileVariantID(i + 1)}
731                         }
732                 }
733                 vlock.Unlock()
734         } else {
735                 tilelib.mtx.Lock()
736                 if tilelib.variant == nil && tilelib.taglib != nil {
737                         tilelib.variant = make([][][blake2b.Size256]byte, tilelib.taglib.Len())
738                         tilelib.vlock = make([]sync.Locker, tilelib.taglib.Len())
739                         for i := range tilelib.vlock {
740                                 tilelib.vlock[i] = new(sync.Mutex)
741                         }
742                 }
743                 if int(tag) >= len(tilelib.variant) {
744                         oldlen := len(tilelib.vlock)
745                         for i := 0; i < oldlen; i++ {
746                                 tilelib.vlock[i].Lock()
747                         }
748                         // If we haven't seen the tag library yet (as
749                         // in a merge), tilelib.taglib.Len() is
750                         // zero. We can still behave correctly, we
751                         // just need to expand the tilelib.variant and
752                         // tilelib.vlock slices as needed.
753                         if int(tag) >= cap(tilelib.variant) {
754                                 // Allocate 2x capacity.
755                                 newslice := make([][][blake2b.Size256]byte, int(tag)+1, (int(tag)+1)*2)
756                                 copy(newslice, tilelib.variant)
757                                 tilelib.variant = newslice[:int(tag)+1]
758                                 newvlock := make([]sync.Locker, int(tag)+1, (int(tag)+1)*2)
759                                 copy(newvlock, tilelib.vlock)
760                                 tilelib.vlock = newvlock[:int(tag)+1]
761                         } else {
762                                 // Use previously allocated capacity,
763                                 // avoiding copy.
764                                 tilelib.variant = tilelib.variant[:int(tag)+1]
765                                 tilelib.vlock = tilelib.vlock[:int(tag)+1]
766                         }
767                         for i := oldlen; i < len(tilelib.vlock); i++ {
768                                 tilelib.vlock[i] = new(sync.Mutex)
769                         }
770                         for i := 0; i < oldlen; i++ {
771                                 tilelib.vlock[i].Unlock()
772                         }
773                 }
774                 vlock = tilelib.vlock[tag]
775                 tilelib.mtx.Unlock()
776         }
777
778         vlock.Lock()
779         for i, varhash := range tilelib.variant[tag] {
780                 if varhash == seqhash {
781                         vlock.Unlock()
782                         return tileLibRef{Tag: tag, Variant: tileVariantID(i + 1)}
783                 }
784         }
785         atomic.AddInt64(&tilelib.variants, 1)
786         tilelib.variant[tag] = append(tilelib.variant[tag], seqhash)
787         variant := tileVariantID(len(tilelib.variant[tag]))
788         vlock.Unlock()
789
790         if tilelib.retainTileSequences && !dropSeq {
791                 seqCopy := append([]byte(nil), seq...)
792                 if tilelib.seq2 == nil {
793                         tilelib.mtx.Lock()
794                         if tilelib.seq2 == nil {
795                                 tilelib.seq2lock = map[[2]byte]sync.Locker{}
796                                 m := map[[2]byte]map[[blake2b.Size256]byte][]byte{}
797                                 var k [2]byte
798                                 for i := 0; i < 256; i++ {
799                                         k[0] = byte(i)
800                                         for j := 0; j < 256; j++ {
801                                                 k[1] = byte(j)
802                                                 m[k] = map[[blake2b.Size256]byte][]byte{}
803                                                 tilelib.seq2lock[k] = &sync.Mutex{}
804                                         }
805                                 }
806                                 tilelib.seq2 = m
807                         }
808                         tilelib.mtx.Unlock()
809                 }
810                 var k [2]byte
811                 copy(k[:], seqhash[:])
812                 locker := tilelib.seq2lock[k]
813                 locker.Lock()
814                 tilelib.seq2[k][seqhash] = seqCopy
815                 locker.Unlock()
816         }
817
818         saveSeq := seq
819         if dropSeq {
820                 // Save the hash, but not the sequence
821                 saveSeq = nil
822         }
823         if tilelib.encoder != nil {
824                 tilelib.encoder.Encode(LibraryEntry{
825                         TileVariants: []TileVariant{{
826                                 Tag:      tag,
827                                 Ref:      usedByRef,
828                                 Variant:  variant,
829                                 Blake2b:  seqhash,
830                                 Sequence: saveSeq,
831                         }},
832                 })
833         }
834         if tilelib.onAddTileVariant != nil {
835                 tilelib.onAddTileVariant(tileLibRef{tag, variant}, seqhash, saveSeq)
836         }
837         return tileLibRef{Tag: tag, Variant: variant}
838 }
839
840 func (tilelib *tileLibrary) hashSequence(hash [blake2b.Size256]byte) []byte {
841         var partition [2]byte
842         copy(partition[:], hash[:])
843         return tilelib.seq2[partition][hash]
844 }
845
846 func (tilelib *tileLibrary) TileVariantSequence(libref tileLibRef) []byte {
847         if libref.Variant == 0 || len(tilelib.variant) <= int(libref.Tag) || len(tilelib.variant[libref.Tag]) < int(libref.Variant) {
848                 return nil
849         }
850         return tilelib.hashSequence(tilelib.variant[libref.Tag][libref.Variant-1])
851 }
852
853 // Tidy deletes unreferenced tile variants and renumbers variants so
854 // more common variants have smaller IDs.
855 func (tilelib *tileLibrary) Tidy() {
856         log.Print("Tidy: compute inref")
857         inref := map[tileLibRef]bool{}
858         for _, refseq := range tilelib.refseqs {
859                 for _, librefs := range refseq {
860                         for _, libref := range librefs {
861                                 inref[libref] = true
862                         }
863                 }
864         }
865         log.Print("Tidy: compute remap")
866         remap := make([][]tileVariantID, len(tilelib.variant))
867         throttle := throttle{Max: runtime.NumCPU() + 1}
868         for tag, oldvariants := range tilelib.variant {
869                 tag, oldvariants := tagID(tag), oldvariants
870                 if tag%1000000 == 0 {
871                         log.Printf("Tidy: tag %d", tag)
872                 }
873                 throttle.Acquire()
874                 go func() {
875                         defer throttle.Release()
876                         uses := make([]int, len(oldvariants))
877                         for _, cg := range tilelib.compactGenomes {
878                                 for phase := 0; phase < 2; phase++ {
879                                         cgi := int(tag)*2 + phase
880                                         if cgi < len(cg) && cg[cgi] > 0 {
881                                                 uses[cg[cgi]-1]++
882                                         }
883                                 }
884                         }
885
886                         // Compute desired order of variants:
887                         // neworder[x] == index in oldvariants that
888                         // should move to position x.
889                         neworder := make([]int, len(oldvariants))
890                         for i := range neworder {
891                                 neworder[i] = i
892                         }
893                         sort.Slice(neworder, func(i, j int) bool {
894                                 if cmp := uses[neworder[i]] - uses[neworder[j]]; cmp != 0 {
895                                         return cmp > 0
896                                 } else {
897                                         return bytes.Compare(oldvariants[neworder[i]][:], oldvariants[neworder[j]][:]) < 0
898                                 }
899                         })
900
901                         // Replace tilelib.variant[tag] with a new
902                         // re-ordered slice of hashes, and make a
903                         // mapping from old to new variant IDs.
904                         remaptag := make([]tileVariantID, len(oldvariants)+1)
905                         newvariants := make([][blake2b.Size256]byte, 0, len(neworder))
906                         for _, oldi := range neworder {
907                                 if uses[oldi] > 0 || inref[tileLibRef{Tag: tag, Variant: tileVariantID(oldi + 1)}] {
908                                         newvariants = append(newvariants, oldvariants[oldi])
909                                         remaptag[oldi+1] = tileVariantID(len(newvariants))
910                                 }
911                         }
912                         tilelib.variant[tag] = newvariants
913                         remap[tag] = remaptag
914                 }()
915         }
916         throttle.Wait()
917
918         // Apply remap to genomes and reference sequences, so they
919         // refer to the same tile variants using the changed IDs.
920         log.Print("Tidy: apply remap")
921         var wg sync.WaitGroup
922         for _, cg := range tilelib.compactGenomes {
923                 cg := cg
924                 wg.Add(1)
925                 go func() {
926                         defer wg.Done()
927                         for idx, variant := range cg {
928                                 cg[idx] = remap[tagID(idx/2)][variant]
929                         }
930                 }()
931         }
932         for _, refcs := range tilelib.refseqs {
933                 for _, refseq := range refcs {
934                         refseq := refseq
935                         wg.Add(1)
936                         go func() {
937                                 defer wg.Done()
938                                 for i, tv := range refseq {
939                                         refseq[i].Variant = remap[tv.Tag][tv.Variant]
940                                 }
941                         }()
942                 }
943         }
944         wg.Wait()
945         log.Print("Tidy: done")
946 }
947
948 func countBases(seq []byte) int {
949         n := 0
950         for _, c := range seq {
951                 if isbase[c] {
952                         n++
953                 }
954         }
955         return n
956 }