Use native client to read annotations.csv.
[lightning.git] / tilelib.go
1 // Copyright (C) The Lightning Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package lightning
6
7 import (
8         "bufio"
9         "bytes"
10         "context"
11         "encoding/gob"
12         "fmt"
13         "io"
14         "os"
15         "regexp"
16         "runtime"
17         "sort"
18         "strings"
19         "sync"
20         "sync/atomic"
21
22         "github.com/klauspost/pgzip"
23         log "github.com/sirupsen/logrus"
24         "golang.org/x/crypto/blake2b"
25 )
26
27 type tileVariantID uint16 // 1-based
28
29 type tileLibRef struct {
30         Tag     tagID
31         Variant tileVariantID
32 }
33
34 type tileSeq map[string][]tileLibRef
35
36 func (tseq tileSeq) Variants() ([]tileVariantID, int, int) {
37         maxtag := 0
38         for _, refs := range tseq {
39                 for _, ref := range refs {
40                         if maxtag < int(ref.Tag) {
41                                 maxtag = int(ref.Tag)
42                         }
43                 }
44         }
45         vars := make([]tileVariantID, maxtag+1)
46         var kept, dropped int
47         for _, refs := range tseq {
48                 for _, ref := range refs {
49                         if vars[int(ref.Tag)] != 0 {
50                                 dropped++
51                         } else {
52                                 kept++
53                         }
54                         vars[int(ref.Tag)] = ref.Variant
55                 }
56         }
57         return vars, kept, dropped
58 }
59
60 type tileLibrary struct {
61         retainNoCalls       bool
62         skipOOO             bool
63         retainTileSequences bool
64
65         taglib         *tagLibrary
66         variant        [][][blake2b.Size256]byte
67         refseqs        map[string]map[string][]tileLibRef
68         compactGenomes map[string][]tileVariantID
69         seq2           map[[2]byte]map[[blake2b.Size256]byte][]byte
70         seq2lock       map[[2]byte]sync.Locker
71         variants       int64
72         // if non-nil, write out any tile variants added while tiling
73         encoder *gob.Encoder
74         // set Ref flag when writing new variants to encoder
75         encodeRef bool
76
77         onAddTileVariant func(libref tileLibRef, hash [blake2b.Size256]byte, seq []byte) error
78         onAddGenome      func(CompactGenome) error
79         onAddRefseq      func(CompactSequence) error
80
81         mtx   sync.RWMutex
82         vlock []sync.Locker
83 }
84
85 func (tilelib *tileLibrary) loadTagSet(newtagset [][]byte) error {
86         // Loading a tagset means either passing it through to the
87         // output (if it's the first one we've seen), or just ensuring
88         // it doesn't disagree with what we already have.
89         if len(newtagset) == 0 {
90                 return nil
91         }
92         tilelib.mtx.Lock()
93         defer tilelib.mtx.Unlock()
94         if tilelib.taglib == nil || tilelib.taglib.Len() == 0 {
95                 tilelib.taglib = &tagLibrary{}
96                 err := tilelib.taglib.setTags(newtagset)
97                 if err != nil {
98                         return err
99                 }
100                 if tilelib.encoder != nil {
101                         err = tilelib.encoder.Encode(LibraryEntry{
102                                 TagSet: newtagset,
103                         })
104                         if err != nil {
105                                 return err
106                         }
107                 }
108         } else if tilelib.taglib.Len() != len(newtagset) {
109                 return fmt.Errorf("cannot merge libraries with differing tagsets")
110         } else {
111                 current := tilelib.taglib.Tags()
112                 for i := range newtagset {
113                         if !bytes.Equal(newtagset[i], current[i]) {
114                                 return fmt.Errorf("cannot merge libraries with differing tagsets")
115                         }
116                 }
117         }
118         return nil
119 }
120
121 func (tilelib *tileLibrary) loadTileVariants(tvs []TileVariant, variantmap map[tileLibRef]tileVariantID) error {
122         for _, tv := range tvs {
123                 // Assign a new variant ID (unique across all inputs)
124                 // for each input variant.
125                 variantmap[tileLibRef{Tag: tv.Tag, Variant: tv.Variant}] = tilelib.getRef(tv.Tag, tv.Sequence, tv.Ref).Variant
126         }
127         return nil
128 }
129
130 func (tilelib *tileLibrary) loadCompactGenomes(cgs []CompactGenome, variantmap map[tileLibRef]tileVariantID) error {
131         log.Debugf("loadCompactGenomes: %d", len(cgs))
132         var wg sync.WaitGroup
133         errs := make(chan error, 1)
134         for _, cg := range cgs {
135                 wg.Add(1)
136                 cg := cg
137                 go func() {
138                         defer wg.Done()
139                         for i, variant := range cg.Variants {
140                                 if len(errs) > 0 {
141                                         return
142                                 }
143                                 if variant == 0 {
144                                         continue
145                                 }
146                                 tag := tagID(i / 2)
147                                 newvariant, ok := variantmap[tileLibRef{Tag: tag, Variant: variant}]
148                                 if !ok {
149                                         err := fmt.Errorf("oops: genome %q has variant %d for tag %d, but that variant was not in its library", cg.Name, variant, tag)
150                                         select {
151                                         case errs <- err:
152                                         default:
153                                         }
154                                         return
155                                 }
156                                 // log.Tracef("loadCompactGenomes: cg %s tag %d variant %d => %d", cg.Name, tag, variant, newvariant)
157                                 cg.Variants[i] = newvariant
158                         }
159                         if tilelib.onAddGenome != nil {
160                                 err := tilelib.onAddGenome(cg)
161                                 if err != nil {
162                                         select {
163                                         case errs <- err:
164                                         default:
165                                         }
166                                         return
167                                 }
168                         }
169                         if tilelib.encoder != nil {
170                                 err := tilelib.encoder.Encode(LibraryEntry{
171                                         CompactGenomes: []CompactGenome{cg},
172                                 })
173                                 if err != nil {
174                                         select {
175                                         case errs <- err:
176                                         default:
177                                         }
178                                         return
179                                 }
180                         }
181                         if tilelib.compactGenomes != nil {
182                                 tilelib.mtx.Lock()
183                                 defer tilelib.mtx.Unlock()
184                                 tilelib.compactGenomes[cg.Name] = cg.Variants
185                         }
186                 }()
187         }
188         wg.Wait()
189         go close(errs)
190         return <-errs
191 }
192
193 func (tilelib *tileLibrary) loadCompactSequences(cseqs []CompactSequence, variantmap map[tileLibRef]tileVariantID) error {
194         log.Infof("loadCompactSequences: %d todo", len(cseqs))
195         for _, cseq := range cseqs {
196                 log.Infof("loadCompactSequences: checking %s", cseq.Name)
197                 for _, tseq := range cseq.TileSequences {
198                         for i, libref := range tseq {
199                                 if libref.Variant == 0 {
200                                         // No variant (e.g., import
201                                         // dropped tiles with
202                                         // no-calls) = no translation.
203                                         continue
204                                 }
205                                 v, ok := variantmap[libref]
206                                 if !ok {
207                                         return fmt.Errorf("oops: CompactSequence %q has variant %d for tag %d, but that variant was not in its library", cseq.Name, libref.Variant, libref.Tag)
208                                 }
209                                 tseq[i].Variant = v
210                         }
211                 }
212                 if tilelib.encoder != nil {
213                         if err := tilelib.encoder.Encode(LibraryEntry{
214                                 CompactSequences: []CompactSequence{cseq},
215                         }); err != nil {
216                                 return err
217                         }
218                 }
219                 if tilelib.onAddRefseq != nil {
220                         err := tilelib.onAddRefseq(cseq)
221                         if err != nil {
222                                 return err
223                         }
224                 }
225                 log.Infof("loadCompactSequences: checking %s done", cseq.Name)
226         }
227         tilelib.mtx.Lock()
228         defer tilelib.mtx.Unlock()
229         if tilelib.refseqs == nil {
230                 tilelib.refseqs = map[string]map[string][]tileLibRef{}
231         }
232         for _, cseq := range cseqs {
233                 tilelib.refseqs[cseq.Name] = cseq.TileSequences
234         }
235         log.Info("loadCompactSequences: done")
236         return nil
237 }
238
239 func allFiles(path string, re *regexp.Regexp) ([]string, error) {
240         var files []string
241         f, err := open(path)
242         if err != nil {
243                 return nil, err
244         }
245         defer f.Close()
246         fis, err := f.Readdir(-1)
247         if err != nil {
248                 return []string{path}, nil
249         }
250         for _, fi := range fis {
251                 if fi.Name() == "." || fi.Name() == ".." {
252                         continue
253                 } else if child := path + "/" + fi.Name(); fi.IsDir() {
254                         add, err := allFiles(child, re)
255                         if err != nil {
256                                 return nil, err
257                         }
258                         files = append(files, add...)
259                 } else if re == nil || re.MatchString(child) {
260                         files = append(files, child)
261                 }
262         }
263         sort.Strings(files)
264         return files, nil
265 }
266
267 var matchGobFile = regexp.MustCompile(`\.gob(\.gz)?$`)
268
269 func (tilelib *tileLibrary) LoadDir(ctx context.Context, path string) error {
270         log.Infof("LoadDir: walk dir %s", path)
271         files, err := allFiles(path, matchGobFile)
272         if err != nil {
273                 return err
274         }
275         ctx, cancel := context.WithCancel(ctx)
276         defer cancel()
277         var mtx sync.Mutex
278         allcgs := make([][]CompactGenome, len(files))
279         allcseqs := make([][]CompactSequence, len(files))
280         allvariantmap := map[tileLibRef]tileVariantID{}
281         errs := make(chan error, len(files))
282         log.Infof("LoadDir: read %d files", len(files))
283         for fileno, path := range files {
284                 fileno, path := fileno, path
285                 go func() {
286                         f, err := open(path)
287                         if err != nil {
288                                 errs <- err
289                                 return
290                         }
291                         defer f.Close()
292                         defer log.Infof("LoadDir: finished reading %s", path)
293
294                         var variantmap = map[tileLibRef]tileVariantID{}
295                         var cgs []CompactGenome
296                         var cseqs []CompactSequence
297                         err = DecodeLibrary(f, strings.HasSuffix(path, ".gz"), func(ent *LibraryEntry) error {
298                                 if ctx.Err() != nil {
299                                         return ctx.Err()
300                                 }
301                                 if len(ent.TagSet) > 0 {
302                                         mtx.Lock()
303                                         if tilelib.taglib == nil || tilelib.taglib.Len() != len(ent.TagSet) {
304                                                 // load first set of tags, or
305                                                 // report mismatch if 2 sets
306                                                 // have different #tags.
307                                                 if err := tilelib.loadTagSet(ent.TagSet); err != nil {
308                                                         mtx.Unlock()
309                                                         return err
310                                                 }
311                                         }
312                                         mtx.Unlock()
313                                 }
314                                 for _, tv := range ent.TileVariants {
315                                         variantmap[tileLibRef{Tag: tv.Tag, Variant: tv.Variant}] = tilelib.getRef(tv.Tag, tv.Sequence, tv.Ref).Variant
316                                 }
317                                 cgs = append(cgs, ent.CompactGenomes...)
318                                 cseqs = append(cseqs, ent.CompactSequences...)
319                                 return nil
320                         })
321                         allcgs[fileno] = cgs
322                         allcseqs[fileno] = cseqs
323                         mtx.Lock()
324                         defer mtx.Unlock()
325                         for k, v := range variantmap {
326                                 allvariantmap[k] = v
327                         }
328                         errs <- err
329                 }()
330         }
331         for range files {
332                 err := <-errs
333                 if err != nil {
334                         return err
335                 }
336         }
337
338         log.Info("LoadDir: loadCompactGenomes")
339         var flatcgs []CompactGenome
340         for _, cgs := range allcgs {
341                 flatcgs = append(flatcgs, cgs...)
342         }
343         err = tilelib.loadCompactGenomes(flatcgs, allvariantmap)
344         if err != nil {
345                 return err
346         }
347
348         log.Info("LoadDir: loadCompactSequences")
349         var flatcseqs []CompactSequence
350         for _, cseqs := range allcseqs {
351                 flatcseqs = append(flatcseqs, cseqs...)
352         }
353         err = tilelib.loadCompactSequences(flatcseqs, allvariantmap)
354         if err != nil {
355                 return err
356         }
357
358         log.Info("LoadDir done")
359         return nil
360 }
361
362 func (tilelib *tileLibrary) WriteDir(dir string) error {
363         ntilefiles := 128
364         nfiles := ntilefiles + len(tilelib.refseqs)
365         files := make([]*os.File, nfiles)
366         for i := range files {
367                 f, err := os.OpenFile(fmt.Sprintf("%s/library.%04d.gob.gz", dir, i), os.O_CREATE|os.O_WRONLY, 0666)
368                 if err != nil {
369                         return err
370                 }
371                 defer f.Close()
372                 files[i] = f
373         }
374         bufws := make([]*bufio.Writer, nfiles)
375         for i := range bufws {
376                 bufws[i] = bufio.NewWriterSize(files[i], 1<<26)
377         }
378         zws := make([]*pgzip.Writer, nfiles)
379         for i := range zws {
380                 zws[i] = pgzip.NewWriter(bufws[i])
381                 defer zws[i].Close()
382         }
383         encoders := make([]*gob.Encoder, nfiles)
384         for i := range encoders {
385                 encoders[i] = gob.NewEncoder(zws[i])
386         }
387
388         cgnames := make([]string, 0, len(tilelib.compactGenomes))
389         for name := range tilelib.compactGenomes {
390                 cgnames = append(cgnames, name)
391         }
392         sort.Strings(cgnames)
393
394         refnames := make([]string, 0, len(tilelib.refseqs))
395         for name := range tilelib.refseqs {
396                 refnames = append(refnames, name)
397         }
398         sort.Strings(refnames)
399
400         log.Infof("WriteDir: writing %d files", nfiles)
401         ctx, cancel := context.WithCancel(context.Background())
402         defer cancel()
403         errs := make(chan error, nfiles)
404         for start := range files {
405                 start := start
406                 go func() {
407                         err := encoders[start].Encode(LibraryEntry{TagSet: tilelib.taglib.Tags()})
408                         if err != nil {
409                                 errs <- err
410                                 return
411                         }
412                         if refidx := start - ntilefiles; refidx >= 0 {
413                                 // write each ref to its own file
414                                 // (they seem to load very slowly)
415                                 name := refnames[refidx]
416                                 errs <- encoders[start].Encode(LibraryEntry{CompactSequences: []CompactSequence{{
417                                         Name:          name,
418                                         TileSequences: tilelib.refseqs[name],
419                                 }}})
420                                 return
421                         }
422                         for i := start; i < len(cgnames); i += ntilefiles {
423                                 err := encoders[start].Encode(LibraryEntry{CompactGenomes: []CompactGenome{{
424                                         Name:     cgnames[i],
425                                         Variants: tilelib.compactGenomes[cgnames[i]],
426                                 }}})
427                                 if err != nil {
428                                         errs <- err
429                                         return
430                                 }
431                         }
432                         tvs := []TileVariant{}
433                         for tag := start; tag < len(tilelib.variant) && ctx.Err() == nil; tag += ntilefiles {
434                                 tvs = tvs[:0]
435                                 for idx, hash := range tilelib.variant[tag] {
436                                         tvs = append(tvs, TileVariant{
437                                                 Tag:      tagID(tag),
438                                                 Variant:  tileVariantID(idx + 1),
439                                                 Blake2b:  hash,
440                                                 Sequence: tilelib.hashSequence(hash),
441                                         })
442                                 }
443                                 err := encoders[start].Encode(LibraryEntry{TileVariants: tvs})
444                                 if err != nil {
445                                         errs <- err
446                                         return
447                                 }
448                         }
449                         errs <- nil
450                 }()
451         }
452         for range files {
453                 err := <-errs
454                 if err != nil {
455                         return err
456                 }
457         }
458         log.Info("WriteDir: flushing")
459         for i := range zws {
460                 err := zws[i].Close()
461                 if err != nil {
462                         return err
463                 }
464                 err = bufws[i].Flush()
465                 if err != nil {
466                         return err
467                 }
468                 err = files[i].Close()
469                 if err != nil {
470                         return err
471                 }
472         }
473         log.Info("WriteDir: done")
474         return nil
475 }
476
477 // Load library data from rdr. Tile variants might be renumbered in
478 // the process; in that case, genomes variants will be renumbered to
479 // match.
480 func (tilelib *tileLibrary) LoadGob(ctx context.Context, rdr io.Reader, gz bool) error {
481         cgs := []CompactGenome{}
482         cseqs := []CompactSequence{}
483         variantmap := map[tileLibRef]tileVariantID{}
484         err := DecodeLibrary(rdr, gz, func(ent *LibraryEntry) error {
485                 if ctx.Err() != nil {
486                         return ctx.Err()
487                 }
488                 if err := tilelib.loadTagSet(ent.TagSet); err != nil {
489                         return err
490                 }
491                 if err := tilelib.loadTileVariants(ent.TileVariants, variantmap); err != nil {
492                         return err
493                 }
494                 cgs = append(cgs, ent.CompactGenomes...)
495                 cseqs = append(cseqs, ent.CompactSequences...)
496                 return nil
497         })
498         if err != nil {
499                 return err
500         }
501         if ctx.Err() != nil {
502                 return ctx.Err()
503         }
504         err = tilelib.loadCompactGenomes(cgs, variantmap)
505         if err != nil {
506                 return err
507         }
508         err = tilelib.loadCompactSequences(cseqs, variantmap)
509         if err != nil {
510                 return err
511         }
512         return nil
513 }
514
515 func (tilelib *tileLibrary) dump(out io.Writer) {
516         printTV := func(tag int, variant tileVariantID) {
517                 if variant < 1 {
518                         fmt.Fprintf(out, " -")
519                 } else if tag >= len(tilelib.variant) {
520                         fmt.Fprintf(out, " (!tag=%d)", tag)
521                 } else if int(variant) > len(tilelib.variant[tag]) {
522                         fmt.Fprintf(out, " (tag=%d,!variant=%d)", tag, variant)
523                 } else {
524                         fmt.Fprintf(out, " %x", tilelib.variant[tag][variant-1][:8])
525                 }
526         }
527         for refname, refseqs := range tilelib.refseqs {
528                 for seqname, seq := range refseqs {
529                         fmt.Fprintf(out, "ref %s %s", refname, seqname)
530                         for _, libref := range seq {
531                                 printTV(int(libref.Tag), libref.Variant)
532                         }
533                         fmt.Fprintf(out, "\n")
534                 }
535         }
536         for name, cg := range tilelib.compactGenomes {
537                 fmt.Fprintf(out, "cg %s", name)
538                 for tag, variant := range cg {
539                         printTV(tag/2, variant)
540                 }
541                 fmt.Fprintf(out, "\n")
542         }
543 }
544
545 type importStats struct {
546         InputFile              string
547         InputLabel             string
548         InputLength            int
549         InputCoverage          int
550         PathLength             int
551         DroppedOutOfOrderTiles int
552 }
553
554 func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader, matchChromosome *regexp.Regexp, isRef bool) (tileSeq, []importStats, error) {
555         ret := tileSeq{}
556         type jobT struct {
557                 label string
558                 fasta []byte
559         }
560         todo := make(chan jobT, 1)
561         scanner := bufio.NewScanner(rdr)
562         scanner.Buffer(make([]byte, 256), 1<<29) // 512 MiB, in case fasta does not have line breaks
563         go func() {
564                 defer close(todo)
565                 var fasta []byte
566                 var seqlabel string
567                 for scanner.Scan() {
568                         buf := scanner.Bytes()
569                         if len(buf) > 0 && buf[0] == '>' {
570                                 todo <- jobT{seqlabel, append([]byte(nil), fasta...)}
571                                 seqlabel, fasta = strings.SplitN(string(buf[1:]), " ", 2)[0], fasta[:0]
572                                 log.Debugf("%s %s reading fasta", filelabel, seqlabel)
573                         } else {
574                                 fasta = append(fasta, bytes.ToLower(buf)...)
575                         }
576                 }
577                 todo <- jobT{seqlabel, fasta}
578         }()
579         type foundtag struct {
580                 pos   int
581                 tagid tagID
582         }
583         found := make([]foundtag, 2000000)
584         path := make([]tileLibRef, 2000000)
585         totalFoundTags := 0
586         totalPathLen := 0
587         skippedSequences := 0
588         taglen := tilelib.taglib.TagLen()
589         var stats []importStats
590         for job := range todo {
591                 if len(job.fasta) == 0 {
592                         continue
593                 } else if !matchChromosome.MatchString(job.label) {
594                         skippedSequences++
595                         continue
596                 }
597                 log.Debugf("%s %s tiling", filelabel, job.label)
598
599                 found = found[:0]
600                 tilelib.taglib.FindAll(job.fasta, func(tagid tagID, pos, taglen int) {
601                         found = append(found, foundtag{pos: pos, tagid: tagid})
602                 })
603                 totalFoundTags += len(found)
604                 if len(found) == 0 {
605                         log.Warnf("%s %s no tags found", filelabel, job.label)
606                 }
607
608                 skipped := 0
609                 if tilelib.skipOOO {
610                         log.Infof("%s %s keeping longest increasing subsequence", filelabel, job.label)
611                         keep := longestIncreasingSubsequence(len(found), func(i int) int { return int(found[i].tagid) })
612                         for i, x := range keep {
613                                 found[i] = found[x]
614                         }
615                         skipped = len(found) - len(keep)
616                         found = found[:len(keep)]
617                 }
618
619                 log.Infof("%s %s getting %d librefs", filelabel, job.label, len(found))
620                 throttle := &throttle{Max: runtime.NumCPU()}
621                 path = path[:len(found)]
622                 var lowquality int64
623                 for i, f := range found {
624                         i, f := i, f
625                         throttle.Acquire()
626                         go func() {
627                                 defer throttle.Release()
628                                 var startpos, endpos int
629                                 if i == 0 {
630                                         startpos = 0
631                                 } else {
632                                         startpos = f.pos
633                                 }
634                                 if i == len(found)-1 {
635                                         endpos = len(job.fasta)
636                                 } else {
637                                         endpos = found[i+1].pos + taglen
638                                 }
639                                 path[i] = tilelib.getRef(f.tagid, job.fasta[startpos:endpos], isRef)
640                                 if countBases(job.fasta[startpos:endpos]) != endpos-startpos {
641                                         atomic.AddInt64(&lowquality, 1)
642                                 }
643                         }()
644                 }
645                 throttle.Wait()
646
647                 log.Infof("%s %s copying path", filelabel, job.label)
648
649                 pathcopy := make([]tileLibRef, len(path))
650                 copy(pathcopy, path)
651                 ret[job.label] = pathcopy
652
653                 basesIn := countBases(job.fasta)
654                 log.Infof("%s %s fasta in %d coverage in %d path len %d low-quality %d skipped-out-of-order %d", filelabel, job.label, len(job.fasta), basesIn, len(path), lowquality, skipped)
655                 stats = append(stats, importStats{
656                         InputFile:              filelabel,
657                         InputLabel:             job.label,
658                         InputLength:            len(job.fasta),
659                         InputCoverage:          basesIn,
660                         PathLength:             len(path),
661                         DroppedOutOfOrderTiles: skipped,
662                 })
663
664                 totalPathLen += len(path)
665         }
666         log.Printf("%s tiled with total path len %d in %d sequences (skipped %d sequences that did not match chromosome regexp, skipped %d out-of-order tags)", filelabel, totalPathLen, len(ret), skippedSequences, totalFoundTags-totalPathLen)
667         return ret, stats, scanner.Err()
668 }
669
670 func (tilelib *tileLibrary) Len() int64 {
671         return atomic.LoadInt64(&tilelib.variants)
672 }
673
674 // Return a tileLibRef for a tile with the given tag and sequence,
675 // adding the sequence to the library if needed.
676 func (tilelib *tileLibrary) getRef(tag tagID, seq []byte, usedByRef bool) tileLibRef {
677         dropSeq := false
678         if !tilelib.retainNoCalls {
679                 for _, b := range seq {
680                         if b != 'a' && b != 'c' && b != 'g' && b != 't' {
681                                 dropSeq = true
682                                 break
683                         }
684                 }
685         }
686         seqhash := blake2b.Sum256(seq)
687         var vlock sync.Locker
688
689         tilelib.mtx.RLock()
690         if len(tilelib.vlock) > int(tag) {
691                 vlock = tilelib.vlock[tag]
692         }
693         tilelib.mtx.RUnlock()
694
695         if vlock != nil {
696                 vlock.Lock()
697                 for i, varhash := range tilelib.variant[tag] {
698                         if varhash == seqhash {
699                                 vlock.Unlock()
700                                 return tileLibRef{Tag: tag, Variant: tileVariantID(i + 1)}
701                         }
702                 }
703                 vlock.Unlock()
704         } else {
705                 tilelib.mtx.Lock()
706                 if tilelib.variant == nil && tilelib.taglib != nil {
707                         tilelib.variant = make([][][blake2b.Size256]byte, tilelib.taglib.Len())
708                         tilelib.vlock = make([]sync.Locker, tilelib.taglib.Len())
709                         for i := range tilelib.vlock {
710                                 tilelib.vlock[i] = new(sync.Mutex)
711                         }
712                 }
713                 if int(tag) >= len(tilelib.variant) {
714                         oldlen := len(tilelib.vlock)
715                         for i := 0; i < oldlen; i++ {
716                                 tilelib.vlock[i].Lock()
717                         }
718                         // If we haven't seen the tag library yet (as
719                         // in a merge), tilelib.taglib.Len() is
720                         // zero. We can still behave correctly, we
721                         // just need to expand the tilelib.variant and
722                         // tilelib.vlock slices as needed.
723                         if int(tag) >= cap(tilelib.variant) {
724                                 // Allocate 2x capacity.
725                                 newslice := make([][][blake2b.Size256]byte, int(tag)+1, (int(tag)+1)*2)
726                                 copy(newslice, tilelib.variant)
727                                 tilelib.variant = newslice[:int(tag)+1]
728                                 newvlock := make([]sync.Locker, int(tag)+1, (int(tag)+1)*2)
729                                 copy(newvlock, tilelib.vlock)
730                                 tilelib.vlock = newvlock[:int(tag)+1]
731                         } else {
732                                 // Use previously allocated capacity,
733                                 // avoiding copy.
734                                 tilelib.variant = tilelib.variant[:int(tag)+1]
735                                 tilelib.vlock = tilelib.vlock[:int(tag)+1]
736                         }
737                         for i := oldlen; i < len(tilelib.vlock); i++ {
738                                 tilelib.vlock[i] = new(sync.Mutex)
739                         }
740                         for i := 0; i < oldlen; i++ {
741                                 tilelib.vlock[i].Unlock()
742                         }
743                 }
744                 vlock = tilelib.vlock[tag]
745                 tilelib.mtx.Unlock()
746         }
747
748         vlock.Lock()
749         for i, varhash := range tilelib.variant[tag] {
750                 if varhash == seqhash {
751                         vlock.Unlock()
752                         return tileLibRef{Tag: tag, Variant: tileVariantID(i + 1)}
753                 }
754         }
755         atomic.AddInt64(&tilelib.variants, 1)
756         tilelib.variant[tag] = append(tilelib.variant[tag], seqhash)
757         variant := tileVariantID(len(tilelib.variant[tag]))
758         vlock.Unlock()
759
760         if tilelib.retainTileSequences && !dropSeq {
761                 seqCopy := append([]byte(nil), seq...)
762                 if tilelib.seq2 == nil {
763                         tilelib.mtx.Lock()
764                         if tilelib.seq2 == nil {
765                                 tilelib.seq2lock = map[[2]byte]sync.Locker{}
766                                 m := map[[2]byte]map[[blake2b.Size256]byte][]byte{}
767                                 var k [2]byte
768                                 for i := 0; i < 256; i++ {
769                                         k[0] = byte(i)
770                                         for j := 0; j < 256; j++ {
771                                                 k[1] = byte(j)
772                                                 m[k] = map[[blake2b.Size256]byte][]byte{}
773                                                 tilelib.seq2lock[k] = &sync.Mutex{}
774                                         }
775                                 }
776                                 tilelib.seq2 = m
777                         }
778                         tilelib.mtx.Unlock()
779                 }
780                 var k [2]byte
781                 copy(k[:], seqhash[:])
782                 locker := tilelib.seq2lock[k]
783                 locker.Lock()
784                 tilelib.seq2[k][seqhash] = seqCopy
785                 locker.Unlock()
786         }
787
788         saveSeq := seq
789         if dropSeq {
790                 // Save the hash, but not the sequence
791                 saveSeq = nil
792         }
793         if tilelib.encoder != nil {
794                 tilelib.encoder.Encode(LibraryEntry{
795                         TileVariants: []TileVariant{{
796                                 Tag:      tag,
797                                 Ref:      usedByRef,
798                                 Variant:  variant,
799                                 Blake2b:  seqhash,
800                                 Sequence: saveSeq,
801                         }},
802                 })
803         }
804         if tilelib.onAddTileVariant != nil {
805                 tilelib.onAddTileVariant(tileLibRef{tag, variant}, seqhash, saveSeq)
806         }
807         return tileLibRef{Tag: tag, Variant: variant}
808 }
809
810 func (tilelib *tileLibrary) hashSequence(hash [blake2b.Size256]byte) []byte {
811         var partition [2]byte
812         copy(partition[:], hash[:])
813         return tilelib.seq2[partition][hash]
814 }
815
816 func (tilelib *tileLibrary) TileVariantSequence(libref tileLibRef) []byte {
817         if libref.Variant == 0 || len(tilelib.variant) <= int(libref.Tag) || len(tilelib.variant[libref.Tag]) < int(libref.Variant) {
818                 return nil
819         }
820         return tilelib.hashSequence(tilelib.variant[libref.Tag][libref.Variant-1])
821 }
822
823 // Tidy deletes unreferenced tile variants and renumbers variants so
824 // more common variants have smaller IDs.
825 func (tilelib *tileLibrary) Tidy() {
826         log.Print("Tidy: compute inref")
827         inref := map[tileLibRef]bool{}
828         for _, refseq := range tilelib.refseqs {
829                 for _, librefs := range refseq {
830                         for _, libref := range librefs {
831                                 inref[libref] = true
832                         }
833                 }
834         }
835         log.Print("Tidy: compute remap")
836         remap := make([][]tileVariantID, len(tilelib.variant))
837         throttle := throttle{Max: runtime.NumCPU() + 1}
838         for tag, oldvariants := range tilelib.variant {
839                 tag, oldvariants := tagID(tag), oldvariants
840                 if tag%1000000 == 0 {
841                         log.Printf("Tidy: tag %d", tag)
842                 }
843                 throttle.Acquire()
844                 go func() {
845                         defer throttle.Release()
846                         uses := make([]int, len(oldvariants))
847                         for _, cg := range tilelib.compactGenomes {
848                                 for phase := 0; phase < 2; phase++ {
849                                         cgi := int(tag)*2 + phase
850                                         if cgi < len(cg) && cg[cgi] > 0 {
851                                                 uses[cg[cgi]-1]++
852                                         }
853                                 }
854                         }
855
856                         // Compute desired order of variants:
857                         // neworder[x] == index in oldvariants that
858                         // should move to position x.
859                         neworder := make([]int, len(oldvariants))
860                         for i := range neworder {
861                                 neworder[i] = i
862                         }
863                         sort.Slice(neworder, func(i, j int) bool {
864                                 if cmp := uses[neworder[i]] - uses[neworder[j]]; cmp != 0 {
865                                         return cmp > 0
866                                 } else {
867                                         return bytes.Compare(oldvariants[neworder[i]][:], oldvariants[neworder[j]][:]) < 0
868                                 }
869                         })
870
871                         // Replace tilelib.variant[tag] with a new
872                         // re-ordered slice of hashes, and make a
873                         // mapping from old to new variant IDs.
874                         remaptag := make([]tileVariantID, len(oldvariants)+1)
875                         newvariants := make([][blake2b.Size256]byte, 0, len(neworder))
876                         for _, oldi := range neworder {
877                                 if uses[oldi] > 0 || inref[tileLibRef{Tag: tag, Variant: tileVariantID(oldi + 1)}] {
878                                         newvariants = append(newvariants, oldvariants[oldi])
879                                         remaptag[oldi+1] = tileVariantID(len(newvariants))
880                                 }
881                         }
882                         tilelib.variant[tag] = newvariants
883                         remap[tag] = remaptag
884                 }()
885         }
886         throttle.Wait()
887
888         // Apply remap to genomes and reference sequences, so they
889         // refer to the same tile variants using the changed IDs.
890         log.Print("Tidy: apply remap")
891         var wg sync.WaitGroup
892         for _, cg := range tilelib.compactGenomes {
893                 cg := cg
894                 wg.Add(1)
895                 go func() {
896                         defer wg.Done()
897                         for idx, variant := range cg {
898                                 cg[idx] = remap[tagID(idx/2)][variant]
899                         }
900                 }()
901         }
902         for _, refcs := range tilelib.refseqs {
903                 for _, refseq := range refcs {
904                         refseq := refseq
905                         wg.Add(1)
906                         go func() {
907                                 defer wg.Done()
908                                 for i, tv := range refseq {
909                                         refseq[i].Variant = remap[tv.Tag][tv.Variant]
910                                 }
911                         }()
912                 }
913         }
914         wg.Wait()
915         log.Print("Tidy: done")
916 }
917
918 func countBases(seq []byte) int {
919         n := 0
920         for _, c := range seq {
921                 if isbase[c] {
922                         n++
923                 }
924         }
925         return n
926 }