Add -match-genome=regexp filter.
[lightning.git] / tilelib.go
index 8960d0e86d8659bf94335ab0bb86b8b5c4aced11..a3b63ec78f9bbf2481301eb0f689c7eac2028edd 100644 (file)
@@ -1,3 +1,7 @@
+// Copyright (C) The Lightning Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
 package lightning
 
 import (
@@ -62,9 +66,9 @@ type tileLibrary struct {
        variant        [][][blake2b.Size256]byte
        refseqs        map[string]map[string][]tileLibRef
        compactGenomes map[string][]tileVariantID
-       // count [][]int
-       seq      map[[blake2b.Size256]byte][]byte
-       variants int64
+       seq2           map[[2]byte]map[[blake2b.Size256]byte][]byte
+       seq2lock       map[[2]byte]sync.Locker
+       variants       int64
        // if non-nil, write out any tile variants added while tiling
        encoder *gob.Encoder
 
@@ -143,7 +147,7 @@ func (tilelib *tileLibrary) loadCompactGenomes(cgs []CompactGenome, variantmap m
                                        }
                                        return
                                }
-                               log.Tracef("loadCompactGenomes: cg %s tag %d variant %d => %d", cg.Name, tag, variant, newvariant)
+                               // log.Tracef("loadCompactGenomes: cg %s tag %d variant %d => %d", cg.Name, tag, variant, newvariant)
                                cg.Variants[i] = newvariant
                        }
                        if onLoadGenome != nil {
@@ -174,8 +178,9 @@ func (tilelib *tileLibrary) loadCompactGenomes(cgs []CompactGenome, variantmap m
 }
 
 func (tilelib *tileLibrary) loadCompactSequences(cseqs []CompactSequence, variantmap map[tileLibRef]tileVariantID) error {
-       log.Debugf("loadCompactSequences: %d", len(cseqs))
+       log.Infof("loadCompactSequences: %d todo", len(cseqs))
        for _, cseq := range cseqs {
+               log.Infof("loadCompactSequences: checking %s", cseq.Name)
                for _, tseq := range cseq.TileSequences {
                        for i, libref := range tseq {
                                if libref.Variant == 0 {
@@ -198,6 +203,7 @@ func (tilelib *tileLibrary) loadCompactSequences(cseqs []CompactSequence, varian
                                return err
                        }
                }
+               log.Infof("loadCompactSequences: checking %s done", cseq.Name)
        }
        tilelib.mtx.Lock()
        defer tilelib.mtx.Unlock()
@@ -207,6 +213,7 @@ func (tilelib *tileLibrary) loadCompactSequences(cseqs []CompactSequence, varian
        for _, cseq := range cseqs {
                tilelib.refseqs[cseq.Name] = cseq.TileSequences
        }
+       log.Info("loadCompactSequences: done")
        return nil
 }
 
@@ -246,17 +253,14 @@ func (tilelib *tileLibrary) LoadDir(ctx context.Context, path string, onLoadGeno
        ctx, cancel := context.WithCancel(ctx)
        defer cancel()
        var mtx sync.Mutex
-       cgs := []CompactGenome{}
-       cseqs := []CompactSequence{}
-       variantmap := map[tileLibRef]tileVariantID{}
+       allcgs := make([][]CompactGenome, len(files))
+       allcseqs := make([][]CompactSequence, len(files))
+       allvariantmap := map[tileLibRef]tileVariantID{}
        errs := make(chan error, len(files))
        log.Infof("LoadDir: read %d files", len(files))
-       throttle := throttle{Max: runtime.GOMAXPROCS(0)/2 + 1}
-       for _, path := range files {
-               path := path
+       for fileno, path := range files {
+               fileno, path := fileno, path
                go func() {
-                       throttle.Acquire()
-                       defer throttle.Release()
                        f, err := open(path)
                        if err != nil {
                                errs <- err
@@ -264,7 +268,11 @@ func (tilelib *tileLibrary) LoadDir(ctx context.Context, path string, onLoadGeno
                        }
                        defer f.Close()
                        defer log.Infof("LoadDir: finished reading %s", path)
-                       errs <- DecodeLibrary(f, strings.HasSuffix(path, ".gz"), func(ent *LibraryEntry) error {
+
+                       var variantmap = map[tileLibRef]tileVariantID{}
+                       var cgs []CompactGenome
+                       var cseqs []CompactSequence
+                       err = DecodeLibrary(f, strings.HasSuffix(path, ".gz"), func(ent *LibraryEntry) error {
                                if ctx.Err() != nil {
                                        return ctx.Err()
                                }
@@ -281,19 +289,21 @@ func (tilelib *tileLibrary) LoadDir(ctx context.Context, path string, onLoadGeno
                                        }
                                        mtx.Unlock()
                                }
-                               variantmapadd := map[tileLibRef]tileVariantID{}
                                for _, tv := range ent.TileVariants {
-                                       variantmapadd[tileLibRef{Tag: tv.Tag, Variant: tv.Variant}] = tilelib.getRef(tv.Tag, tv.Sequence).Variant
+                                       variantmap[tileLibRef{Tag: tv.Tag, Variant: tv.Variant}] = tilelib.getRef(tv.Tag, tv.Sequence).Variant
                                }
-                               mtx.Lock()
                                cgs = append(cgs, ent.CompactGenomes...)
                                cseqs = append(cseqs, ent.CompactSequences...)
-                               for k, v := range variantmapadd {
-                                       variantmap[k] = v
-                               }
-                               mtx.Unlock()
                                return nil
                        })
+                       allcgs[fileno] = cgs
+                       allcseqs[fileno] = cseqs
+                       mtx.Lock()
+                       defer mtx.Unlock()
+                       for k, v := range variantmap {
+                               allvariantmap[k] = v
+                       }
+                       errs <- err
                }()
        }
        for range files {
@@ -302,22 +312,34 @@ func (tilelib *tileLibrary) LoadDir(ctx context.Context, path string, onLoadGeno
                        return err
                }
        }
+
        log.Info("LoadDir: loadCompactGenomes")
-       err = tilelib.loadCompactGenomes(cgs, variantmap, onLoadGenome)
+       var flatcgs []CompactGenome
+       for _, cgs := range allcgs {
+               flatcgs = append(flatcgs, cgs...)
+       }
+       err = tilelib.loadCompactGenomes(flatcgs, allvariantmap, onLoadGenome)
        if err != nil {
                return err
        }
+
        log.Info("LoadDir: loadCompactSequences")
-       err = tilelib.loadCompactSequences(cseqs, variantmap)
+       var flatcseqs []CompactSequence
+       for _, cseqs := range allcseqs {
+               flatcseqs = append(flatcseqs, cseqs...)
+       }
+       err = tilelib.loadCompactSequences(flatcseqs, allvariantmap)
        if err != nil {
                return err
        }
+
        log.Info("LoadDir done")
        return nil
 }
 
 func (tilelib *tileLibrary) WriteDir(dir string) error {
-       nfiles := 128
+       ntilefiles := 128
+       nfiles := ntilefiles + len(tilelib.refseqs)
        files := make([]*os.File, nfiles)
        for i := range files {
                f, err := os.OpenFile(fmt.Sprintf("%s/library.%04d.gob.gz", dir, i), os.O_CREATE|os.O_WRONLY, 0666)
@@ -347,6 +369,12 @@ func (tilelib *tileLibrary) WriteDir(dir string) error {
        }
        sort.Strings(cgnames)
 
+       refnames := make([]string, 0, len(tilelib.refseqs))
+       for name := range tilelib.refseqs {
+               refnames = append(refnames, name)
+       }
+       sort.Strings(refnames)
+
        log.Infof("WriteDir: writing %d files", nfiles)
        ctx, cancel := context.WithCancel(context.Background())
        defer cancel()
@@ -359,21 +387,17 @@ func (tilelib *tileLibrary) WriteDir(dir string) error {
                                errs <- err
                                return
                        }
-                       if start == 0 {
-                               // For now, just write all the refs to
-                               // the first file
-                               for name, tseqs := range tilelib.refseqs {
-                                       err := encoders[start].Encode(LibraryEntry{CompactSequences: []CompactSequence{{
-                                               Name:          name,
-                                               TileSequences: tseqs,
-                                       }}})
-                                       if err != nil {
-                                               errs <- err
-                                               return
-                                       }
-                               }
+                       if refidx := start - ntilefiles; refidx >= 0 {
+                               // write each ref to its own file
+                               // (they seem to load very slowly)
+                               name := refnames[refidx]
+                               errs <- encoders[start].Encode(LibraryEntry{CompactSequences: []CompactSequence{{
+                                       Name:          name,
+                                       TileSequences: tilelib.refseqs[name],
+                               }}})
+                               return
                        }
-                       for i := start; i < len(cgnames); i += nfiles {
+                       for i := start; i < len(cgnames); i += ntilefiles {
                                err := encoders[start].Encode(LibraryEntry{CompactGenomes: []CompactGenome{{
                                        Name:     cgnames[i],
                                        Variants: tilelib.compactGenomes[cgnames[i]],
@@ -384,14 +408,14 @@ func (tilelib *tileLibrary) WriteDir(dir string) error {
                                }
                        }
                        tvs := []TileVariant{}
-                       for tag := start; tag < len(tilelib.variant) && ctx.Err() == nil; tag += nfiles {
+                       for tag := start; tag < len(tilelib.variant) && ctx.Err() == nil; tag += ntilefiles {
                                tvs = tvs[:0]
                                for idx, hash := range tilelib.variant[tag] {
                                        tvs = append(tvs, TileVariant{
                                                Tag:      tagID(tag),
                                                Variant:  tileVariantID(idx + 1),
                                                Blake2b:  hash,
-                                               Sequence: tilelib.seq[hash],
+                                               Sequence: tilelib.hashSequence(hash),
                                        })
                                }
                                err := encoders[start].Encode(LibraryEntry{TileVariants: tvs})
@@ -713,12 +737,31 @@ func (tilelib *tileLibrary) getRef(tag tagID, seq []byte) tileLibRef {
        vlock.Unlock()
 
        if tilelib.retainTileSequences && !dropSeq {
-               tilelib.mtx.Lock()
-               if tilelib.seq == nil {
-                       tilelib.seq = map[[blake2b.Size256]byte][]byte{}
+               seqCopy := append([]byte(nil), seq...)
+               if tilelib.seq2 == nil {
+                       tilelib.mtx.Lock()
+                       if tilelib.seq2 == nil {
+                               tilelib.seq2lock = map[[2]byte]sync.Locker{}
+                               m := map[[2]byte]map[[blake2b.Size256]byte][]byte{}
+                               var k [2]byte
+                               for i := 0; i < 256; i++ {
+                                       k[0] = byte(i)
+                                       for j := 0; j < 256; j++ {
+                                               k[1] = byte(j)
+                                               m[k] = map[[blake2b.Size256]byte][]byte{}
+                                               tilelib.seq2lock[k] = &sync.Mutex{}
+                                       }
+                               }
+                               tilelib.seq2 = m
+                       }
+                       tilelib.mtx.Unlock()
                }
-               tilelib.seq[seqhash] = append([]byte(nil), seq...)
-               tilelib.mtx.Unlock()
+               var k [2]byte
+               copy(k[:], seqhash[:])
+               locker := tilelib.seq2lock[k]
+               locker.Lock()
+               tilelib.seq2[k][seqhash] = seqCopy
+               locker.Unlock()
        }
 
        if tilelib.encoder != nil {
@@ -739,11 +782,17 @@ func (tilelib *tileLibrary) getRef(tag tagID, seq []byte) tileLibRef {
        return tileLibRef{Tag: tag, Variant: variant}
 }
 
+func (tilelib *tileLibrary) hashSequence(hash [blake2b.Size256]byte) []byte {
+       var partition [2]byte
+       copy(partition[:], hash[:])
+       return tilelib.seq2[partition][hash]
+}
+
 func (tilelib *tileLibrary) TileVariantSequence(libref tileLibRef) []byte {
        if libref.Variant == 0 || len(tilelib.variant) <= int(libref.Tag) || len(tilelib.variant[libref.Tag]) < int(libref.Variant) {
                return nil
        }
-       return tilelib.seq[tilelib.variant[libref.Tag][libref.Variant-1]]
+       return tilelib.hashSequence(tilelib.variant[libref.Tag][libref.Variant-1])
 }
 
 // Tidy deletes unreferenced tile variants and renumbers variants so