Option to export one vcf/csv file per chromosome.
[lightning.git] / tilelib.go
index e0d0e1bc4f4a603b7ec8e3801a26d71031e32d9d..8073eba18360733918b9fa705618e55e088581c0 100644 (file)
@@ -62,9 +62,9 @@ type tileLibrary struct {
        variant        [][][blake2b.Size256]byte
        refseqs        map[string]map[string][]tileLibRef
        compactGenomes map[string][]tileVariantID
-       // count [][]int
-       seq      map[[blake2b.Size256]byte][]byte
-       variants int64
+       seq2           map[[2]byte]map[[blake2b.Size256]byte][]byte
+       seq2lock       map[[2]byte]sync.Locker
+       variants       int64
        // if non-nil, write out any tile variants added while tiling
        encoder *gob.Encoder
 
@@ -143,7 +143,7 @@ func (tilelib *tileLibrary) loadCompactGenomes(cgs []CompactGenome, variantmap m
                                        }
                                        return
                                }
-                               log.Tracef("loadCompactGenomes: cg %s tag %d variant %d => %d", cg.Name, tag, variant, newvariant)
+                               // log.Tracef("loadCompactGenomes: cg %s tag %d variant %d => %d", cg.Name, tag, variant, newvariant)
                                cg.Variants[i] = newvariant
                        }
                        if onLoadGenome != nil {
@@ -246,13 +246,13 @@ func (tilelib *tileLibrary) LoadDir(ctx context.Context, path string, onLoadGeno
        ctx, cancel := context.WithCancel(ctx)
        defer cancel()
        var mtx sync.Mutex
-       cgs := []CompactGenome{}
-       cseqs := []CompactSequence{}
-       variantmap := map[tileLibRef]tileVariantID{}
+       allcgs := make([][]CompactGenome, len(files))
+       allcseqs := make([][]CompactSequence, len(files))
+       allvariantmap := map[tileLibRef]tileVariantID{}
        errs := make(chan error, len(files))
        log.Infof("LoadDir: read %d files", len(files))
-       for _, path := range files {
-               path := path
+       for fileno, path := range files {
+               fileno, path := fileno, path
                go func() {
                        f, err := open(path)
                        if err != nil {
@@ -261,7 +261,11 @@ func (tilelib *tileLibrary) LoadDir(ctx context.Context, path string, onLoadGeno
                        }
                        defer f.Close()
                        defer log.Infof("LoadDir: finished reading %s", path)
-                       errs <- DecodeLibrary(f, strings.HasSuffix(path, ".gz"), func(ent *LibraryEntry) error {
+
+                       var variantmap = map[tileLibRef]tileVariantID{}
+                       var cgs []CompactGenome
+                       var cseqs []CompactSequence
+                       err = DecodeLibrary(f, strings.HasSuffix(path, ".gz"), func(ent *LibraryEntry) error {
                                if ctx.Err() != nil {
                                        return ctx.Err()
                                }
@@ -278,19 +282,21 @@ func (tilelib *tileLibrary) LoadDir(ctx context.Context, path string, onLoadGeno
                                        }
                                        mtx.Unlock()
                                }
-                               variantmapadd := map[tileLibRef]tileVariantID{}
                                for _, tv := range ent.TileVariants {
-                                       variantmapadd[tileLibRef{Tag: tv.Tag, Variant: tv.Variant}] = tilelib.getRef(tv.Tag, tv.Sequence).Variant
+                                       variantmap[tileLibRef{Tag: tv.Tag, Variant: tv.Variant}] = tilelib.getRef(tv.Tag, tv.Sequence).Variant
                                }
-                               mtx.Lock()
                                cgs = append(cgs, ent.CompactGenomes...)
                                cseqs = append(cseqs, ent.CompactSequences...)
-                               for k, v := range variantmapadd {
-                                       variantmap[k] = v
-                               }
-                               mtx.Unlock()
                                return nil
                        })
+                       allcgs[fileno] = cgs
+                       allcseqs[fileno] = cseqs
+                       mtx.Lock()
+                       defer mtx.Unlock()
+                       for k, v := range variantmap {
+                               allvariantmap[k] = v
+                       }
+                       errs <- err
                }()
        }
        for range files {
@@ -299,16 +305,27 @@ func (tilelib *tileLibrary) LoadDir(ctx context.Context, path string, onLoadGeno
                        return err
                }
        }
+
        log.Info("LoadDir: loadCompactGenomes")
-       err = tilelib.loadCompactGenomes(cgs, variantmap, onLoadGenome)
+       var flatcgs []CompactGenome
+       for _, cgs := range allcgs {
+               flatcgs = append(flatcgs, cgs...)
+       }
+       err = tilelib.loadCompactGenomes(flatcgs, allvariantmap, onLoadGenome)
        if err != nil {
                return err
        }
+
        log.Info("LoadDir: loadCompactSequences")
-       err = tilelib.loadCompactSequences(cseqs, variantmap)
+       var flatcseqs []CompactSequence
+       for _, cseqs := range allcseqs {
+               flatcseqs = append(flatcseqs, cseqs...)
+       }
+       err = tilelib.loadCompactSequences(flatcseqs, allvariantmap)
        if err != nil {
                return err
        }
+
        log.Info("LoadDir done")
        return nil
 }
@@ -338,6 +355,12 @@ func (tilelib *tileLibrary) WriteDir(dir string) error {
                encoders[i] = gob.NewEncoder(zws[i])
        }
 
+       cgnames := make([]string, 0, len(tilelib.compactGenomes))
+       for name := range tilelib.compactGenomes {
+               cgnames = append(cgnames, name)
+       }
+       sort.Strings(cgnames)
+
        log.Infof("WriteDir: writing %d files", nfiles)
        ctx, cancel := context.WithCancel(context.Background())
        defer cancel()
@@ -351,18 +374,8 @@ func (tilelib *tileLibrary) WriteDir(dir string) error {
                                return
                        }
                        if start == 0 {
-                               // For now, just write all the genomes and refs
-                               // to the first file
-                               for name, cg := range tilelib.compactGenomes {
-                                       err := encoders[start].Encode(LibraryEntry{CompactGenomes: []CompactGenome{{
-                                               Name:     name,
-                                               Variants: cg,
-                                       }}})
-                                       if err != nil {
-                                               errs <- err
-                                               return
-                                       }
-                               }
+                               // For now, just write all the refs to
+                               // the first file
                                for name, tseqs := range tilelib.refseqs {
                                        err := encoders[start].Encode(LibraryEntry{CompactSequences: []CompactSequence{{
                                                Name:          name,
@@ -374,6 +387,16 @@ func (tilelib *tileLibrary) WriteDir(dir string) error {
                                        }
                                }
                        }
+                       for i := start; i < len(cgnames); i += nfiles {
+                               err := encoders[start].Encode(LibraryEntry{CompactGenomes: []CompactGenome{{
+                                       Name:     cgnames[i],
+                                       Variants: tilelib.compactGenomes[cgnames[i]],
+                               }}})
+                               if err != nil {
+                                       errs <- err
+                                       return
+                               }
+                       }
                        tvs := []TileVariant{}
                        for tag := start; tag < len(tilelib.variant) && ctx.Err() == nil; tag += nfiles {
                                tvs = tvs[:0]
@@ -382,7 +405,7 @@ func (tilelib *tileLibrary) WriteDir(dir string) error {
                                                Tag:      tagID(tag),
                                                Variant:  tileVariantID(idx + 1),
                                                Blake2b:  hash,
-                                               Sequence: tilelib.seq[hash],
+                                               Sequence: tilelib.hashSequence(hash),
                                        })
                                }
                                err := encoders[start].Encode(LibraryEntry{TileVariants: tvs})
@@ -704,12 +727,31 @@ func (tilelib *tileLibrary) getRef(tag tagID, seq []byte) tileLibRef {
        vlock.Unlock()
 
        if tilelib.retainTileSequences && !dropSeq {
-               tilelib.mtx.Lock()
-               if tilelib.seq == nil {
-                       tilelib.seq = map[[blake2b.Size256]byte][]byte{}
+               seqCopy := append([]byte(nil), seq...)
+               if tilelib.seq2 == nil {
+                       tilelib.mtx.Lock()
+                       if tilelib.seq2 == nil {
+                               tilelib.seq2lock = map[[2]byte]sync.Locker{}
+                               m := map[[2]byte]map[[blake2b.Size256]byte][]byte{}
+                               var k [2]byte
+                               for i := 0; i < 256; i++ {
+                                       k[0] = byte(i)
+                                       for j := 0; j < 256; j++ {
+                                               k[1] = byte(j)
+                                               m[k] = map[[blake2b.Size256]byte][]byte{}
+                                               tilelib.seq2lock[k] = &sync.Mutex{}
+                                       }
+                               }
+                               tilelib.seq2 = m
+                       }
+                       tilelib.mtx.Unlock()
                }
-               tilelib.seq[seqhash] = append([]byte(nil), seq...)
-               tilelib.mtx.Unlock()
+               var k [2]byte
+               copy(k[:], seqhash[:])
+               locker := tilelib.seq2lock[k]
+               locker.Lock()
+               tilelib.seq2[k][seqhash] = seqCopy
+               locker.Unlock()
        }
 
        if tilelib.encoder != nil {
@@ -730,11 +772,17 @@ func (tilelib *tileLibrary) getRef(tag tagID, seq []byte) tileLibRef {
        return tileLibRef{Tag: tag, Variant: variant}
 }
 
+func (tilelib *tileLibrary) hashSequence(hash [blake2b.Size256]byte) []byte {
+       var partition [2]byte
+       copy(partition[:], hash[:])
+       return tilelib.seq2[partition][hash]
+}
+
 func (tilelib *tileLibrary) TileVariantSequence(libref tileLibRef) []byte {
        if libref.Variant == 0 || len(tilelib.variant) <= int(libref.Tag) || len(tilelib.variant[libref.Tag]) < int(libref.Variant) {
                return nil
        }
-       return tilelib.seq[tilelib.variant[libref.Tag][libref.Variant-1]]
+       return tilelib.hashSequence(tilelib.variant[libref.Tag][libref.Variant-1])
 }
 
 // Tidy deletes unreferenced tile variants and renumbers variants so