Use tilelib to load for export and pca.
[lightning.git] / tilelib.go
index 7f19b0d4081c2f70c5fa23eed610e44bf0cfc441..31fbb314a5eabf79187c90cdf800c28c07a41d11 100644 (file)
@@ -53,6 +53,7 @@ type tileLibrary struct {
        taglib         *tagLibrary
        variant        [][][blake2b.Size256]byte
        refseqs        map[string]map[string][]tileLibRef
+       compactGenomes map[string][]tileVariantID
        // count [][]int
        // seq map[[blake2b.Size]byte][]byte
        variants int
@@ -112,11 +113,11 @@ func (tilelib *tileLibrary) loadCompactGenomes(cgs []CompactGenome, variantmap m
        var wg sync.WaitGroup
        errs := make(chan error, 1)
        for _, cg := range cgs {
-               name, variants := cg.Name, cg.Variants
                wg.Add(1)
+               cg := cg
                go func() {
                        defer wg.Done()
-                       for i, variant := range variants {
+                       for i, variant := range cg.Variants {
                                if len(errs) > 0 {
                                        return
                                }
@@ -126,15 +127,15 @@ func (tilelib *tileLibrary) loadCompactGenomes(cgs []CompactGenome, variantmap m
                                tag := tagID(i / 2)
                                newvariant, ok := variantmap[tileLibRef{Tag: tag, Variant: variant}]
                                if !ok {
-                                       err := fmt.Errorf("oops: genome %q has variant %d for tag %d, but that variant was not in its library", name, variant, tag)
+                                       err := fmt.Errorf("oops: genome %q has variant %d for tag %d, but that variant was not in its library", cg.Name, variant, tag)
                                        select {
                                        case errs <- err:
                                        default:
                                        }
                                        return
                                }
-                               log.Tracef("loadCompactGenomes: cg %s tag %d variant %d => %d", name, tag, variant, newvariant)
-                               variants[i] = newvariant
+                               log.Tracef("loadCompactGenomes: cg %s tag %d variant %d => %d", cg.Name, tag, variant, newvariant)
+                               cg.Variants[i] = newvariant
                        }
                        if onLoadGenome != nil {
                                onLoadGenome(cg)
@@ -151,6 +152,11 @@ func (tilelib *tileLibrary) loadCompactGenomes(cgs []CompactGenome, variantmap m
                                        return
                                }
                        }
+                       if tilelib.compactGenomes != nil {
+                               tilelib.mtx.Lock()
+                               defer tilelib.mtx.Unlock()
+                               tilelib.compactGenomes[cg.Name] = cg.Variants
+                       }
                }()
        }
        wg.Wait()
@@ -235,7 +241,17 @@ func (tilelib *tileLibrary) LoadGob(ctx context.Context, rdr io.Reader, onLoadGe
        return nil
 }
 
-func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq, error) {
+type importStats struct {
+       InputFile              string
+       InputLabel             string
+       InputLength            int
+       InputCoverage          int
+       TileCoverage           int
+       PathLength             int
+       DroppedOutOfOrderTiles int
+}
+
+func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq, []importStats, error) {
        ret := tileSeq{}
        type jobT struct {
                label string
@@ -269,6 +285,7 @@ func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq,
        totalFoundTags := 0
        totalPathLen := 0
        skippedSequences := 0
+       stats := make([]importStats, 0, len(todo))
        for job := range todo {
                if len(job.fasta) == 0 {
                        continue
@@ -284,6 +301,7 @@ func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq,
                })
                totalFoundTags += len(found)
 
+               basesOut := 0
                skipped := 0
                path = path[:0]
                last := foundtag{tagid: -1}
@@ -297,27 +315,62 @@ func (tilelib *tileLibrary) TileFasta(filelabel string, rdr io.Reader) (tileSeq,
                }
                for i, f := range found {
                        log.Tracef("%s %s found[%d] == %#v", filelabel, job.label, i, f)
-                       if last.taglen > 0 {
-                               path = append(path, tilelib.getRef(last.tagid, job.fasta[last.pos:f.pos+f.taglen]))
+                       if last.tagid < 0 {
+                               // first tag in sequence
+                               last = foundtag{tagid: f.tagid}
+                               continue
+                       }
+                       libref := tilelib.getRef(last.tagid, job.fasta[last.pos:f.pos+f.taglen])
+                       path = append(path, libref)
+                       if libref.Variant > 0 {
+                               // Count output coverage from
+                               // the end of the previous tag
+                               // (if any) to the end of the
+                               // current tag, IOW don't
+                               // double-count coverage for
+                               // the previous tag.
+                               basesOut += countBases(job.fasta[last.pos+last.taglen : f.pos+f.taglen])
                        } else {
-                               f.pos = 0
+                               // If we dropped this tile
+                               // (because !includeNoCalls),
+                               // set taglen=0 so the
+                               // overlapping tag is counted
+                               // toward coverage on the
+                               // following tile.
+                               f.taglen = 0
                        }
                        last = f
                }
-               if last.taglen > 0 {
-                       path = append(path, tilelib.getRef(last.tagid, job.fasta[last.pos:]))
-               } else {
+               if last.tagid < 0 {
                        log.Warnf("%s %s no tags found", filelabel, job.label)
+               } else {
+                       libref := tilelib.getRef(last.tagid, job.fasta[last.pos:])
+                       path = append(path, libref)
+                       if libref.Variant > 0 {
+                               basesOut += countBases(job.fasta[last.pos+last.taglen:])
+                       }
                }
 
                pathcopy := make([]tileLibRef, len(path))
                copy(pathcopy, path)
                ret[job.label] = pathcopy
-               log.Debugf("%s %s tiled with path len %d, skipped %d", filelabel, job.label, len(path), skipped)
+
+               basesIn := countBases(job.fasta)
+               log.Infof("%s %s fasta in %d coverage in %d coverage out %d path len %d skipped %d", filelabel, job.label, len(job.fasta), basesIn, basesOut, len(path), skipped)
+               stats = append(stats, importStats{
+                       InputFile:              filelabel,
+                       InputLabel:             job.label,
+                       InputLength:            len(job.fasta),
+                       InputCoverage:          basesIn,
+                       TileCoverage:           basesOut,
+                       PathLength:             len(path),
+                       DroppedOutOfOrderTiles: skipped,
+               })
+
                totalPathLen += len(path)
        }
        log.Printf("%s tiled with total path len %d in %d sequences (skipped %d sequences with '_' in name, skipped %d out-of-order tags)", filelabel, totalPathLen, len(ret), skippedSequences, totalFoundTags-totalPathLen)
-       return ret, scanner.Err()
+       return ret, stats, scanner.Err()
 }
 
 func (tilelib *tileLibrary) Len() int {
@@ -386,3 +439,13 @@ func (tilelib *tileLibrary) getRef(tag tagID, seq []byte) tileLibRef {
        }
        return tileLibRef{Tag: tag, Variant: variant}
 }
+
+func countBases(seq []byte) int {
+       n := 0
+       for _, c := range seq {
+               if isbase[c] {
+                       n++
+               }
+       }
+       return n
+}