Fix coordinates in hgvs annotations.
[lightning.git] / slicenumpy.go
index d780eb55c7e59ba84eb19b1e5d06790443e72a21..82cfff75ce9edc9f3ca3e1357956ac0b3ea3b635 100644 (file)
@@ -17,6 +17,7 @@ import (
        "os"
        "regexp"
        "runtime"
+       "runtime/debug"
        "sort"
        "strconv"
        "strings"
@@ -248,9 +249,10 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
                log.Printf("after applying mask, len(reftile) == %d", len(reftile))
        }
 
+       type hgvsColSet map[hgvs.Variant][2][]int8
+       encodeHGVS := throttle{Max: len(refseq)}
+       encodeHGVSTodo := map[string]chan hgvsColSet{}
        tmpHGVSCols := map[string]*os.File{}
-       bufHGVSCols := map[string]*bufio.Writer{}
-       encodeHGVSCols := map[string]*gob.Encoder{}
        if *hgvsChunked {
                for seqname := range refseq {
                        var f *os.File
@@ -262,8 +264,20 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
                        bufw := bufio.NewWriterSize(f, 1<<24)
                        enc := gob.NewEncoder(bufw)
                        tmpHGVSCols[seqname] = f
-                       bufHGVSCols[seqname] = bufw
-                       encodeHGVSCols[seqname] = enc
+                       todo := make(chan hgvsColSet, 128)
+                       encodeHGVSTodo[seqname] = todo
+                       encodeHGVS.Go(func() error {
+                               for colset := range todo {
+                                       err := enc.Encode(colset)
+                                       if err != nil {
+                                               encodeHGVS.Report(err)
+                                               for range todo {
+                                               }
+                                               return err
+                                       }
+                               }
+                               return bufw.Flush()
+                       })
                }
        }
 
@@ -309,9 +323,16 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
                                        seq[tv.Tag] = variants
                                }
                                for _, cg := range ent.CompactGenomes {
-                                       if matchGenome.MatchString(cg.Name) {
-                                               cgs[cg.Name] = cg
+                                       if !matchGenome.MatchString(cg.Name) {
+                                               continue
+                                       }
+                                       // pad to full slice size
+                                       // to avoid out-of-bounds
+                                       // checks later
+                                       if sliceSize := 2 * int(cg.EndTag-cg.StartTag); len(cg.Variants) < sliceSize {
+                                               cg.Variants = append(cg.Variants, make([]tileVariantID, sliceSize-len(cg.Variants))...)
                                        }
+                                       cgs[cg.Name] = cg
                                }
                                return nil
                        })
@@ -340,12 +361,10 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
 
                                        for _, cg := range cgs {
                                                idx := int(tag-tagstart) * 2
-                                               if idx < len(cg.Variants) {
-                                                       for allele := 0; allele < 2; allele++ {
-                                                               v := cg.Variants[idx+allele]
-                                                               if v > 0 && len(variants[v].Sequence) > 0 {
-                                                                       count[variants[v].Blake2b]++
-                                                               }
+                                               for allele := 0; allele < 2; allele++ {
+                                                       v := cg.Variants[idx+allele]
+                                                       if v > 0 && len(variants[v].Sequence) > 0 {
+                                                               count[variants[v].Blake2b]++
                                                        }
                                                }
                                        }
@@ -435,8 +454,10 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
                                                continue
                                        }
                                        diffs, _ := hgvs.Diff(reftilestr, strings.ToUpper(string(tv.Sequence)), 0)
+                                       for i := range diffs {
+                                               diffs[i].Position += rt.pos
+                                       }
                                        for _, diff := range diffs {
-                                               diff.Position += rt.pos
                                                fmt.Fprintf(annow, "%d,%d,%d,%s:g.%s,%s,%d,%s,%s,%s\n", tag, outcol, v, rt.seqname, diff.String(), rt.seqname, diff.Position, diff.Ref, diff.New, diff.Left)
                                        }
                                        if *hgvsChunked {
@@ -451,7 +472,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
                                        // =ref or a different variant in that
                                        // position, or (-1) is lacking
                                        // coverage / couldn't be diffed.
-                                       hgvsCol := map[hgvs.Variant][2][]int8{}
+                                       hgvsCol := hgvsColSet{}
                                        for _, diffs := range variantDiffs {
                                                for _, diff := range diffs {
                                                        if _, ok := hgvsCol[diff]; ok {
@@ -486,7 +507,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
                                                        }
                                                }
                                        }
-                                       encodeHGVSCols[rt.seqname].Encode(hgvsCol)
+                                       encodeHGVSTodo[rt.seqname] <- hgvsCol
                                }
                                outcol++
                        }
@@ -521,6 +542,8 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
                                }
                        }
                        seq = nil
+                       cgs = nil
+                       debug.FreeOSMemory()
                        throttleNumpyMem.Release()
 
                        if *mergeOutput || *hgvsSingle {
@@ -533,6 +556,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
                                if err != nil {
                                        return err
                                }
+                               debug.FreeOSMemory()
                        }
                        log.Infof("%s: done (%d/%d)", infile, int(atomic.AddInt64(&done, 1)), len(infiles))
                        return nil
@@ -545,11 +569,11 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
        if *hgvsChunked {
                log.Info("flushing hgvsCols temp files")
                for seqname := range refseq {
-                       err = bufHGVSCols[seqname].Flush()
-                       if err != nil {
-                               return 1
-                       }
-                       bufHGVSCols[seqname] = nil // free buffer memory
+                       close(encodeHGVSTodo[seqname])
+               }
+               err = encodeHGVS.Wait()
+               if err != nil {
+                       return 1
                }
                for seqname := range refseq {
                        log.Infof("%s: reading hgvsCols from temp file", seqname)
@@ -558,7 +582,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
                        if err != nil {
                                return 1
                        }
-                       var hgvsCols map[hgvs.Variant][2][]int8
+                       var hgvsCols hgvsColSet
                        dec := gob.NewDecoder(bufio.NewReaderSize(f, 1<<24))
                        for err == nil {
                                err = dec.Decode(&hgvsCols)