Fix some tests.
[lightning.git] / annotate.go
index 7787ae231838430840d6ae4ef5631ac08750f9b6..00f1cad1556147c2b2ba31f2e8e3b4483ad02e20 100644 (file)
@@ -1,4 +1,8 @@
-package main
+// Copyright (C) The Lightning Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package lightning
 
 import (
        "bufio"
@@ -23,9 +27,11 @@ import (
 )
 
 type annotatecmd struct {
-       variantHash bool
-       maxTileSize int
-       tag2tagid   map[string]tagID
+       dropTiles        []bool
+       variantHash      bool
+       maxTileSize      int
+       tag2tagid        map[string]tagID
+       reportAnnotation func(tag tagID, outcol int, variant tileVariantID, refname string, seqname string, pdi hgvs.Variant)
 }
 
 func (cmd *annotatecmd) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
@@ -51,6 +57,9 @@ func (cmd *annotatecmd) RunCommand(prog string, args []string, stdin io.Reader,
                return 0
        } else if err != nil {
                return 2
+       } else if flags.NArg() > 0 {
+               err = fmt.Errorf("errant command line arguments after parsed flags: %v", flags.Args())
+               return 2
        }
 
        if *pprof != "" {
@@ -75,13 +84,13 @@ func (cmd *annotatecmd) RunCommand(prog string, args []string, stdin io.Reader,
                if err != nil {
                        return 1
                }
-               runner.Args = []string{"annotate", "-local=true", fmt.Sprintf("-variant-hash=%v", cmd.variantHash), "-max-tile-size", strconv.Itoa(cmd.maxTileSize), "-i", *inputFilename, "-o", "/mnt/output/tilevariants.tsv"}
+               runner.Args = []string{"annotate", "-local=true", fmt.Sprintf("-variant-hash=%v", cmd.variantHash), "-max-tile-size", strconv.Itoa(cmd.maxTileSize), "-i", *inputFilename, "-o", "/mnt/output/tilevariants.csv"}
                var output string
                output, err = runner.Run()
                if err != nil {
                        return 1
                }
-               fmt.Fprintln(stdout, output+"/tilevariants.tsv")
+               fmt.Fprintln(stdout, output+"/tilevariants.csv")
                return 0
        }
 
@@ -106,13 +115,13 @@ func (cmd *annotatecmd) RunCommand(prog string, args []string, stdin io.Reader,
                }
                defer output.Close()
        }
-       bufw := bufio.NewWriter(output)
+       bufw := bufio.NewWriterSize(output, 4*1024*1024)
 
        tilelib := &tileLibrary{
-               includeNoCalls:      true,
+               retainNoCalls:       true,
                retainTileSequences: true,
        }
-       err = tilelib.LoadGob(context.Background(), input, nil)
+       err = tilelib.LoadGob(context.Background(), input, strings.HasSuffix(*inputFilename, ".gz"))
        if err != nil {
                return 1
        }
@@ -183,6 +192,9 @@ func (cmd *annotatecmd) exportTileDiffs(outw io.Writer, tilelib *tileLibrary) er
                for _, seqname := range seqnames {
                        seqname := seqname
                        throttle.Acquire()
+                       if throttle.Err() != nil {
+                               break
+                       }
                        go func() {
                                defer throttle.Release()
                                throttle.Report(cmd.annotateSequence(throttle, outch, tilelib, taglen, refname, seqname, refcs[seqname], len(refs) > 1))
@@ -194,6 +206,10 @@ func (cmd *annotatecmd) exportTileDiffs(outw io.Writer, tilelib *tileLibrary) er
 }
 
 func (cmd *annotatecmd) annotateSequence(throttle *throttle, outch chan<- string, tilelib *tileLibrary, taglen int, refname, seqname string, reftiles []tileLibRef, refnamecol bool) error {
+       refnamefield := ""
+       if refnamecol {
+               refnamefield = "," + trimFilenameForLabel(refname)
+       }
        var refseq []byte
        // tilestart[123] is the index into refseq
        // where the tile for tag 123 was placed.
@@ -216,23 +232,37 @@ func (cmd *annotatecmd) annotateSequence(throttle *throttle, outch chan<- string
                tileend[libref.Tag] = len(refseq)
        }
        log.Infof("seq %s len(refseq) %d len(tilestart) %d", seqname, len(refseq), len(tilestart))
+       // outtag is tag's index in the subset of tags that aren't
+       // dropped. If there are 10M tags and half are dropped by
+       // dropTiles, tag ranges from 0 to 10M-1 and outtag ranges
+       // from 0 to 5M-1.
+       //
+       // IOW, in the matrix built by cgs2array(), {tag} is
+       // represented by columns {outtag}*2 and {outtag}*2+1.
+       outcol := -1
        for tag, tvs := range tilelib.variant {
+               if len(cmd.dropTiles) > tag && cmd.dropTiles[tag] {
+                       continue
+               }
                tag := tagID(tag)
+               outcol++
+               // Must shadow outcol var to use safely in goroutine below.
+               outcol := outcol
                refstart, ok := tilestart[tag]
                if !ok {
-                       // Tag didn't place on this
-                       // reference sequence. (It
-                       // might place on the same
-                       // chromosome in a genome
-                       // anyway, but we don't output
-                       // the annotations that would
-                       // result.)
+                       // Tag didn't place on this reference
+                       // sequence. (It might place on the same
+                       // chromosome in a genome anyway, but we don't
+                       // output the annotations that would result.)
+                       // outch <- fmt.Sprintf("%d,%d,-1%s\n", tag, outcol, refnamefield)
                        continue
                }
                for variant := 1; variant <= len(tvs); variant++ {
                        variant, hash := tileVariantID(variant), tvs[variant-1]
                        tileseq := tilelib.TileVariantSequence(tileLibRef{Tag: tag, Variant: variant})
-                       if len(tileseq) < taglen {
+                       if len(tileseq) == 0 {
+                               continue
+                       } else if len(tileseq) < taglen {
                                return fmt.Errorf("tilevar %d,%d has sequence len %d < taglen %d", tag, variant, len(tileseq), taglen)
                        }
                        var refpart []byte
@@ -273,11 +303,10 @@ func (cmd *annotatecmd) annotateSequence(throttle *throttle, outch chan<- string
                                        } else {
                                                varid = fmt.Sprintf("%d", variant)
                                        }
-                                       refnamefield := ""
-                                       if refnamecol {
-                                               refnamefield = "\t" + refname
+                                       outch <- fmt.Sprintf("%d,%d,%s%s,%s:g.%s\n", tag, outcol, varid, refnamefield, seqname, diff.String())
+                                       if cmd.reportAnnotation != nil {
+                                               cmd.reportAnnotation(tag, outcol, variant, refname, seqname, diff)
                                        }
-                                       outch <- fmt.Sprintf("%d\t%s%s\t%s:g.%s\n", tag, varid, refnamefield, seqname, diff.String())
                                }
                        }()
                }