Fix some tests.
[lightning.git] / stats.go
index 6c0208aaf5a23ffaaf940f076a6692f91d47a7b2..2f9bd4192c9ea3fc7f3a29cf10db29ec2cdc1bc8 100644 (file)
--- a/stats.go
+++ b/stats.go
@@ -1,8 +1,11 @@
-package main
+// Copyright (C) The Lightning Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package lightning
 
 import (
        "bufio"
-       "encoding/gob"
        "encoding/json"
        "errors"
        "flag"
@@ -12,14 +15,17 @@ import (
        "net/http"
        _ "net/http/pprof"
        "os"
+       "strings"
 
        "git.arvados.org/arvados.git/sdk/go/arvados"
        log "github.com/sirupsen/logrus"
 )
 
-type stats struct{}
+type statscmd struct {
+       debugUnplaced bool
+}
 
-func (cmd *stats) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
+func (cmd *statscmd) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
        var err error
        defer func() {
                if err != nil {
@@ -34,12 +40,16 @@ func (cmd *stats) RunCommand(prog string, args []string, stdin io.Reader, stdout
        priority := flags.Int("priority", 500, "container request priority")
        inputFilename := flags.String("i", "-", "input `file`")
        outputFilename := flags.String("o", "-", "output `file`")
+       flags.BoolVar(&cmd.debugUnplaced, "debug-unplaced", false, "output full list of unplaced tags")
        err = flags.Parse(args)
        if err == flag.ErrHelp {
                err = nil
                return 0
        } else if err != nil {
                return 2
+       } else if flags.NArg() > 0 {
+               err = fmt.Errorf("errant command line arguments after parsed flags: %v", flags.Args())
+               return 2
        }
 
        if *pprof != "" {
@@ -65,7 +75,7 @@ func (cmd *stats) RunCommand(prog string, args []string, stdin io.Reader, stdout
                if err != nil {
                        return 1
                }
-               runner.Args = []string{"stats", "-local=true", "-i", *inputFilename, "-o", "/mnt/output/stats.json"}
+               runner.Args = []string{"stats", "-local=true", fmt.Sprintf("-debug-unplaced=%v", cmd.debugUnplaced), "-i", *inputFilename, "-o", "/mnt/output/stats.json"}
                var output string
                output, err = runner.Run()
                if err != nil {
@@ -98,7 +108,10 @@ func (cmd *stats) RunCommand(prog string, args []string, stdin io.Reader, stdout
        }
 
        bufw := bufio.NewWriter(output)
-       cmd.readLibrary(input, bufw)
+       err = cmd.doStats(input, strings.HasSuffix(*inputFilename, ".gz"), bufw)
+       if err != nil {
+               return 1
+       }
        err = bufw.Flush()
        if err != nil {
                return 1
@@ -110,36 +123,43 @@ func (cmd *stats) RunCommand(prog string, args []string, stdin io.Reader, stdout
        return 0
 }
 
-func (cmd *stats) readLibrary(input io.Reader, output io.Writer) error {
+func (cmd *statscmd) doStats(input io.Reader, gz bool, output io.Writer) error {
        var ret struct {
                Genomes          int
+               CalledBases      []int64
                Tags             int
+               TagsPlacedNTimes []int // a[x]==y means there were y tags that placed x times
                TileVariants     int
                VariantsBySize   []int
                NCVariantsBySize []int
+               UnplacedTags     []string `json:",omitempty"`
        }
 
-       dec := gob.NewDecoder(bufio.NewReaderSize(input, 1<<26))
-       for {
-               var ent LibraryEntry
-               err := dec.Decode(&ent)
-               if err == io.EOF {
-                       break
-               } else if err != nil {
-                       return err
-               }
+       var tagSet [][]byte
+       var tagPlacements []int
+       tileVariantCalls := map[tileLibRef]int{}
+       err := DecodeLibrary(input, gz, func(ent *LibraryEntry) error {
                ret.Genomes += len(ent.CompactGenomes)
-               ret.Tags += len(ent.TagSet)
                ret.TileVariants += len(ent.TileVariants)
+               if len(ent.TagSet) > 0 {
+                       if ret.Tags > 0 {
+                               return errors.New("invalid input: contains multiple tagsets")
+                       }
+                       ret.Tags = len(ent.TagSet)
+                       tagSet = ent.TagSet
+               }
                for _, tv := range ent.TileVariants {
                        if need := 1 + len(tv.Sequence) - len(ret.VariantsBySize); need > 0 {
                                ret.VariantsBySize = append(ret.VariantsBySize, make([]int, need)...)
                                ret.NCVariantsBySize = append(ret.NCVariantsBySize, make([]int, need)...)
                        }
 
+                       calls := 0
                        hasNoCalls := false
                        for _, b := range tv.Sequence {
-                               if b != 'a' && b != 'c' && b != 'g' && b != 't' {
+                               if b == 'a' || b == 'c' || b == 'g' || b == 't' {
+                                       calls++
+                               } else {
                                        hasNoCalls = true
                                }
                        }
@@ -149,7 +169,36 @@ func (cmd *stats) readLibrary(input io.Reader, output io.Writer) error {
                        } else {
                                ret.VariantsBySize[len(tv.Sequence)]++
                        }
+
+                       tileVariantCalls[tileLibRef{Tag: tv.Tag, Variant: tv.Variant}] = calls
                }
+               for _, g := range ent.CompactGenomes {
+                       if need := (len(g.Variants)+1)/2 - len(tagPlacements); need > 0 {
+                               tagPlacements = append(tagPlacements, make([]int, need)...)
+                       }
+                       calledBases := int64(0)
+                       for idx, v := range g.Variants {
+                               if v > 0 {
+                                       tagPlacements[idx/2]++
+                                       calledBases += int64(tileVariantCalls[tileLibRef{Tag: tagID(idx / 2), Variant: v}])
+                               }
+                       }
+                       ret.CalledBases = append(ret.CalledBases, calledBases)
+               }
+               return nil
+       })
+       if err != nil {
+               return err
        }
+       for id, p := range tagPlacements {
+               for len(ret.TagsPlacedNTimes) <= p {
+                       ret.TagsPlacedNTimes = append(ret.TagsPlacedNTimes, 0)
+               }
+               ret.TagsPlacedNTimes[p]++
+               if cmd.debugUnplaced && p == 0 {
+                       ret.UnplacedTags = append(ret.UnplacedTags, fmt.Sprintf("%d %s", id, tagSet[id]))
+               }
+       }
+
        return json.NewEncoder(output).Encode(ret)
 }