X-Git-Url: https://git.arvados.org/lightning.git/blobdiff_plain/329d3bb23f0489bb0d8ca480435d183038dff169..870319feefd956f48d33d0d4cd146d1da16d9c55:/stats.go diff --git a/stats.go b/stats.go index 518cd132ac..460eef47fb 100644 --- a/stats.go +++ b/stats.go @@ -2,7 +2,6 @@ package main import ( "bufio" - "encoding/gob" "encoding/json" "errors" "flag" @@ -12,16 +11,17 @@ import ( "net/http" _ "net/http/pprof" "os" + "strings" "git.arvados.org/arvados.git/sdk/go/arvados" log "github.com/sirupsen/logrus" ) -type stats struct { +type statscmd struct { debugUnplaced bool } -func (cmd *stats) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int { +func (cmd *statscmd) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int { var err error defer func() { if err != nil { @@ -101,7 +101,7 @@ func (cmd *stats) RunCommand(prog string, args []string, stdin io.Reader, stdout } bufw := bufio.NewWriter(output) - err = cmd.doStats(input, bufw) + err = cmd.doStats(input, strings.HasSuffix(*inputFilename, ".gz"), bufw) if err != nil { return 1 } @@ -116,9 +116,10 @@ func (cmd *stats) RunCommand(prog string, args []string, stdin io.Reader, stdout return 0 } -func (cmd *stats) doStats(input io.Reader, output io.Writer) error { +func (cmd *statscmd) doStats(input io.Reader, gz bool, output io.Writer) error { var ret struct { Genomes int + CalledBases []int64 Tags int TagsPlacedNTimes []int // a[x]==y means there were y tags that placed x times TileVariants int @@ -129,15 +130,8 @@ func (cmd *stats) doStats(input io.Reader, output io.Writer) error { var tagSet [][]byte var tagPlacements []int - dec := gob.NewDecoder(bufio.NewReaderSize(input, 1<<26)) - for { - var ent LibraryEntry - err := dec.Decode(&ent) - if err == io.EOF { - break - } else if err != nil { - return fmt.Errorf("gob decode: %w", err) - } + tileVariantCalls := map[tileLibRef]int{} + err := DecodeLibrary(input, gz, func(ent *LibraryEntry) error { ret.Genomes += len(ent.CompactGenomes) ret.TileVariants += len(ent.TileVariants) if len(ent.TagSet) > 0 { @@ -147,25 +141,18 @@ func (cmd *stats) doStats(input io.Reader, output io.Writer) error { ret.Tags = len(ent.TagSet) tagSet = ent.TagSet } - for _, g := range ent.CompactGenomes { - if need := (len(g.Variants)+1)/2 - len(tagPlacements); need > 0 { - tagPlacements = append(tagPlacements, make([]int, need)...) - } - for idx, v := range g.Variants { - if v > 0 { - tagPlacements[idx/2]++ - } - } - } for _, tv := range ent.TileVariants { if need := 1 + len(tv.Sequence) - len(ret.VariantsBySize); need > 0 { ret.VariantsBySize = append(ret.VariantsBySize, make([]int, need)...) ret.NCVariantsBySize = append(ret.NCVariantsBySize, make([]int, need)...) } + calls := 0 hasNoCalls := false for _, b := range tv.Sequence { - if b != 'a' && b != 'c' && b != 'g' && b != 't' { + if b == 'a' || b == 'c' || b == 'g' || b == 't' { + calls++ + } else { hasNoCalls = true } } @@ -175,7 +162,26 @@ func (cmd *stats) doStats(input io.Reader, output io.Writer) error { } else { ret.VariantsBySize[len(tv.Sequence)]++ } + + tileVariantCalls[tileLibRef{Tag: tv.Tag, Variant: tv.Variant}] = calls } + for _, g := range ent.CompactGenomes { + if need := (len(g.Variants)+1)/2 - len(tagPlacements); need > 0 { + tagPlacements = append(tagPlacements, make([]int, need)...) + } + calledBases := int64(0) + for idx, v := range g.Variants { + if v > 0 { + tagPlacements[idx/2]++ + calledBases += int64(tileVariantCalls[tileLibRef{Tag: tagID(idx / 2), Variant: v}]) + } + } + ret.CalledBases = append(ret.CalledBases, calledBases) + } + return nil + }) + if err != nil { + return err } for id, p := range tagPlacements { for len(ret.TagsPlacedNTimes) <= p {