X-Git-Url: https://git.arvados.org/lightning.git/blobdiff_plain/7ebff453d609e6231a57ea831c31fe5374695550..79e0bd0ff72ae1759e8b780e5d8b61b09c9a0399:/stats.go diff --git a/stats.go b/stats.go index 62fb17d4fd..69e66da9a7 100644 --- a/stats.go +++ b/stats.go @@ -1,8 +1,11 @@ -package main +// Copyright (C) The Lightning Authors. All rights reserved. +// +// SPDX-License-Identifier: AGPL-3.0 + +package lightning import ( "bufio" - "encoding/gob" "encoding/json" "errors" "flag" @@ -12,16 +15,17 @@ import ( "net/http" _ "net/http/pprof" "os" + "strings" "git.arvados.org/arvados.git/sdk/go/arvados" log "github.com/sirupsen/logrus" ) -type stats struct { +type statscmd struct { debugUnplaced bool } -func (cmd *stats) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int { +func (cmd *statscmd) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int { var err error defer func() { if err != nil { @@ -101,7 +105,7 @@ func (cmd *stats) RunCommand(prog string, args []string, stdin io.Reader, stdout } bufw := bufio.NewWriter(output) - err = cmd.doStats(input, bufw) + err = cmd.doStats(input, strings.HasSuffix(*inputFilename, ".gz"), bufw) if err != nil { return 1 } @@ -116,9 +120,10 @@ func (cmd *stats) RunCommand(prog string, args []string, stdin io.Reader, stdout return 0 } -func (cmd *stats) doStats(input io.Reader, output io.Writer) error { +func (cmd *statscmd) doStats(input io.Reader, gz bool, output io.Writer) error { var ret struct { Genomes int + CalledBases []int64 Tags int TagsPlacedNTimes []int // a[x]==y means there were y tags that placed x times TileVariants int @@ -129,15 +134,8 @@ func (cmd *stats) doStats(input io.Reader, output io.Writer) error { var tagSet [][]byte var tagPlacements []int - dec := gob.NewDecoder(bufio.NewReaderSize(input, 1<<26)) - for { - var ent LibraryEntry - err := dec.Decode(&ent) - if err == io.EOF { - break - } else if err != nil { - return err - } + tileVariantCalls := map[tileLibRef]int{} + err := DecodeLibrary(input, gz, func(ent *LibraryEntry) error { ret.Genomes += len(ent.CompactGenomes) ret.TileVariants += len(ent.TileVariants) if len(ent.TagSet) > 0 { @@ -147,25 +145,18 @@ func (cmd *stats) doStats(input io.Reader, output io.Writer) error { ret.Tags = len(ent.TagSet) tagSet = ent.TagSet } - for _, g := range ent.CompactGenomes { - if need := (len(g.Variants)+1)/2 - len(tagPlacements); need > 0 { - tagPlacements = append(tagPlacements, make([]int, need)...) - } - for idx, v := range g.Variants { - if v > 0 { - tagPlacements[idx/2]++ - } - } - } for _, tv := range ent.TileVariants { if need := 1 + len(tv.Sequence) - len(ret.VariantsBySize); need > 0 { ret.VariantsBySize = append(ret.VariantsBySize, make([]int, need)...) ret.NCVariantsBySize = append(ret.NCVariantsBySize, make([]int, need)...) } + calls := 0 hasNoCalls := false for _, b := range tv.Sequence { - if b != 'a' && b != 'c' && b != 'g' && b != 't' { + if b == 'a' || b == 'c' || b == 'g' || b == 't' { + calls++ + } else { hasNoCalls = true } } @@ -175,7 +166,26 @@ func (cmd *stats) doStats(input io.Reader, output io.Writer) error { } else { ret.VariantsBySize[len(tv.Sequence)]++ } + + tileVariantCalls[tileLibRef{Tag: tv.Tag, Variant: tv.Variant}] = calls + } + for _, g := range ent.CompactGenomes { + if need := (len(g.Variants)+1)/2 - len(tagPlacements); need > 0 { + tagPlacements = append(tagPlacements, make([]int, need)...) + } + calledBases := int64(0) + for idx, v := range g.Variants { + if v > 0 { + tagPlacements[idx/2]++ + calledBases += int64(tileVariantCalls[tileLibRef{Tag: tagID(idx / 2), Variant: v}]) + } + } + ret.CalledBases = append(ret.CalledBases, calledBases) } + return nil + }) + if err != nil { + return err } for id, p := range tagPlacements { for len(ret.TagsPlacedNTimes) <= p {