X-Git-Url: https://git.arvados.org/lightning.git/blobdiff_plain/33de1a2524e955ab51f71f428c580e67c6874e43..69b71af4136fdaeeb5c2afbc559208dc5f428c48:/stats.go diff --git a/stats.go b/stats.go index 6c0208aaf5..460eef47fb 100644 --- a/stats.go +++ b/stats.go @@ -2,7 +2,6 @@ package main import ( "bufio" - "encoding/gob" "encoding/json" "errors" "flag" @@ -12,14 +11,17 @@ import ( "net/http" _ "net/http/pprof" "os" + "strings" "git.arvados.org/arvados.git/sdk/go/arvados" log "github.com/sirupsen/logrus" ) -type stats struct{} +type statscmd struct { + debugUnplaced bool +} -func (cmd *stats) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int { +func (cmd *statscmd) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int { var err error defer func() { if err != nil { @@ -34,6 +36,7 @@ func (cmd *stats) RunCommand(prog string, args []string, stdin io.Reader, stdout priority := flags.Int("priority", 500, "container request priority") inputFilename := flags.String("i", "-", "input `file`") outputFilename := flags.String("o", "-", "output `file`") + flags.BoolVar(&cmd.debugUnplaced, "debug-unplaced", false, "output full list of unplaced tags") err = flags.Parse(args) if err == flag.ErrHelp { err = nil @@ -65,7 +68,7 @@ func (cmd *stats) RunCommand(prog string, args []string, stdin io.Reader, stdout if err != nil { return 1 } - runner.Args = []string{"stats", "-local=true", "-i", *inputFilename, "-o", "/mnt/output/stats.json"} + runner.Args = []string{"stats", "-local=true", fmt.Sprintf("-debug-unplaced=%v", cmd.debugUnplaced), "-i", *inputFilename, "-o", "/mnt/output/stats.json"} var output string output, err = runner.Run() if err != nil { @@ -98,7 +101,10 @@ func (cmd *stats) RunCommand(prog string, args []string, stdin io.Reader, stdout } bufw := bufio.NewWriter(output) - cmd.readLibrary(input, bufw) + err = cmd.doStats(input, strings.HasSuffix(*inputFilename, ".gz"), bufw) + if err != nil { + return 1 + } err = bufw.Flush() if err != nil { return 1 @@ -110,36 +116,43 @@ func (cmd *stats) RunCommand(prog string, args []string, stdin io.Reader, stdout return 0 } -func (cmd *stats) readLibrary(input io.Reader, output io.Writer) error { +func (cmd *statscmd) doStats(input io.Reader, gz bool, output io.Writer) error { var ret struct { Genomes int + CalledBases []int64 Tags int + TagsPlacedNTimes []int // a[x]==y means there were y tags that placed x times TileVariants int VariantsBySize []int NCVariantsBySize []int + UnplacedTags []string `json:",omitempty"` } - dec := gob.NewDecoder(bufio.NewReaderSize(input, 1<<26)) - for { - var ent LibraryEntry - err := dec.Decode(&ent) - if err == io.EOF { - break - } else if err != nil { - return err - } + var tagSet [][]byte + var tagPlacements []int + tileVariantCalls := map[tileLibRef]int{} + err := DecodeLibrary(input, gz, func(ent *LibraryEntry) error { ret.Genomes += len(ent.CompactGenomes) - ret.Tags += len(ent.TagSet) ret.TileVariants += len(ent.TileVariants) + if len(ent.TagSet) > 0 { + if ret.Tags > 0 { + return errors.New("invalid input: contains multiple tagsets") + } + ret.Tags = len(ent.TagSet) + tagSet = ent.TagSet + } for _, tv := range ent.TileVariants { if need := 1 + len(tv.Sequence) - len(ret.VariantsBySize); need > 0 { ret.VariantsBySize = append(ret.VariantsBySize, make([]int, need)...) ret.NCVariantsBySize = append(ret.NCVariantsBySize, make([]int, need)...) } + calls := 0 hasNoCalls := false for _, b := range tv.Sequence { - if b != 'a' && b != 'c' && b != 'g' && b != 't' { + if b == 'a' || b == 'c' || b == 'g' || b == 't' { + calls++ + } else { hasNoCalls = true } } @@ -149,7 +162,36 @@ func (cmd *stats) readLibrary(input io.Reader, output io.Writer) error { } else { ret.VariantsBySize[len(tv.Sequence)]++ } + + tileVariantCalls[tileLibRef{Tag: tv.Tag, Variant: tv.Variant}] = calls } + for _, g := range ent.CompactGenomes { + if need := (len(g.Variants)+1)/2 - len(tagPlacements); need > 0 { + tagPlacements = append(tagPlacements, make([]int, need)...) + } + calledBases := int64(0) + for idx, v := range g.Variants { + if v > 0 { + tagPlacements[idx/2]++ + calledBases += int64(tileVariantCalls[tileLibRef{Tag: tagID(idx / 2), Variant: v}]) + } + } + ret.CalledBases = append(ret.CalledBases, calledBases) + } + return nil + }) + if err != nil { + return err } + for id, p := range tagPlacements { + for len(ret.TagsPlacedNTimes) <= p { + ret.TagsPlacedNTimes = append(ret.TagsPlacedNTimes, 0) + } + ret.TagsPlacedNTimes[p]++ + if cmd.debugUnplaced && p == 0 { + ret.UnplacedTags = append(ret.UnplacedTags, fmt.Sprintf("%d %s", id, tagSet[id])) + } + } + return json.NewEncoder(output).Encode(ret) }