From: Tom Clegg Date: Wed, 7 Oct 2020 13:22:46 +0000 (-0400) Subject: Add stats command. X-Git-Url: https://git.arvados.org/lightning.git/commitdiff_plain/20123d5f8ceb57ea4e5253cb0c57da194d66da63 Add stats command. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- diff --git a/cmd.go b/cmd.go index 833dddb275..8fc3636d6c 100644 --- a/cmd.go +++ b/cmd.go @@ -26,6 +26,7 @@ var ( "pca-py": &pythonPCA{}, "plot": &pythonPlot{}, "diff-fasta": &diffFasta{}, + "stats": &stats{}, }) ) diff --git a/example-su92l-1kg.sh b/example-su92l-1kg.sh index d59b55cb5d..1e5ecbc1b3 100755 --- a/example-su92l-1kg.sh +++ b/example-su92l-1kg.sh @@ -17,6 +17,7 @@ tagset=su92l-4zz18-92bx4zjg5hgs3yc/tagset.fa.gz genome=$(lightning ref2genome -project ${project} -priority ${priority} -ref ${ref_fa}) fasta=$(lightning vcf2fasta -project ${project} -priority ${priority} -ref ${ref_fa} -genome ${genome} -mask=true ${gvcf}) unfiltered=$(lightning import -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true ${fasta}) +stats=$(lightning stats -project ${project} -priority ${priority} -i ${unfiltered}) filtered=$(lightning filter -project ${project} -priority ${priority} -i ${unfiltered} -min-coverage "0.9" -max-variants "30") #numpy=$(lightning export-numpy -project ${project} -priority ${priority} -i ${filtered} -one-hot) #pca=$(lightning pca-py -project ${project} -priority ${priority} -i ${numpy}) diff --git a/stats.go b/stats.go new file mode 100644 index 0000000000..207bd25f81 --- /dev/null +++ b/stats.go @@ -0,0 +1,155 @@ +package main + +import ( + "bufio" + "encoding/gob" + "encoding/json" + "errors" + "flag" + "fmt" + "io" + "io/ioutil" + "net/http" + _ "net/http/pprof" + "os" + + "git.arvados.org/arvados.git/sdk/go/arvados" + log "github.com/sirupsen/logrus" +) + +type stats struct{} + +func (cmd *stats) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int { + var err error + defer func() { + if err != nil { + fmt.Fprintf(stderr, "%s\n", err) + } + }() + flags := flag.NewFlagSet("", flag.ContinueOnError) + flags.SetOutput(stderr) + pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`") + runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)") + projectUUID := flags.String("project", "", "project `UUID` for output data") + priority := flags.Int("priority", 500, "container request priority") + inputFilename := flags.String("i", "-", "input `file`") + outputFilename := flags.String("o", "-", "output `file`") + err = flags.Parse(args) + if err == flag.ErrHelp { + err = nil + return 0 + } else if err != nil { + return 2 + } + + if *pprof != "" { + go func() { + log.Println(http.ListenAndServe(*pprof, nil)) + }() + } + + if !*runlocal { + if *outputFilename != "-" { + err = errors.New("cannot specify output file in container mode: not implemented") + return 1 + } + runner := arvadosContainerRunner{ + Name: "lightning stats", + Client: arvados.NewClientFromEnv(), + ProjectUUID: *projectUUID, + RAM: 16000000000, + VCPUs: 1, + Priority: *priority, + } + err = runner.TranslatePaths(inputFilename) + if err != nil { + return 1 + } + runner.Args = []string{"stats", "-local=true", "-i", *inputFilename, "-o", "/mnt/output/stats.json"} + var output string + output, err = runner.Run() + if err != nil { + return 1 + } + fmt.Fprintln(stdout, output+"/stats.json") + return 0 + } + + var input io.ReadCloser + if *inputFilename == "-" { + input = ioutil.NopCloser(stdin) + } else { + input, err = os.Open(*inputFilename) + if err != nil { + return 1 + } + defer input.Close() + } + + var output io.WriteCloser + if *outputFilename == "-" { + output = nopCloser{stdout} + } else { + output, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0777) + if err != nil { + return 1 + } + defer output.Close() + } + + bufw := bufio.NewWriter(output) + cmd.readLibrary(input, bufw) + err = bufw.Flush() + if err != nil { + return 1 + } + err = output.Close() + if err != nil { + return 1 + } + return 0 +} + +func (cmd *stats) readLibrary(input io.Reader, output io.Writer) error { + var ret struct { + Genomes int + Tags int + TileVariants int + VariantsBySize []int + NCVariantsBySize []int + } + + dec := gob.NewDecoder(input) + for { + var ent LibraryEntry + err := dec.Decode(&ent) + if err == io.EOF { + break + } else if err != nil { + return err + } + ret.Genomes += len(ent.CompactGenomes) + ret.Tags += len(ent.TagSet) + ret.TileVariants += len(ent.TileVariants) + for _, tv := range ent.TileVariants { + if need := 1 + len(tv.Sequence) - len(ret.VariantsBySize); need > 0 { + ret.VariantsBySize = append(ret.VariantsBySize, make([]int, need)...) + ret.NCVariantsBySize = append(ret.NCVariantsBySize, make([]int, need)...) + } + + hasNoCalls := false + for _, b := range tv.Sequence { + if b != 'a' && b != 'c' && b != 'g' && b != 't' { + hasNoCalls = true + } + } + + if hasNoCalls { + ret.NCVariantsBySize[len(tv.Sequence)]++ + } else { + ret.VariantsBySize[len(tv.Sequence)]++ + } + } + } + return json.NewEncoder(output).Encode(ret) +}