Add stats command.
authorTom Clegg <tom@tomclegg.ca>
Wed, 7 Oct 2020 13:22:46 +0000 (09:22 -0400)
committerTom Clegg <tom@tomclegg.ca>
Wed, 7 Oct 2020 13:22:46 +0000 (09:22 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@tomclegg.ca>

cmd.go
example-su92l-1kg.sh
stats.go [new file with mode: 0644]

diff --git a/cmd.go b/cmd.go
index 833dddb275e9900c887832a2ae82b48a0bafd4a4..8fc3636d6c577b68ab8d439179f327dee2726f5e 100644 (file)
--- a/cmd.go
+++ b/cmd.go
@@ -26,6 +26,7 @@ var (
                "pca-py":             &pythonPCA{},
                "plot":               &pythonPlot{},
                "diff-fasta":         &diffFasta{},
+               "stats":              &stats{},
        })
 )
 
index d59b55cb5d62d42d019b6c52ea0643f5c048f1f3..1e5ecbc1b348535695714d096c7e88c173202c24 100755 (executable)
@@ -17,6 +17,7 @@ tagset=su92l-4zz18-92bx4zjg5hgs3yc/tagset.fa.gz
 genome=$(lightning     ref2genome   -project ${project} -priority ${priority} -ref ${ref_fa})
 fasta=$(lightning      vcf2fasta    -project ${project} -priority ${priority} -ref ${ref_fa} -genome ${genome} -mask=true ${gvcf})
 unfiltered=$(lightning import       -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true ${fasta})
+stats=$(lightning      stats        -project ${project} -priority ${priority} -i ${unfiltered})
 filtered=$(lightning   filter       -project ${project} -priority ${priority} -i ${unfiltered} -min-coverage "0.9" -max-variants "30")
 #numpy=$(lightning     export-numpy -project ${project} -priority ${priority} -i ${filtered} -one-hot)
 #pca=$(lightning       pca-py       -project ${project} -priority ${priority} -i ${numpy})
diff --git a/stats.go b/stats.go
new file mode 100644 (file)
index 0000000..207bd25
--- /dev/null
+++ b/stats.go
@@ -0,0 +1,155 @@
+package main
+
+import (
+       "bufio"
+       "encoding/gob"
+       "encoding/json"
+       "errors"
+       "flag"
+       "fmt"
+       "io"
+       "io/ioutil"
+       "net/http"
+       _ "net/http/pprof"
+       "os"
+
+       "git.arvados.org/arvados.git/sdk/go/arvados"
+       log "github.com/sirupsen/logrus"
+)
+
+type stats struct{}
+
+func (cmd *stats) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
+       var err error
+       defer func() {
+               if err != nil {
+                       fmt.Fprintf(stderr, "%s\n", err)
+               }
+       }()
+       flags := flag.NewFlagSet("", flag.ContinueOnError)
+       flags.SetOutput(stderr)
+       pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
+       runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
+       projectUUID := flags.String("project", "", "project `UUID` for output data")
+       priority := flags.Int("priority", 500, "container request priority")
+       inputFilename := flags.String("i", "-", "input `file`")
+       outputFilename := flags.String("o", "-", "output `file`")
+       err = flags.Parse(args)
+       if err == flag.ErrHelp {
+               err = nil
+               return 0
+       } else if err != nil {
+               return 2
+       }
+
+       if *pprof != "" {
+               go func() {
+                       log.Println(http.ListenAndServe(*pprof, nil))
+               }()
+       }
+
+       if !*runlocal {
+               if *outputFilename != "-" {
+                       err = errors.New("cannot specify output file in container mode: not implemented")
+                       return 1
+               }
+               runner := arvadosContainerRunner{
+                       Name:        "lightning stats",
+                       Client:      arvados.NewClientFromEnv(),
+                       ProjectUUID: *projectUUID,
+                       RAM:         16000000000,
+                       VCPUs:       1,
+                       Priority:    *priority,
+               }
+               err = runner.TranslatePaths(inputFilename)
+               if err != nil {
+                       return 1
+               }
+               runner.Args = []string{"stats", "-local=true", "-i", *inputFilename, "-o", "/mnt/output/stats.json"}
+               var output string
+               output, err = runner.Run()
+               if err != nil {
+                       return 1
+               }
+               fmt.Fprintln(stdout, output+"/stats.json")
+               return 0
+       }
+
+       var input io.ReadCloser
+       if *inputFilename == "-" {
+               input = ioutil.NopCloser(stdin)
+       } else {
+               input, err = os.Open(*inputFilename)
+               if err != nil {
+                       return 1
+               }
+               defer input.Close()
+       }
+
+       var output io.WriteCloser
+       if *outputFilename == "-" {
+               output = nopCloser{stdout}
+       } else {
+               output, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0777)
+               if err != nil {
+                       return 1
+               }
+               defer output.Close()
+       }
+
+       bufw := bufio.NewWriter(output)
+       cmd.readLibrary(input, bufw)
+       err = bufw.Flush()
+       if err != nil {
+               return 1
+       }
+       err = output.Close()
+       if err != nil {
+               return 1
+       }
+       return 0
+}
+
+func (cmd *stats) readLibrary(input io.Reader, output io.Writer) error {
+       var ret struct {
+               Genomes          int
+               Tags             int
+               TileVariants     int
+               VariantsBySize   []int
+               NCVariantsBySize []int
+       }
+
+       dec := gob.NewDecoder(input)
+       for {
+               var ent LibraryEntry
+               err := dec.Decode(&ent)
+               if err == io.EOF {
+                       break
+               } else if err != nil {
+                       return err
+               }
+               ret.Genomes += len(ent.CompactGenomes)
+               ret.Tags += len(ent.TagSet)
+               ret.TileVariants += len(ent.TileVariants)
+               for _, tv := range ent.TileVariants {
+                       if need := 1 + len(tv.Sequence) - len(ret.VariantsBySize); need > 0 {
+                               ret.VariantsBySize = append(ret.VariantsBySize, make([]int, need)...)
+                               ret.NCVariantsBySize = append(ret.NCVariantsBySize, make([]int, need)...)
+                       }
+
+                       hasNoCalls := false
+                       for _, b := range tv.Sequence {
+                               if b != 'a' && b != 'c' && b != 'g' && b != 't' {
+                                       hasNoCalls = true
+                               }
+                       }
+
+                       if hasNoCalls {
+                               ret.NCVariantsBySize[len(tv.Sequence)]++
+                       } else {
+                               ret.VariantsBySize[len(tv.Sequence)]++
+                       }
+               }
+       }
+       return json.NewEncoder(output).Encode(ret)
+}