Add tilestats cmd.
authorTom Clegg <tom@curii.com>
Wed, 16 Mar 2022 22:08:31 +0000 (18:08 -0400)
committerTom Clegg <tom@curii.com>
Wed, 16 Mar 2022 22:08:31 +0000 (18:08 -0400)
refs #18582

Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

cmd.go
slicenumpy.go
tilestats.go [new file with mode: 0644]

diff --git a/cmd.go b/cmd.go
index 514bd1ebb42ee99dec73bc18761a08a70641b1a1..b6c28f4deb35dbdd180b3804d9b694132d51c8af 100644 (file)
--- a/cmd.go
+++ b/cmd.go
@@ -33,6 +33,7 @@ var (
                "flake":              &flakecmd{},
                "slice":              &slicecmd{},
                "slice-numpy":        &sliceNumpy{},
+               "tiling-stats":       &tilingStats{},
                "anno2vcf":           &anno2vcf{},
                "numpy-comvar":       &numpyComVar{},
                "filter":             &filtercmd{},
index e0d58c48917d4f16bb29fdc22a6be2d8ca7f4a96..a2e03f494462a94da2233899ea2488ef6c4c5087 100644 (file)
@@ -166,7 +166,7 @@ func (cmd *sliceNumpy) RunCommand(prog string, args []string, stdin io.Reader, s
 
        cmd.cgnames = nil
        var tagset [][]byte
-       DecodeLibrary(in0, strings.HasSuffix(infiles[0], ".gz"), func(ent *LibraryEntry) error {
+       err = DecodeLibrary(in0, strings.HasSuffix(infiles[0], ".gz"), func(ent *LibraryEntry) error {
                if len(ent.TagSet) > 0 {
                        tagset = ent.TagSet
                }
diff --git a/tilestats.go b/tilestats.go
new file mode 100644 (file)
index 0000000..733b0fd
--- /dev/null
@@ -0,0 +1,174 @@
+// Copyright (C) The Lightning Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package lightning
+
+import (
+       "bufio"
+       "flag"
+       "fmt"
+       "io"
+       "net/http"
+       _ "net/http/pprof"
+       "os"
+       "path/filepath"
+       "sort"
+       "strings"
+
+       "git.arvados.org/arvados.git/sdk/go/arvados"
+       log "github.com/sirupsen/logrus"
+)
+
+type tilingStats struct {
+}
+
+func (cmd *tilingStats) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
+       var err error
+       defer func() {
+               if err != nil {
+                       fmt.Fprintf(stderr, "%s\n", err)
+               }
+       }()
+       flags := flag.NewFlagSet("", flag.ContinueOnError)
+       flags.SetOutput(stderr)
+       pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
+       runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
+       projectUUID := flags.String("project", "", "project `UUID` for output data")
+       priority := flags.Int("priority", 500, "container request priority")
+       inputDir := flags.String("input-dir", "./in", "input `directory`")
+       outputDir := flags.String("output-dir", "./out", "output `directory`")
+       err = flags.Parse(args)
+       if err == flag.ErrHelp {
+               err = nil
+               return 0
+       } else if err != nil {
+               return 2
+       }
+
+       if *pprof != "" {
+               go func() {
+                       log.Println(http.ListenAndServe(*pprof, nil))
+               }()
+       }
+
+       if !*runlocal {
+               runner := arvadosContainerRunner{
+                       Name:        "lightning tiling-stats",
+                       Client:      arvados.NewClientFromEnv(),
+                       ProjectUUID: *projectUUID,
+                       RAM:         8000000000,
+                       VCPUs:       2,
+                       Priority:    *priority,
+                       KeepCache:   2,
+                       APIAccess:   true,
+               }
+               err = runner.TranslatePaths(inputDir)
+               if err != nil {
+                       return 1
+               }
+               runner.Args = []string{"tiling-stats", "-local=true",
+                       "-pprof=:6060",
+                       "-input-dir=" + *inputDir,
+                       "-output-dir=/mnt/output",
+               }
+               var output string
+               output, err = runner.Run()
+               if err != nil {
+                       return 1
+               }
+               fmt.Fprintln(stdout, output)
+               return 0
+       }
+
+       infiles, err := allFiles(*inputDir, matchGobFile)
+       if err != nil {
+               return 1
+       }
+       if len(infiles) == 0 {
+               err = fmt.Errorf("no input files found in %s", *inputDir)
+               return 1
+       }
+       sort.Strings(infiles)
+
+       var refseqs []CompactSequence
+       var reftiledata = make(map[tileLibRef][]byte, 11000000)
+       in0, err := open(infiles[0])
+       if err != nil {
+               return 1
+       }
+       defer in0.Close()
+       var taglen int
+       err = DecodeLibrary(in0, strings.HasSuffix(infiles[0], ".gz"), func(ent *LibraryEntry) error {
+               if len(ent.TagSet) > 0 {
+                       taglen = len(ent.TagSet[0])
+               }
+               refseqs = append(refseqs, ent.CompactSequences...)
+               for _, tv := range ent.TileVariants {
+                       if tv.Ref {
+                               reftiledata[tileLibRef{tv.Tag, tv.Variant}] = tv.Sequence
+                       }
+               }
+               return nil
+       })
+       if err != nil {
+               return 1
+       }
+       in0.Close()
+       if len(refseqs) == 0 {
+               err = fmt.Errorf("%s: reference sequence not found", infiles[0])
+               return 1
+       }
+       if taglen == 0 {
+               err = fmt.Errorf("%s: tagset not found", infiles[0])
+               return 1
+       }
+
+       for _, cseq := range refseqs {
+               _, basename := filepath.Split(cseq.Name)
+               bedname := fmt.Sprintf("%s/%s.bed", *outputDir, basename)
+               log.Infof("writing %s", bedname)
+               var f *os.File
+               f, err = os.OpenFile(bedname, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0777)
+               if err != nil {
+                       return 1
+               }
+               defer f.Close()
+               bufw := bufio.NewWriterSize(f, 1<<24)
+               seqnames := make([]string, 0, len(cseq.TileSequences))
+               for seqname := range cseq.TileSequences {
+                       seqnames = append(seqnames, seqname)
+               }
+               sort.Strings(seqnames)
+               for _, seqname := range seqnames {
+                       pos := 0
+                       for _, libref := range cseq.TileSequences[seqname] {
+                               tiledata := reftiledata[libref]
+                               if len(tiledata) <= taglen {
+                                       err = fmt.Errorf("bogus input data: ref tile libref %v has len %d < taglen %d", libref, len(tiledata), taglen)
+                                       return 1
+                               }
+                               score := 1000 * countBases(tiledata) / len(tiledata)
+                               _, err = fmt.Fprintf(bufw, "%s %d %d %d %d . %d %d\n",
+                                       seqname,
+                                       pos, pos+len(tiledata),
+                                       libref.Tag,
+                                       score,
+                                       pos+taglen, pos+len(tiledata)-taglen)
+                               if err != nil {
+                                       return 1
+                               }
+                               pos += len(tiledata) - taglen
+                       }
+               }
+               err = bufw.Flush()
+               if err != nil {
+                       return 1
+               }
+               err = f.Close()
+               if err != nil {
+                       return 1
+               }
+       }
+       return 0
+}