Track which tile variants each hgvs.Variant appeared in.
[lightning.git] / dumpgob.go
1 package lightning
2
3 import (
4         "bufio"
5         "errors"
6         "flag"
7         "fmt"
8         "io"
9         "net/http"
10         _ "net/http/pprof"
11         "os"
12         "strings"
13
14         "git.arvados.org/arvados.git/sdk/go/arvados"
15         log "github.com/sirupsen/logrus"
16 )
17
18 type dumpGob struct{}
19
20 func (cmd *dumpGob) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
21         var err error
22         defer func() {
23                 if err != nil {
24                         fmt.Fprintf(stderr, "%s\n", err)
25                 }
26         }()
27         flags := flag.NewFlagSet("", flag.ContinueOnError)
28         flags.SetOutput(stderr)
29         pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
30         runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
31         projectUUID := flags.String("project", "", "project `UUID` for output data")
32         priority := flags.Int("priority", 500, "container request priority")
33         inputFilename := flags.String("i", "-", "input `file` (library)")
34         outputFilename := flags.String("o", "-", "output `file`")
35         err = flags.Parse(args)
36         if err == flag.ErrHelp {
37                 err = nil
38                 return 0
39         } else if err != nil {
40                 return 2
41         }
42
43         if *pprof != "" {
44                 go func() {
45                         log.Println(http.ListenAndServe(*pprof, nil))
46                 }()
47         }
48
49         if !*runlocal {
50                 if *outputFilename != "-" {
51                         err = errors.New("cannot specify output file in container mode: not implemented")
52                         return 1
53                 }
54                 runner := arvadosContainerRunner{
55                         Name:        "lightning dumpgob",
56                         Client:      arvados.NewClientFromEnv(),
57                         ProjectUUID: *projectUUID,
58                         RAM:         4000000000,
59                         VCPUs:       1,
60                         Priority:    *priority,
61                 }
62                 err = runner.TranslatePaths(inputFilename)
63                 if err != nil {
64                         return 1
65                 }
66                 runner.Args = []string{"dumpgob", "-local=true", fmt.Sprintf("-pprof=%v", *pprof), "-i", *inputFilename, "-o", "/mnt/output/dumpgob.txt"}
67                 var output string
68                 output, err = runner.Run()
69                 if err != nil {
70                         return 1
71                 }
72                 fmt.Fprintln(stdout, output+"/dumpgob.txt")
73                 return 0
74         }
75
76         input, err := open(*inputFilename)
77         if err != nil {
78                 return 1
79         }
80         defer input.Close()
81         output, err := os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0644)
82         if err != nil {
83                 return 1
84         }
85         defer output.Close()
86         bufw := bufio.NewWriterSize(output, 8*1024*1024)
87
88         var n, nCG, nCS, nTV int
89         err = DecodeLibrary(input, strings.HasSuffix(*inputFilename, ".gz"), func(ent *LibraryEntry) error {
90                 if n%1000000 == 0 {
91                         fmt.Fprintf(stderr, "ent %d\n", n)
92                 }
93                 n++
94                 if len(ent.TagSet) > 0 {
95                         fmt.Fprintf(bufw, "ent %d: TagSet, len %d, taglen %d\n", n, len(ent.TagSet), len(ent.TagSet[0]))
96                 }
97                 for _, cg := range ent.CompactGenomes {
98                         nCG++
99                         fmt.Fprintf(bufw, "ent %d: CompactGenome, name %q, len(Variants) %d\n", n, cg.Name, len(cg.Variants))
100                 }
101                 for _, cs := range ent.CompactSequences {
102                         nCS++
103                         fmt.Fprintf(bufw, "ent %d: CompactSequence, name %q, len(TileSequences) %d\n", n, cs.Name, len(cs.TileSequences))
104                 }
105                 for _, tv := range ent.TileVariants {
106                         nTV++
107                         fmt.Fprintf(bufw, "ent %d: TileVariant, tag %d, variant %d, hash %x, len(seq) %d\n", n, tv.Tag, tv.Variant, tv.Blake2b, len(tv.Sequence))
108                 }
109                 return nil
110         })
111         if err != nil {
112                 return 1
113         }
114         fmt.Fprintf(bufw, "total: ents %d, CompactGenomes %d, CompactSequences %d, TileVariants %d\n", n, nCG, nCS, nTV)
115         err = bufw.Flush()
116         if err != nil {
117                 return 1
118         }
119         err = output.Close()
120         if err != nil {
121                 return 1
122         }
123         return 0
124 }