Export HGVS.
[lightning.git] / exporthgvs.go
1 package main
2
3 import (
4         "bufio"
5         "bytes"
6         "context"
7         "errors"
8         "flag"
9         "fmt"
10         "io"
11         "net/http"
12         _ "net/http/pprof"
13         "os"
14         "sort"
15         "strings"
16         "sync"
17         "time"
18
19         "git.arvados.org/arvados.git/sdk/go/arvados"
20         "github.com/arvados/lightning/hgvs"
21         log "github.com/sirupsen/logrus"
22 )
23
24 type exportHGVS struct {
25 }
26
27 func (cmd *exportHGVS) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
28         var err error
29         defer func() {
30                 if err != nil {
31                         fmt.Fprintf(stderr, "%s\n", err)
32                 }
33         }()
34         flags := flag.NewFlagSet("", flag.ContinueOnError)
35         flags.SetOutput(stderr)
36         pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
37         runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
38         projectUUID := flags.String("project", "", "project `UUID` for output data")
39         priority := flags.Int("priority", 500, "container request priority")
40         refname := flags.String("ref", "", "reference genome `name`")
41         inputFilename := flags.String("i", "-", "input `file` (library)")
42         outputFilename := flags.String("o", "-", "fasta output `file`")
43         pick := flags.String("pick", "", "`name` of single genome to export")
44         err = flags.Parse(args)
45         if err == flag.ErrHelp {
46                 err = nil
47                 return 0
48         } else if err != nil {
49                 return 2
50         }
51
52         if *pprof != "" {
53                 go func() {
54                         log.Println(http.ListenAndServe(*pprof, nil))
55                 }()
56         }
57
58         if !*runlocal {
59                 if *outputFilename != "-" {
60                         err = errors.New("cannot specify output file in container mode: not implemented")
61                         return 1
62                 }
63                 runner := arvadosContainerRunner{
64                         Name:        "lightning export-hgvs",
65                         Client:      arvados.NewClientFromEnv(),
66                         ProjectUUID: *projectUUID,
67                         RAM:         128000000000,
68                         VCPUs:       2,
69                         Priority:    *priority,
70                 }
71                 err = runner.TranslatePaths(inputFilename)
72                 if err != nil {
73                         return 1
74                 }
75                 runner.Args = []string{"export-hgvs", "-local=true", "-pick", *pick, "-ref", *refname, "-i", *inputFilename, "-o", "/mnt/output/export.csv"}
76                 var output string
77                 output, err = runner.Run()
78                 if err != nil {
79                         return 1
80                 }
81                 fmt.Fprintln(stdout, output+"/export.csv")
82                 return 0
83         }
84
85         input, err := os.Open(*inputFilename)
86         if err != nil {
87                 return 1
88         }
89         defer input.Close()
90
91         // Error out early if seeking doesn't work on the input file.
92         _, err = input.Seek(0, io.SeekEnd)
93         if err != nil {
94                 return 1
95         }
96         _, err = input.Seek(0, io.SeekStart)
97         if err != nil {
98                 return 1
99         }
100
101         var mtx sync.Mutex
102         var cgs []CompactGenome
103         var tilelib tileLibrary
104         err = tilelib.LoadGob(context.Background(), input, func(cg CompactGenome) {
105                 if *pick != "" && *pick != cg.Name {
106                         return
107                 }
108                 log.Debugf("export: pick %q", cg.Name)
109                 mtx.Lock()
110                 defer mtx.Unlock()
111                 cgs = append(cgs, cg)
112         })
113         if err != nil {
114                 return 1
115         }
116         sort.Slice(cgs, func(i, j int) bool { return cgs[i].Name < cgs[j].Name })
117         log.Printf("export: pick %q => %d genomes", *pick, len(cgs))
118
119         refseq, ok := tilelib.refseqs[*refname]
120         if !ok {
121                 err = fmt.Errorf("reference name %q not found in input; have %v", *refname, func() (names []string) {
122                         for name := range tilelib.refseqs {
123                                 names = append(names, name)
124                         }
125                         return
126                 }())
127                 return 1
128         }
129
130         _, err = input.Seek(0, io.SeekStart)
131         if err != nil {
132                 return 1
133         }
134
135         var output io.WriteCloser
136         if *outputFilename == "-" {
137                 output = nopCloser{stdout}
138         } else {
139                 output, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0777)
140                 if err != nil {
141                         return 1
142                 }
143                 defer output.Close()
144         }
145         bufw := bufio.NewWriter(output)
146         err = cmd.export(bufw, input, tilelib.taglib.keylen, refseq, cgs)
147         if err != nil {
148                 return 1
149         }
150         err = bufw.Flush()
151         if err != nil {
152                 return 1
153         }
154         err = output.Close()
155         if err != nil {
156                 return 1
157         }
158         err = input.Close()
159         if err != nil {
160                 return 1
161         }
162         return 0
163 }
164
165 func (cmd *exportHGVS) export(out io.Writer, librdr io.Reader, taglen int, refseq map[string][]tileLibRef, cgs []CompactGenome) error {
166         need := map[tileLibRef]bool{}
167         var seqnames []string
168         for seqname, librefs := range refseq {
169                 seqnames = append(seqnames, seqname)
170                 for _, libref := range librefs {
171                         need[libref] = true
172                 }
173         }
174         sort.Strings(seqnames)
175
176         for _, cg := range cgs {
177                 for i, variant := range cg.Variants {
178                         if variant == 0 {
179                                 continue
180                         }
181                         libref := tileLibRef{Tag: tagID(i / 2), Variant: variant}
182                         need[libref] = true
183                 }
184         }
185
186         log.Infof("export: loading %d tile variants", len(need))
187         tileVariant := map[tileLibRef]TileVariant{}
188         err := DecodeLibrary(librdr, func(ent *LibraryEntry) error {
189                 for _, tv := range ent.TileVariants {
190                         libref := tileLibRef{Tag: tv.Tag, Variant: tv.Variant}
191                         if need[libref] {
192                                 tileVariant[libref] = tv
193                         }
194                 }
195                 return nil
196         })
197         if err != nil {
198                 return err
199         }
200
201         log.Infof("export: loaded %d tile variants", len(tileVariant))
202         var missing []tileLibRef
203         for libref := range need {
204                 if _, ok := tileVariant[libref]; !ok {
205                         missing = append(missing, libref)
206                 }
207         }
208         if len(missing) > 0 {
209                 if limit := 100; len(missing) > limit {
210                         log.Warnf("first %d missing tiles: %v", limit, missing[:limit])
211                 } else {
212                         log.Warnf("missing tiles: %v", missing)
213                 }
214                 return fmt.Errorf("%d needed tiles are missing from library", len(missing))
215         }
216
217         refpos := 0
218         for _, seqname := range seqnames {
219                 variantAt := map[int][]hgvs.Variant{} // variantAt[chromOffset][genomeIndex*2+phase]
220                 for refstep, libref := range refseq[seqname] {
221                         reftile := tileVariant[libref]
222                         for cgidx, cg := range cgs {
223                                 for phase := 0; phase < 2; phase++ {
224                                         if len(cg.Variants) <= int(libref.Tag)*2+phase {
225                                                 continue
226                                         }
227                                         variant := cg.Variants[int(libref.Tag)*2+phase]
228                                         if variant == 0 {
229                                                 continue
230                                         }
231                                         genometile := tileVariant[tileLibRef{Tag: libref.Tag, Variant: variant}]
232                                         if variant == libref.Variant {
233                                                 continue
234                                         }
235                                         refSequence := reftile.Sequence
236                                         // If needed, extend the
237                                         // reference sequence up to
238                                         // the tag at the end of the
239                                         // genometile sequence.
240                                         refstepend := refstep + 1
241                                         for refstepend < len(refseq[seqname]) && len(refSequence) >= taglen && !bytes.EqualFold(refSequence[len(refSequence)-taglen:], genometile.Sequence[len(genometile.Sequence)-taglen:]) {
242                                                 if &refSequence[0] == &reftile.Sequence[0] {
243                                                         refSequence = append([]byte(nil), refSequence...)
244                                                 }
245                                                 refSequence = append(refSequence, tileVariant[refseq[seqname][refstepend]].Sequence...)
246                                                 refstepend++
247                                         }
248                                         vars, _ := hgvs.Diff(strings.ToUpper(string(refSequence)), strings.ToUpper(string(genometile.Sequence)), time.Second)
249                                         for _, v := range vars {
250                                                 v.Position += refpos
251                                                 log.Debugf("%s seq %s phase %d tag %d tile diff %s\n", cg.Name, seqname, phase, libref.Tag, v.String())
252                                                 varslice := variantAt[v.Position]
253                                                 if varslice == nil {
254                                                         varslice = make([]hgvs.Variant, len(cgs)*2)
255                                                 }
256                                                 varslice[cgidx*2+phase] = v
257                                                 variantAt[v.Position] = varslice
258                                         }
259                                 }
260                         }
261                         refpos += len(reftile.Sequence) - taglen
262
263                         // Flush entries from variantAt that are
264                         // behind refpos. Flush all entries if this is
265                         // the last reftile of the path/chromosome.
266                         var flushpos []int
267                         lastrefstep := refstep == len(refseq[seqname])-1
268                         for pos := range variantAt {
269                                 if lastrefstep || pos <= refpos {
270                                         flushpos = append(flushpos, pos)
271                                 }
272                         }
273                         sort.Slice(flushpos, func(i, j int) bool { return flushpos[i] < flushpos[j] })
274                         for _, pos := range flushpos {
275                                 varslice := variantAt[pos]
276                                 delete(variantAt, pos)
277                                 for i := range varslice {
278                                         if varslice[i].Position == 0 {
279                                                 varslice[i].Position = pos
280                                         }
281                                 }
282                                 for i := 0; i < len(cgs); i++ {
283                                         if i > 0 {
284                                                 out.Write([]byte{'\t'})
285                                         }
286                                         var1, var2 := varslice[i*2], varslice[i*2+1]
287                                         if var1.Position == 0 && var2.Position == 0 {
288                                                 out.Write([]byte{'.'})
289                                         } else if var1 == var2 {
290                                                 fmt.Fprintf(out, "%s:g.%s", seqname, var1.String())
291                                         } else {
292                                                 if var1.Position == 0 {
293                                                         var1.Position = pos
294                                                 }
295                                                 if var2.Position == 0 {
296                                                         var2.Position = pos
297                                                 }
298                                                 fmt.Fprintf(out, "%s:g.[%s];[%s]", seqname, var1.String(), var2.String())
299                                         }
300                                 }
301                                 out.Write([]byte{'\n'})
302                         }
303                 }
304         }
305         return nil
306 }