Export tile annotations.
[lightning.git] / annotate.go
1 package main
2
3 import (
4         "bufio"
5         "errors"
6         "flag"
7         "fmt"
8         "io"
9         "io/ioutil"
10         "net/http"
11         _ "net/http/pprof"
12         "os"
13         "sort"
14         "strconv"
15         "strings"
16
17         "git.arvados.org/arvados.git/sdk/go/arvados"
18         "github.com/arvados/lightning/hgvs"
19         log "github.com/sirupsen/logrus"
20 )
21
22 type annotatecmd struct {
23         variantHash bool
24 }
25
26 func (cmd *annotatecmd) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
27         var err error
28         defer func() {
29                 if err != nil {
30                         fmt.Fprintf(stderr, "%s\n", err)
31                 }
32         }()
33         flags := flag.NewFlagSet("", flag.ContinueOnError)
34         flags.SetOutput(stderr)
35         pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
36         runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
37         projectUUID := flags.String("project", "", "project `UUID` for output data")
38         priority := flags.Int("priority", 500, "container request priority")
39         inputFilename := flags.String("i", "-", "input `file` (library)")
40         outputFilename := flags.String("o", "-", "output `file`")
41         flags.BoolVar(&cmd.variantHash, "variant-hash", false, "output variant hash instead of index")
42         err = flags.Parse(args)
43         if err == flag.ErrHelp {
44                 err = nil
45                 return 0
46         } else if err != nil {
47                 return 2
48         }
49
50         if *pprof != "" {
51                 go func() {
52                         log.Println(http.ListenAndServe(*pprof, nil))
53                 }()
54         }
55         if !*runlocal {
56                 if *outputFilename != "-" {
57                         err = errors.New("cannot specify output file in container mode: not implemented")
58                         return 1
59                 }
60                 runner := arvadosContainerRunner{
61                         Name:        "lightning annotate",
62                         Client:      arvados.NewClientFromEnv(),
63                         ProjectUUID: *projectUUID,
64                         RAM:         120000000000,
65                         VCPUs:       2,
66                         Priority:    *priority,
67                 }
68                 err = runner.TranslatePaths(inputFilename)
69                 if err != nil {
70                         return 1
71                 }
72                 runner.Args = []string{"annotate", "-local=true", fmt.Sprintf("-variant-hash=%v", cmd.variantHash), "-i", *inputFilename, "-o", "/mnt/output/tilevariants.tsv"}
73                 var output string
74                 output, err = runner.Run()
75                 if err != nil {
76                         return 1
77                 }
78                 fmt.Fprintln(stdout, output+"/tilevariants.tsv")
79                 return 0
80         }
81
82         var input io.ReadCloser
83         if *inputFilename == "-" {
84                 input = ioutil.NopCloser(stdin)
85         } else {
86                 input, err = os.Open(*inputFilename)
87                 if err != nil {
88                         return 1
89                 }
90                 defer input.Close()
91         }
92
93         var output io.WriteCloser
94         if *outputFilename == "-" {
95                 output = nopCloser{stdout}
96         } else {
97                 output, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0666)
98                 if err != nil {
99                         return 1
100                 }
101                 defer output.Close()
102         }
103         bufw := bufio.NewWriter(output)
104
105         err = cmd.exportTileDiffs(bufw, input)
106         if err != nil {
107                 return 1
108         }
109         err = bufw.Flush()
110         if err != nil {
111                 return 1
112         }
113         err = output.Close()
114         if err != nil {
115                 return 1
116         }
117         err = input.Close()
118         if err != nil {
119                 return 1
120         }
121         return 0
122 }
123
124 func (cmd *annotatecmd) exportTileDiffs(outw io.Writer, librdr io.Reader) error {
125         var refs []CompactSequence
126         var tiles [][]TileVariant
127         var tagset [][]byte
128         var taglen int
129         err := DecodeLibrary(librdr, func(ent *LibraryEntry) error {
130                 if len(ent.TagSet) > 0 {
131                         if tagset != nil {
132                                 return errors.New("error: not implemented: input has multiple tagsets")
133                         }
134                         tagset = ent.TagSet
135                         taglen = len(tagset[0])
136                         tiles = make([][]TileVariant, len(tagset))
137                 }
138                 for _, tv := range ent.TileVariants {
139                         if tv.Tag >= tagID(len(tiles)) {
140                                 return fmt.Errorf("error: reading tile variant for tag %d but only %d tags were loaded", tv.Tag, len(tiles))
141                         }
142                         for len(tiles[tv.Tag]) <= int(tv.Variant) {
143                                 tiles[tv.Tag] = append(tiles[tv.Tag], TileVariant{})
144                         }
145                         tiles[tv.Tag][tv.Variant] = tv
146                 }
147                 for _, cs := range ent.CompactSequences {
148                         refs = append(refs, cs)
149                 }
150                 return nil
151         })
152         if err != nil {
153                 return err
154         }
155         tag2tagid := make(map[string]tagID, len(tagset))
156         for tagid, tagseq := range tagset {
157                 tag2tagid[string(tagseq)] = tagID(tagid)
158         }
159         sort.Slice(refs, func(i, j int) bool { return refs[i].Name < refs[j].Name })
160         log.Infof("len(refs) %d", len(refs))
161         for _, refcs := range refs {
162                 var seqnames []string
163                 for seqname := range refcs.TileSequences {
164                         seqnames = append(seqnames, seqname)
165                 }
166                 sort.Strings(seqnames)
167                 for _, seqname := range seqnames {
168                         var refseq []byte
169                         // tilestart[123] is the index into refseq
170                         // where the tile for tag 123 was placed.
171                         tilestart := map[tagID]int{}
172                         tileend := map[tagID]int{}
173                         for _, libref := range refcs.TileSequences[seqname] {
174                                 if libref.Variant < 1 {
175                                         return fmt.Errorf("reference %q seq %q uses variant zero at tag %d", refcs.Name, seqname, libref.Tag)
176                                 }
177                                 seq := tiles[libref.Tag][libref.Variant].Sequence
178                                 if len(seq) < taglen {
179                                         return fmt.Errorf("reference %q seq %q uses tile %d variant %d with sequence len %d < taglen %d", refcs.Name, seqname, libref.Tag, libref.Variant, len(seq), taglen)
180                                 }
181                                 overlap := taglen
182                                 if len(refseq) == 0 {
183                                         overlap = 0
184                                 }
185                                 tilestart[libref.Tag] = len(refseq) - overlap
186                                 refseq = append(refseq, seq[overlap:]...)
187                                 tileend[libref.Tag] = len(refseq)
188                         }
189                         log.Infof("seq %s len(refseq) %d len(tilestart) %d", seqname, len(refseq), len(tilestart))
190                         for tag, tvs := range tiles {
191                                 tag := tagID(tag)
192                                 refstart, ok := tilestart[tag]
193                                 if !ok {
194                                         // Tag didn't place on this
195                                         // reference sequence. (It
196                                         // might place on the same
197                                         // chromosome in a genome
198                                         // anyway, but we don't output
199                                         // the annotations that would
200                                         // result.)
201                                         continue
202                                 }
203                                 for variant, tv := range tvs {
204                                         if variant == 0 {
205                                                 continue
206                                         }
207                                         if len(tv.Sequence) < taglen {
208                                                 return fmt.Errorf("tilevar %d,%d has sequence len %d < taglen %d", tag, variant, len(tv.Sequence), taglen)
209                                         }
210                                         var refpart []byte
211                                         endtag := string(tv.Sequence[len(tv.Sequence)-taglen:])
212                                         if endtagid, ok := tag2tagid[endtag]; !ok {
213                                                 // Tile variant doesn't end on a tag, so it can only place at the end of a chromosome.
214                                                 refpart = refseq[refstart:]
215                                                 log.Warnf("%x tilevar %d,%d endtag not in ref: %s", tv.Blake2b[:13], tag, variant, endtag)
216                                         } else if refendtagstart, ok := tilestart[endtagid]; !ok {
217                                                 // Ref ends a chromsome with a (possibly very large) variant of this tile, but genomes with this tile don't.
218                                                 // Give up. (TODO: something smarter)
219                                                 log.Warnf("%x not annotating tilevar %d,%d because end tag %d is not in ref", tv.Blake2b[:13], tag, variant, endtagid)
220                                                 continue
221                                         } else {
222                                                 // Non-terminal tile vs. non-terminal reference.
223                                                 refpart = refseq[refstart : refendtagstart+taglen]
224                                                 log.Tracef("\n%x tilevar %d,%d endtag %s endtagid %d refendtagstart %d", tv.Blake2b[:13], tag, variant, endtag, endtagid, refendtagstart)
225                                         }
226                                         // log.Printf("\n%x @ refstart %d \n< %s\n> %s\n", tv.Blake2b, refstart, refpart, tv.Sequence)
227                                         diffs, _ := hgvs.Diff(strings.ToUpper(string(refpart)), strings.ToUpper(string(tv.Sequence)), 0)
228                                         for _, diff := range diffs {
229                                                 diff.Position += refstart
230                                                 var varid string
231                                                 if cmd.variantHash {
232                                                         varid = fmt.Sprintf("%x", tv.Blake2b)[:13]
233                                                 } else {
234                                                         varid = strconv.Itoa(variant)
235                                                 }
236                                                 fmt.Fprintf(outw, "%d\t%s\t%s\t%s:g.%s\n", tag, varid, refcs.Name, seqname, diff.String())
237                                         }
238                                 }
239                         }
240                 }
241         }
242         return nil
243 }