19 "git.arvados.org/arvados.git/sdk/go/arvados"
20 "github.com/arvados/lightning/hgvs"
21 log "github.com/sirupsen/logrus"
24 type annotatecmd struct {
29 func (cmd *annotatecmd) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
33 fmt.Fprintf(stderr, "%s\n", err)
36 flags := flag.NewFlagSet("", flag.ContinueOnError)
37 flags.SetOutput(stderr)
38 pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
39 runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
40 projectUUID := flags.String("project", "", "project `UUID` for output data")
41 priority := flags.Int("priority", 500, "container request priority")
42 inputFilename := flags.String("i", "-", "input `file` (library)")
43 outputFilename := flags.String("o", "-", "output `file`")
44 flags.BoolVar(&cmd.variantHash, "variant-hash", false, "output variant hash instead of index")
45 flags.IntVar(&cmd.maxTileSize, "max-tile-size", 50000, "don't try to make annotations for tiles bigger than given `size`")
46 err = flags.Parse(args)
47 if err == flag.ErrHelp {
50 } else if err != nil {
56 log.Println(http.ListenAndServe(*pprof, nil))
60 if *outputFilename != "-" {
61 err = errors.New("cannot specify output file in container mode: not implemented")
64 runner := arvadosContainerRunner{
65 Name: "lightning annotate",
66 Client: arvados.NewClientFromEnv(),
67 ProjectUUID: *projectUUID,
72 err = runner.TranslatePaths(inputFilename)
76 runner.Args = []string{"annotate", "-local=true", fmt.Sprintf("-variant-hash=%v", cmd.variantHash), "-max-tile-size", strconv.Itoa(cmd.maxTileSize), "-i", *inputFilename, "-o", "/mnt/output/tilevariants.tsv"}
78 output, err = runner.Run()
82 fmt.Fprintln(stdout, output+"/tilevariants.tsv")
86 var input io.ReadCloser
87 if *inputFilename == "-" {
88 input = ioutil.NopCloser(stdin)
90 input, err = os.Open(*inputFilename)
97 var output io.WriteCloser
98 if *outputFilename == "-" {
99 output = nopCloser{stdout}
101 output, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0666)
107 bufw := bufio.NewWriter(output)
109 err = cmd.exportTileDiffs(bufw, input)
128 func (cmd *annotatecmd) exportTileDiffs(outw io.Writer, librdr io.Reader) error {
129 var refs []CompactSequence
130 var tiles [][]TileVariant
133 err := DecodeLibrary(librdr, func(ent *LibraryEntry) error {
134 if len(ent.TagSet) > 0 {
136 return errors.New("error: not implemented: input has multiple tagsets")
139 taglen = len(tagset[0])
140 tiles = make([][]TileVariant, len(tagset))
142 for _, tv := range ent.TileVariants {
143 if tv.Tag >= tagID(len(tiles)) {
144 return fmt.Errorf("error: reading tile variant for tag %d but only %d tags were loaded", tv.Tag, len(tiles))
146 for len(tiles[tv.Tag]) <= int(tv.Variant) {
147 tiles[tv.Tag] = append(tiles[tv.Tag], TileVariant{})
149 tiles[tv.Tag][tv.Variant] = tv
151 for _, cs := range ent.CompactSequences {
152 refs = append(refs, cs)
159 tag2tagid := make(map[string]tagID, len(tagset))
160 for tagid, tagseq := range tagset {
161 tag2tagid[string(tagseq)] = tagID(tagid)
163 sort.Slice(refs, func(i, j int) bool { return refs[i].Name < refs[j].Name })
164 log.Infof("len(refs) %d", len(refs))
166 outch := make(chan string, 1)
167 var outwg sync.WaitGroup
172 for s := range outch {
173 io.WriteString(outw, s)
178 limiter := make(chan bool, runtime.NumCPU()+1)
179 var diffwg sync.WaitGroup
182 for _, refcs := range refs {
184 var seqnames []string
185 for seqname := range refcs.TileSequences {
186 seqnames = append(seqnames, seqname)
188 sort.Strings(seqnames)
189 for _, seqname := range seqnames {
192 // tilestart[123] is the index into refseq
193 // where the tile for tag 123 was placed.
194 tilestart := map[tagID]int{}
195 tileend := map[tagID]int{}
196 for _, libref := range refcs.TileSequences[seqname] {
197 if libref.Variant < 1 {
198 return fmt.Errorf("reference %q seq %q uses variant zero at tag %d", refcs.Name, seqname, libref.Tag)
200 seq := tiles[libref.Tag][libref.Variant].Sequence
201 if len(seq) < taglen {
202 return fmt.Errorf("reference %q seq %q uses tile %d variant %d with sequence len %d < taglen %d", refcs.Name, seqname, libref.Tag, libref.Variant, len(seq), taglen)
205 if len(refseq) == 0 {
208 tilestart[libref.Tag] = len(refseq) - overlap
209 refseq = append(refseq, seq[overlap:]...)
210 tileend[libref.Tag] = len(refseq)
212 log.Infof("seq %s len(refseq) %d len(tilestart) %d", seqname, len(refseq), len(tilestart))
213 for tag, tvs := range tiles {
215 refstart, ok := tilestart[tag]
217 // Tag didn't place on this
218 // reference sequence. (It
219 // might place on the same
220 // chromosome in a genome
221 // anyway, but we don't output
222 // the annotations that would
226 for variant, tv := range tvs {
227 variant, tv := variant, tv
231 if len(tv.Sequence) < taglen {
232 return fmt.Errorf("tilevar %d,%d has sequence len %d < taglen %d", tag, variant, len(tv.Sequence), taglen)
235 endtag := string(tv.Sequence[len(tv.Sequence)-taglen:])
236 if endtagid, ok := tag2tagid[endtag]; !ok {
237 // Tile variant doesn't end on a tag, so it can only place at the end of a chromosome.
238 refpart = refseq[refstart:]
239 log.Warnf("%x tilevar %d,%d endtag not in ref: %s", tv.Blake2b[:13], tag, variant, endtag)
240 } else if refendtagstart, ok := tilestart[endtagid]; !ok {
241 // Ref ends a chromsome with a (possibly very large) variant of this tile, but genomes with this tile don't.
242 // Give up. (TODO: something smarter)
243 log.Debugf("%x not annotating tilevar %d,%d because end tag %d is not in ref", tv.Blake2b[:13], tag, variant, endtagid)
246 // Non-terminal tile vs. non-terminal reference.
247 refpart = refseq[refstart : refendtagstart+taglen]
248 log.Tracef("\n%x tilevar %d,%d endtag %s endtagid %d refendtagstart %d", tv.Blake2b[:13], tag, variant, endtag, endtagid, refendtagstart)
250 if len(refpart) > cmd.maxTileSize {
251 log.Warnf("%x tilevar %d,%d skipping long diff, ref %s seq %s ref len %d", tv.Blake2b[:13], tag, variant, refcs.Name, seqname, len(refpart))
254 if len(tv.Sequence) > cmd.maxTileSize {
255 log.Warnf("%x tilevar %d,%d skipping long diff, ref %s seq %s variant len %d", tv.Blake2b[:13], tag, variant, refcs.Name, seqname, len(tv.Sequence))
258 // log.Printf("\n%x @ refstart %d \n< %s\n> %s\n", tv.Blake2b, refstart, refpart, tv.Sequence)
267 diffs, _ := hgvs.Diff(strings.ToUpper(string(refpart)), strings.ToUpper(string(tv.Sequence)), 0)
268 for _, diff := range diffs {
269 diff.Position += refstart
272 varid = fmt.Sprintf("%x", tv.Blake2b)[:13]
274 varid = strconv.Itoa(variant)
276 outch <- fmt.Sprintf("%d\t%s\t%s\t%s:g.%s\n", tag, varid, refcs.Name, seqname, diff.String())