20 "git.arvados.org/arvados.git/sdk/go/arvados"
21 "github.com/arvados/lightning/hgvs"
22 log "github.com/sirupsen/logrus"
25 type annotatecmd struct {
30 func (cmd *annotatecmd) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
34 fmt.Fprintf(stderr, "%s\n", err)
37 flags := flag.NewFlagSet("", flag.ContinueOnError)
38 flags.SetOutput(stderr)
39 pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
40 runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
41 projectUUID := flags.String("project", "", "project `UUID` for output data")
42 priority := flags.Int("priority", 500, "container request priority")
43 inputFilename := flags.String("i", "-", "input `file` (library)")
44 outputFilename := flags.String("o", "-", "output `file`")
45 flags.BoolVar(&cmd.variantHash, "variant-hash", false, "output variant hash instead of index")
46 flags.IntVar(&cmd.maxTileSize, "max-tile-size", 50000, "don't try to make annotations for tiles bigger than given `size`")
47 err = flags.Parse(args)
48 if err == flag.ErrHelp {
51 } else if err != nil {
57 log.Println(http.ListenAndServe(*pprof, nil))
61 if *outputFilename != "-" {
62 err = errors.New("cannot specify output file in container mode: not implemented")
65 runner := arvadosContainerRunner{
66 Name: "lightning annotate",
67 Client: arvados.NewClientFromEnv(),
68 ProjectUUID: *projectUUID,
73 err = runner.TranslatePaths(inputFilename)
77 runner.Args = []string{"annotate", "-local=true", fmt.Sprintf("-variant-hash=%v", cmd.variantHash), "-max-tile-size", strconv.Itoa(cmd.maxTileSize), "-i", *inputFilename, "-o", "/mnt/output/tilevariants.tsv"}
79 output, err = runner.Run()
83 fmt.Fprintln(stdout, output+"/tilevariants.tsv")
87 var input io.ReadCloser
88 if *inputFilename == "-" {
89 input = ioutil.NopCloser(stdin)
91 input, err = os.Open(*inputFilename)
98 var output io.WriteCloser
99 if *outputFilename == "-" {
100 output = nopCloser{stdout}
102 output, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0666)
108 bufw := bufio.NewWriter(output)
110 tilelib := &tileLibrary{
111 includeNoCalls: true,
112 retainTileSequences: true,
114 err = tilelib.LoadGob(context.Background(), input, nil)
118 err = cmd.exportTileDiffs(bufw, tilelib)
137 func (cmd *annotatecmd) exportTileDiffs(outw io.Writer, tilelib *tileLibrary) error {
138 tagset := tilelib.taglib.Tags()
139 if len(tagset) == 0 {
140 return errors.New("cannot annotate library without tags")
142 taglen := len(tagset[0])
144 for name := range tilelib.refseqs {
145 refs = append(refs, name)
147 tag2tagid := make(map[string]tagID, len(tagset))
148 for tagid, tagseq := range tagset {
149 tag2tagid[string(tagseq)] = tagID(tagid)
152 log.Infof("len(refs) %d", len(refs))
154 outch := make(chan string, 1)
155 var outwg sync.WaitGroup
160 for s := range outch {
161 io.WriteString(outw, s)
166 limiter := make(chan bool, runtime.NumCPU()+1)
167 var diffwg sync.WaitGroup
170 for _, refname := range refs {
172 refcs := tilelib.refseqs[refname]
173 var seqnames []string
174 for seqname := range refcs {
175 seqnames = append(seqnames, seqname)
177 sort.Strings(seqnames)
178 for _, seqname := range seqnames {
181 // tilestart[123] is the index into refseq
182 // where the tile for tag 123 was placed.
183 tilestart := map[tagID]int{}
184 tileend := map[tagID]int{}
185 for _, libref := range refcs[seqname] {
186 if libref.Variant < 1 {
187 return fmt.Errorf("reference %q seq %q uses variant zero at tag %d", refname, seqname, libref.Tag)
189 seq := tilelib.TileVariantSequence(libref)
190 if len(seq) < taglen {
191 return fmt.Errorf("reference %q seq %q uses tile %d variant %d with sequence len %d < taglen %d", refname, seqname, libref.Tag, libref.Variant, len(seq), taglen)
194 if len(refseq) == 0 {
197 tilestart[libref.Tag] = len(refseq) - overlap
198 refseq = append(refseq, seq[overlap:]...)
199 tileend[libref.Tag] = len(refseq)
201 log.Infof("seq %s len(refseq) %d len(tilestart) %d", seqname, len(refseq), len(tilestart))
202 for tag, tvs := range tilelib.variant {
204 refstart, ok := tilestart[tag]
206 // Tag didn't place on this
207 // reference sequence. (It
208 // might place on the same
209 // chromosome in a genome
210 // anyway, but we don't output
211 // the annotations that would
215 for variant := 1; variant <= len(tvs); variant++ {
216 variant, hash := tileVariantID(variant), tvs[variant-1]
217 tileseq := tilelib.TileVariantSequence(tileLibRef{Tag: tag, Variant: variant})
218 if len(tileseq) < taglen {
219 return fmt.Errorf("tilevar %d,%d has sequence len %d < taglen %d", tag, variant, len(tileseq), taglen)
222 endtag := string(tileseq[len(tileseq)-taglen:])
223 if endtagid, ok := tag2tagid[endtag]; !ok {
224 // Tile variant doesn't end on a tag, so it can only place at the end of a chromosome.
225 refpart = refseq[refstart:]
226 log.Warnf("%x tilevar %d,%d endtag not in ref: %s", hash[:13], tag, variant, endtag)
227 } else if refendtagstart, ok := tilestart[endtagid]; !ok {
228 // Ref ends a chromsome with a (possibly very large) variant of this tile, but genomes with this tile don't.
229 // Give up. (TODO: something smarter)
230 log.Debugf("%x not annotating tilevar %d,%d because end tag %d is not in ref", hash[:13], tag, variant, endtagid)
233 // Non-terminal tile vs. non-terminal reference.
234 refpart = refseq[refstart : refendtagstart+taglen]
235 log.Tracef("\n%x tilevar %d,%d endtag %s endtagid %d refendtagstart %d", hash[:13], tag, variant, endtag, endtagid, refendtagstart)
237 if len(refpart) > cmd.maxTileSize {
238 log.Warnf("%x tilevar %d,%d skipping long diff, ref %s seq %s pos %d ref len %d", hash[:13], tag, variant, refname, seqname, refstart, len(refpart))
241 if len(tileseq) > cmd.maxTileSize {
242 log.Warnf("%x tilevar %d,%d skipping long diff, ref %s seq %s variant len %d", hash[:13], tag, variant, refname, seqname, len(tileseq))
245 // log.Printf("\n%x @ refstart %d \n< %s\n> %s\n", tv.Blake2b, refstart, refpart, tileseq)
254 diffs, _ := hgvs.Diff(strings.ToUpper(string(refpart)), strings.ToUpper(string(tileseq)), 0)
255 for _, diff := range diffs {
256 diff.Position += refstart
259 varid = fmt.Sprintf("%x", hash)[:13]
261 varid = fmt.Sprintf("%d", variant)
263 outch <- fmt.Sprintf("%d\t%s\t%s\t%s:g.%s\n", tag, varid, refname, seqname, diff.String())