19 "git.arvados.org/arvados.git/sdk/go/arvados"
20 "github.com/arvados/lightning/hgvs"
21 log "github.com/sirupsen/logrus"
24 type exportHGVS struct {
27 func (cmd *exportHGVS) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
31 fmt.Fprintf(stderr, "%s\n", err)
34 flags := flag.NewFlagSet("", flag.ContinueOnError)
35 flags.SetOutput(stderr)
36 pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
37 runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
38 projectUUID := flags.String("project", "", "project `UUID` for output data")
39 priority := flags.Int("priority", 500, "container request priority")
40 refname := flags.String("ref", "", "reference genome `name`")
41 inputFilename := flags.String("i", "-", "input `file` (library)")
42 outputFilename := flags.String("o", "-", "fasta output `file`")
43 pick := flags.String("pick", "", "`name` of single genome to export")
44 err = flags.Parse(args)
45 if err == flag.ErrHelp {
48 } else if err != nil {
54 log.Println(http.ListenAndServe(*pprof, nil))
59 if *outputFilename != "-" {
60 err = errors.New("cannot specify output file in container mode: not implemented")
63 runner := arvadosContainerRunner{
64 Name: "lightning export-hgvs",
65 Client: arvados.NewClientFromEnv(),
66 ProjectUUID: *projectUUID,
71 err = runner.TranslatePaths(inputFilename)
75 runner.Args = []string{"export-hgvs", "-local=true", "-pick", *pick, "-ref", *refname, "-i", *inputFilename, "-o", "/mnt/output/export.csv"}
77 output, err = runner.Run()
81 fmt.Fprintln(stdout, output+"/export.csv")
85 input, err := os.Open(*inputFilename)
91 // Error out early if seeking doesn't work on the input file.
92 _, err = input.Seek(0, io.SeekEnd)
96 _, err = input.Seek(0, io.SeekStart)
102 var cgs []CompactGenome
103 var tilelib tileLibrary
104 err = tilelib.LoadGob(context.Background(), input, func(cg CompactGenome) {
105 if *pick != "" && *pick != cg.Name {
108 log.Debugf("export: pick %q", cg.Name)
111 cgs = append(cgs, cg)
116 sort.Slice(cgs, func(i, j int) bool { return cgs[i].Name < cgs[j].Name })
117 log.Printf("export: pick %q => %d genomes", *pick, len(cgs))
119 refseq, ok := tilelib.refseqs[*refname]
121 err = fmt.Errorf("reference name %q not found in input; have %v", *refname, func() (names []string) {
122 for name := range tilelib.refseqs {
123 names = append(names, name)
130 _, err = input.Seek(0, io.SeekStart)
135 var output io.WriteCloser
136 if *outputFilename == "-" {
137 output = nopCloser{stdout}
139 output, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0777)
145 bufw := bufio.NewWriter(output)
146 err = cmd.export(bufw, input, tilelib.taglib.keylen, refseq, cgs)
165 func (cmd *exportHGVS) export(out io.Writer, librdr io.Reader, taglen int, refseq map[string][]tileLibRef, cgs []CompactGenome) error {
166 need := map[tileLibRef]bool{}
167 var seqnames []string
168 for seqname, librefs := range refseq {
169 seqnames = append(seqnames, seqname)
170 for _, libref := range librefs {
174 sort.Strings(seqnames)
176 for _, cg := range cgs {
177 for i, variant := range cg.Variants {
181 libref := tileLibRef{Tag: tagID(i / 2), Variant: variant}
186 log.Infof("export: loading %d tile variants", len(need))
187 tileVariant := map[tileLibRef]TileVariant{}
188 err := DecodeLibrary(librdr, func(ent *LibraryEntry) error {
189 for _, tv := range ent.TileVariants {
190 libref := tileLibRef{Tag: tv.Tag, Variant: tv.Variant}
192 tileVariant[libref] = tv
201 log.Infof("export: loaded %d tile variants", len(tileVariant))
202 var missing []tileLibRef
203 for libref := range need {
204 if _, ok := tileVariant[libref]; !ok {
205 missing = append(missing, libref)
208 if len(missing) > 0 {
209 if limit := 100; len(missing) > limit {
210 log.Warnf("first %d missing tiles: %v", limit, missing[:limit])
212 log.Warnf("missing tiles: %v", missing)
214 return fmt.Errorf("%d needed tiles are missing from library", len(missing))
218 for _, seqname := range seqnames {
219 variantAt := map[int][]hgvs.Variant{} // variantAt[chromOffset][genomeIndex*2+phase]
220 for refstep, libref := range refseq[seqname] {
221 reftile := tileVariant[libref]
222 for cgidx, cg := range cgs {
223 for phase := 0; phase < 2; phase++ {
224 if len(cg.Variants) <= int(libref.Tag)*2+phase {
227 variant := cg.Variants[int(libref.Tag)*2+phase]
231 genometile := tileVariant[tileLibRef{Tag: libref.Tag, Variant: variant}]
232 if variant == libref.Variant {
235 refSequence := reftile.Sequence
236 // If needed, extend the
237 // reference sequence up to
238 // the tag at the end of the
239 // genometile sequence.
240 refstepend := refstep + 1
241 for refstepend < len(refseq[seqname]) && len(refSequence) >= taglen && !bytes.EqualFold(refSequence[len(refSequence)-taglen:], genometile.Sequence[len(genometile.Sequence)-taglen:]) {
242 if &refSequence[0] == &reftile.Sequence[0] {
243 refSequence = append([]byte(nil), refSequence...)
245 refSequence = append(refSequence, tileVariant[refseq[seqname][refstepend]].Sequence...)
248 vars, _ := hgvs.Diff(strings.ToUpper(string(refSequence)), strings.ToUpper(string(genometile.Sequence)), time.Second)
249 for _, v := range vars {
251 log.Debugf("%s seq %s phase %d tag %d tile diff %s\n", cg.Name, seqname, phase, libref.Tag, v.String())
252 varslice := variantAt[v.Position]
254 varslice = make([]hgvs.Variant, len(cgs)*2)
256 varslice[cgidx*2+phase] = v
257 variantAt[v.Position] = varslice
261 refpos += len(reftile.Sequence) - taglen
263 // Flush entries from variantAt that are
264 // behind refpos. Flush all entries if this is
265 // the last reftile of the path/chromosome.
267 lastrefstep := refstep == len(refseq[seqname])-1
268 for pos := range variantAt {
269 if lastrefstep || pos <= refpos {
270 flushpos = append(flushpos, pos)
273 sort.Slice(flushpos, func(i, j int) bool { return flushpos[i] < flushpos[j] })
274 for _, pos := range flushpos {
275 varslice := variantAt[pos]
276 delete(variantAt, pos)
277 for i := range varslice {
278 if varslice[i].Position == 0 {
279 varslice[i].Position = pos
282 for i := 0; i < len(cgs); i++ {
284 out.Write([]byte{'\t'})
286 var1, var2 := varslice[i*2], varslice[i*2+1]
287 if var1.Position == 0 && var2.Position == 0 {
288 out.Write([]byte{'.'})
289 } else if var1 == var2 {
290 fmt.Fprintf(out, "%s:g.%s", seqname, var1.String())
292 if var1.Position == 0 {
295 if var2.Position == 0 {
298 fmt.Fprintf(out, "%s:g.[%s];[%s]", seqname, var1.String(), var2.String())
301 out.Write([]byte{'\n'})