From: Tom Clegg Date: Wed, 28 Oct 2020 07:16:06 +0000 (-0400) Subject: Export tile annotations. X-Git-Url: https://git.arvados.org/lightning.git/commitdiff_plain/fd3f45ef07a22854715503a5b4ebb05aeccd77fc?hp=3144aedf02bc1c241ac0666bf01905814eeed71a Export tile annotations. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- diff --git a/annotate.go b/annotate.go new file mode 100644 index 0000000000..234bb67908 --- /dev/null +++ b/annotate.go @@ -0,0 +1,243 @@ +package main + +import ( + "bufio" + "errors" + "flag" + "fmt" + "io" + "io/ioutil" + "net/http" + _ "net/http/pprof" + "os" + "sort" + "strconv" + "strings" + + "git.arvados.org/arvados.git/sdk/go/arvados" + "github.com/arvados/lightning/hgvs" + log "github.com/sirupsen/logrus" +) + +type annotatecmd struct { + variantHash bool +} + +func (cmd *annotatecmd) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int { + var err error + defer func() { + if err != nil { + fmt.Fprintf(stderr, "%s\n", err) + } + }() + flags := flag.NewFlagSet("", flag.ContinueOnError) + flags.SetOutput(stderr) + pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`") + runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)") + projectUUID := flags.String("project", "", "project `UUID` for output data") + priority := flags.Int("priority", 500, "container request priority") + inputFilename := flags.String("i", "-", "input `file` (library)") + outputFilename := flags.String("o", "-", "output `file`") + flags.BoolVar(&cmd.variantHash, "variant-hash", false, "output variant hash instead of index") + err = flags.Parse(args) + if err == flag.ErrHelp { + err = nil + return 0 + } else if err != nil { + return 2 + } + + if *pprof != "" { + go func() { + log.Println(http.ListenAndServe(*pprof, nil)) + }() + } + if !*runlocal { + if *outputFilename != "-" { + err = errors.New("cannot specify output file in container mode: not implemented") + return 1 + } + runner := arvadosContainerRunner{ + Name: "lightning annotate", + Client: arvados.NewClientFromEnv(), + ProjectUUID: *projectUUID, + RAM: 120000000000, + VCPUs: 2, + Priority: *priority, + } + err = runner.TranslatePaths(inputFilename) + if err != nil { + return 1 + } + runner.Args = []string{"annotate", "-local=true", fmt.Sprintf("-variant-hash=%v", cmd.variantHash), "-i", *inputFilename, "-o", "/mnt/output/tilevariants.tsv"} + var output string + output, err = runner.Run() + if err != nil { + return 1 + } + fmt.Fprintln(stdout, output+"/tilevariants.tsv") + return 0 + } + + var input io.ReadCloser + if *inputFilename == "-" { + input = ioutil.NopCloser(stdin) + } else { + input, err = os.Open(*inputFilename) + if err != nil { + return 1 + } + defer input.Close() + } + + var output io.WriteCloser + if *outputFilename == "-" { + output = nopCloser{stdout} + } else { + output, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0666) + if err != nil { + return 1 + } + defer output.Close() + } + bufw := bufio.NewWriter(output) + + err = cmd.exportTileDiffs(bufw, input) + if err != nil { + return 1 + } + err = bufw.Flush() + if err != nil { + return 1 + } + err = output.Close() + if err != nil { + return 1 + } + err = input.Close() + if err != nil { + return 1 + } + return 0 +} + +func (cmd *annotatecmd) exportTileDiffs(outw io.Writer, librdr io.Reader) error { + var refs []CompactSequence + var tiles [][]TileVariant + var tagset [][]byte + var taglen int + err := DecodeLibrary(librdr, func(ent *LibraryEntry) error { + if len(ent.TagSet) > 0 { + if tagset != nil { + return errors.New("error: not implemented: input has multiple tagsets") + } + tagset = ent.TagSet + taglen = len(tagset[0]) + tiles = make([][]TileVariant, len(tagset)) + } + for _, tv := range ent.TileVariants { + if tv.Tag >= tagID(len(tiles)) { + return fmt.Errorf("error: reading tile variant for tag %d but only %d tags were loaded", tv.Tag, len(tiles)) + } + for len(tiles[tv.Tag]) <= int(tv.Variant) { + tiles[tv.Tag] = append(tiles[tv.Tag], TileVariant{}) + } + tiles[tv.Tag][tv.Variant] = tv + } + for _, cs := range ent.CompactSequences { + refs = append(refs, cs) + } + return nil + }) + if err != nil { + return err + } + tag2tagid := make(map[string]tagID, len(tagset)) + for tagid, tagseq := range tagset { + tag2tagid[string(tagseq)] = tagID(tagid) + } + sort.Slice(refs, func(i, j int) bool { return refs[i].Name < refs[j].Name }) + log.Infof("len(refs) %d", len(refs)) + for _, refcs := range refs { + var seqnames []string + for seqname := range refcs.TileSequences { + seqnames = append(seqnames, seqname) + } + sort.Strings(seqnames) + for _, seqname := range seqnames { + var refseq []byte + // tilestart[123] is the index into refseq + // where the tile for tag 123 was placed. + tilestart := map[tagID]int{} + tileend := map[tagID]int{} + for _, libref := range refcs.TileSequences[seqname] { + if libref.Variant < 1 { + return fmt.Errorf("reference %q seq %q uses variant zero at tag %d", refcs.Name, seqname, libref.Tag) + } + seq := tiles[libref.Tag][libref.Variant].Sequence + if len(seq) < taglen { + return fmt.Errorf("reference %q seq %q uses tile %d variant %d with sequence len %d < taglen %d", refcs.Name, seqname, libref.Tag, libref.Variant, len(seq), taglen) + } + overlap := taglen + if len(refseq) == 0 { + overlap = 0 + } + tilestart[libref.Tag] = len(refseq) - overlap + refseq = append(refseq, seq[overlap:]...) + tileend[libref.Tag] = len(refseq) + } + log.Infof("seq %s len(refseq) %d len(tilestart) %d", seqname, len(refseq), len(tilestart)) + for tag, tvs := range tiles { + tag := tagID(tag) + refstart, ok := tilestart[tag] + if !ok { + // Tag didn't place on this + // reference sequence. (It + // might place on the same + // chromosome in a genome + // anyway, but we don't output + // the annotations that would + // result.) + continue + } + for variant, tv := range tvs { + if variant == 0 { + continue + } + if len(tv.Sequence) < taglen { + return fmt.Errorf("tilevar %d,%d has sequence len %d < taglen %d", tag, variant, len(tv.Sequence), taglen) + } + var refpart []byte + endtag := string(tv.Sequence[len(tv.Sequence)-taglen:]) + if endtagid, ok := tag2tagid[endtag]; !ok { + // Tile variant doesn't end on a tag, so it can only place at the end of a chromosome. + refpart = refseq[refstart:] + log.Warnf("%x tilevar %d,%d endtag not in ref: %s", tv.Blake2b[:13], tag, variant, endtag) + } else if refendtagstart, ok := tilestart[endtagid]; !ok { + // Ref ends a chromsome with a (possibly very large) variant of this tile, but genomes with this tile don't. + // Give up. (TODO: something smarter) + log.Warnf("%x not annotating tilevar %d,%d because end tag %d is not in ref", tv.Blake2b[:13], tag, variant, endtagid) + continue + } else { + // Non-terminal tile vs. non-terminal reference. + refpart = refseq[refstart : refendtagstart+taglen] + log.Tracef("\n%x tilevar %d,%d endtag %s endtagid %d refendtagstart %d", tv.Blake2b[:13], tag, variant, endtag, endtagid, refendtagstart) + } + // log.Printf("\n%x @ refstart %d \n< %s\n> %s\n", tv.Blake2b, refstart, refpart, tv.Sequence) + diffs, _ := hgvs.Diff(strings.ToUpper(string(refpart)), strings.ToUpper(string(tv.Sequence)), 0) + for _, diff := range diffs { + diff.Position += refstart + var varid string + if cmd.variantHash { + varid = fmt.Sprintf("%x", tv.Blake2b)[:13] + } else { + varid = strconv.Itoa(variant) + } + fmt.Fprintf(outw, "%d\t%s\t%s\t%s:g.%s\n", tag, varid, refcs.Name, seqname, diff.String()) + } + } + } + } + } + return nil +} diff --git a/cmd.go b/cmd.go index 18e18f0d4a..8cbcdb681c 100644 --- a/cmd.go +++ b/cmd.go @@ -22,6 +22,7 @@ var ( "vcf2fasta": &vcf2fasta{}, "import": &importer{}, "import-stats-plot": &importstatsplot{}, + "annotate": &annotatecmd{}, "export": &exporter{}, "export-numpy": &exportNumpy{}, "filter": &filterer{}, diff --git a/example-su92l-1kg.sh b/example-su92l-1kg.sh index 5689b92904..4e6082353a 100755 --- a/example-su92l-1kg.sh +++ b/example-su92l-1kg.sh @@ -27,7 +27,7 @@ unfiltered=$(lightning import -project ${project} -priority ${priority} -t # unfiltered=su92l-4zz18-mz3546bib6oj1gg/library.gob merged=$(lightning merge -project ${project} -priority ${priority} ${unfiltered} ${ref37_lib}) ; echo merged=${merged} -# merged=su92l-4zz18-bfs0wc658kbf3lf/library.gob +# merged=su92l-4zz18-svw5xqe5g0ct2v1/library.gob # 2400s exportvcf=$(lightning export -project ${project} -priority ${priority} -i ${merged} -output-format vcf -ref /mnt/su92l-4zz18-caw3g2ji89jxix8/human_g1k_v37.fasta.gz -output-bed export.bed) ; echo exportvcf=${exportvcf} @@ -35,8 +35,11 @@ exportvcf=$(lightning export -project ${project} -priority ${priority} -i # 5506s stats=$(lightning stats -project ${project} -priority ${priority} -i ${merged}) ; echo stats=${stats} + filtered=$(lightning filter -project ${project} -priority ${priority} -i ${merged} -min-coverage "0.9" -max-variants "30") ; echo filtered=${filtered} +annotations=$(lightning annotate -project ${project} -priority ${priority} -i ${merged}) ; echo annotated=${annotated} + pca=$(lightning pca-go -project ${project} -priority ${priority} -i ${filtered} -one-hot) ; echo pca=${pca} plot=$(lightning plot -project ${project} -priority ${priority} -i ${pca} -labels-csv ${info}/sample_info.csv -sample-fasta-dir ${fasta}) echo >&2 "https://workbench2.${plot%%-*}.arvadosapi.com/collections/${plot}" diff --git a/pipeline_test.go b/pipeline_test.go index bf6cf5597c..49dc39ddac 100644 --- a/pipeline_test.go +++ b/pipeline_test.go @@ -6,6 +6,8 @@ import ( "io" "io/ioutil" "os" + "sort" + "strings" "sync" "gopkg.in/check.v1" @@ -134,4 +136,33 @@ chr2 224 372 5 750 . 248 348 chr2 348 496 6 1000 . 372 472 chr2 472 572 7 1000 . 496 572 `) + + annotateout := &bytes.Buffer{} + code = (&annotatecmd{}).RunCommand("lightning annotate", []string{"-local", "-variant-hash=true", "-i", tmpdir + "/merged.gob"}, bytes.NewReader(nil), annotateout, os.Stderr) + c.Check(code, check.Equals, 0) + c.Check(annotateout.Len() > 0, check.Equals, true) + sorted := sortLines(annotateout.String()) + c.Logf("%s", sorted) + c.Check(sorted, check.Equals, sortLines(`0 8d4fe9a63921b testdata/ref.fasta chr1:g.161A>T +0 8d4fe9a63921b testdata/ref.fasta chr1:g.178A>T +0 8d4fe9a63921b testdata/ref.fasta chr1:g.1_3delinsGGC +0 8d4fe9a63921b testdata/ref.fasta chr1:g.222_224del +0 ba4263ca4199c testdata/ref.fasta chr1:g.1_3delinsGGC +0 ba4263ca4199c testdata/ref.fasta chr1:g.222_224del +0 ba4263ca4199c testdata/ref.fasta chr1:g.41_42delinsAA +1 139890345dbb8 testdata/ref.fasta chr1:g.302_305delinsAAAA +4 cbfca15d241d3 testdata/ref.fasta chr2:g.125_127delinsAAA +4 cbfca15d241d3 testdata/ref.fasta chr2:g.1_3delinsAAA +4 f5fafe9450b02 testdata/ref.fasta chr2:g.241_245delinsAAAAA +4 f5fafe9450b02 testdata/ref.fasta chr2:g.291C>A +4 fe9a71a42adb4 testdata/ref.fasta chr2:g.125_127delinsAAA +6 e36dce85efbef testdata/ref.fasta chr2:g.471_472delinsAA +6 f81388b184f4a testdata/ref.fasta chr2:g.470_472del +`)) +} + +func sortLines(txt string) string { + lines := strings.Split(strings.TrimRightFunc(txt, func(c rune) bool { return c == '\n' }), "\n") + sort.Strings(lines) + return strings.Join(lines, "\n") + "\n" }