Export tile annotations.
authorTom Clegg <tom@tomclegg.ca>
Wed, 28 Oct 2020 07:16:06 +0000 (03:16 -0400)
committerTom Clegg <tom@tomclegg.ca>
Wed, 28 Oct 2020 07:16:06 +0000 (03:16 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@tomclegg.ca>

annotate.go [new file with mode: 0644]
cmd.go
example-su92l-1kg.sh
pipeline_test.go

diff --git a/annotate.go b/annotate.go
new file mode 100644 (file)
index 0000000..234bb67
--- /dev/null
@@ -0,0 +1,243 @@
+package main
+
+import (
+       "bufio"
+       "errors"
+       "flag"
+       "fmt"
+       "io"
+       "io/ioutil"
+       "net/http"
+       _ "net/http/pprof"
+       "os"
+       "sort"
+       "strconv"
+       "strings"
+
+       "git.arvados.org/arvados.git/sdk/go/arvados"
+       "github.com/arvados/lightning/hgvs"
+       log "github.com/sirupsen/logrus"
+)
+
+type annotatecmd struct {
+       variantHash bool
+}
+
+func (cmd *annotatecmd) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
+       var err error
+       defer func() {
+               if err != nil {
+                       fmt.Fprintf(stderr, "%s\n", err)
+               }
+       }()
+       flags := flag.NewFlagSet("", flag.ContinueOnError)
+       flags.SetOutput(stderr)
+       pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
+       runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
+       projectUUID := flags.String("project", "", "project `UUID` for output data")
+       priority := flags.Int("priority", 500, "container request priority")
+       inputFilename := flags.String("i", "-", "input `file` (library)")
+       outputFilename := flags.String("o", "-", "output `file`")
+       flags.BoolVar(&cmd.variantHash, "variant-hash", false, "output variant hash instead of index")
+       err = flags.Parse(args)
+       if err == flag.ErrHelp {
+               err = nil
+               return 0
+       } else if err != nil {
+               return 2
+       }
+
+       if *pprof != "" {
+               go func() {
+                       log.Println(http.ListenAndServe(*pprof, nil))
+               }()
+       }
+       if !*runlocal {
+               if *outputFilename != "-" {
+                       err = errors.New("cannot specify output file in container mode: not implemented")
+                       return 1
+               }
+               runner := arvadosContainerRunner{
+                       Name:        "lightning annotate",
+                       Client:      arvados.NewClientFromEnv(),
+                       ProjectUUID: *projectUUID,
+                       RAM:         120000000000,
+                       VCPUs:       2,
+                       Priority:    *priority,
+               }
+               err = runner.TranslatePaths(inputFilename)
+               if err != nil {
+                       return 1
+               }
+               runner.Args = []string{"annotate", "-local=true", fmt.Sprintf("-variant-hash=%v", cmd.variantHash), "-i", *inputFilename, "-o", "/mnt/output/tilevariants.tsv"}
+               var output string
+               output, err = runner.Run()
+               if err != nil {
+                       return 1
+               }
+               fmt.Fprintln(stdout, output+"/tilevariants.tsv")
+               return 0
+       }
+
+       var input io.ReadCloser
+       if *inputFilename == "-" {
+               input = ioutil.NopCloser(stdin)
+       } else {
+               input, err = os.Open(*inputFilename)
+               if err != nil {
+                       return 1
+               }
+               defer input.Close()
+       }
+
+       var output io.WriteCloser
+       if *outputFilename == "-" {
+               output = nopCloser{stdout}
+       } else {
+               output, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0666)
+               if err != nil {
+                       return 1
+               }
+               defer output.Close()
+       }
+       bufw := bufio.NewWriter(output)
+
+       err = cmd.exportTileDiffs(bufw, input)
+       if err != nil {
+               return 1
+       }
+       err = bufw.Flush()
+       if err != nil {
+               return 1
+       }
+       err = output.Close()
+       if err != nil {
+               return 1
+       }
+       err = input.Close()
+       if err != nil {
+               return 1
+       }
+       return 0
+}
+
+func (cmd *annotatecmd) exportTileDiffs(outw io.Writer, librdr io.Reader) error {
+       var refs []CompactSequence
+       var tiles [][]TileVariant
+       var tagset [][]byte
+       var taglen int
+       err := DecodeLibrary(librdr, func(ent *LibraryEntry) error {
+               if len(ent.TagSet) > 0 {
+                       if tagset != nil {
+                               return errors.New("error: not implemented: input has multiple tagsets")
+                       }
+                       tagset = ent.TagSet
+                       taglen = len(tagset[0])
+                       tiles = make([][]TileVariant, len(tagset))
+               }
+               for _, tv := range ent.TileVariants {
+                       if tv.Tag >= tagID(len(tiles)) {
+                               return fmt.Errorf("error: reading tile variant for tag %d but only %d tags were loaded", tv.Tag, len(tiles))
+                       }
+                       for len(tiles[tv.Tag]) <= int(tv.Variant) {
+                               tiles[tv.Tag] = append(tiles[tv.Tag], TileVariant{})
+                       }
+                       tiles[tv.Tag][tv.Variant] = tv
+               }
+               for _, cs := range ent.CompactSequences {
+                       refs = append(refs, cs)
+               }
+               return nil
+       })
+       if err != nil {
+               return err
+       }
+       tag2tagid := make(map[string]tagID, len(tagset))
+       for tagid, tagseq := range tagset {
+               tag2tagid[string(tagseq)] = tagID(tagid)
+       }
+       sort.Slice(refs, func(i, j int) bool { return refs[i].Name < refs[j].Name })
+       log.Infof("len(refs) %d", len(refs))
+       for _, refcs := range refs {
+               var seqnames []string
+               for seqname := range refcs.TileSequences {
+                       seqnames = append(seqnames, seqname)
+               }
+               sort.Strings(seqnames)
+               for _, seqname := range seqnames {
+                       var refseq []byte
+                       // tilestart[123] is the index into refseq
+                       // where the tile for tag 123 was placed.
+                       tilestart := map[tagID]int{}
+                       tileend := map[tagID]int{}
+                       for _, libref := range refcs.TileSequences[seqname] {
+                               if libref.Variant < 1 {
+                                       return fmt.Errorf("reference %q seq %q uses variant zero at tag %d", refcs.Name, seqname, libref.Tag)
+                               }
+                               seq := tiles[libref.Tag][libref.Variant].Sequence
+                               if len(seq) < taglen {
+                                       return fmt.Errorf("reference %q seq %q uses tile %d variant %d with sequence len %d < taglen %d", refcs.Name, seqname, libref.Tag, libref.Variant, len(seq), taglen)
+                               }
+                               overlap := taglen
+                               if len(refseq) == 0 {
+                                       overlap = 0
+                               }
+                               tilestart[libref.Tag] = len(refseq) - overlap
+                               refseq = append(refseq, seq[overlap:]...)
+                               tileend[libref.Tag] = len(refseq)
+                       }
+                       log.Infof("seq %s len(refseq) %d len(tilestart) %d", seqname, len(refseq), len(tilestart))
+                       for tag, tvs := range tiles {
+                               tag := tagID(tag)
+                               refstart, ok := tilestart[tag]
+                               if !ok {
+                                       // Tag didn't place on this
+                                       // reference sequence. (It
+                                       // might place on the same
+                                       // chromosome in a genome
+                                       // anyway, but we don't output
+                                       // the annotations that would
+                                       // result.)
+                                       continue
+                               }
+                               for variant, tv := range tvs {
+                                       if variant == 0 {
+                                               continue
+                                       }
+                                       if len(tv.Sequence) < taglen {
+                                               return fmt.Errorf("tilevar %d,%d has sequence len %d < taglen %d", tag, variant, len(tv.Sequence), taglen)
+                                       }
+                                       var refpart []byte
+                                       endtag := string(tv.Sequence[len(tv.Sequence)-taglen:])
+                                       if endtagid, ok := tag2tagid[endtag]; !ok {
+                                               // Tile variant doesn't end on a tag, so it can only place at the end of a chromosome.
+                                               refpart = refseq[refstart:]
+                                               log.Warnf("%x tilevar %d,%d endtag not in ref: %s", tv.Blake2b[:13], tag, variant, endtag)
+                                       } else if refendtagstart, ok := tilestart[endtagid]; !ok {
+                                               // Ref ends a chromsome with a (possibly very large) variant of this tile, but genomes with this tile don't.
+                                               // Give up. (TODO: something smarter)
+                                               log.Warnf("%x not annotating tilevar %d,%d because end tag %d is not in ref", tv.Blake2b[:13], tag, variant, endtagid)
+                                               continue
+                                       } else {
+                                               // Non-terminal tile vs. non-terminal reference.
+                                               refpart = refseq[refstart : refendtagstart+taglen]
+                                               log.Tracef("\n%x tilevar %d,%d endtag %s endtagid %d refendtagstart %d", tv.Blake2b[:13], tag, variant, endtag, endtagid, refendtagstart)
+                                       }
+                                       // log.Printf("\n%x @ refstart %d \n< %s\n> %s\n", tv.Blake2b, refstart, refpart, tv.Sequence)
+                                       diffs, _ := hgvs.Diff(strings.ToUpper(string(refpart)), strings.ToUpper(string(tv.Sequence)), 0)
+                                       for _, diff := range diffs {
+                                               diff.Position += refstart
+                                               var varid string
+                                               if cmd.variantHash {
+                                                       varid = fmt.Sprintf("%x", tv.Blake2b)[:13]
+                                               } else {
+                                                       varid = strconv.Itoa(variant)
+                                               }
+                                               fmt.Fprintf(outw, "%d\t%s\t%s\t%s:g.%s\n", tag, varid, refcs.Name, seqname, diff.String())
+                                       }
+                               }
+                       }
+               }
+       }
+       return nil
+}
diff --git a/cmd.go b/cmd.go
index 18e18f0d4a16c034c8aca21fced9dd3d7fc2d04e..8cbcdb681c11e33204cf16664c88ce39f676035b 100644 (file)
--- a/cmd.go
+++ b/cmd.go
@@ -22,6 +22,7 @@ var (
                "vcf2fasta":          &vcf2fasta{},
                "import":             &importer{},
                "import-stats-plot":  &importstatsplot{},
+               "annotate":           &annotatecmd{},
                "export":             &exporter{},
                "export-numpy":       &exportNumpy{},
                "filter":             &filterer{},
index 5689b9290474e712a33ee5ed0b38f27254a7d388..4e6082353afaba962676cf7537354984b42013b8 100755 (executable)
@@ -27,7 +27,7 @@ unfiltered=$(lightning import       -project ${project} -priority ${priority} -t
 # unfiltered=su92l-4zz18-mz3546bib6oj1gg/library.gob
 
 merged=$(lightning     merge        -project ${project} -priority ${priority} ${unfiltered} ${ref37_lib})                                              ; echo merged=${merged}
-# merged=su92l-4zz18-bfs0wc658kbf3lf/library.gob
+# merged=su92l-4zz18-svw5xqe5g0ct2v1/library.gob
 # 2400s
 
 exportvcf=$(lightning  export       -project ${project} -priority ${priority} -i ${merged} -output-format vcf -ref /mnt/su92l-4zz18-caw3g2ji89jxix8/human_g1k_v37.fasta.gz -output-bed export.bed) ; echo exportvcf=${exportvcf}
@@ -35,8 +35,11 @@ exportvcf=$(lightning  export       -project ${project} -priority ${priority} -i
 # 5506s
 
 stats=$(lightning      stats        -project ${project} -priority ${priority} -i ${merged})                                                            ; echo stats=${stats}
+
 filtered=$(lightning   filter       -project ${project} -priority ${priority} -i ${merged} -min-coverage "0.9" -max-variants "30")                     ; echo filtered=${filtered}
 
+annotations=$(lightning annotate    -project ${project} -priority ${priority} -i ${merged})                                                            ; echo annotated=${annotated}
+
 pca=$(lightning        pca-go       -project ${project} -priority ${priority} -i ${filtered} -one-hot)                                                 ; echo pca=${pca}
 plot=$(lightning       plot         -project ${project} -priority ${priority} -i ${pca} -labels-csv ${info}/sample_info.csv -sample-fasta-dir ${fasta})
 echo >&2 "https://workbench2.${plot%%-*}.arvadosapi.com/collections/${plot}"
index bf6cf5597c590ae8a8cd4a0c5a612c22f612f3d1..49dc39ddacf386cf89afa68591b8fa7d2153bbea 100644 (file)
@@ -6,6 +6,8 @@ import (
        "io"
        "io/ioutil"
        "os"
+       "sort"
+       "strings"
        "sync"
 
        "gopkg.in/check.v1"
@@ -134,4 +136,33 @@ chr2 224 372 5 750 . 248 348
 chr2 348 496 6 1000 . 372 472
 chr2 472 572 7 1000 . 496 572
 `)
+
+       annotateout := &bytes.Buffer{}
+       code = (&annotatecmd{}).RunCommand("lightning annotate", []string{"-local", "-variant-hash=true", "-i", tmpdir + "/merged.gob"}, bytes.NewReader(nil), annotateout, os.Stderr)
+       c.Check(code, check.Equals, 0)
+       c.Check(annotateout.Len() > 0, check.Equals, true)
+       sorted := sortLines(annotateout.String())
+       c.Logf("%s", sorted)
+       c.Check(sorted, check.Equals, sortLines(`0      8d4fe9a63921b   testdata/ref.fasta      chr1:g.161A>T
+0      8d4fe9a63921b   testdata/ref.fasta      chr1:g.178A>T
+0      8d4fe9a63921b   testdata/ref.fasta      chr1:g.1_3delinsGGC
+0      8d4fe9a63921b   testdata/ref.fasta      chr1:g.222_224del
+0      ba4263ca4199c   testdata/ref.fasta      chr1:g.1_3delinsGGC
+0      ba4263ca4199c   testdata/ref.fasta      chr1:g.222_224del
+0      ba4263ca4199c   testdata/ref.fasta      chr1:g.41_42delinsAA
+1      139890345dbb8   testdata/ref.fasta      chr1:g.302_305delinsAAAA
+4      cbfca15d241d3   testdata/ref.fasta      chr2:g.125_127delinsAAA
+4      cbfca15d241d3   testdata/ref.fasta      chr2:g.1_3delinsAAA
+4      f5fafe9450b02   testdata/ref.fasta      chr2:g.241_245delinsAAAAA
+4      f5fafe9450b02   testdata/ref.fasta      chr2:g.291C>A
+4      fe9a71a42adb4   testdata/ref.fasta      chr2:g.125_127delinsAAA
+6      e36dce85efbef   testdata/ref.fasta      chr2:g.471_472delinsAA
+6      f81388b184f4a   testdata/ref.fasta      chr2:g.470_472del
+`))
+}
+
+func sortLines(txt string) string {
+       lines := strings.Split(strings.TrimRightFunc(txt, func(c rune) bool { return c == '\n' }), "\n")
+       sort.Strings(lines)
+       return strings.Join(lines, "\n") + "\n"
 }