Export bed file.
authorTom Clegg <tom@tomclegg.ca>
Wed, 21 Oct 2020 20:22:55 +0000 (16:22 -0400)
committerTom Clegg <tom@tomclegg.ca>
Wed, 21 Oct 2020 20:22:55 +0000 (16:22 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@tomclegg.ca>

export.go
pipeline_test.go

index 6fde64f50800ecf67fb0a39a70ff1dd29163c010..256af2129604bac58c0b1b878db004cb66f86b49 100644 (file)
--- a/export.go
+++ b/export.go
@@ -56,6 +56,7 @@ func (cmd *exporter) RunCommand(prog string, args []string, stdin io.Reader, std
        inputFilename := flags.String("i", "-", "input `file` (library)")
        outputFilename := flags.String("o", "-", "output `file`")
        outputFormatStr := flags.String("output-format", "hgvs", "output `format`: hgvs or vcf")
+       outputBed := flags.String("output-bed", "", "also output bed `file`")
        pick := flags.String("pick", "", "`name` of single genome to export")
        err = flags.Parse(args)
        if err == flag.ErrHelp {
@@ -95,7 +96,14 @@ func (cmd *exporter) RunCommand(prog string, args []string, stdin io.Reader, std
                if err != nil {
                        return 1
                }
-               runner.Args = []string{"export", "-local=true", "-pick", *pick, "-ref", *refname, "-output-format", *outputFormatStr, "-i", *inputFilename, "-o", "/mnt/output/export.csv"}
+               if *outputBed != "" {
+                       if strings.Contains(*outputBed, "/") {
+                               err = fmt.Errorf("cannot use -output-bed filename %q containing '/' char", *outputBed)
+                               return 1
+                       }
+                       *outputBed = "/mnt/output/" + *outputBed
+               }
+               runner.Args = []string{"export", "-local=true", "-pick", *pick, "-ref", *refname, "-output-format", *outputFormatStr, "-output-bed", *outputBed, "-i", *inputFilename, "-o", "/mnt/output/export.csv"}
                var output string
                output, err = runner.Run()
                if err != nil {
@@ -159,14 +167,26 @@ func (cmd *exporter) RunCommand(prog string, args []string, stdin io.Reader, std
        if *outputFilename == "-" {
                output = nopCloser{stdout}
        } else {
-               output, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0777)
+               output, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0666)
                if err != nil {
                        return 1
                }
                defer output.Close()
        }
        bufw := bufio.NewWriter(output)
-       err = cmd.export(bufw, input, tilelib.taglib.keylen, refseq, cgs)
+
+       var bedout *os.File
+       var bedbufw *bufio.Writer
+       if *outputBed != "" {
+               bedout, err = os.OpenFile(*outputBed, os.O_CREATE|os.O_WRONLY, 0666)
+               if err != nil {
+                       return 1
+               }
+               defer bedout.Close()
+               bedbufw = bufio.NewWriter(bedout)
+       }
+
+       err = cmd.export(bufw, bedout, input, tilelib.taglib.keylen, refseq, cgs)
        if err != nil {
                return 1
        }
@@ -178,6 +198,16 @@ func (cmd *exporter) RunCommand(prog string, args []string, stdin io.Reader, std
        if err != nil {
                return 1
        }
+       if bedout != nil {
+               err = bedbufw.Flush()
+               if err != nil {
+                       return 1
+               }
+               err = bedout.Close()
+               if err != nil {
+                       return 1
+               }
+       }
        err = input.Close()
        if err != nil {
                return 1
@@ -185,7 +215,7 @@ func (cmd *exporter) RunCommand(prog string, args []string, stdin io.Reader, std
        return 0
 }
 
-func (cmd *exporter) export(out io.Writer, librdr io.Reader, taglen int, refseq map[string][]tileLibRef, cgs []CompactGenome) error {
+func (cmd *exporter) export(out, bedout io.Writer, librdr io.Reader, taglen int, refseq map[string][]tileLibRef, cgs []CompactGenome) error {
        need := map[tileLibRef]bool{}
        var seqnames []string
        for seqname, librefs := range refseq {
@@ -244,6 +274,7 @@ func (cmd *exporter) export(out io.Writer, librdr io.Reader, taglen int, refseq
                variantAt := map[int][]hgvs.Variant{} // variantAt[chromOffset][genomeIndex*2+phase]
                for refstep, libref := range refseq[seqname] {
                        reftile := tileVariant[libref]
+                       coverage := int64(0) // number of ref bases that are called in genomes -- max is len(reftile.Sequence)*len(cgs)*2
                        for cgidx, cg := range cgs {
                                for phase := 0; phase < 2; phase++ {
                                        if len(cg.Variants) <= int(libref.Tag)*2+phase {
@@ -270,6 +301,7 @@ func (cmd *exporter) export(out io.Writer, librdr io.Reader, taglen int, refseq
                                                refSequence = append(refSequence, tileVariant[refseq[seqname][refstepend]].Sequence...)
                                                refstepend++
                                        }
+                                       // (TODO: handle no-calls)
                                        vars, _ := hgvs.Diff(strings.ToUpper(string(refSequence)), strings.ToUpper(string(genometile.Sequence)), time.Second)
                                        for _, v := range vars {
                                                if cmd.outputFormat.PadLeft {
@@ -284,6 +316,7 @@ func (cmd *exporter) export(out io.Writer, librdr io.Reader, taglen int, refseq
                                                }
                                                varslice[cgidx*2+phase] = v
                                        }
+                                       coverage += int64(len(reftile.Sequence))
                                }
                        }
                        refpos += len(reftile.Sequence) - taglen
@@ -309,6 +342,25 @@ func (cmd *exporter) export(out io.Writer, librdr io.Reader, taglen int, refseq
                                }
                                cmd.outputFormat.Print(out, seqname, varslice)
                        }
+                       if bedout != nil && len(reftile.Sequence) > 0 {
+                               tilestart := refpos - len(reftile.Sequence) + taglen
+                               tileend := refpos
+                               if !lastrefstep {
+                                       tileend += taglen
+                               }
+                               thickstart := tilestart + taglen
+                               if refstep == 0 {
+                                       thickstart = 0
+                               }
+                               thickend := refpos
+                               // coverage score, 0 to 1000
+                               score := 1000 * coverage / int64(len(reftile.Sequence)) / int64(len(cgs)) / 2
+                               fmt.Fprintf(bedout, "%s %d %d %d %d . %d %d\n",
+                                       seqname, tilestart, tileend,
+                                       libref.Tag,
+                                       score,
+                                       thickstart, thickend)
+                       }
                }
        }
        return nil
index 39597031e7fba1a76896b78a7a9f0d2905f3a286..821c0fc6269b29473923b8e482ab471a323a0c78 100644 (file)
@@ -95,7 +95,7 @@ chr2:g.[1043=];[1043_1044delinsAA]
 `)
 
        vcfout := &bytes.Buffer{}
-       code = (&exporter{}).RunCommand("lightning export", []string{"-local", "-ref", "testdata/ref.fasta", "-output-format", "vcf", "-i", tmpdir + "/merged.gob"}, bytes.NewReader(nil), vcfout, os.Stderr)
+       code = (&exporter{}).RunCommand("lightning export", []string{"-local", "-ref", "testdata/ref.fasta", "-output-format", "vcf", "-i", tmpdir + "/merged.gob", "-output-bed", tmpdir + "/export.bed"}, bytes.NewReader(nil), vcfout, os.Stderr)
        c.Check(code, check.Equals, 0)
        c.Check(vcfout.Len() > 0, check.Equals, true)
        c.Logf("%s", vcfout.String())
@@ -109,5 +109,17 @@ chr2       830     CCTTGTATTTTT    AA      1/0
 chr2   887     C       A       1/0
 chr2   1041    GTGG    G       1/0
 chr2   1043    GG      AA      0/1
+`)
+       bedout, err := ioutil.ReadFile(tmpdir + "/export.bed")
+       c.Check(err, check.IsNil)
+       c.Logf("%s", string(bedout))
+       c.Check(string(bedout), check.Equals, `chr1 0 248 0 1000 . 0 224
+chr1 224 372 1 500 . 248 348
+chr1 348 496 2 0 . 372 472
+chr1 472 572 3 0 . 496 572
+chr2 572 820 4 500 . 0 796
+chr2 796 944 5 0 . 820 920
+chr2 920 1068 6 1000 . 944 1044
+chr2 1044 1144 7 0 . 1068 1144
 `)
 }