From bd981a00bfb1d74cec8477d5054ee4194fb9cb7e Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Thu, 8 Oct 2020 01:57:37 -0400 Subject: [PATCH] Allow importing all-hom (reference) data from single fasta file. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- import.go | 50 +++++++++++++++++++++++-------- pipeline_test.go | 42 ++++++++++++++++++++++++++ testdata/pipeline1/input1.1.fasta | 20 +++++++++++++ testdata/pipeline1/input1.2.fasta | 20 +++++++++++++ testdata/ref.fasta | 20 +++++++++++++ 5 files changed, 140 insertions(+), 12 deletions(-) create mode 100644 pipeline_test.go create mode 100644 testdata/pipeline1/input1.1.fasta create mode 100644 testdata/pipeline1/input1.2.fasta create mode 100644 testdata/ref.fasta diff --git a/import.go b/import.go index 30e86a153f..57596fbb07 100644 --- a/import.go +++ b/import.go @@ -224,12 +224,19 @@ func (cmd *importer) loadTagLibrary() (*tagLibrary, error) { return &taglib, nil } +var ( + vcfFilenameRe = regexp.MustCompile(`\.vcf(\.gz)?$`) + fasta1FilenameRe = regexp.MustCompile(`\.1\.fa(sta)?(\.gz)?$`) + fasta2FilenameRe = regexp.MustCompile(`\.2\.fa(sta)?(\.gz)?$`) + fastaFilenameRe = regexp.MustCompile(`\.fa(sta)?(\.gz)?$`) +) + func listInputFiles(paths []string) (files []string, err error) { for _, path := range paths { if fi, err := os.Stat(path); err != nil { return nil, fmt.Errorf("%s: stat failed: %s", path, err) } else if !fi.IsDir() { - if !strings.HasSuffix(path, ".2.fasta") || strings.HasSuffix(path, ".2.fasta.gz") { + if !fasta2FilenameRe.MatchString(path) { files = append(files, path) } continue @@ -245,23 +252,27 @@ func listInputFiles(paths []string) (files []string, err error) { } sort.Strings(names) for _, name := range names { - if strings.HasSuffix(name, ".vcf") || strings.HasSuffix(name, ".vcf.gz") { + if vcfFilenameRe.MatchString(name) { files = append(files, filepath.Join(path, name)) - } else if strings.HasSuffix(name, ".1.fasta") || strings.HasSuffix(name, ".1.fasta.gz") { + } else if fastaFilenameRe.MatchString(name) && !fasta2FilenameRe.MatchString(name) { files = append(files, filepath.Join(path, name)) } } d.Close() } for _, file := range files { - if strings.HasSuffix(file, ".1.fasta") || strings.HasSuffix(file, ".1.fasta.gz") { - continue - } else if _, err := os.Stat(file + ".csi"); err == nil { - continue - } else if _, err = os.Stat(file + ".tbi"); err == nil { + if fastaFilenameRe.MatchString(file) { continue + } else if vcfFilenameRe.MatchString(file) { + if _, err := os.Stat(file + ".csi"); err == nil { + continue + } else if _, err = os.Stat(file + ".tbi"); err == nil { + continue + } else { + return nil, fmt.Errorf("%s: cannot read without .tbi or .csi index file", file) + } } else { - return nil, fmt.Errorf("%s: cannot read without .tbi or .csi index file", file) + return nil, fmt.Errorf("don't know how to handle filename %s", file) } } return @@ -277,7 +288,7 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error { var phases sync.WaitGroup phases.Add(2) variants := make([][]tileVariantID, 2) - if strings.HasSuffix(infile, ".1.fasta") || strings.HasSuffix(infile, ".1.fasta.gz") { + if fasta1FilenameRe.MatchString(infile) { todo <- func() error { defer phases.Done() log.Printf("%s starting", infile) @@ -288,7 +299,7 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error { log.Printf("%s found %d unique tags plus %d repeats", infile, kept, dropped) return err } - infile2 := regexp.MustCompile(`\.1\.fasta(\.gz)?$`).ReplaceAllString(infile, `.2.fasta$1`) + infile2 := fasta1FilenameRe.ReplaceAllString(infile, `.2.fa$1$2`) todo <- func() error { defer phases.Done() log.Printf("%s starting", infile2) @@ -299,7 +310,20 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error { log.Printf("%s found %d unique tags plus %d repeats", infile, kept, dropped) return err } - } else { + } else if fastaFilenameRe.MatchString(infile) { + todo <- func() error { + defer phases.Done() + defer phases.Done() + log.Printf("%s starting", infile) + defer log.Printf("%s done", infile) + tseqs, err := cmd.tileFasta(tilelib, infile) + var kept, dropped int + variants[0], kept, dropped = tseqs.Variants() + variants[1] = variants[0] + log.Printf("%s found %d unique tags plus %d repeats", infile, kept, dropped) + return err + } + } else if vcfFilenameRe.MatchString(infile) { for phase := 0; phase < 2; phase++ { phase := phase todo <- func() error { @@ -313,6 +337,8 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error { return err } } + } else { + panic(fmt.Sprintf("bug: unhandled filename %q", infile)) } encodeJobs.Add(1) go func() { diff --git a/pipeline_test.go b/pipeline_test.go new file mode 100644 index 0000000000..96659a2977 --- /dev/null +++ b/pipeline_test.go @@ -0,0 +1,42 @@ +package main + +import ( + "bytes" + "io" + "os" + "sync" + + "gopkg.in/check.v1" +) + +type pipelineSuite struct{} + +var _ = check.Suite(&pipelineSuite{}) + +func (s *pipelineSuite) TestImport(c *check.C) { + for _, infile := range []string{ + "testdata/pipeline1/", + "testdata/ref.fasta", + } { + c.Logf("TestImport: %s", infile) + var wg sync.WaitGroup + + statsin, importout := io.Pipe() + wg.Add(1) + go func() { + defer wg.Done() + code := (&importer{}).RunCommand("lightning import", []string{"-local=true", "-skip-ooo=true", "-output-tiles", "-tag-library", "testdata/tags", infile}, bytes.NewReader(nil), importout, os.Stderr) + c.Check(code, check.Equals, 0) + importout.Close() + }() + statsout := &bytes.Buffer{} + wg.Add(1) + go func() { + defer wg.Done() + code := (&stats{}).RunCommand("lightning stats", []string{"-local"}, statsin, statsout, os.Stderr) + c.Check(code, check.Equals, 0) + }() + wg.Wait() + os.Stdout.Write(statsout.Bytes()) + } +} diff --git a/testdata/pipeline1/input1.1.fasta b/testdata/pipeline1/input1.1.fasta new file mode 100644 index 0000000000..e81faaf1d8 --- /dev/null +++ b/testdata/pipeline1/input1.1.fasta @@ -0,0 +1,20 @@ +>chr1 +ggcgtctacctcgagaagccccgacctctgaataagatctaagaacatctcaagggattgtgtatcttgttgggtgtacgcgcgccagcccgcagcatta +ggagaactgtgctccgccttcaga +ccccttgggtaaaatgccgcgcaatatgttgattacacttgctgcccatctgaaaggtcgccttatcaatcctatgctgaatgccctctaaggagtt +acacatgctagcgcgtcggggtgg +tcgccgcatgggacatgttggggtcccgtagctccggtcgatgtaggcacgcgttttgccgaggagatagatcatcagctgacctatagattcgtctgtc +gactctagcagagtggccagccac +aaggtttatattcagtcgaaatggacgggtcccgaacttgcacggacctaacgggactcgcctttcgtaataaccgaacctctaggccgccgcgagatca +cctcccgagccgagccacccgtca +tgatagccatcccgcacctagcgcacggaacttcgaccgatcccatattaacgtgtctcttatcctcccgctactgcgtgccttgccactcgtttactat +>chr2 +tttttatagcagtagcggttgcgataatgcgcactaaggtggccataacttagccacacagactgcgacctcggtgtcaatctttaggcgatgactagtg +gttattaataataacttatcatca +cttccggggaaactagcgtaaaaaccgccgtcgcagtataccaccttatacctgtgccactcaatacaggagttgctcagcccaagacaccaacgactaa +gctctcaaaccttgtatttttctt +atgtcctcgcttggattatggggactcgcagtaaatgacaaccgtgcccgctggccctggccggaccgcgcgtccgtaagtagcgatcgagtcctgcagt +aaaactgatccaaaaaaaatacaa +acaccacgtttataccctgttgagtcgccacgtaacgtattaacgtatgaacggcccggttttcttatctcgcaccgctagtttgccctggcggtcg +cctatgagtcaatcctattttcaa +gatgagtatgtataaggcggcctgtcacgtgaaacctaggatacccaggtacctagggtcatttgctccccgttccccgcggcggacatccgaatatatc diff --git a/testdata/pipeline1/input1.2.fasta b/testdata/pipeline1/input1.2.fasta new file mode 100644 index 0000000000..953af8971b --- /dev/null +++ b/testdata/pipeline1/input1.2.fasta @@ -0,0 +1,20 @@ +>chr1 +ggcgtctacctcgagaagccccgacctctgaataagatctttgaacatctcaagggattgtgtatcttgttgggtgtacgcgcgccagcccgcagcatta +ggagaactgtgctccgccttcaga +ccccttgggtaaaatgccgcgcaatatgttgattacacttgctgcccatctgaaaggtcgccttatcaatcctatgctgaatgccctctaaggagtt +acacatgctagcgcgtcggggtgg +tcgccgcatgggacatgttggggtcccgtagctccggtcgatgtaggcacgcgttttgccgaggagatagatcatcagctgacctatagattcgtctgtc +gactctagcagagtggccagccac +aaggtttatattcagtcgaaatggacgggtcccgaacttgcacggacctaacgggactcgcctttcgtaataaccgaacctctaggccgccgcgagatca +cctcccgagccgagccacccgtca +tgatagccatcccgcacctagcgcacggaacttcgaccgatcccatattaacgtgtctcttatcctcccgctactgcgtgccttgccactcgtttactat +>chr2 +tttttatagcagtagcggttgcgataatgcgcactaaggtggccataacttagccacacagactgcgacctcggtgtcaatctttaggcgatgactagtg +gttattaataataacttatcatca +cttccggggaaactagcgtaaaaaccgccgtcgcagtataccaccttatacctgtgccactcaatacaggagttgctcagcccaagacaccaacgactaa +gctctcaaaccttgtatttttctt +atgtcctcgcttggattatggggactcgcagtaaatgacaaccgtgcccgctggccctggccggaccgcgcgtccgtaagtagcgatcgagtcctgcagt +aaaactgatccaaaaaaaatacaa +acaccacgtttataccctgttgagtcgccacgtaacgtattaacgtatgaacggcccggttttcttatctcgcaccgctagtttgccctggcggtcgtaa +cctatgagtcaatcctattttcaa +gatgagtatgtataaggcggcctgtcacgtgaaacctaggatacccaggtacctagggtcatttgctccccgttccccgcggcggacatccgaatatatc diff --git a/testdata/ref.fasta b/testdata/ref.fasta new file mode 100644 index 0000000000..d35147afe0 --- /dev/null +++ b/testdata/ref.fasta @@ -0,0 +1,20 @@ +>chr1 +ggcgtctacctcgagaagccccgacctctgaataagatctttgaacatctcaagggattgtgtatcttgttgggtgtacgcgcgccagcccgcagcatta +ggagaactgtgctccgccttcaga +ccccttgggtaaaatgccgcgcaatatgttgattacacttgctgcccatctgaaaggtcgccttatcaatcctatgctgaatgccctctaaggagttcca +acacatgctagcgcgtcggggtgg +tcgccgcatgggacatgttggggtcccgtagctccggtcgatgtaggcacgcgttttgccgaggagatagatcatcagctgacctatagattcgtctgtc +gactctagcagagtggccagccac +aaggtttatattcagtcgaaatggacgggtcccgaacttgcacggacctaacgggactcgcctttcgtaataaccgaacctctaggccgccgcgagatca +cctcccgagccgagccacccgtca +tgatagccatcccgcacctagcgcacggaacttcgaccgatcccatattaacgtgtctcttatcctcccgctactgcgtgccttgccactcgtttactat +>chr2 +tttttatagcagtagcggttgcgataatgcgcactaaggtggccataacttagccacacagactgcgacctcggtgtcaatctttaggcgatgactagtg +gttattaataataacttatcatca +cttccggggaaactagcgtaaaaaccgccgtcgcagtataccaccttatacctgtgccactcaatacaggagttgctcagcccaagacaccaacgactaa +gctctcaaaccttgtatttttctt +atgtcctcgcttggattatggggactcgcagtaaatgacaaccgtgcccgctggccctggccggaccgcgcgtccgtaagtagcgatcgagtcctgcagt +aaaactgatccaaaaaaaatacaa +acaccacgtttataccctgttgagtcgccacgtaacgtattaacgtatgaacggcccggttttcttatctcgcaccgctagtttgccctggcggtcgtgg +cctatgagtcaatcctattttcaa +gatgagtatgtataaggcggcctgtcacgtgaaacctaggatacccaggtacctagggtcatttgctccccgttccccgcggcggacatccgaatatatc -- 2.30.2