Allow importing all-hom (reference) data from single fasta file.
authorTom Clegg <tom@tomclegg.ca>
Thu, 8 Oct 2020 05:57:37 +0000 (01:57 -0400)
committerTom Clegg <tom@tomclegg.ca>
Thu, 8 Oct 2020 05:57:37 +0000 (01:57 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@tomclegg.ca>

import.go
pipeline_test.go [new file with mode: 0644]
testdata/pipeline1/input1.1.fasta [new file with mode: 0644]
testdata/pipeline1/input1.2.fasta [new file with mode: 0644]
testdata/ref.fasta [new file with mode: 0644]

index 30e86a153f61a821b9139686e5841c8218dfe6fb..57596fbb07fb96b951ead13f41697e5f565e13d2 100644 (file)
--- a/import.go
+++ b/import.go
@@ -224,12 +224,19 @@ func (cmd *importer) loadTagLibrary() (*tagLibrary, error) {
        return &taglib, nil
 }
 
+var (
+       vcfFilenameRe    = regexp.MustCompile(`\.vcf(\.gz)?$`)
+       fasta1FilenameRe = regexp.MustCompile(`\.1\.fa(sta)?(\.gz)?$`)
+       fasta2FilenameRe = regexp.MustCompile(`\.2\.fa(sta)?(\.gz)?$`)
+       fastaFilenameRe  = regexp.MustCompile(`\.fa(sta)?(\.gz)?$`)
+)
+
 func listInputFiles(paths []string) (files []string, err error) {
        for _, path := range paths {
                if fi, err := os.Stat(path); err != nil {
                        return nil, fmt.Errorf("%s: stat failed: %s", path, err)
                } else if !fi.IsDir() {
-                       if !strings.HasSuffix(path, ".2.fasta") || strings.HasSuffix(path, ".2.fasta.gz") {
+                       if !fasta2FilenameRe.MatchString(path) {
                                files = append(files, path)
                        }
                        continue
@@ -245,23 +252,27 @@ func listInputFiles(paths []string) (files []string, err error) {
                }
                sort.Strings(names)
                for _, name := range names {
-                       if strings.HasSuffix(name, ".vcf") || strings.HasSuffix(name, ".vcf.gz") {
+                       if vcfFilenameRe.MatchString(name) {
                                files = append(files, filepath.Join(path, name))
-                       } else if strings.HasSuffix(name, ".1.fasta") || strings.HasSuffix(name, ".1.fasta.gz") {
+                       } else if fastaFilenameRe.MatchString(name) && !fasta2FilenameRe.MatchString(name) {
                                files = append(files, filepath.Join(path, name))
                        }
                }
                d.Close()
        }
        for _, file := range files {
-               if strings.HasSuffix(file, ".1.fasta") || strings.HasSuffix(file, ".1.fasta.gz") {
-                       continue
-               } else if _, err := os.Stat(file + ".csi"); err == nil {
-                       continue
-               } else if _, err = os.Stat(file + ".tbi"); err == nil {
+               if fastaFilenameRe.MatchString(file) {
                        continue
+               } else if vcfFilenameRe.MatchString(file) {
+                       if _, err := os.Stat(file + ".csi"); err == nil {
+                               continue
+                       } else if _, err = os.Stat(file + ".tbi"); err == nil {
+                               continue
+                       } else {
+                               return nil, fmt.Errorf("%s: cannot read without .tbi or .csi index file", file)
+                       }
                } else {
-                       return nil, fmt.Errorf("%s: cannot read without .tbi or .csi index file", file)
+                       return nil, fmt.Errorf("don't know how to handle filename %s", file)
                }
        }
        return
@@ -277,7 +288,7 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error {
                var phases sync.WaitGroup
                phases.Add(2)
                variants := make([][]tileVariantID, 2)
-               if strings.HasSuffix(infile, ".1.fasta") || strings.HasSuffix(infile, ".1.fasta.gz") {
+               if fasta1FilenameRe.MatchString(infile) {
                        todo <- func() error {
                                defer phases.Done()
                                log.Printf("%s starting", infile)
@@ -288,7 +299,7 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error {
                                log.Printf("%s found %d unique tags plus %d repeats", infile, kept, dropped)
                                return err
                        }
-                       infile2 := regexp.MustCompile(`\.1\.fasta(\.gz)?$`).ReplaceAllString(infile, `.2.fasta$1`)
+                       infile2 := fasta1FilenameRe.ReplaceAllString(infile, `.2.fa$1$2`)
                        todo <- func() error {
                                defer phases.Done()
                                log.Printf("%s starting", infile2)
@@ -299,7 +310,20 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error {
                                log.Printf("%s found %d unique tags plus %d repeats", infile, kept, dropped)
                                return err
                        }
-               } else {
+               } else if fastaFilenameRe.MatchString(infile) {
+                       todo <- func() error {
+                               defer phases.Done()
+                               defer phases.Done()
+                               log.Printf("%s starting", infile)
+                               defer log.Printf("%s done", infile)
+                               tseqs, err := cmd.tileFasta(tilelib, infile)
+                               var kept, dropped int
+                               variants[0], kept, dropped = tseqs.Variants()
+                               variants[1] = variants[0]
+                               log.Printf("%s found %d unique tags plus %d repeats", infile, kept, dropped)
+                               return err
+                       }
+               } else if vcfFilenameRe.MatchString(infile) {
                        for phase := 0; phase < 2; phase++ {
                                phase := phase
                                todo <- func() error {
@@ -313,6 +337,8 @@ func (cmd *importer) tileInputs(tilelib *tileLibrary, infiles []string) error {
                                        return err
                                }
                        }
+               } else {
+                       panic(fmt.Sprintf("bug: unhandled filename %q", infile))
                }
                encodeJobs.Add(1)
                go func() {
diff --git a/pipeline_test.go b/pipeline_test.go
new file mode 100644 (file)
index 0000000..96659a2
--- /dev/null
@@ -0,0 +1,42 @@
+package main
+
+import (
+       "bytes"
+       "io"
+       "os"
+       "sync"
+
+       "gopkg.in/check.v1"
+)
+
+type pipelineSuite struct{}
+
+var _ = check.Suite(&pipelineSuite{})
+
+func (s *pipelineSuite) TestImport(c *check.C) {
+       for _, infile := range []string{
+               "testdata/pipeline1/",
+               "testdata/ref.fasta",
+       } {
+               c.Logf("TestImport: %s", infile)
+               var wg sync.WaitGroup
+
+               statsin, importout := io.Pipe()
+               wg.Add(1)
+               go func() {
+                       defer wg.Done()
+                       code := (&importer{}).RunCommand("lightning import", []string{"-local=true", "-skip-ooo=true", "-output-tiles", "-tag-library", "testdata/tags", infile}, bytes.NewReader(nil), importout, os.Stderr)
+                       c.Check(code, check.Equals, 0)
+                       importout.Close()
+               }()
+               statsout := &bytes.Buffer{}
+               wg.Add(1)
+               go func() {
+                       defer wg.Done()
+                       code := (&stats{}).RunCommand("lightning stats", []string{"-local"}, statsin, statsout, os.Stderr)
+                       c.Check(code, check.Equals, 0)
+               }()
+               wg.Wait()
+               os.Stdout.Write(statsout.Bytes())
+       }
+}
diff --git a/testdata/pipeline1/input1.1.fasta b/testdata/pipeline1/input1.1.fasta
new file mode 100644 (file)
index 0000000..e81faaf
--- /dev/null
@@ -0,0 +1,20 @@
+>chr1
+ggcgtctacctcgagaagccccgacctctgaataagatctaagaacatctcaagggattgtgtatcttgttgggtgtacgcgcgccagcccgcagcatta
+ggagaactgtgctccgccttcaga
+ccccttgggtaaaatgccgcgcaatatgttgattacacttgctgcccatctgaaaggtcgccttatcaatcctatgctgaatgccctctaaggagtt
+acacatgctagcgcgtcggggtgg
+tcgccgcatgggacatgttggggtcccgtagctccggtcgatgtaggcacgcgttttgccgaggagatagatcatcagctgacctatagattcgtctgtc
+gactctagcagagtggccagccac
+aaggtttatattcagtcgaaatggacgggtcccgaacttgcacggacctaacgggactcgcctttcgtaataaccgaacctctaggccgccgcgagatca
+cctcccgagccgagccacccgtca
+tgatagccatcccgcacctagcgcacggaacttcgaccgatcccatattaacgtgtctcttatcctcccgctactgcgtgccttgccactcgtttactat
+>chr2
+tttttatagcagtagcggttgcgataatgcgcactaaggtggccataacttagccacacagactgcgacctcggtgtcaatctttaggcgatgactagtg
+gttattaataataacttatcatca
+cttccggggaaactagcgtaaaaaccgccgtcgcagtataccaccttatacctgtgccactcaatacaggagttgctcagcccaagacaccaacgactaa
+gctctcaaaccttgtatttttctt
+atgtcctcgcttggattatggggactcgcagtaaatgacaaccgtgcccgctggccctggccggaccgcgcgtccgtaagtagcgatcgagtcctgcagt
+aaaactgatccaaaaaaaatacaa
+acaccacgtttataccctgttgagtcgccacgtaacgtattaacgtatgaacggcccggttttcttatctcgcaccgctagtttgccctggcggtcg
+cctatgagtcaatcctattttcaa
+gatgagtatgtataaggcggcctgtcacgtgaaacctaggatacccaggtacctagggtcatttgctccccgttccccgcggcggacatccgaatatatc
diff --git a/testdata/pipeline1/input1.2.fasta b/testdata/pipeline1/input1.2.fasta
new file mode 100644 (file)
index 0000000..953af89
--- /dev/null
@@ -0,0 +1,20 @@
+>chr1
+ggcgtctacctcgagaagccccgacctctgaataagatctttgaacatctcaagggattgtgtatcttgttgggtgtacgcgcgccagcccgcagcatta
+ggagaactgtgctccgccttcaga
+ccccttgggtaaaatgccgcgcaatatgttgattacacttgctgcccatctgaaaggtcgccttatcaatcctatgctgaatgccctctaaggagtt
+acacatgctagcgcgtcggggtgg
+tcgccgcatgggacatgttggggtcccgtagctccggtcgatgtaggcacgcgttttgccgaggagatagatcatcagctgacctatagattcgtctgtc
+gactctagcagagtggccagccac
+aaggtttatattcagtcgaaatggacgggtcccgaacttgcacggacctaacgggactcgcctttcgtaataaccgaacctctaggccgccgcgagatca
+cctcccgagccgagccacccgtca
+tgatagccatcccgcacctagcgcacggaacttcgaccgatcccatattaacgtgtctcttatcctcccgctactgcgtgccttgccactcgtttactat
+>chr2
+tttttatagcagtagcggttgcgataatgcgcactaaggtggccataacttagccacacagactgcgacctcggtgtcaatctttaggcgatgactagtg
+gttattaataataacttatcatca
+cttccggggaaactagcgtaaaaaccgccgtcgcagtataccaccttatacctgtgccactcaatacaggagttgctcagcccaagacaccaacgactaa
+gctctcaaaccttgtatttttctt
+atgtcctcgcttggattatggggactcgcagtaaatgacaaccgtgcccgctggccctggccggaccgcgcgtccgtaagtagcgatcgagtcctgcagt
+aaaactgatccaaaaaaaatacaa
+acaccacgtttataccctgttgagtcgccacgtaacgtattaacgtatgaacggcccggttttcttatctcgcaccgctagtttgccctggcggtcgtaa
+cctatgagtcaatcctattttcaa
+gatgagtatgtataaggcggcctgtcacgtgaaacctaggatacccaggtacctagggtcatttgctccccgttccccgcggcggacatccgaatatatc
diff --git a/testdata/ref.fasta b/testdata/ref.fasta
new file mode 100644 (file)
index 0000000..d35147a
--- /dev/null
@@ -0,0 +1,20 @@
+>chr1
+ggcgtctacctcgagaagccccgacctctgaataagatctttgaacatctcaagggattgtgtatcttgttgggtgtacgcgcgccagcccgcagcatta
+ggagaactgtgctccgccttcaga
+ccccttgggtaaaatgccgcgcaatatgttgattacacttgctgcccatctgaaaggtcgccttatcaatcctatgctgaatgccctctaaggagttcca
+acacatgctagcgcgtcggggtgg
+tcgccgcatgggacatgttggggtcccgtagctccggtcgatgtaggcacgcgttttgccgaggagatagatcatcagctgacctatagattcgtctgtc
+gactctagcagagtggccagccac
+aaggtttatattcagtcgaaatggacgggtcccgaacttgcacggacctaacgggactcgcctttcgtaataaccgaacctctaggccgccgcgagatca
+cctcccgagccgagccacccgtca
+tgatagccatcccgcacctagcgcacggaacttcgaccgatcccatattaacgtgtctcttatcctcccgctactgcgtgccttgccactcgtttactat
+>chr2
+tttttatagcagtagcggttgcgataatgcgcactaaggtggccataacttagccacacagactgcgacctcggtgtcaatctttaggcgatgactagtg
+gttattaataataacttatcatca
+cttccggggaaactagcgtaaaaaccgccgtcgcagtataccaccttatacctgtgccactcaatacaggagttgctcagcccaagacaccaacgactaa
+gctctcaaaccttgtatttttctt
+atgtcctcgcttggattatggggactcgcagtaaatgacaaccgtgcccgctggccctggccggaccgcgcgtccgtaagtagcgatcgagtcctgcagt
+aaaactgatccaaaaaaaatacaa
+acaccacgtttataccctgttgagtcgccacgtaacgtattaacgtatgaacggcccggttttcttatctcgcaccgctagtttgccctggcggtcgtgg
+cctatgagtcaatcctattttcaa
+gatgagtatgtataaggcggcctgtcacgtgaaacctaggatacccaggtacctagggtcatttgctccccgttccccgcggcggacatccgaatatatc