Updating readme and reorganization
authorSarah Wait Zaranek <swz@curii.com>
Thu, 18 Jun 2020 00:52:51 +0000 (00:52 +0000)
committerWard Vandewege <ward@jhvc.com>
Thu, 18 Jun 2020 15:16:40 +0000 (11:16 -0400)
Arvados-DCO-1.1-Signed-off-by: Sarah Wait Zaranek <swz@curii.com>
no issue #

WGS-processing/README
WGS-processing/cwl/helper/bwamem-gatk-report-wf.cwl
WGS-processing/cwl/helper/gatk-wf-with-interval.cwl
WGS-processing/cwl/helper/getfastq.cwl
WGS-processing/cwl/helper/not-in-use/gather-array-vcf.cwl [moved from WGS-processing/cwl/helper/gather-array-vcf.cwl with 100% similarity]
WGS-processing/cwl/helper/report-wf.cwl
WGS-processing/cwl/wgs-processing-wf.cwl
WGS-processing/yml/getfastq.yml
WGS-processing/yml/wgs-processing-wf.yml [new file with mode: 0644]

index 2b5cdedfd032bbf6474c52320a80dd9f162cf1c7..5b4b9b9b5f28f9cd592fb598e9a21f3313057c82 100644 (file)
@@ -1,7 +1,24 @@
-Arvados demo showing processing of  whole genome sequencing (WGS) data. The workflow includes:
+This directory contains an Arvados demo showing processing of whole genome sequencing (WGS) data. The workflow includes:
 
 
+* Check of fastq quality 
 * Local alignment using BWA-MEM
 * Variant calling in parallel using GATK
 * Local alignment using BWA-MEM
 * Variant calling in parallel using GATK
-* Generation of HTML Report showing Comparision of Variants with ClinVar Public Archive 
+* Generation of an HTML report comparing variants against ClinVar archive 
 
 
-Workflows are written in CWL
+Workflows are written in CWL v1.1. 
+
+To run the workflow:
+
+1. cd into cwl directory
+
+2. run the following:
+arvados-cwl-runner --no-wait --project-uuid YOUR_PROJECT_UUID wgs-processing-wf.cwl ../yml/YOURINPUTS.yml
+
+Subdirectories are:
+cwl - contains CWL code for the demo
+yml - contains yml inputs for cwl demo code
+src - contains any src code for the demo
+docker - contains dockerfiles necessary to re-create any needed docker images 
+
+About the Demo Data:
+WGS Data used in this demo is public data made available by the Personal Genome Project.  This data is from the PGP-UK (https://www.personalgenomes.org.uk/). 
index f601e9549eec665e8ed113103394f1bc57fd52b8..dbe5cf05f88dbdfa6e1014b98306b344f2238589 100644 (file)
@@ -43,7 +43,7 @@ outputs:
     outputSource: generate-report/report
 steps:
   fastqc:
     outputSource: generate-report/report
 steps:
   fastqc:
-    run: ./helper/fastqc.cwl
+    run: fastqc.cwl
     in:
       fastq1: fastq1
       fastq2: fastq2
     in:
       fastq1: fastq1
       fastq2: fastq2
@@ -57,13 +57,13 @@ steps:
       sample: sample
     out: [bam]
   samtools-sort:
       sample: sample
     out: [bam]
   samtools-sort:
-    run: ./helper/samtools-sort.cwl 
+    run: samtools-sort.cwl 
     in:
       bam: bwamem-samtools-view/bam
       sample: sample
     out: [sortedbam]
   mark-duplicates:
     in:
       bam: bwamem-samtools-view/bam
       sample: sample
     out: [sortedbam]
   mark-duplicates:
-    run: ./helper/mark-duplicates.cwl
+    run: mark-duplicates.cwl
     in:
       bam: samtools-sort/sortedbam
     out: [dupbam,dupmetrics]
     in:
       bam: samtools-sort/sortedbam
     out: [dupbam,dupmetrics]
@@ -73,7 +73,7 @@ steps:
       bam: mark-duplicates/dupbam
     out: [indexedbam]
   haplotypecaller:
       bam: mark-duplicates/dupbam
     out: [indexedbam]
   haplotypecaller:
-    run: ./helper/scatter-gatk-wf-with-interval.cwl 
+    run: scatter-gatk-wf-with-interval.cwl 
     in:
       reference: reference
       bam: samtools-index/indexedbam
     in:
       reference: reference
       bam: samtools-index/indexedbam
@@ -82,7 +82,7 @@ steps:
       knownsites1: knownsites
     out: [gatheredgvcf]
   generate-report:
       knownsites1: knownsites
     out: [gatheredgvcf]
   generate-report:
-    run: ./helper/report-wf.cwl
+    run: report-wf.cwl
     in:
       gvcf: haplotypecaller/gatheredgvcf
       samplename: sample
     in:
       gvcf: haplotypecaller/gatheredgvcf
       samplename: sample
index c7bd8021f8ed0ad9d9018201e281c2b04dd3c37e..4e9f456b24fec167cc3098a71560395254dd74da 100644 (file)
@@ -34,7 +34,7 @@ outputs:
 
 steps:
   basecalibrator:
 
 steps:
   basecalibrator:
-    run: ./helper/gatk-baserecalibrator-with-interval.cwl
+    run: gatk-baserecalibrator-with-interval.cwl
     in:
       bam: bam
       reference: reference
     in:
       bam: bam
       reference: reference
@@ -43,7 +43,7 @@ steps:
       intervallist: intervallist
     out: [recaltable]
   applyBQSR:
       intervallist: intervallist
     out: [recaltable]
   applyBQSR:
-    run: ./helper/gatk-applyBSQR-with-interval.cwl
+    run: gatk-applyBSQR-with-interval.cwl
     in: 
       reference: reference
       bam: bam
     in: 
       reference: reference
       bam: bam
@@ -52,7 +52,7 @@ steps:
       recaltable: basecalibrator/recaltable
     out: [recalbam]
   haplotypecaller:
       recaltable: basecalibrator/recaltable
     out: [recalbam]
   haplotypecaller:
-    run: ./helper/gatk-haplotypecaller-with-interval.cwl
+    run: gatk-haplotypecaller-with-interval.cwl
     in:
       reference: reference
       bam: applyBQSR/recalbam
     in:
       reference: reference
       bam: applyBQSR/recalbam
@@ -60,7 +60,7 @@ steps:
       intervallist: intervallist
     out: [gvcf]
   selectvariants:
       intervallist: intervallist
     out: [gvcf]
   selectvariants:
-    run: ./helper/gatk-selectvariants.cwl
+    run: gatk-selectvariants.cwl
     in: 
       gvcf: haplotypecaller/gvcf
       reference: reference
     in: 
       gvcf: haplotypecaller/gvcf
       reference: reference
index a38bc9125a60652a71c65e7f58d193a47a6663eb..c651771389f2720614f5e33fcab54e325f3a5305 100644 (file)
@@ -7,15 +7,17 @@ label: Create array of gvcfs to process
 requirements:
   InlineJavascriptRequirement: {}
 inputs:
 requirements:
   InlineJavascriptRequirement: {}
 inputs:
-  fastjdir:
+  fastqdir:
     type: Directory
     type: Directory
-    label: Input directory of fastj
+    label: Input directory of fastqs
     loadListing: 'shallow_listing' 
 outputs:
   fastq1: 
     type: File[]
   fastq2:
     type: File[]
     loadListing: 'shallow_listing' 
 outputs:
   fastq1: 
     type: File[]
   fastq2:
     type: File[]
+  sample:
+    type: string[]
 expression: |
   ${function compare(a, b) {
     var baseA = a.basename;
 expression: |
   ${function compare(a, b) {
     var baseA = a.basename;
@@ -32,8 +34,8 @@ expression: |
 
     var fastq1 = [];
     var fastq2 = [];
 
     var fastq1 = [];
     var fastq2 = [];
-    for (var i = 0; i < inputs.fastjdir.listing.length; i++) {
-      var name = inputs.fastjdir.listing[i];
+    for (var i = 0; i < inputs.fastqdir.listing.length; i++) {
+      var name = inputs.fastqdir.listing[i];
       if (name.basename.indexOf('_1.fastq.gz') != -1 ) {
         fastq1.push(name);
       }
       if (name.basename.indexOf('_1.fastq.gz') != -1 ) {
         fastq1.push(name);
       }
@@ -44,6 +46,15 @@ expression: |
   
     fastq1 = fastq1.sort(compare)
     fastq2 = fastq2.sort(compare)
   
     fastq1 = fastq1.sort(compare)
     fastq2 = fastq2.sort(compare)
+
+    var sample = [];
+
+    for (var i = 0; i < fastq1.length; i++) {
+      var name = fastq1[i].basename;
+      var samplename = name.replace(/_1.fastq.gz/,'');
+      sample.push(samplename);
+      }
+
  
  
-    return {"fastq1": fastq1, "fastq2": fastq2};
+    return {"fastq1": fastq1, "fastq2": fastq2, "sample": sample};
   }
   }
index 3403ebbe7dfb880d5a5d44b8c7a05c5c1c6a6d36..de8ab3f67ce9b20bfd392a1ce31c2300926a00a6 100644 (file)
@@ -22,21 +22,21 @@ outputs:
 
 steps:
   gvcf-to-vcf:
 
 steps:
   gvcf-to-vcf:
-    run: ./helper/gvcf-to-vcf.cwl
+    run: gvcf-to-vcf.cwl
     in:
       gvcf: gvcf
       samplename: samplename
     out: [vcf]
 
   annotate:
     in:
       gvcf: gvcf
       samplename: samplename
     out: [vcf]
 
   annotate:
-    run: ./helper/annotate-vcf.cwl
+    run: annotate-vcf.cwl
     in:
       vcf: gvcf-to-vcf/vcf
       clinvarvcf: clinvarvcf
     out: [reporttxt]
 
   generate-report:
     in:
       vcf: gvcf-to-vcf/vcf
       clinvarvcf: clinvarvcf
     out: [reporttxt]
 
   generate-report:
-    run: ./helper/generate-report.cwl
+    run: generate-report.cwl
     in:
       reportfunc: reportfunc
       sampletxt: annotate/reporttxt
     in:
       reportfunc: reportfunc
       sampletxt: annotate/reporttxt
index e43bfe5e95a08d89e2f8a033505abe3ead69366d..9d20a0e23720b28526cb5471ba6131da4fda27d1 100644 (file)
@@ -1,4 +1,4 @@
-Version: v1.1
+cwlVersion: v1.1
 class: Workflow
 
 requirements:
 class: Workflow
 
 requirements:
@@ -41,17 +41,17 @@ steps:
     run: ./helper/getfastq.cwl
     in:
       fastqdir: fastqdir
     run: ./helper/getfastq.cwl
     in:
       fastqdir: fastqdir
-    out: [fastq1, fastq2]
+    out: [fastq1, fastq2, sample]
 
   bwamem-gatk-report:
     run: ./helper/bwamem-gatk-report-wf.cwl
 
   bwamem-gatk-report:
     run: ./helper/bwamem-gatk-report-wf.cwl
-    scatter: [fastq1, fastq2]
+    scatter: [fastq1, fastq2, sample]
     scatterMethod: dotproduct
     in:
       fastq1: getfastq/fastq1
       fastq2: getfastq/fastq2
       reference: reference
     scatterMethod: dotproduct
     in:
       fastq1: getfastq/fastq1
       fastq2: getfastq/fastq2
       reference: reference
-      sample: sample
+      sample: getfastq/sample
       knownsites: knownsites
       scattercount: scattercount
       clinvarvcf: clinvarvcf
       knownsites: knownsites
       scattercount: scattercount
       clinvarvcf: clinvarvcf
index 2f61a4afa39fbe730655563dab0d516942cb2224..704a7acce8bc4f20f3317ba17aba748d95b147b8 100644 (file)
@@ -1,3 +1,3 @@
-fastjdir:
+fastqdir:
   class: Directory 
   location: keep:fd9539730452a3de5712a9df464dcd81+187497 
   class: Directory 
   location: keep:fd9539730452a3de5712a9df464dcd81+187497 
diff --git a/WGS-processing/yml/wgs-processing-wf.yml b/WGS-processing/yml/wgs-processing-wf.yml
new file mode 100644 (file)
index 0000000..8853a80
--- /dev/null
@@ -0,0 +1,31 @@
+sample: ERR1726424
+
+reference:
+  class: File
+  location: keep:a3af04432df3d71d22f2fe8be549ba96+5974/hg38.fa
+
+fastqdir:
+  class: Directory
+  location: keep:5e1324428f93c5e41aeb3d7b676de34c+186978 
+
+clinvarvcf:
+  class: File
+  location: keep:242ba3b9049aee86ab2c72db4f3b2822+223/38/clinvar.vcf.gz
+
+reportfunc:
+  class: File
+  location: ../src/annotation/generatereport.py
+
+headhtml:
+  class: File
+  location: ../src/annotation/head.html
+
+tailhtml:
+  class: File
+  location: ../src/annotation/tail.html
+
+scattercount: '32'
+
+knownsites:
+  class: File
+  location: keep:7c0b13bda857fa15d88c1039182f69d5+8052/Homo_sapiens_assembly38.known_indels.vcf.gz