From 64f689fcbaadb8fc17274f42d424c3ed72a4af37 Mon Sep 17 00:00:00 2001 From: Sarah Wait Zaranek Date: Thu, 18 Jun 2020 00:52:51 +0000 Subject: [PATCH] Updating readme and reorganization Arvados-DCO-1.1-Signed-off-by: Sarah Wait Zaranek no issue # --- WGS-processing/README | 23 ++++++++++++-- .../cwl/helper/bwamem-gatk-report-wf.cwl | 10 +++--- .../cwl/helper/gatk-wf-with-interval.cwl | 8 ++--- WGS-processing/cwl/helper/getfastq.cwl | 21 ++++++++++--- .../{ => not-in-use}/gather-array-vcf.cwl | 0 WGS-processing/cwl/helper/report-wf.cwl | 6 ++-- WGS-processing/cwl/wgs-processing-wf.cwl | 8 ++--- WGS-processing/yml/getfastq.yml | 2 +- WGS-processing/yml/wgs-processing-wf.yml | 31 +++++++++++++++++++ 9 files changed, 84 insertions(+), 25 deletions(-) rename WGS-processing/cwl/helper/{ => not-in-use}/gather-array-vcf.cwl (100%) create mode 100644 WGS-processing/yml/wgs-processing-wf.yml diff --git a/WGS-processing/README b/WGS-processing/README index 2b5cded..5b4b9b9 100644 --- a/WGS-processing/README +++ b/WGS-processing/README @@ -1,7 +1,24 @@ -Arvados demo showing processing of whole genome sequencing (WGS) data. The workflow includes: +This directory contains an Arvados demo showing processing of whole genome sequencing (WGS) data. The workflow includes: +* Check of fastq quality * Local alignment using BWA-MEM * Variant calling in parallel using GATK -* Generation of HTML Report showing Comparision of Variants with ClinVar Public Archive +* Generation of an HTML report comparing variants against ClinVar archive -Workflows are written in CWL +Workflows are written in CWL v1.1. + +To run the workflow: + +1. cd into cwl directory + +2. run the following: +arvados-cwl-runner --no-wait --project-uuid YOUR_PROJECT_UUID wgs-processing-wf.cwl ../yml/YOURINPUTS.yml + +Subdirectories are: +cwl - contains CWL code for the demo +yml - contains yml inputs for cwl demo code +src - contains any src code for the demo +docker - contains dockerfiles necessary to re-create any needed docker images + +About the Demo Data: +WGS Data used in this demo is public data made available by the Personal Genome Project. This data is from the PGP-UK (https://www.personalgenomes.org.uk/). diff --git a/WGS-processing/cwl/helper/bwamem-gatk-report-wf.cwl b/WGS-processing/cwl/helper/bwamem-gatk-report-wf.cwl index f601e95..dbe5cf0 100644 --- a/WGS-processing/cwl/helper/bwamem-gatk-report-wf.cwl +++ b/WGS-processing/cwl/helper/bwamem-gatk-report-wf.cwl @@ -43,7 +43,7 @@ outputs: outputSource: generate-report/report steps: fastqc: - run: ./helper/fastqc.cwl + run: fastqc.cwl in: fastq1: fastq1 fastq2: fastq2 @@ -57,13 +57,13 @@ steps: sample: sample out: [bam] samtools-sort: - run: ./helper/samtools-sort.cwl + run: samtools-sort.cwl in: bam: bwamem-samtools-view/bam sample: sample out: [sortedbam] mark-duplicates: - run: ./helper/mark-duplicates.cwl + run: mark-duplicates.cwl in: bam: samtools-sort/sortedbam out: [dupbam,dupmetrics] @@ -73,7 +73,7 @@ steps: bam: mark-duplicates/dupbam out: [indexedbam] haplotypecaller: - run: ./helper/scatter-gatk-wf-with-interval.cwl + run: scatter-gatk-wf-with-interval.cwl in: reference: reference bam: samtools-index/indexedbam @@ -82,7 +82,7 @@ steps: knownsites1: knownsites out: [gatheredgvcf] generate-report: - run: ./helper/report-wf.cwl + run: report-wf.cwl in: gvcf: haplotypecaller/gatheredgvcf samplename: sample diff --git a/WGS-processing/cwl/helper/gatk-wf-with-interval.cwl b/WGS-processing/cwl/helper/gatk-wf-with-interval.cwl index c7bd802..4e9f456 100644 --- a/WGS-processing/cwl/helper/gatk-wf-with-interval.cwl +++ b/WGS-processing/cwl/helper/gatk-wf-with-interval.cwl @@ -34,7 +34,7 @@ outputs: steps: basecalibrator: - run: ./helper/gatk-baserecalibrator-with-interval.cwl + run: gatk-baserecalibrator-with-interval.cwl in: bam: bam reference: reference @@ -43,7 +43,7 @@ steps: intervallist: intervallist out: [recaltable] applyBQSR: - run: ./helper/gatk-applyBSQR-with-interval.cwl + run: gatk-applyBSQR-with-interval.cwl in: reference: reference bam: bam @@ -52,7 +52,7 @@ steps: recaltable: basecalibrator/recaltable out: [recalbam] haplotypecaller: - run: ./helper/gatk-haplotypecaller-with-interval.cwl + run: gatk-haplotypecaller-with-interval.cwl in: reference: reference bam: applyBQSR/recalbam @@ -60,7 +60,7 @@ steps: intervallist: intervallist out: [gvcf] selectvariants: - run: ./helper/gatk-selectvariants.cwl + run: gatk-selectvariants.cwl in: gvcf: haplotypecaller/gvcf reference: reference diff --git a/WGS-processing/cwl/helper/getfastq.cwl b/WGS-processing/cwl/helper/getfastq.cwl index a38bc91..c651771 100644 --- a/WGS-processing/cwl/helper/getfastq.cwl +++ b/WGS-processing/cwl/helper/getfastq.cwl @@ -7,15 +7,17 @@ label: Create array of gvcfs to process requirements: InlineJavascriptRequirement: {} inputs: - fastjdir: + fastqdir: type: Directory - label: Input directory of fastj + label: Input directory of fastqs loadListing: 'shallow_listing' outputs: fastq1: type: File[] fastq2: type: File[] + sample: + type: string[] expression: | ${function compare(a, b) { var baseA = a.basename; @@ -32,8 +34,8 @@ expression: | var fastq1 = []; var fastq2 = []; - for (var i = 0; i < inputs.fastjdir.listing.length; i++) { - var name = inputs.fastjdir.listing[i]; + for (var i = 0; i < inputs.fastqdir.listing.length; i++) { + var name = inputs.fastqdir.listing[i]; if (name.basename.indexOf('_1.fastq.gz') != -1 ) { fastq1.push(name); } @@ -44,6 +46,15 @@ expression: | fastq1 = fastq1.sort(compare) fastq2 = fastq2.sort(compare) + + var sample = []; + + for (var i = 0; i < fastq1.length; i++) { + var name = fastq1[i].basename; + var samplename = name.replace(/_1.fastq.gz/,''); + sample.push(samplename); + } + - return {"fastq1": fastq1, "fastq2": fastq2}; + return {"fastq1": fastq1, "fastq2": fastq2, "sample": sample}; } diff --git a/WGS-processing/cwl/helper/gather-array-vcf.cwl b/WGS-processing/cwl/helper/not-in-use/gather-array-vcf.cwl similarity index 100% rename from WGS-processing/cwl/helper/gather-array-vcf.cwl rename to WGS-processing/cwl/helper/not-in-use/gather-array-vcf.cwl diff --git a/WGS-processing/cwl/helper/report-wf.cwl b/WGS-processing/cwl/helper/report-wf.cwl index 3403ebb..de8ab3f 100644 --- a/WGS-processing/cwl/helper/report-wf.cwl +++ b/WGS-processing/cwl/helper/report-wf.cwl @@ -22,21 +22,21 @@ outputs: steps: gvcf-to-vcf: - run: ./helper/gvcf-to-vcf.cwl + run: gvcf-to-vcf.cwl in: gvcf: gvcf samplename: samplename out: [vcf] annotate: - run: ./helper/annotate-vcf.cwl + run: annotate-vcf.cwl in: vcf: gvcf-to-vcf/vcf clinvarvcf: clinvarvcf out: [reporttxt] generate-report: - run: ./helper/generate-report.cwl + run: generate-report.cwl in: reportfunc: reportfunc sampletxt: annotate/reporttxt diff --git a/WGS-processing/cwl/wgs-processing-wf.cwl b/WGS-processing/cwl/wgs-processing-wf.cwl index e43bfe5..9d20a0e 100644 --- a/WGS-processing/cwl/wgs-processing-wf.cwl +++ b/WGS-processing/cwl/wgs-processing-wf.cwl @@ -1,4 +1,4 @@ -Version: v1.1 +cwlVersion: v1.1 class: Workflow requirements: @@ -41,17 +41,17 @@ steps: run: ./helper/getfastq.cwl in: fastqdir: fastqdir - out: [fastq1, fastq2] + out: [fastq1, fastq2, sample] bwamem-gatk-report: run: ./helper/bwamem-gatk-report-wf.cwl - scatter: [fastq1, fastq2] + scatter: [fastq1, fastq2, sample] scatterMethod: dotproduct in: fastq1: getfastq/fastq1 fastq2: getfastq/fastq2 reference: reference - sample: sample + sample: getfastq/sample knownsites: knownsites scattercount: scattercount clinvarvcf: clinvarvcf diff --git a/WGS-processing/yml/getfastq.yml b/WGS-processing/yml/getfastq.yml index 2f61a4a..704a7ac 100644 --- a/WGS-processing/yml/getfastq.yml +++ b/WGS-processing/yml/getfastq.yml @@ -1,3 +1,3 @@ -fastjdir: +fastqdir: class: Directory location: keep:fd9539730452a3de5712a9df464dcd81+187497 diff --git a/WGS-processing/yml/wgs-processing-wf.yml b/WGS-processing/yml/wgs-processing-wf.yml new file mode 100644 index 0000000..8853a80 --- /dev/null +++ b/WGS-processing/yml/wgs-processing-wf.yml @@ -0,0 +1,31 @@ +sample: ERR1726424 + +reference: + class: File + location: keep:a3af04432df3d71d22f2fe8be549ba96+5974/hg38.fa + +fastqdir: + class: Directory + location: keep:5e1324428f93c5e41aeb3d7b676de34c+186978 + +clinvarvcf: + class: File + location: keep:242ba3b9049aee86ab2c72db4f3b2822+223/38/clinvar.vcf.gz + +reportfunc: + class: File + location: ../src/annotation/generatereport.py + +headhtml: + class: File + location: ../src/annotation/head.html + +tailhtml: + class: File + location: ../src/annotation/tail.html + +scattercount: '32' + +knownsites: + class: File + location: keep:7c0b13bda857fa15d88c1039182f69d5+8052/Homo_sapiens_assembly38.known_indels.vcf.gz -- 2.30.2