-Arvados demo showing processing of whole genome sequencing (WGS) data. The workflow includes:
+This directory contains an Arvados demo showing processing of whole genome sequencing (WGS) data. The workflow includes:
+* Check of fastq quality
* Local alignment using BWA-MEM
* Variant calling in parallel using GATK
-* Generation of HTML Report showing Comparision of Variants with ClinVar Public Archive
+* Generation of an HTML report comparing variants against ClinVar archive
-Workflows are written in CWL
+Workflows are written in CWL v1.1.
+
+To run the workflow:
+
+1. cd into cwl directory
+
+2. run the following:
+arvados-cwl-runner --no-wait --project-uuid YOUR_PROJECT_UUID wgs-processing-wf.cwl ../yml/YOURINPUTS.yml
+
+Subdirectories are:
+cwl - contains CWL code for the demo
+yml - contains yml inputs for cwl demo code
+src - contains any src code for the demo
+docker - contains dockerfiles necessary to re-create any needed docker images
+
+About the Demo Data:
+WGS Data used in this demo is public data made available by the Personal Genome Project. This data is from the PGP-UK (https://www.personalgenomes.org.uk/).
outputSource: generate-report/report
steps:
fastqc:
- run: ./helper/fastqc.cwl
+ run: fastqc.cwl
in:
fastq1: fastq1
fastq2: fastq2
sample: sample
out: [bam]
samtools-sort:
- run: ./helper/samtools-sort.cwl
+ run: samtools-sort.cwl
in:
bam: bwamem-samtools-view/bam
sample: sample
out: [sortedbam]
mark-duplicates:
- run: ./helper/mark-duplicates.cwl
+ run: mark-duplicates.cwl
in:
bam: samtools-sort/sortedbam
out: [dupbam,dupmetrics]
bam: mark-duplicates/dupbam
out: [indexedbam]
haplotypecaller:
- run: ./helper/scatter-gatk-wf-with-interval.cwl
+ run: scatter-gatk-wf-with-interval.cwl
in:
reference: reference
bam: samtools-index/indexedbam
knownsites1: knownsites
out: [gatheredgvcf]
generate-report:
- run: ./helper/report-wf.cwl
+ run: report-wf.cwl
in:
gvcf: haplotypecaller/gatheredgvcf
samplename: sample
steps:
basecalibrator:
- run: ./helper/gatk-baserecalibrator-with-interval.cwl
+ run: gatk-baserecalibrator-with-interval.cwl
in:
bam: bam
reference: reference
intervallist: intervallist
out: [recaltable]
applyBQSR:
- run: ./helper/gatk-applyBSQR-with-interval.cwl
+ run: gatk-applyBSQR-with-interval.cwl
in:
reference: reference
bam: bam
recaltable: basecalibrator/recaltable
out: [recalbam]
haplotypecaller:
- run: ./helper/gatk-haplotypecaller-with-interval.cwl
+ run: gatk-haplotypecaller-with-interval.cwl
in:
reference: reference
bam: applyBQSR/recalbam
intervallist: intervallist
out: [gvcf]
selectvariants:
- run: ./helper/gatk-selectvariants.cwl
+ run: gatk-selectvariants.cwl
in:
gvcf: haplotypecaller/gvcf
reference: reference
requirements:
InlineJavascriptRequirement: {}
inputs:
- fastjdir:
+ fastqdir:
type: Directory
- label: Input directory of fastj
+ label: Input directory of fastqs
loadListing: 'shallow_listing'
outputs:
fastq1:
type: File[]
fastq2:
type: File[]
+ sample:
+ type: string[]
expression: |
${function compare(a, b) {
var baseA = a.basename;
var fastq1 = [];
var fastq2 = [];
- for (var i = 0; i < inputs.fastjdir.listing.length; i++) {
- var name = inputs.fastjdir.listing[i];
+ for (var i = 0; i < inputs.fastqdir.listing.length; i++) {
+ var name = inputs.fastqdir.listing[i];
if (name.basename.indexOf('_1.fastq.gz') != -1 ) {
fastq1.push(name);
}
fastq1 = fastq1.sort(compare)
fastq2 = fastq2.sort(compare)
+
+ var sample = [];
+
+ for (var i = 0; i < fastq1.length; i++) {
+ var name = fastq1[i].basename;
+ var samplename = name.replace(/_1.fastq.gz/,'');
+ sample.push(samplename);
+ }
+
- return {"fastq1": fastq1, "fastq2": fastq2};
+ return {"fastq1": fastq1, "fastq2": fastq2, "sample": sample};
}
steps:
gvcf-to-vcf:
- run: ./helper/gvcf-to-vcf.cwl
+ run: gvcf-to-vcf.cwl
in:
gvcf: gvcf
samplename: samplename
out: [vcf]
annotate:
- run: ./helper/annotate-vcf.cwl
+ run: annotate-vcf.cwl
in:
vcf: gvcf-to-vcf/vcf
clinvarvcf: clinvarvcf
out: [reporttxt]
generate-report:
- run: ./helper/generate-report.cwl
+ run: generate-report.cwl
in:
reportfunc: reportfunc
sampletxt: annotate/reporttxt
-Version: v1.1
+cwlVersion: v1.1
class: Workflow
requirements:
run: ./helper/getfastq.cwl
in:
fastqdir: fastqdir
- out: [fastq1, fastq2]
+ out: [fastq1, fastq2, sample]
bwamem-gatk-report:
run: ./helper/bwamem-gatk-report-wf.cwl
- scatter: [fastq1, fastq2]
+ scatter: [fastq1, fastq2, sample]
scatterMethod: dotproduct
in:
fastq1: getfastq/fastq1
fastq2: getfastq/fastq2
reference: reference
- sample: sample
+ sample: getfastq/sample
knownsites: knownsites
scattercount: scattercount
clinvarvcf: clinvarvcf
-fastjdir:
+fastqdir:
class: Directory
location: keep:fd9539730452a3de5712a9df464dcd81+187497
--- /dev/null
+sample: ERR1726424
+
+reference:
+ class: File
+ location: keep:a3af04432df3d71d22f2fe8be549ba96+5974/hg38.fa
+
+fastqdir:
+ class: Directory
+ location: keep:5e1324428f93c5e41aeb3d7b676de34c+186978
+
+clinvarvcf:
+ class: File
+ location: keep:242ba3b9049aee86ab2c72db4f3b2822+223/38/clinvar.vcf.gz
+
+reportfunc:
+ class: File
+ location: ../src/annotation/generatereport.py
+
+headhtml:
+ class: File
+ location: ../src/annotation/head.html
+
+tailhtml:
+ class: File
+ location: ../src/annotation/tail.html
+
+scattercount: '32'
+
+knownsites:
+ class: File
+ location: keep:7c0b13bda857fa15d88c1039182f69d5+8052/Homo_sapiens_assembly38.known_indels.vcf.gz