From 637cd97c85cda5bf86cfbfa0aa86e63a9277dd2a Mon Sep 17 00:00:00 2001 From: Sarah Wait Zaranek Date: Thu, 11 Jun 2020 19:58:06 +0000 Subject: [PATCH] Adding modified code to select variants and gather arrays Arvados-DCO-1.1-Signed-off-by: Sarah Wait Zaranek no issue # --- cwl/helper/bwa-gatk-report-wf.cwl | 93 +++++++++++++++++++ cwl/helper/gather-array-vcf.cwl | 85 +++++++++++++++++ cwl/helper/gather-vcf-try2.cwl | 29 ++++-- cwl/helper/gather-vcf.cwl | 53 ++++++++--- .../gatk-haplotypecaller-with-interval.cwl | 2 + cwl/helper/gatk-selectvariants.cwl | 8 +- cwl/helper/gatk-wf-with-interval.cwl | 9 +- cwl/helper/getgvcfs.cwl | 37 ++++++++ cwl/helper/samtools-index.cwl | 2 +- cwl/helper/samtools-sort.cwl | 7 +- cwl/helper/scatter-gatk-wf-with-interval.cwl | 18 +++- 11 files changed, 307 insertions(+), 36 deletions(-) create mode 100644 cwl/helper/bwa-gatk-report-wf.cwl create mode 100644 cwl/helper/gather-array-vcf.cwl create mode 100644 cwl/helper/getgvcfs.cwl diff --git a/cwl/helper/bwa-gatk-report-wf.cwl b/cwl/helper/bwa-gatk-report-wf.cwl new file mode 100644 index 0000000..0a4b9d2 --- /dev/null +++ b/cwl/helper/bwa-gatk-report-wf.cwl @@ -0,0 +1,93 @@ +cwlVersion: v1.1 +class: Workflow + +requirements: + - class: SubworkflowFeatureRequirement + +inputs: + fastq1: File + fastq2: File + reference: + type: File + secondaryFiles: + - .amb + - .ann + - .bwt + - .pac + - .sa + - .fai + - ^.dict + sample: string + knownsites: + type: File + secondaryFiles: + - .tbi + scattercount: string + clinvarvcf: File + reportfunc: File + headhtml: File + tailhtml: File + +outputs: + qc-html: + type: File[] + outputSource: fastqc/out-html + qc-zip: + type: File[] + outputSource: fastqc/out-zip + gvcf: + type: File + outputSource: haplotypecaller/gatheredgvcf + report: + type: File + outputSource: generate-report/report +steps: + fastqc: + run: fastqc.cwl + in: + fastq1: fastq1 + fastq2: fastq2 + out: [out-html, out-zip] + bwamem-samtools-view: + run: bwamem-samtools-view.cwl + in: + fastq1: fastq1 + fastq2: fastq2 + reference: reference + sample: sample + out: [bam] + samtools-sort: + run: samtools-sort.cwl + in: + bam: bwamem-samtools-view/bam + sample: sample + out: [sortedbam] + mark-duplicates: + run: mark-duplicates.cwl + in: + bam: samtools-sort/sortedbam + out: [dupbam,dupmetrics] + samtools-index: + run: samtools-index.cwl + in: + bam: mark-duplicates/dupbam + out: [indexedbam] + haplotypecaller: + run: scatter-gatk-wf-with-interval.cwl + in: + reference: reference + bam: samtools-index/indexedbam + sample: sample + scattercount: scattercount + knownsites1: knownsites + out: [gatheredgvcf] + generate-report: + run: report-wf.cwl + in: + gvcf: haplotypecaller/gatheredgvcf + samplename: sample + clinvarvcf: clinvarvcf + reportfunc: reportfunc + headhtml: headhtml + tailhtml: tailhtml + out: [report] diff --git a/cwl/helper/gather-array-vcf.cwl b/cwl/helper/gather-array-vcf.cwl new file mode 100644 index 0000000..d566841 --- /dev/null +++ b/cwl/helper/gather-array-vcf.cwl @@ -0,0 +1,85 @@ +cwlVersion: v1.1 +class: CommandLineTool +label: Gathering vcf using Picard +$namespaces: + arv: "http://arvados.org/cwl#" + cwltool: "http://commonwl.org/cwltool#" + +requirements: + DockerRequirement: + dockerPull: broadinstitute/gatk:4.1.7.0 + ShellCommandRequirement: {} + InlineJavascriptRequirement: {} + +hints: + ResourceRequirement: + ramMin: 20000 + coresMin: 4 + arv:RuntimeConstraints: + outputDirType: keep_output_dir + +inputs: + gvcfarray: + type: File[] + sample: string + reference: + type: File + secondaryFiles: + - .amb + - .ann + - .bwt + - .pac + - .sa + - .fai + - ^.dict +outputs: + gatheredgvcf: + type: File + secondaryFiles: + - .tbi + outputBinding: + glob: "*.g.vcf.gz" + +baseCommand: /gatk/gatk + +arguments: + - "--java-options" + - "-Xmx8G" + - MergeVcfs + - shellQuote: false + valueFrom: | + ${function compare(a, b) { + var baseA = a.basename; + var baseB = b.basename; + + var comparison = 0; + if (baseA > baseB) { + comparison = 1; + } else if (baseA < baseB) { + comparison = -1; + } + return comparison; + } + + var sortedarray = []; + sortedarray = inputs.gvcfarray.sort(compare) + + var samples = []; + for (var i = 0; i < sortedarray.length; i++) { + var name = sortedarray[i]; + if (name.nameext ==='.gz' ) { + samples.push(name.path); + } + } + + var sampleinput = ""; + + for (var i = 0; i < samples.length; i++) { + var s1 = samples[i]; + sampleinput = sampleinput + "-I " + s1 + " " + } + + return sampleinput; + } + - prefix: "-O" + valueFrom: $(inputs.sample).g.vcf.gz diff --git a/cwl/helper/gather-vcf-try2.cwl b/cwl/helper/gather-vcf-try2.cwl index c573ffb..40a5c9d 100644 --- a/cwl/helper/gather-vcf-try2.cwl +++ b/cwl/helper/gather-vcf-try2.cwl @@ -19,7 +19,10 @@ hints: outputDirType: keep_output_dir inputs: - gvcfarray: File[] + gvcfdir: + type: Directory + label: Input directory of gvcfs + loadListing: 'shallow_listing' sample: string reference: type: File @@ -45,12 +48,22 @@ arguments: - GatherVcfs - shellQuote: false valueFrom: > - ${ - var cmd ""; - for( var i = 0; i < inputs.gvcfarray.length; i++){ - cmd += "\s echo " + "-I" + "\s" + inputs.gvcfsarray[i] - } - return cmd; - } + ${ + var samples = []; + for (var i = 0; i < inputs.gvcfdir.listing.length; i++) { + var name = inputs.gvcfdir.listing[i]; + if (name.nameext ==='.gz' ) { + samples.push(name.basename); + } + } + samples = samples.sort(); + var sampleinput = []; + + for (var i = 0; i < samples.length; i++) { + var s1 = samples[i]; + sampleinput = sampleinput + "-I " + s1 + " " + } + return sampleinput; + } - prefix: "-O" valueFrom: $(inputs.sample).g.vcf.gz diff --git a/cwl/helper/gather-vcf.cwl b/cwl/helper/gather-vcf.cwl index 8e5b29a..cd1532c 100644 --- a/cwl/helper/gather-vcf.cwl +++ b/cwl/helper/gather-vcf.cwl @@ -9,10 +9,7 @@ requirements: DockerRequirement: dockerPull: broadinstitute/gatk:4.1.7.0 ShellCommandRequirement: {} - InitialWorkDirRequirement: - listing: - - $(inputs.gvcf1) - - $(inputs.gvcf2) + InlineJavascriptRequirement: {} hints: ResourceRequirement: @@ -22,10 +19,10 @@ hints: outputDirType: keep_output_dir inputs: - gvcf1: - type: File - gvcf2: - type: File + gvcfdir: + type: Directory + label: Input directory of gvcfs + loadListing: 'shallow_listing' sample: string reference: type: File @@ -40,6 +37,8 @@ inputs: outputs: gatheredgvcf: type: File + secondaryFiles: + - .tbi outputBinding: glob: "*.g.vcf.gz" @@ -48,10 +47,38 @@ baseCommand: /gatk/gatk arguments: - "--java-options" - "-Xmx8G" - - GatherVcfs - - "-I" - - $(inputs.gvcf1.basename) - - "-I" - - $(inputs.gvcf2.basename) + - MergeVcfs + - shellQuote: false + valueFrom: | + ${function compare(a, b) { + var baseA = a.basename; + var baseB = b.basename; + + var comparison = 0; + if (baseA > baseB) { + comparison = 1; + } else if (baseA < baseB) { + comparison = -1; + } + return comparison; + } + + var samples = []; + for (var i = 0; i < inputs.gvcfdir.listing.length; i++) { + var name = inputs.gvcfdir.listing[i]; + if (name.nameext ==='.gz' ) { + samples.push(name.path); + } + } + samples = samples.sort(compare); + var sampleinput = []; + + for (var i = 0; i < samples.length; i++) { + var s1 = samples[i]; + sampleinput = sampleinput + "-I " + s1 + " " + } + + return sampleinput; + } - prefix: "-O" valueFrom: $(inputs.sample).g.vcf.gz diff --git a/cwl/helper/gatk-haplotypecaller-with-interval.cwl b/cwl/helper/gatk-haplotypecaller-with-interval.cwl index 3e0b53d..88c78e0 100644 --- a/cwl/helper/gatk-haplotypecaller-with-interval.cwl +++ b/cwl/helper/gatk-haplotypecaller-with-interval.cwl @@ -40,6 +40,8 @@ inputs: outputs: gvcf: type: File + secondaryFiles: + - .tbi outputBinding: glob: "*vcf.gz" diff --git a/cwl/helper/gatk-selectvariants.cwl b/cwl/helper/gatk-selectvariants.cwl index 2a65e53..66b5d5c 100644 --- a/cwl/helper/gatk-selectvariants.cwl +++ b/cwl/helper/gatk-selectvariants.cwl @@ -35,10 +35,10 @@ inputs: sample: string outputs: - genotypegvcf: + filteredgvcf: type: File outputBinding: - glob: "*selected.g.vcf.gz" + glob: "*g.vcf.gz" baseCommand: /gatk/gatk @@ -51,6 +51,6 @@ arguments: - prefix: "--remove-unused-alternates" valueFrom: "true" - prefix: "-V" - valueFrom: $(inputs.gvcf) + valueFrom: $(inputs.gvcf.path) - prefix: "-O" - valueFrom: $(inputs.sample)selected.g.vcf.gz + valueFrom: selected$(inputs.gvcf.basename) diff --git a/cwl/helper/gatk-wf-with-interval.cwl b/cwl/helper/gatk-wf-with-interval.cwl index 037accd..4e9f456 100644 --- a/cwl/helper/gatk-wf-with-interval.cwl +++ b/cwl/helper/gatk-wf-with-interval.cwl @@ -30,7 +30,7 @@ inputs: outputs: gvcf: type: File - outputSource: haplotypecaller/gvcf + outputSource: selectvariants/filteredgvcf steps: basecalibrator: @@ -59,3 +59,10 @@ steps: sample: sample intervallist: intervallist out: [gvcf] + selectvariants: + run: gatk-selectvariants.cwl + in: + gvcf: haplotypecaller/gvcf + reference: reference + sample: sample + out: [filteredgvcf] diff --git a/cwl/helper/getgvcfs.cwl b/cwl/helper/getgvcfs.cwl new file mode 100644 index 0000000..e3a7022 --- /dev/null +++ b/cwl/helper/getgvcfs.cwl @@ -0,0 +1,37 @@ +$namespaces: + arv: "http://arvados.org/cwl#" + cwltool: "http://commonwl.org/cwltool#" +class: ExpressionTool +cwlVersion: v1.1 +label: Create array of gvcfs to process +requirements: + InlineJavascriptRequirement: {} +inputs: + gvcfdir: + type: Directory + label: Input directory of gvcfs + loadListing: 'shallow_listing' +outputs: + sampleinput: + type: string +expression: | + ${ + var samples = []; + for (var i = 0; i < inputs.gvcfdir.listing.length; i++) { + var name = inputs.gvcfdir.listing[i]; + if (name.nameext ==='.gz' ) { + samples.push(name.basename); + } + } + samples = samples.sort(); + var sampleinput = []; + + for (var i = 0; i < samples.length; i++) { + var s1 = samples[i]; + sampleinput = sampleinput + "-I " + s1 + " " + } + + + return {"sampleinput": sampleinput}; + + } diff --git a/cwl/helper/samtools-index.cwl b/cwl/helper/samtools-index.cwl index 605390a..d479fbd 100644 --- a/cwl/helper/samtools-index.cwl +++ b/cwl/helper/samtools-index.cwl @@ -21,7 +21,7 @@ inputs: bam: File outputs: - bam: + indexedbam: type: File outputBinding: glob: "*bam" diff --git a/cwl/helper/samtools-sort.cwl b/cwl/helper/samtools-sort.cwl index b583f66..02072cd 100644 --- a/cwl/helper/samtools-sort.cwl +++ b/cwl/helper/samtools-sort.cwl @@ -10,9 +10,6 @@ requirements: DockerRequirement: dockerPull: curii/bwa-samtools-picard ShellCommandRequirement: {} - InitialWorkDirRequirement: - listing: - - $(inputs.bam) ResourceRequirement: ramMin: 20000 coresMin: 4 @@ -27,7 +24,7 @@ inputs: sample: string outputs: - bam: + sortedbam: type: File outputBinding: glob: "*sorted.bam" @@ -38,7 +35,7 @@ arguments: - sort - -@ - $(runtime.cores) - - $(inputs.bam.basename) + - $(inputs.bam.path) - -m - '2G' - -o diff --git a/cwl/helper/scatter-gatk-wf-with-interval.cwl b/cwl/helper/scatter-gatk-wf-with-interval.cwl index 80d0456..6e9f415 100644 --- a/cwl/helper/scatter-gatk-wf-with-interval.cwl +++ b/cwl/helper/scatter-gatk-wf-with-interval.cwl @@ -31,10 +31,12 @@ inputs: scattercount: string outputs: - gvcf: - type: File[] - outputSource: recal-haplotypecaller/gvcf - + gatheredgvcf: + type: File + secondaryFiles: + - .tbi + outputSource: merge-GVCFs/gatheredgvcf + steps: splitintervals: run: gatk-splitintervals.cwl @@ -54,3 +56,11 @@ steps: knownsites1: knownsites1 intervallist: splitintervals/intervalfiles out: [gvcf] + + merge-GVCFs: + run: gather-array-vcf.cwl + in: + gvcfarray: recal-haplotypecaller/gvcf + sample: sample + reference: reference + out: [gatheredgvcf] -- 2.30.2