From: Peter Amstutz Date: Thu, 15 Sep 2022 22:39:31 +0000 (-0400) Subject: Add splitDir to take Directory input X-Git-Url: https://git.arvados.org/arvados-tutorial.git/commitdiff_plain/HEAD?hp=4292ebf16822fde6b023ca1828bcae97b6d812ce Add splitDir to take Directory input Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- diff --git a/.licenseignore b/.licenseignore index 35252fd..e617614 100644 --- a/.licenseignore +++ b/.licenseignore @@ -1 +1,2 @@ LICENSE.txt +* diff --git a/RNA-Seq/README.md b/RNA-Seq/README.md new file mode 100644 index 0000000..d4e3c22 --- /dev/null +++ b/RNA-Seq/README.md @@ -0,0 +1,26 @@ +This directory contains an Arvados demo that performs bioinformatics RNA-seq analysis. However, specific knowledge of the biology of RNA-seq is not required for this demo. For those unfamiliar with RNA-seq, it is the process of sequencing RNA present in a biological sample. From the sequence reads, we want to measure the relative numbers of different RNA molecules appearing in the sample that were produced by particular genes. This analysis is called “differential gene expression”. + +Workflows are written in CWL v1.2. (https://www.commonwl.org/) + +Subdirectories are: +* cwl - contains CWL code for the demo +* yml - contains YML inputs for cwl demo code + +To run the workflow: + +* arvados-cwl-runner --no-wait --project-uuid YOUR_PROJECT_UUID ./cwl/RNA-seq-wf.cwl ./yml/RNA-seq-wf.yml + +About the Demo: +This demo is based on a workflow in the Introduction to RNA-seq using high-performance computing (HPC) lessons developed by members of the teaching team at the Harvard Chan Bioinformatics Core (HBC). The original training, which includes additional lectures about the biology of RNA-seq, can be found at here:https://github.com/hbctraining/Intro-to-rnaseq-hpc-O2. + +The data used in this demo can be found here: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE50499. Citation: Kenny PJ, Zhou H, Kim M, Skariah G et al. MOV10 and FMRP regulate AGO2 association with microRNA recognition elements. Cell Rep 2014 Dec 11;9(5):1729-1741. PMID: 25464849 + +Have questions about Arvados? +* https://arvados.org/ +* https://gitter.im/arvados/community + +Have questions about CWL? +* https://www.commonwl.org/ +* https://cwl.discourse.group/ +* https://gitter.im/common-workflow-language/common-workflow-language + diff --git a/RNA-Seq/cwl/RNA-seq-wf.cwl b/RNA-Seq/cwl/RNA-seq-wf.cwl new file mode 100644 index 0000000..183bca1 --- /dev/null +++ b/RNA-Seq/cwl/RNA-seq-wf.cwl @@ -0,0 +1,58 @@ +cwlVersion: v1.2 +class: Workflow +label: RNAseq workflow + +inputs: + fqdir: + type: Directory + loadListing: shallow_listing + genome: Directory + gtf: File + +steps: + splitDir: + in: + fqdir: fqdir + run: helper/splitDir.cwl + out: [fq] + + alignment: + run: helper/alignment.cwl + scatter: fq + in: + fq: splitDir/fq + genome: genome + gtf: gtf + out: [qc_html, bam_sorted_indexed] + + featureCounts: + requirements: + ResourceRequirement: + ramMin: 500 + run: helper/featureCounts.cwl + in: + counts_input_bam: alignment/bam_sorted_indexed + gtf: gtf + out: [featurecounts] + + output-subdirs: + run: helper/subdirs.cwl + in: + fq: splitDir/fq + bams: alignment/bam_sorted_indexed + qc: alignment/qc_html + out: [dirs] + +outputs: + dirs: + type: Directory[] + outputSource: output-subdirs/dirs + + featurecounts: + type: File + outputSource: featureCounts/featurecounts + +requirements: + SubworkflowFeatureRequirement: {} + ScatterFeatureRequirement: {} + StepInputExpressionRequirement: {} diff --git a/RNA-Seq/cwl/helper/STAR-Align.cwl b/RNA-Seq/cwl/helper/STAR-Align.cwl new file mode 100755 index 0000000..fabb73a --- /dev/null +++ b/RNA-Seq/cwl/helper/STAR-Align.cwl @@ -0,0 +1,248 @@ +#!/usr/bin/env cwl-runner +cwlVersion: v1.0 +class: CommandLineTool + +hints: + DockerRequirement: + dockerPull: "quay.io/biocontainers/star:2.7.5c--0" + +inputs: + # Required Inputs + RunThreadN: + type: int + inputBinding: + prefix: "--runThreadN" + + GenomeDir: + type: Directory + inputBinding: + prefix: "--genomeDir" + + ForwardReads: + type: + - File + - File[] + inputBinding: + prefix: "--readFilesIn" + itemSeparator: "," + position: 1 + # If paired-end reads (like Illumina), both 1 and 2 must be provided. + ReverseReads: + type: + - "null" + - File + - File[] + inputBinding: + prefix: "" + separate: false + itemSeparator: "," + position: 2 + + # Optional Inputs + Gtf: + type: File? + inputBinding: + prefix: "--sjdbGTFfile" + + Overhang: + type: int? + inputBinding: + prefix: "--sjdbOverhang" + + OutFilterType: + type: + - "null" + - type: enum + symbols: + - Normal + - BySJout + inputBinding: + prefix: "--outFilterType" + + OutFilterIntronMotifs: + type: + - "null" + - type: enum + symbols: + - None + - RemoveNoncanonical + - RemoveNoncanonicalUnannotated + inputBinding: + prefix: "--outFilterIntronMotifs" + + OutSAMtype: + type: + - "null" + - type: enum + symbols: + - "BAM" + - "SAM" + inputBinding: + prefix: "--outSAMtype" + position: 3 + + Unsorted: + type: boolean? + inputBinding: + prefix: "Unsorted" + position: 4 + + SortedByCoordinate: + type: boolean? + inputBinding: + prefix: "SortedByCoordinate" + position: 5 + + ReadFilesCommand: + type: string? + inputBinding: + prefix: "--readFilesCommand" + + AlignIntronMin: + type: int? + inputBinding: + prefix: "--alignIntronMin" + + AlignIntronMax: + type: int? + inputBinding: + prefix: "--alignIntronMax" + + AlignMatesGapMax: + type: int? + inputBinding: + prefix: "--alignMatesGapMax" + + AlignSJoverhangMin: + type: int? + inputBinding: + prefix: "--alignSJoverhangMin" + + AlignSJDBoverhangMin: + type: int? + inputBinding: + prefix: "--alignSJDBoverhangMin" + + SeedSearchStartLmax: + type: int? + inputBinding: + prefix: "--seedSearchStartLmax" + + ChimOutType: + type: + - "null" + - type: enum + symbols: + - Junctions + - SeparateSAMold + - WithinBAM + - "WithinBAM HardClip" + - "WithinBAM SoftClip" + + ChimSegmentMin: + type: int? + inputBinding: + prefix: "--chimSegmentMin" + + ChimJunctionOverhangMin: + type: int? + inputBinding: + prefix: "--chimJunctionOverhangMin" + + OutFilterMultimapNmax: + type: int? + inputBinding: + prefix: "--outFilterMultimapNmax" + + OutFilterMismatchNmax: + type: int? + inputBinding: + prefix: "--outFilterMismatchNmax" + + OutFilterMismatchNoverLmax: + type: double? + inputBinding: + prefix: "--outFilterMismatchNoverLmax" + + OutReadsUnmapped: + type: + - "null" + - type: enum + symbols: + - None + - Fastx + inputBinding: + prefix: "--outReadsUnmapped" + + OutSAMstrandField: + type: + - "null" + - type: enum + symbols: + - None + - intronMotif + inputBinding: + prefix: "--outSAMstrandField" + + OutSAMunmapped: + type: + - "null" + - type: enum + symbols: + - None + - Within + - "Within KeepPairs" + inputBinding: + prefix: "--outSAMunmapped" + + OutSAMmapqUnique: + type: int? + inputBinding: + prefix: "--outSAMmapqUnique" + + OutSamMode: + type: + - "null" + - type: enum + symbols: + - None + - Full + - NoQS + inputBinding: + prefix: "--outSAMmode" + + LimitOutSAMoneReadBytes: + type: int? + inputBinding: + prefix: "--limitOutSAMoneReadBytes" + + OutFileNamePrefix: + type: string? + inputBinding: + prefix: "--outFileNamePrefix" + + GenomeLoad: + type: + - "null" + - type: enum + symbols: + - LoadAndKeep + - LoadAndRemove + - LoadAndExit + - Remove + - NoSharedMemory + inputBinding: + prefix: "--genomeLoad" + +baseCommand: [STAR, --runMode, alignReads] + +outputs: + alignment: + type: + - File + outputBinding: + glob: "*.bam" + unmapped_reads: + type: ["null", File] + outputBinding: + glob: "Unmapped.out*" diff --git a/RNA-Seq/cwl/helper/alignment.cwl b/RNA-Seq/cwl/helper/alignment.cwl new file mode 100644 index 0000000..702ba05 --- /dev/null +++ b/RNA-Seq/cwl/helper/alignment.cwl @@ -0,0 +1,47 @@ +cwlVersion: v1.2 +class: Workflow +label: RNAseq Alignment workflow + +inputs: + fq: File + genome: Directory + gtf: File + +requirements: + StepInputExpressionRequirement: {} + +steps: + fastqc: + run: fastqc_2.cwl + in: + reads_file: fq + out: [html_file] + + STAR: + requirements: + ResourceRequirement: + ramMin: 9000 + run: STAR-Align.cwl + in: + RunThreadN: {default: 4} + GenomeDir: genome + ForwardReads: fq + OutSAMtype: {default: BAM} + SortedByCoordinate: {default: true} + OutSAMunmapped: {default: Within} + OutFileNamePrefix: {valueFrom: "$(inputs.ForwardReads.nameroot)."} + out: [alignment] + + samtools: + run: samtools_index.cwl + in: + bam_sorted: STAR/alignment + out: [bam_sorted_indexed] + +outputs: + qc_html: + type: File + outputSource: fastqc/html_file + bam_sorted_indexed: + type: File + outputSource: samtools/bam_sorted_indexed diff --git a/RNA-Seq/cwl/helper/fastqc_2.cwl b/RNA-Seq/cwl/helper/fastqc_2.cwl new file mode 100755 index 0000000..575c9b0 --- /dev/null +++ b/RNA-Seq/cwl/helper/fastqc_2.cwl @@ -0,0 +1,183 @@ +#!/usr/bin/env cwl-runner +cwlVersion: v1.0 +class: CommandLineTool + +hints: + DockerRequirement: + dockerPull: quay.io/biocontainers/fastqc:0.11.9--hdfd78af_1 + SoftwareRequirement: + packages: + fastqc: + specs: [ "http://identifiers.org/biotools/fastqc" ] + version: [ "0.11.9--hdfd78af_1", "0.11.9" ] + +inputs: + + reads_file: + type: File + inputBinding: + position: 50 + doc: | + Input bam,sam,bam_mapped,sam_mapped or fastq file + + format_enum: + type: + - "null" + - type: enum + name: "format" + symbols: ['bam','sam','bam_mapped','sam_mapped','fastq'] + inputBinding: + position: 6 + prefix: '--format' + doc: | + Bypasses the normal sequence file format detection and + forces the program to use the specified format. Valid + formats are bam,sam,bam_mapped,sam_mapped and fastq + + threads: + type: int? + inputBinding: + position: 7 + prefix: '--threads' + doc: | + Specifies the number of files which can be processed + simultaneously. Each thread will be allocated 250MB of + memory so you shouldn't run more threads than your + available memory will cope with, and not more than + 6 threads on a 32 bit machine + + contaminants: + type: File? + inputBinding: + position: 8 + prefix: '--contaminants' + doc: | + Specifies a non-default file which contains the list of + contaminants to screen overrepresented sequences against. + The file must contain sets of named contaminants in the + form name[tab]sequence. Lines prefixed with a hash will + be ignored. + + adapters: + type: File? + inputBinding: + position: 9 + prefix: '--adapters' + doc: | + Specifies a non-default file which contains the list of + adapter sequences which will be explicity searched against + the library. The file must contain sets of named adapters + in the form name[tab]sequence. Lines prefixed with a hash + will be ignored. + + limits: + type: File? + inputBinding: + position: 10 + prefix: '--limits' + doc: | + Specifies a non-default file which contains a set of criteria + which will be used to determine the warn/error limits for the + various modules. This file can also be used to selectively + remove some modules from the output all together. The format + needs to mirror the default limits.txt file found in the + Configuration folder. + + kmers: + type: int? + inputBinding: + position: 11 + prefix: '--kmers' + doc: | + Specifies the length of Kmer to look for in the Kmer content + module. Specified Kmer length must be between 2 and 10. Default + length is 7 if not specified. + + casava: + type: boolean? + inputBinding: + position: 13 + prefix: '--casava' + doc: | + Files come from raw casava output. Files in the same sample + group (differing only by the group number) will be analysed + as a set rather than individually. Sequences with the filter + flag set in the header will be excluded from the analysis. + Files must have the same names given to them by casava + (including being gzipped and ending with .gz) otherwise they + won't be grouped together correctly. + + nofilter: + type: boolean? + inputBinding: + position: 14 + prefix: '--nofilter' + doc: | + If running with --casava then don't remove read flagged by + casava as poor quality when performing the QC analysis. + + hide_group: + type: boolean? + inputBinding: + position: 15 + prefix: '--nogroup' + doc: | + Disable grouping of bases for reads >50bp. All reports will + show data for every base in the read. WARNING: Using this + option will cause fastqc to crash and burn if you use it on + really long reads, and your plots may end up a ridiculous size. + You have been warned! + +outputs: + + zipped_file: + type: File + outputBinding: + glob: '*.zip' + html_file: + type: File + outputBinding: + glob: '*.html' + summary_file: + type: File + outputBinding: + glob: "*/summary.txt" + +baseCommand: [fastqc, --extract, --outdir, .] + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +s:name: "fastqc_2" +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + +doc: | + Tool runs FastQC from Babraham Bioinformatics diff --git a/RNA-Seq/cwl/helper/featureCounts.cwl b/RNA-Seq/cwl/helper/featureCounts.cwl new file mode 100644 index 0000000..a17163f --- /dev/null +++ b/RNA-Seq/cwl/helper/featureCounts.cwl @@ -0,0 +1,25 @@ +cwlVersion: v1.2 +class: CommandLineTool + +inputs: + gtf: File + counts_input_bam: + - File + - File[] + +baseCommand: featureCounts + +arguments: [-T, $(runtime.cores), + -a, $(inputs.gtf), + -o, featurecounts.tsv, + $(inputs.counts_input_bam)] + +outputs: + featurecounts: + type: File + outputBinding: + glob: featurecounts.tsv + +hints: + DockerRequirement: + dockerPull: quay.io/biocontainers/subread:1.5.0p3--0 diff --git a/RNA-Seq/cwl/helper/samtools_index.cwl b/RNA-Seq/cwl/helper/samtools_index.cwl new file mode 100755 index 0000000..5d0a2de --- /dev/null +++ b/RNA-Seq/cwl/helper/samtools_index.cwl @@ -0,0 +1,37 @@ +#!/usr/bin/env cwl-runner +cwlVersion: v1.0 +class: CommandLineTool + +doc: | + Indexing BAM. + +requirements: + InitialWorkDirRequirement: + listing: + - $(inputs.bam_sorted) +hints: + ResourceRequirement: + coresMin: 1 + ramMin: 20000 + DockerRequirement: + dockerPull: quay.io/biocontainers/samtools:1.14--hb421002_0 + +baseCommand: ["samtools", "index"] +arguments: + - valueFrom: -b # specifies that index is created in bai format + position: 1 + +inputs: + bam_sorted: + doc: sorted bam input file + type: File + inputBinding: + position: 2 + +outputs: + bam_sorted_indexed: + type: File + secondaryFiles: .bai + outputBinding: + glob: $(inputs.bam_sorted.basename) + diff --git a/RNA-Seq/cwl/helper/splitDir.cwl b/RNA-Seq/cwl/helper/splitDir.cwl new file mode 100644 index 0000000..e8be6c3 --- /dev/null +++ b/RNA-Seq/cwl/helper/splitDir.cwl @@ -0,0 +1,9 @@ +cwlVersion: v1.2 +class: ExpressionTool +requirements: + InlineJavascriptRequirement: {} +inputs: + fqdir: Directory +outputs: + fq: File[] +expression: '${return {fq: inputs.fqdir.listing};}' diff --git a/RNA-Seq/cwl/helper/subdirs.cwl b/RNA-Seq/cwl/helper/subdirs.cwl new file mode 100644 index 0000000..fc4fe7d --- /dev/null +++ b/RNA-Seq/cwl/helper/subdirs.cwl @@ -0,0 +1,22 @@ +cwlVersion: v1.2 +class: ExpressionTool +requirements: + InlineJavascriptRequirement: {} +inputs: + fq: File[] + bams: File[] + qc: File[] +outputs: + dirs: Directory[] +expression: |- + ${ + var dirs = []; + for (var i = 0; i < inputs.bams.length; i++) { + dirs.push({ + "class": "Directory", + "basename": inputs.fq[i].nameroot, + "listing": [inputs.bams[i], inputs.qc[i]] + }); + } + return {"dirs": dirs}; + } diff --git a/RNA-Seq/docker/Dockerfile b/RNA-Seq/docker/Dockerfile new file mode 100644 index 0000000..e66754d --- /dev/null +++ b/RNA-Seq/docker/Dockerfile @@ -0,0 +1,14 @@ +FROM debian:10-slim +MAINTAINER Peter Amstutz + +RUN apt-get update -qy +RUN apt-get install -qy build-essential wget unzip zlib1g-dev + +# Install BWA 07.7.17 +RUN wget https://github.com/lh3/bwa/archive/v0.7.17.zip && \ + unzip v0.7.17 && \ + cd bwa-0.7.17 && \ + make && \ + cp bwa /usr/bin && \ + cd .. && \ + rm -rf bwa-0.7.17 diff --git a/RNA-Seq/yml/RNA-seq-wf.yml b/RNA-Seq/yml/RNA-seq-wf.yml new file mode 100644 index 0000000..3c3b3ae --- /dev/null +++ b/RNA-Seq/yml/RNA-seq-wf.yml @@ -0,0 +1,12 @@ +fqdir: + class: Directory + location: keep:1360b500543d1d0b041084a2f99d33b6+567/ + #location: keep:pirca-4zz18-blweknwtwyjys0i/ +genome: + class: Directory + location: keep:64e703acc21226ce99f3d88eacdacd0b+3159/hg19-STAR-index + #location: keep:pirca-4zz18-c543b5welq68g90/hg19-STAR-index +gtf: + class: File + location: keep:64e703acc21226ce99f3d88eacdacd0b+3159/chr1-hg19_genes.gtf + #location: keep:pirca-4zz18-c543b5welq68g90/chr1-hg19_genes.gtf diff --git a/RNA-Seq/yml/alignment.yml b/RNA-Seq/yml/alignment.yml new file mode 100644 index 0000000..742b47b --- /dev/null +++ b/RNA-Seq/yml/alignment.yml @@ -0,0 +1,10 @@ +fq: + class: File + location: keep:pirca-4zz18-blweknwtwyjys0i/Mov10_oe_1.subset.fq +genome: + class: Directory + location: keep:pirca-4zz18-c543b5welq68g90/hg19-STAR-index +gtf: + class: File + location: keep:pirca-4zz18-c543b5welq68g90/chr1-hg19_genes.gtf + diff --git a/WGS-processing/cwl/helper/bwamem-samtools-view.cwl b/WGS-processing/cwl/helper/bwamem-samtools-view.cwl index cf9210c..f4a3ed1 100644 --- a/WGS-processing/cwl/helper/bwamem-samtools-view.cwl +++ b/WGS-processing/cwl/helper/bwamem-samtools-view.cwl @@ -12,8 +12,8 @@ hints: keep_cache: 1024 outputDirType: keep_output_dir ResourceRequirement: - ramMin: 50000 - coresMin: 16 + ramMin: 25000 + coresMin: 8 SoftwareRequirement: packages: BWA: diff --git a/WGS-processing/cwl/helper/gatk-applyBSQR-with-interval.cwl b/WGS-processing/cwl/helper/gatk-applyBSQR-with-interval.cwl index 52413a6..f6d625a 100644 --- a/WGS-processing/cwl/helper/gatk-applyBSQR-with-interval.cwl +++ b/WGS-processing/cwl/helper/gatk-applyBSQR-with-interval.cwl @@ -54,7 +54,7 @@ outputs: format: edam:format_2572 # BAM label: Recalibrated BAM for given interval secondaryFiles: - - .bai + - ^.bai outputBinding: glob: "*nodups_BQSR.bam" diff --git a/WGS-processing/cwl/helper/gatk-haplotypecaller-with-interval.cwl b/WGS-processing/cwl/helper/gatk-haplotypecaller-with-interval.cwl index 4b3d1a7..0b6c102 100644 --- a/WGS-processing/cwl/helper/gatk-haplotypecaller-with-interval.cwl +++ b/WGS-processing/cwl/helper/gatk-haplotypecaller-with-interval.cwl @@ -25,7 +25,7 @@ inputs: format: edam:format_2572 # BAM label: Recalibrated BAM for given interval secondaryFiles: - - .bai + - ^.bai reference: type: File format: edam:format_1929 # FASTA