From c604c42bb92a6301b13a250582eabed509adaae1 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Mon, 25 Oct 2021 20:02:45 +0000 Subject: [PATCH 1/8] Bugfix .bai secondary file for ApplyBSQR behavior no issue # Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- WGS-processing/cwl/helper/gatk-applyBSQR-with-interval.cwl | 2 +- .../cwl/helper/gatk-haplotypecaller-with-interval.cwl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/WGS-processing/cwl/helper/gatk-applyBSQR-with-interval.cwl b/WGS-processing/cwl/helper/gatk-applyBSQR-with-interval.cwl index 52413a6..f6d625a 100644 --- a/WGS-processing/cwl/helper/gatk-applyBSQR-with-interval.cwl +++ b/WGS-processing/cwl/helper/gatk-applyBSQR-with-interval.cwl @@ -54,7 +54,7 @@ outputs: format: edam:format_2572 # BAM label: Recalibrated BAM for given interval secondaryFiles: - - .bai + - ^.bai outputBinding: glob: "*nodups_BQSR.bam" diff --git a/WGS-processing/cwl/helper/gatk-haplotypecaller-with-interval.cwl b/WGS-processing/cwl/helper/gatk-haplotypecaller-with-interval.cwl index 4b3d1a7..0b6c102 100644 --- a/WGS-processing/cwl/helper/gatk-haplotypecaller-with-interval.cwl +++ b/WGS-processing/cwl/helper/gatk-haplotypecaller-with-interval.cwl @@ -25,7 +25,7 @@ inputs: format: edam:format_2572 # BAM label: Recalibrated BAM for given interval secondaryFiles: - - .bai + - ^.bai reference: type: File format: edam:format_1929 # FASTA -- 2.30.2 From 3103cae51ea3f51ef5d4e9827490b2609ab59b3b Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 26 Oct 2021 15:26:29 -0400 Subject: [PATCH 2/8] Lower RAM request by bwa to run on 9tee4 no issue # Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- WGS-processing/cwl/helper/bwamem-samtools-view.cwl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/WGS-processing/cwl/helper/bwamem-samtools-view.cwl b/WGS-processing/cwl/helper/bwamem-samtools-view.cwl index cf9210c..f4a3ed1 100644 --- a/WGS-processing/cwl/helper/bwamem-samtools-view.cwl +++ b/WGS-processing/cwl/helper/bwamem-samtools-view.cwl @@ -12,8 +12,8 @@ hints: keep_cache: 1024 outputDirType: keep_output_dir ResourceRequirement: - ramMin: 50000 - coresMin: 16 + ramMin: 25000 + coresMin: 8 SoftwareRequirement: packages: BWA: -- 2.30.2 From 5b4f2a85bedfad9e72e406e3e1990afc913c9da2 Mon Sep 17 00:00:00 2001 From: swz Date: Thu, 15 Sep 2022 12:44:44 +0000 Subject: [PATCH 3/8] Adding RNA-Seq demo no issue # Arvados-DCO-1.1-Signed-off-by: Sarah Wait Zaranek --- .licenseignore | 1 + RNA-Seq/cwl/RNA-seq-wf.cwl | 50 ++++++++++++++++++++++++++++ RNA-Seq/cwl/helper/alignment.cwl | 48 ++++++++++++++++++++++++++ RNA-Seq/cwl/helper/featureCounts.cwl | 25 ++++++++++++++ RNA-Seq/cwl/helper/subdirs.cwl | 22 ++++++++++++ RNA-Seq/docker/Dockerfile | 14 ++++++++ 6 files changed, 160 insertions(+) create mode 100644 RNA-Seq/cwl/RNA-seq-wf.cwl create mode 100644 RNA-Seq/cwl/helper/alignment.cwl create mode 100644 RNA-Seq/cwl/helper/featureCounts.cwl create mode 100644 RNA-Seq/cwl/helper/subdirs.cwl create mode 100644 RNA-Seq/docker/Dockerfile diff --git a/.licenseignore b/.licenseignore index 35252fd..e617614 100644 --- a/.licenseignore +++ b/.licenseignore @@ -1 +1,2 @@ LICENSE.txt +* diff --git a/RNA-Seq/cwl/RNA-seq-wf.cwl b/RNA-Seq/cwl/RNA-seq-wf.cwl new file mode 100644 index 0000000..97461ce --- /dev/null +++ b/RNA-Seq/cwl/RNA-seq-wf.cwl @@ -0,0 +1,50 @@ +cwlVersion: v1.2 +class: Workflow +label: RNAseq CWL practice workflow + +inputs: + fq: File[] + genome: Directory + gtf: File + +steps: + alignment: + run: alignment.cwl + scatter: fq + in: + fq: fq + genome: genome + gtf: gtf + out: [qc_html, bam_sorted_indexed] + + featureCounts: + requirements: + ResourceRequirement: + ramMin: 500 + run: featureCounts.cwl + in: + counts_input_bam: alignment/bam_sorted_indexed + gtf: gtf + out: [featurecounts] + + ### 2. Organizing output files into Directories + output-subdirs: + run: subdirs.cwl + in: + fq: fq + bams: alignment/bam_sorted_indexed + qc: alignment/qc_html + out: [dirs] + +outputs: + dirs: + type: Directory[] + outputSource: output-subdirs/dirs + + featurecounts: + type: File + outputSource: featureCounts/featurecounts + +requirements: + SubworkflowFeatureRequirement: {} + ScatterFeatureRequirement: {} diff --git a/RNA-Seq/cwl/helper/alignment.cwl b/RNA-Seq/cwl/helper/alignment.cwl new file mode 100644 index 0000000..6712aae --- /dev/null +++ b/RNA-Seq/cwl/helper/alignment.cwl @@ -0,0 +1,48 @@ +cwlVersion: v1.2 +class: Workflow +label: RNAseq CWL practice workflow + +inputs: + fq: File + genome: Directory + gtf: File + +requirements: + StepInputExpressionRequirement: {} + +steps: + fastqc: + run: bio-cwl-tools/fastqc/fastqc_2.cwl + in: + reads_file: fq + out: [html_file] + + STAR: + requirements: + ResourceRequirement: + ramMin: 9000 + run: bio-cwl-tools/STAR/STAR-Align.cwl + in: + RunThreadN: {default: 4} + GenomeDir: genome + ForwardReads: fq + OutSAMtype: {default: BAM} + SortedByCoordinate: {default: true} + OutSAMunmapped: {default: Within} + ### 1. Expressions on step inputs + OutFileNamePrefix: {valueFrom: "$(inputs.ForwardReads.nameroot)."} + out: [alignment] + + samtools: + run: bio-cwl-tools/samtools/samtools_index.cwl + in: + bam_sorted: STAR/alignment + out: [bam_sorted_indexed] + +outputs: + qc_html: + type: File + outputSource: fastqc/html_file + bam_sorted_indexed: + type: File + outputSource: samtools/bam_sorted_indexed diff --git a/RNA-Seq/cwl/helper/featureCounts.cwl b/RNA-Seq/cwl/helper/featureCounts.cwl new file mode 100644 index 0000000..a17163f --- /dev/null +++ b/RNA-Seq/cwl/helper/featureCounts.cwl @@ -0,0 +1,25 @@ +cwlVersion: v1.2 +class: CommandLineTool + +inputs: + gtf: File + counts_input_bam: + - File + - File[] + +baseCommand: featureCounts + +arguments: [-T, $(runtime.cores), + -a, $(inputs.gtf), + -o, featurecounts.tsv, + $(inputs.counts_input_bam)] + +outputs: + featurecounts: + type: File + outputBinding: + glob: featurecounts.tsv + +hints: + DockerRequirement: + dockerPull: quay.io/biocontainers/subread:1.5.0p3--0 diff --git a/RNA-Seq/cwl/helper/subdirs.cwl b/RNA-Seq/cwl/helper/subdirs.cwl new file mode 100644 index 0000000..fc4fe7d --- /dev/null +++ b/RNA-Seq/cwl/helper/subdirs.cwl @@ -0,0 +1,22 @@ +cwlVersion: v1.2 +class: ExpressionTool +requirements: + InlineJavascriptRequirement: {} +inputs: + fq: File[] + bams: File[] + qc: File[] +outputs: + dirs: Directory[] +expression: |- + ${ + var dirs = []; + for (var i = 0; i < inputs.bams.length; i++) { + dirs.push({ + "class": "Directory", + "basename": inputs.fq[i].nameroot, + "listing": [inputs.bams[i], inputs.qc[i]] + }); + } + return {"dirs": dirs}; + } diff --git a/RNA-Seq/docker/Dockerfile b/RNA-Seq/docker/Dockerfile new file mode 100644 index 0000000..e66754d --- /dev/null +++ b/RNA-Seq/docker/Dockerfile @@ -0,0 +1,14 @@ +FROM debian:10-slim +MAINTAINER Peter Amstutz + +RUN apt-get update -qy +RUN apt-get install -qy build-essential wget unzip zlib1g-dev + +# Install BWA 07.7.17 +RUN wget https://github.com/lh3/bwa/archive/v0.7.17.zip && \ + unzip v0.7.17 && \ + cd bwa-0.7.17 && \ + make && \ + cp bwa /usr/bin && \ + cd .. && \ + rm -rf bwa-0.7.17 -- 2.30.2 From bec60d8f8671fedf159d9be0c02196ebaf602d53 Mon Sep 17 00:00:00 2001 From: swz Date: Thu, 15 Sep 2022 14:04:58 +0000 Subject: [PATCH 4/8] Adding helper files and updating readme no issue # Arvados-DCO-1.1-Signed-off-by: Sarah Wait Zaranek --- RNA-Seq/README.md | 13 ++ RNA-Seq/cwl/RNA-seq-wf.cwl | 7 +- RNA-Seq/cwl/helper/STAR-Align.cwl | 255 ++++++++++++++++++++++++++ RNA-Seq/cwl/helper/alignment.cwl | 7 +- RNA-Seq/cwl/helper/fastqc_2.cwl | 183 ++++++++++++++++++ RNA-Seq/cwl/helper/samtools_index.cwl | 42 +++++ RNA-Seq/yml/RNA-seq-wf.yml | 14 ++ RNA-Seq/yml/alignment.yml | 11 ++ 8 files changed, 524 insertions(+), 8 deletions(-) create mode 100644 RNA-Seq/README.md create mode 100755 RNA-Seq/cwl/helper/STAR-Align.cwl create mode 100755 RNA-Seq/cwl/helper/fastqc_2.cwl create mode 100755 RNA-Seq/cwl/helper/samtools_index.cwl create mode 100644 RNA-Seq/yml/RNA-seq-wf.yml create mode 100644 RNA-Seq/yml/alignment.yml diff --git a/RNA-Seq/README.md b/RNA-Seq/README.md new file mode 100644 index 0000000..25677fd --- /dev/null +++ b/RNA-Seq/README.md @@ -0,0 +1,13 @@ +This directory contains an Arvados demo that performs bioinformatics RNA-seq analysis. However, specific knowledge of the biology of RNA-seq is not required for this demo. For those unfamiliar with RNA-seq, it is the process of sequencing RNA present in a biological sample. From the sequence reads, we want to measure the relative numbers of different RNA molecules appearing in the sample that were produced by particular genes. This analysis is called “differential gene expression”. + +Workflows are written in CWL v1.2. (https://www.commonwl.org/) + +Subdirectories are: +* cwl - contains CWL code for the demo +* yml - contains YML inputs for cwl demo code +* docker - contains dockerfiles necessary to re-create any needed docker images + +To run the workflow: + +* arvados-cwl-runner --no-wait --project-uuid YOUR_PROJECT_UUID ./cwl/RNA-seq-wf.cwl ./yml/RNA-seq-wf.yml + diff --git a/RNA-Seq/cwl/RNA-seq-wf.cwl b/RNA-Seq/cwl/RNA-seq-wf.cwl index 97461ce..8da505c 100644 --- a/RNA-Seq/cwl/RNA-seq-wf.cwl +++ b/RNA-Seq/cwl/RNA-seq-wf.cwl @@ -9,7 +9,7 @@ inputs: steps: alignment: - run: alignment.cwl + run: helper/alignment.cwl scatter: fq in: fq: fq @@ -21,15 +21,14 @@ steps: requirements: ResourceRequirement: ramMin: 500 - run: featureCounts.cwl + run: helper/featureCounts.cwl in: counts_input_bam: alignment/bam_sorted_indexed gtf: gtf out: [featurecounts] - ### 2. Organizing output files into Directories output-subdirs: - run: subdirs.cwl + run: helper/subdirs.cwl in: fq: fq bams: alignment/bam_sorted_indexed diff --git a/RNA-Seq/cwl/helper/STAR-Align.cwl b/RNA-Seq/cwl/helper/STAR-Align.cwl new file mode 100755 index 0000000..24a640b --- /dev/null +++ b/RNA-Seq/cwl/helper/STAR-Align.cwl @@ -0,0 +1,255 @@ +#!/usr/bin/env cwl-runner +cwlVersion: v1.0 +class: CommandLineTool + +hints: + DockerRequirement: + dockerPull: "quay.io/biocontainers/star:2.7.5c--0" + +inputs: + # Required Inputs + RunThreadN: + type: int + inputBinding: + prefix: "--runThreadN" + + GenomeDir: + type: Directory + inputBinding: + prefix: "--genomeDir" + + ForwardReads: + format: edam:format_1930 # FASTQ + type: + - File + - File[] + inputBinding: + prefix: "--readFilesIn" + itemSeparator: "," + position: 1 + # If paired-end reads (like Illumina), both 1 and 2 must be provided. + ReverseReads: + format: edam:format_1930 # FASTQ + type: + - "null" + - File + - File[] + inputBinding: + prefix: "" + separate: false + itemSeparator: "," + position: 2 + + # Optional Inputs + Gtf: + type: File? + inputBinding: + prefix: "--sjdbGTFfile" + + Overhang: + type: int? + inputBinding: + prefix: "--sjdbOverhang" + + OutFilterType: + type: + - "null" + - type: enum + symbols: + - Normal + - BySJout + inputBinding: + prefix: "--outFilterType" + + OutFilterIntronMotifs: + type: + - "null" + - type: enum + symbols: + - None + - RemoveNoncanonical + - RemoveNoncanonicalUnannotated + inputBinding: + prefix: "--outFilterIntronMotifs" + + OutSAMtype: + type: + - "null" + - type: enum + symbols: + - "BAM" + - "SAM" + inputBinding: + prefix: "--outSAMtype" + position: 3 + + Unsorted: + type: boolean? + inputBinding: + prefix: "Unsorted" + position: 4 + + SortedByCoordinate: + type: boolean? + inputBinding: + prefix: "SortedByCoordinate" + position: 5 + + ReadFilesCommand: + type: string? + inputBinding: + prefix: "--readFilesCommand" + + AlignIntronMin: + type: int? + inputBinding: + prefix: "--alignIntronMin" + + AlignIntronMax: + type: int? + inputBinding: + prefix: "--alignIntronMax" + + AlignMatesGapMax: + type: int? + inputBinding: + prefix: "--alignMatesGapMax" + + AlignSJoverhangMin: + type: int? + inputBinding: + prefix: "--alignSJoverhangMin" + + AlignSJDBoverhangMin: + type: int? + inputBinding: + prefix: "--alignSJDBoverhangMin" + + SeedSearchStartLmax: + type: int? + inputBinding: + prefix: "--seedSearchStartLmax" + + ChimOutType: + type: + - "null" + - type: enum + symbols: + - Junctions + - SeparateSAMold + - WithinBAM + - "WithinBAM HardClip" + - "WithinBAM SoftClip" + + ChimSegmentMin: + type: int? + inputBinding: + prefix: "--chimSegmentMin" + + ChimJunctionOverhangMin: + type: int? + inputBinding: + prefix: "--chimJunctionOverhangMin" + + OutFilterMultimapNmax: + type: int? + inputBinding: + prefix: "--outFilterMultimapNmax" + + OutFilterMismatchNmax: + type: int? + inputBinding: + prefix: "--outFilterMismatchNmax" + + OutFilterMismatchNoverLmax: + type: double? + inputBinding: + prefix: "--outFilterMismatchNoverLmax" + + OutReadsUnmapped: + type: + - "null" + - type: enum + symbols: + - None + - Fastx + inputBinding: + prefix: "--outReadsUnmapped" + + OutSAMstrandField: + type: + - "null" + - type: enum + symbols: + - None + - intronMotif + inputBinding: + prefix: "--outSAMstrandField" + + OutSAMunmapped: + type: + - "null" + - type: enum + symbols: + - None + - Within + - "Within KeepPairs" + inputBinding: + prefix: "--outSAMunmapped" + + OutSAMmapqUnique: + type: int? + inputBinding: + prefix: "--outSAMmapqUnique" + + OutSamMode: + type: + - "null" + - type: enum + symbols: + - None + - Full + - NoQS + inputBinding: + prefix: "--outSAMmode" + + LimitOutSAMoneReadBytes: + type: int? + inputBinding: + prefix: "--limitOutSAMoneReadBytes" + + OutFileNamePrefix: + type: string? + inputBinding: + prefix: "--outFileNamePrefix" + + GenomeLoad: + type: + - "null" + - type: enum + symbols: + - LoadAndKeep + - LoadAndRemove + - LoadAndExit + - Remove + - NoSharedMemory + inputBinding: + prefix: "--genomeLoad" + +baseCommand: [STAR, --runMode, alignReads] + +outputs: + alignment: + type: + - File + outputBinding: + glob: "*.bam" + unmapped_reads: + type: ["null", File] + outputBinding: + glob: "Unmapped.out*" + +$namespaces: + edam: https://edamontology.org/ +$schemas: + - https://edamontology.org/EDAM_1.18.owl diff --git a/RNA-Seq/cwl/helper/alignment.cwl b/RNA-Seq/cwl/helper/alignment.cwl index 6712aae..e2a434f 100644 --- a/RNA-Seq/cwl/helper/alignment.cwl +++ b/RNA-Seq/cwl/helper/alignment.cwl @@ -12,7 +12,7 @@ requirements: steps: fastqc: - run: bio-cwl-tools/fastqc/fastqc_2.cwl + run: fastqc_2.cwl in: reads_file: fq out: [html_file] @@ -21,7 +21,7 @@ steps: requirements: ResourceRequirement: ramMin: 9000 - run: bio-cwl-tools/STAR/STAR-Align.cwl + run: STAR-Align.cwl in: RunThreadN: {default: 4} GenomeDir: genome @@ -29,12 +29,11 @@ steps: OutSAMtype: {default: BAM} SortedByCoordinate: {default: true} OutSAMunmapped: {default: Within} - ### 1. Expressions on step inputs OutFileNamePrefix: {valueFrom: "$(inputs.ForwardReads.nameroot)."} out: [alignment] samtools: - run: bio-cwl-tools/samtools/samtools_index.cwl + run: samtools_index.cwl in: bam_sorted: STAR/alignment out: [bam_sorted_indexed] diff --git a/RNA-Seq/cwl/helper/fastqc_2.cwl b/RNA-Seq/cwl/helper/fastqc_2.cwl new file mode 100755 index 0000000..575c9b0 --- /dev/null +++ b/RNA-Seq/cwl/helper/fastqc_2.cwl @@ -0,0 +1,183 @@ +#!/usr/bin/env cwl-runner +cwlVersion: v1.0 +class: CommandLineTool + +hints: + DockerRequirement: + dockerPull: quay.io/biocontainers/fastqc:0.11.9--hdfd78af_1 + SoftwareRequirement: + packages: + fastqc: + specs: [ "http://identifiers.org/biotools/fastqc" ] + version: [ "0.11.9--hdfd78af_1", "0.11.9" ] + +inputs: + + reads_file: + type: File + inputBinding: + position: 50 + doc: | + Input bam,sam,bam_mapped,sam_mapped or fastq file + + format_enum: + type: + - "null" + - type: enum + name: "format" + symbols: ['bam','sam','bam_mapped','sam_mapped','fastq'] + inputBinding: + position: 6 + prefix: '--format' + doc: | + Bypasses the normal sequence file format detection and + forces the program to use the specified format. Valid + formats are bam,sam,bam_mapped,sam_mapped and fastq + + threads: + type: int? + inputBinding: + position: 7 + prefix: '--threads' + doc: | + Specifies the number of files which can be processed + simultaneously. Each thread will be allocated 250MB of + memory so you shouldn't run more threads than your + available memory will cope with, and not more than + 6 threads on a 32 bit machine + + contaminants: + type: File? + inputBinding: + position: 8 + prefix: '--contaminants' + doc: | + Specifies a non-default file which contains the list of + contaminants to screen overrepresented sequences against. + The file must contain sets of named contaminants in the + form name[tab]sequence. Lines prefixed with a hash will + be ignored. + + adapters: + type: File? + inputBinding: + position: 9 + prefix: '--adapters' + doc: | + Specifies a non-default file which contains the list of + adapter sequences which will be explicity searched against + the library. The file must contain sets of named adapters + in the form name[tab]sequence. Lines prefixed with a hash + will be ignored. + + limits: + type: File? + inputBinding: + position: 10 + prefix: '--limits' + doc: | + Specifies a non-default file which contains a set of criteria + which will be used to determine the warn/error limits for the + various modules. This file can also be used to selectively + remove some modules from the output all together. The format + needs to mirror the default limits.txt file found in the + Configuration folder. + + kmers: + type: int? + inputBinding: + position: 11 + prefix: '--kmers' + doc: | + Specifies the length of Kmer to look for in the Kmer content + module. Specified Kmer length must be between 2 and 10. Default + length is 7 if not specified. + + casava: + type: boolean? + inputBinding: + position: 13 + prefix: '--casava' + doc: | + Files come from raw casava output. Files in the same sample + group (differing only by the group number) will be analysed + as a set rather than individually. Sequences with the filter + flag set in the header will be excluded from the analysis. + Files must have the same names given to them by casava + (including being gzipped and ending with .gz) otherwise they + won't be grouped together correctly. + + nofilter: + type: boolean? + inputBinding: + position: 14 + prefix: '--nofilter' + doc: | + If running with --casava then don't remove read flagged by + casava as poor quality when performing the QC analysis. + + hide_group: + type: boolean? + inputBinding: + position: 15 + prefix: '--nogroup' + doc: | + Disable grouping of bases for reads >50bp. All reports will + show data for every base in the read. WARNING: Using this + option will cause fastqc to crash and burn if you use it on + really long reads, and your plots may end up a ridiculous size. + You have been warned! + +outputs: + + zipped_file: + type: File + outputBinding: + glob: '*.zip' + html_file: + type: File + outputBinding: + glob: '*.html' + summary_file: + type: File + outputBinding: + glob: "*/summary.txt" + +baseCommand: [fastqc, --extract, --outdir, .] + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +s:name: "fastqc_2" +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + +doc: | + Tool runs FastQC from Babraham Bioinformatics diff --git a/RNA-Seq/cwl/helper/samtools_index.cwl b/RNA-Seq/cwl/helper/samtools_index.cwl new file mode 100755 index 0000000..3ae2e9b --- /dev/null +++ b/RNA-Seq/cwl/helper/samtools_index.cwl @@ -0,0 +1,42 @@ +#!/usr/bin/env cwl-runner +cwlVersion: v1.0 +class: CommandLineTool + +doc: | + Indexing BAM. + +requirements: + InitialWorkDirRequirement: + listing: + - $(inputs.bam_sorted) +hints: + ResourceRequirement: + coresMin: 1 + ramMin: 20000 + DockerRequirement: + dockerPull: quay.io/biocontainers/samtools:1.14--hb421002_0 + +baseCommand: ["samtools", "index"] +arguments: + - valueFrom: -b # specifies that index is created in bai format + position: 1 + +inputs: + bam_sorted: + doc: sorted bam input file + type: File + inputBinding: + position: 2 + +outputs: + bam_sorted_indexed: + type: File + secondaryFiles: .bai + format: edam:format_2572 # BAM + outputBinding: + glob: $(inputs.bam_sorted.basename) + +$namespaces: + edam: https://edamontology.org/ +$schemas: + - https://edamontology.org/EDAM_1.18.owl diff --git a/RNA-Seq/yml/RNA-seq-wf.yml b/RNA-Seq/yml/RNA-seq-wf.yml new file mode 100644 index 0000000..19f8c54 --- /dev/null +++ b/RNA-Seq/yml/RNA-seq-wf.yml @@ -0,0 +1,14 @@ +fq: + - class: File + location: keep:pirca-4zz18-blweknwtwyjys0i/Mov10_oe_1.subset.fq + format: http://edamontology.org/format_1930 + - class: File + location: keep:pirca-4zz18-blweknwtwyjys0i/Mov10_oe_2.subset.fq + format: http://edamontology.org/format_1930 +genome: + class: Directory + location: keep:pirca-4zz18-c543b5welq68g90/hg19-chr1-STAR-index +gtf: + class: File + location: keep:pirca-4zz18-c543b5welq68g90/chr1-hg19_genes.gtf + diff --git a/RNA-Seq/yml/alignment.yml b/RNA-Seq/yml/alignment.yml new file mode 100644 index 0000000..83d0e48 --- /dev/null +++ b/RNA-Seq/yml/alignment.yml @@ -0,0 +1,11 @@ +fq: + class: File + location: keep:pirca-4zz18-blweknwtwyjys0i/Mov10_oe_1.subset.fq + format: http://edamontology.org/format_1930 +genome: + class: Directory + location: keep:pirca-4zz18-c543b5welq68g90/hg19-chr1-STAR-index +gtf: + class: File + location: keep:pirca-4zz18-c543b5welq68g90/chr1-hg19_genes.gtf + -- 2.30.2 From 72f04012089ad1a81148056e7943a0bc531b742d Mon Sep 17 00:00:00 2001 From: swz Date: Thu, 15 Sep 2022 18:03:53 +0000 Subject: [PATCH 5/8] Moving to directory no issue # Arvados-DCO-1.1-Signed-off-by: Sarah Wait Zaranek --- RNA-Seq/cwl/RNA-seq-wf.cwl | 14 ++++++++++---- RNA-Seq/cwl/helper/alignment.cwl | 2 +- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/RNA-Seq/cwl/RNA-seq-wf.cwl b/RNA-Seq/cwl/RNA-seq-wf.cwl index 8da505c..07386c1 100644 --- a/RNA-Seq/cwl/RNA-seq-wf.cwl +++ b/RNA-Seq/cwl/RNA-seq-wf.cwl @@ -1,9 +1,9 @@ cwlVersion: v1.2 class: Workflow -label: RNAseq CWL practice workflow +label: RNAseq workflow inputs: - fq: File[] + fqdir: Directory genome: Directory gtf: File @@ -12,7 +12,8 @@ steps: run: helper/alignment.cwl scatter: fq in: - fq: fq + fq: + valueFrom: $(inputs.fq.listing) genome: genome gtf: gtf out: [qc_html, bam_sorted_indexed] @@ -30,7 +31,8 @@ steps: output-subdirs: run: helper/subdirs.cwl in: - fq: fq + fq: + valueFrom: $(inputs.fq.listing) bams: alignment/bam_sorted_indexed qc: alignment/qc_html out: [dirs] @@ -47,3 +49,7 @@ outputs: requirements: SubworkflowFeatureRequirement: {} ScatterFeatureRequirement: {} + +hints: + LoadListingRequirement: + loadListing: shallow_listing diff --git a/RNA-Seq/cwl/helper/alignment.cwl b/RNA-Seq/cwl/helper/alignment.cwl index e2a434f..702ba05 100644 --- a/RNA-Seq/cwl/helper/alignment.cwl +++ b/RNA-Seq/cwl/helper/alignment.cwl @@ -1,6 +1,6 @@ cwlVersion: v1.2 class: Workflow -label: RNAseq CWL practice workflow +label: RNAseq Alignment workflow inputs: fq: File -- 2.30.2 From 6093cec998ae55174ea6ca1e868be01c302c7294 Mon Sep 17 00:00:00 2001 From: swz Date: Thu, 15 Sep 2022 20:35:55 +0000 Subject: [PATCH 6/8] Updating to take a directory no issue # Arvados-DCO-1.1-Signed-off-by: Sarah Wait Zaranek --- RNA-Seq/cwl/RNA-seq-wf.cwl | 5 +++-- RNA-Seq/cwl/helper/STAR-Align.cwl | 7 ------- RNA-Seq/cwl/helper/samtools_index.cwl | 5 ----- 3 files changed, 3 insertions(+), 14 deletions(-) diff --git a/RNA-Seq/cwl/RNA-seq-wf.cwl b/RNA-Seq/cwl/RNA-seq-wf.cwl index 07386c1..555e5e2 100644 --- a/RNA-Seq/cwl/RNA-seq-wf.cwl +++ b/RNA-Seq/cwl/RNA-seq-wf.cwl @@ -13,7 +13,7 @@ steps: scatter: fq in: fq: - valueFrom: $(inputs.fq.listing) + valueFrom: $(inputs.fqdir.listing) genome: genome gtf: gtf out: [qc_html, bam_sorted_indexed] @@ -32,7 +32,7 @@ steps: run: helper/subdirs.cwl in: fq: - valueFrom: $(inputs.fq.listing) + valueFrom: $(inputs.fqdir.listing) bams: alignment/bam_sorted_indexed qc: alignment/qc_html out: [dirs] @@ -49,6 +49,7 @@ outputs: requirements: SubworkflowFeatureRequirement: {} ScatterFeatureRequirement: {} + StepInputExpressionRequirement: {} hints: LoadListingRequirement: diff --git a/RNA-Seq/cwl/helper/STAR-Align.cwl b/RNA-Seq/cwl/helper/STAR-Align.cwl index 24a640b..fabb73a 100755 --- a/RNA-Seq/cwl/helper/STAR-Align.cwl +++ b/RNA-Seq/cwl/helper/STAR-Align.cwl @@ -19,7 +19,6 @@ inputs: prefix: "--genomeDir" ForwardReads: - format: edam:format_1930 # FASTQ type: - File - File[] @@ -29,7 +28,6 @@ inputs: position: 1 # If paired-end reads (like Illumina), both 1 and 2 must be provided. ReverseReads: - format: edam:format_1930 # FASTQ type: - "null" - File @@ -248,8 +246,3 @@ outputs: type: ["null", File] outputBinding: glob: "Unmapped.out*" - -$namespaces: - edam: https://edamontology.org/ -$schemas: - - https://edamontology.org/EDAM_1.18.owl diff --git a/RNA-Seq/cwl/helper/samtools_index.cwl b/RNA-Seq/cwl/helper/samtools_index.cwl index 3ae2e9b..5d0a2de 100755 --- a/RNA-Seq/cwl/helper/samtools_index.cwl +++ b/RNA-Seq/cwl/helper/samtools_index.cwl @@ -32,11 +32,6 @@ outputs: bam_sorted_indexed: type: File secondaryFiles: .bai - format: edam:format_2572 # BAM outputBinding: glob: $(inputs.bam_sorted.basename) -$namespaces: - edam: https://edamontology.org/ -$schemas: - - https://edamontology.org/EDAM_1.18.owl -- 2.30.2 From 7bdffd71945b5426581f197a8afc2e2fedbf1bd4 Mon Sep 17 00:00:00 2001 From: swz Date: Thu, 15 Sep 2022 20:51:54 +0000 Subject: [PATCH 7/8] Updating for proper loadListing no issue # Arvados-DCO-1.1-Signed-off-by: Sarah Wait Zaranek --- RNA-Seq/README.md | 15 ++++++++++++++- RNA-Seq/cwl/RNA-seq-wf.cwl | 8 +++----- RNA-Seq/yml/RNA-seq-wf.yml | 12 ++++-------- RNA-Seq/yml/alignment.yml | 3 +-- 4 files changed, 22 insertions(+), 16 deletions(-) diff --git a/RNA-Seq/README.md b/RNA-Seq/README.md index 25677fd..d4e3c22 100644 --- a/RNA-Seq/README.md +++ b/RNA-Seq/README.md @@ -5,9 +5,22 @@ Workflows are written in CWL v1.2. (https://www.commonwl.org/) Subdirectories are: * cwl - contains CWL code for the demo * yml - contains YML inputs for cwl demo code -* docker - contains dockerfiles necessary to re-create any needed docker images To run the workflow: * arvados-cwl-runner --no-wait --project-uuid YOUR_PROJECT_UUID ./cwl/RNA-seq-wf.cwl ./yml/RNA-seq-wf.yml +About the Demo: +This demo is based on a workflow in the Introduction to RNA-seq using high-performance computing (HPC) lessons developed by members of the teaching team at the Harvard Chan Bioinformatics Core (HBC). The original training, which includes additional lectures about the biology of RNA-seq, can be found at here:https://github.com/hbctraining/Intro-to-rnaseq-hpc-O2. + +The data used in this demo can be found here: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE50499. Citation: Kenny PJ, Zhou H, Kim M, Skariah G et al. MOV10 and FMRP regulate AGO2 association with microRNA recognition elements. Cell Rep 2014 Dec 11;9(5):1729-1741. PMID: 25464849 + +Have questions about Arvados? +* https://arvados.org/ +* https://gitter.im/arvados/community + +Have questions about CWL? +* https://www.commonwl.org/ +* https://cwl.discourse.group/ +* https://gitter.im/common-workflow-language/common-workflow-language + diff --git a/RNA-Seq/cwl/RNA-seq-wf.cwl b/RNA-Seq/cwl/RNA-seq-wf.cwl index 555e5e2..cb00e16 100644 --- a/RNA-Seq/cwl/RNA-seq-wf.cwl +++ b/RNA-Seq/cwl/RNA-seq-wf.cwl @@ -3,7 +3,9 @@ class: Workflow label: RNAseq workflow inputs: - fqdir: Directory + fqdir: + type: Directory + loadListing: shallow_listing genome: Directory gtf: File @@ -50,7 +52,3 @@ requirements: SubworkflowFeatureRequirement: {} ScatterFeatureRequirement: {} StepInputExpressionRequirement: {} - -hints: - LoadListingRequirement: - loadListing: shallow_listing diff --git a/RNA-Seq/yml/RNA-seq-wf.yml b/RNA-Seq/yml/RNA-seq-wf.yml index 19f8c54..828c70c 100644 --- a/RNA-Seq/yml/RNA-seq-wf.yml +++ b/RNA-Seq/yml/RNA-seq-wf.yml @@ -1,13 +1,9 @@ -fq: - - class: File - location: keep:pirca-4zz18-blweknwtwyjys0i/Mov10_oe_1.subset.fq - format: http://edamontology.org/format_1930 - - class: File - location: keep:pirca-4zz18-blweknwtwyjys0i/Mov10_oe_2.subset.fq - format: http://edamontology.org/format_1930 +fqdir: + class: Directory + location: keep:pirca-4zz18-blweknwtwyjys0i/ genome: class: Directory - location: keep:pirca-4zz18-c543b5welq68g90/hg19-chr1-STAR-index + location: keep:pirca-4zz18-c543b5welq68g90/hg19-STAR-index gtf: class: File location: keep:pirca-4zz18-c543b5welq68g90/chr1-hg19_genes.gtf diff --git a/RNA-Seq/yml/alignment.yml b/RNA-Seq/yml/alignment.yml index 83d0e48..742b47b 100644 --- a/RNA-Seq/yml/alignment.yml +++ b/RNA-Seq/yml/alignment.yml @@ -1,10 +1,9 @@ fq: class: File location: keep:pirca-4zz18-blweknwtwyjys0i/Mov10_oe_1.subset.fq - format: http://edamontology.org/format_1930 genome: class: Directory - location: keep:pirca-4zz18-c543b5welq68g90/hg19-chr1-STAR-index + location: keep:pirca-4zz18-c543b5welq68g90/hg19-STAR-index gtf: class: File location: keep:pirca-4zz18-c543b5welq68g90/chr1-hg19_genes.gtf -- 2.30.2 From 2691061efa8341166ad6518688e5e6c0fb9a8fbf Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Thu, 15 Sep 2022 18:39:31 -0400 Subject: [PATCH 8/8] Add splitDir to take Directory input Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- RNA-Seq/cwl/RNA-seq-wf.cwl | 18 +++++++++++------- RNA-Seq/cwl/helper/splitDir.cwl | 9 +++++++++ RNA-Seq/yml/RNA-seq-wf.yml | 10 ++++++---- 3 files changed, 26 insertions(+), 11 deletions(-) create mode 100644 RNA-Seq/cwl/helper/splitDir.cwl diff --git a/RNA-Seq/cwl/RNA-seq-wf.cwl b/RNA-Seq/cwl/RNA-seq-wf.cwl index cb00e16..183bca1 100644 --- a/RNA-Seq/cwl/RNA-seq-wf.cwl +++ b/RNA-Seq/cwl/RNA-seq-wf.cwl @@ -1,21 +1,26 @@ cwlVersion: v1.2 class: Workflow -label: RNAseq workflow +label: RNAseq workflow inputs: - fqdir: - type: Directory + fqdir: + type: Directory loadListing: shallow_listing genome: Directory gtf: File steps: + splitDir: + in: + fqdir: fqdir + run: helper/splitDir.cwl + out: [fq] + alignment: run: helper/alignment.cwl scatter: fq in: - fq: - valueFrom: $(inputs.fqdir.listing) + fq: splitDir/fq genome: genome gtf: gtf out: [qc_html, bam_sorted_indexed] @@ -33,8 +38,7 @@ steps: output-subdirs: run: helper/subdirs.cwl in: - fq: - valueFrom: $(inputs.fqdir.listing) + fq: splitDir/fq bams: alignment/bam_sorted_indexed qc: alignment/qc_html out: [dirs] diff --git a/RNA-Seq/cwl/helper/splitDir.cwl b/RNA-Seq/cwl/helper/splitDir.cwl new file mode 100644 index 0000000..e8be6c3 --- /dev/null +++ b/RNA-Seq/cwl/helper/splitDir.cwl @@ -0,0 +1,9 @@ +cwlVersion: v1.2 +class: ExpressionTool +requirements: + InlineJavascriptRequirement: {} +inputs: + fqdir: Directory +outputs: + fq: File[] +expression: '${return {fq: inputs.fqdir.listing};}' diff --git a/RNA-Seq/yml/RNA-seq-wf.yml b/RNA-Seq/yml/RNA-seq-wf.yml index 828c70c..3c3b3ae 100644 --- a/RNA-Seq/yml/RNA-seq-wf.yml +++ b/RNA-Seq/yml/RNA-seq-wf.yml @@ -1,10 +1,12 @@ fqdir: class: Directory - location: keep:pirca-4zz18-blweknwtwyjys0i/ + location: keep:1360b500543d1d0b041084a2f99d33b6+567/ + #location: keep:pirca-4zz18-blweknwtwyjys0i/ genome: class: Directory - location: keep:pirca-4zz18-c543b5welq68g90/hg19-STAR-index + location: keep:64e703acc21226ce99f3d88eacdacd0b+3159/hg19-STAR-index + #location: keep:pirca-4zz18-c543b5welq68g90/hg19-STAR-index gtf: class: File - location: keep:pirca-4zz18-c543b5welq68g90/chr1-hg19_genes.gtf - + location: keep:64e703acc21226ce99f3d88eacdacd0b+3159/chr1-hg19_genes.gtf + #location: keep:pirca-4zz18-c543b5welq68g90/chr1-hg19_genes.gtf -- 2.30.2