--- /dev/null
+This directory contains an Arvados demo that performs bioinformatics RNA-seq analysis. However, specific knowledge of the biology of RNA-seq is not required for this demo. For those unfamiliar with RNA-seq, it is the process of sequencing RNA present in a biological sample. From the sequence reads, we want to measure the relative numbers of different RNA molecules appearing in the sample that were produced by particular genes. This analysis is called “differential gene expression”.
+
+Workflows are written in CWL v1.2. (https://www.commonwl.org/)
+
+Subdirectories are:
+* cwl - contains CWL code for the demo
+* yml - contains YML inputs for cwl demo code
+* docker - contains dockerfiles necessary to re-create any needed docker images
+
+To run the workflow:
+
+* arvados-cwl-runner --no-wait --project-uuid YOUR_PROJECT_UUID ./cwl/RNA-seq-wf.cwl ./yml/RNA-seq-wf.yml
+
steps:
alignment:
- run: alignment.cwl
+ run: helper/alignment.cwl
scatter: fq
in:
fq: fq
requirements:
ResourceRequirement:
ramMin: 500
- run: featureCounts.cwl
+ run: helper/featureCounts.cwl
in:
counts_input_bam: alignment/bam_sorted_indexed
gtf: gtf
out: [featurecounts]
- ### 2. Organizing output files into Directories
output-subdirs:
- run: subdirs.cwl
+ run: helper/subdirs.cwl
in:
fq: fq
bams: alignment/bam_sorted_indexed
--- /dev/null
+#!/usr/bin/env cwl-runner
+cwlVersion: v1.0
+class: CommandLineTool
+
+hints:
+ DockerRequirement:
+ dockerPull: "quay.io/biocontainers/star:2.7.5c--0"
+
+inputs:
+ # Required Inputs
+ RunThreadN:
+ type: int
+ inputBinding:
+ prefix: "--runThreadN"
+
+ GenomeDir:
+ type: Directory
+ inputBinding:
+ prefix: "--genomeDir"
+
+ ForwardReads:
+ format: edam:format_1930 # FASTQ
+ type:
+ - File
+ - File[]
+ inputBinding:
+ prefix: "--readFilesIn"
+ itemSeparator: ","
+ position: 1
+ # If paired-end reads (like Illumina), both 1 and 2 must be provided.
+ ReverseReads:
+ format: edam:format_1930 # FASTQ
+ type:
+ - "null"
+ - File
+ - File[]
+ inputBinding:
+ prefix: ""
+ separate: false
+ itemSeparator: ","
+ position: 2
+
+ # Optional Inputs
+ Gtf:
+ type: File?
+ inputBinding:
+ prefix: "--sjdbGTFfile"
+
+ Overhang:
+ type: int?
+ inputBinding:
+ prefix: "--sjdbOverhang"
+
+ OutFilterType:
+ type:
+ - "null"
+ - type: enum
+ symbols:
+ - Normal
+ - BySJout
+ inputBinding:
+ prefix: "--outFilterType"
+
+ OutFilterIntronMotifs:
+ type:
+ - "null"
+ - type: enum
+ symbols:
+ - None
+ - RemoveNoncanonical
+ - RemoveNoncanonicalUnannotated
+ inputBinding:
+ prefix: "--outFilterIntronMotifs"
+
+ OutSAMtype:
+ type:
+ - "null"
+ - type: enum
+ symbols:
+ - "BAM"
+ - "SAM"
+ inputBinding:
+ prefix: "--outSAMtype"
+ position: 3
+
+ Unsorted:
+ type: boolean?
+ inputBinding:
+ prefix: "Unsorted"
+ position: 4
+
+ SortedByCoordinate:
+ type: boolean?
+ inputBinding:
+ prefix: "SortedByCoordinate"
+ position: 5
+
+ ReadFilesCommand:
+ type: string?
+ inputBinding:
+ prefix: "--readFilesCommand"
+
+ AlignIntronMin:
+ type: int?
+ inputBinding:
+ prefix: "--alignIntronMin"
+
+ AlignIntronMax:
+ type: int?
+ inputBinding:
+ prefix: "--alignIntronMax"
+
+ AlignMatesGapMax:
+ type: int?
+ inputBinding:
+ prefix: "--alignMatesGapMax"
+
+ AlignSJoverhangMin:
+ type: int?
+ inputBinding:
+ prefix: "--alignSJoverhangMin"
+
+ AlignSJDBoverhangMin:
+ type: int?
+ inputBinding:
+ prefix: "--alignSJDBoverhangMin"
+
+ SeedSearchStartLmax:
+ type: int?
+ inputBinding:
+ prefix: "--seedSearchStartLmax"
+
+ ChimOutType:
+ type:
+ - "null"
+ - type: enum
+ symbols:
+ - Junctions
+ - SeparateSAMold
+ - WithinBAM
+ - "WithinBAM HardClip"
+ - "WithinBAM SoftClip"
+
+ ChimSegmentMin:
+ type: int?
+ inputBinding:
+ prefix: "--chimSegmentMin"
+
+ ChimJunctionOverhangMin:
+ type: int?
+ inputBinding:
+ prefix: "--chimJunctionOverhangMin"
+
+ OutFilterMultimapNmax:
+ type: int?
+ inputBinding:
+ prefix: "--outFilterMultimapNmax"
+
+ OutFilterMismatchNmax:
+ type: int?
+ inputBinding:
+ prefix: "--outFilterMismatchNmax"
+
+ OutFilterMismatchNoverLmax:
+ type: double?
+ inputBinding:
+ prefix: "--outFilterMismatchNoverLmax"
+
+ OutReadsUnmapped:
+ type:
+ - "null"
+ - type: enum
+ symbols:
+ - None
+ - Fastx
+ inputBinding:
+ prefix: "--outReadsUnmapped"
+
+ OutSAMstrandField:
+ type:
+ - "null"
+ - type: enum
+ symbols:
+ - None
+ - intronMotif
+ inputBinding:
+ prefix: "--outSAMstrandField"
+
+ OutSAMunmapped:
+ type:
+ - "null"
+ - type: enum
+ symbols:
+ - None
+ - Within
+ - "Within KeepPairs"
+ inputBinding:
+ prefix: "--outSAMunmapped"
+
+ OutSAMmapqUnique:
+ type: int?
+ inputBinding:
+ prefix: "--outSAMmapqUnique"
+
+ OutSamMode:
+ type:
+ - "null"
+ - type: enum
+ symbols:
+ - None
+ - Full
+ - NoQS
+ inputBinding:
+ prefix: "--outSAMmode"
+
+ LimitOutSAMoneReadBytes:
+ type: int?
+ inputBinding:
+ prefix: "--limitOutSAMoneReadBytes"
+
+ OutFileNamePrefix:
+ type: string?
+ inputBinding:
+ prefix: "--outFileNamePrefix"
+
+ GenomeLoad:
+ type:
+ - "null"
+ - type: enum
+ symbols:
+ - LoadAndKeep
+ - LoadAndRemove
+ - LoadAndExit
+ - Remove
+ - NoSharedMemory
+ inputBinding:
+ prefix: "--genomeLoad"
+
+baseCommand: [STAR, --runMode, alignReads]
+
+outputs:
+ alignment:
+ type:
+ - File
+ outputBinding:
+ glob: "*.bam"
+ unmapped_reads:
+ type: ["null", File]
+ outputBinding:
+ glob: "Unmapped.out*"
+
+$namespaces:
+ edam: https://edamontology.org/
+$schemas:
+ - https://edamontology.org/EDAM_1.18.owl
steps:
fastqc:
- run: bio-cwl-tools/fastqc/fastqc_2.cwl
+ run: fastqc_2.cwl
in:
reads_file: fq
out: [html_file]
requirements:
ResourceRequirement:
ramMin: 9000
- run: bio-cwl-tools/STAR/STAR-Align.cwl
+ run: STAR-Align.cwl
in:
RunThreadN: {default: 4}
GenomeDir: genome
OutSAMtype: {default: BAM}
SortedByCoordinate: {default: true}
OutSAMunmapped: {default: Within}
- ### 1. Expressions on step inputs
OutFileNamePrefix: {valueFrom: "$(inputs.ForwardReads.nameroot)."}
out: [alignment]
samtools:
- run: bio-cwl-tools/samtools/samtools_index.cwl
+ run: samtools_index.cwl
in:
bam_sorted: STAR/alignment
out: [bam_sorted_indexed]
--- /dev/null
+#!/usr/bin/env cwl-runner
+cwlVersion: v1.0
+class: CommandLineTool
+
+hints:
+ DockerRequirement:
+ dockerPull: quay.io/biocontainers/fastqc:0.11.9--hdfd78af_1
+ SoftwareRequirement:
+ packages:
+ fastqc:
+ specs: [ "http://identifiers.org/biotools/fastqc" ]
+ version: [ "0.11.9--hdfd78af_1", "0.11.9" ]
+
+inputs:
+
+ reads_file:
+ type: File
+ inputBinding:
+ position: 50
+ doc: |
+ Input bam,sam,bam_mapped,sam_mapped or fastq file
+
+ format_enum:
+ type:
+ - "null"
+ - type: enum
+ name: "format"
+ symbols: ['bam','sam','bam_mapped','sam_mapped','fastq']
+ inputBinding:
+ position: 6
+ prefix: '--format'
+ doc: |
+ Bypasses the normal sequence file format detection and
+ forces the program to use the specified format. Valid
+ formats are bam,sam,bam_mapped,sam_mapped and fastq
+
+ threads:
+ type: int?
+ inputBinding:
+ position: 7
+ prefix: '--threads'
+ doc: |
+ Specifies the number of files which can be processed
+ simultaneously. Each thread will be allocated 250MB of
+ memory so you shouldn't run more threads than your
+ available memory will cope with, and not more than
+ 6 threads on a 32 bit machine
+
+ contaminants:
+ type: File?
+ inputBinding:
+ position: 8
+ prefix: '--contaminants'
+ doc: |
+ Specifies a non-default file which contains the list of
+ contaminants to screen overrepresented sequences against.
+ The file must contain sets of named contaminants in the
+ form name[tab]sequence. Lines prefixed with a hash will
+ be ignored.
+
+ adapters:
+ type: File?
+ inputBinding:
+ position: 9
+ prefix: '--adapters'
+ doc: |
+ Specifies a non-default file which contains the list of
+ adapter sequences which will be explicity searched against
+ the library. The file must contain sets of named adapters
+ in the form name[tab]sequence. Lines prefixed with a hash
+ will be ignored.
+
+ limits:
+ type: File?
+ inputBinding:
+ position: 10
+ prefix: '--limits'
+ doc: |
+ Specifies a non-default file which contains a set of criteria
+ which will be used to determine the warn/error limits for the
+ various modules. This file can also be used to selectively
+ remove some modules from the output all together. The format
+ needs to mirror the default limits.txt file found in the
+ Configuration folder.
+
+ kmers:
+ type: int?
+ inputBinding:
+ position: 11
+ prefix: '--kmers'
+ doc: |
+ Specifies the length of Kmer to look for in the Kmer content
+ module. Specified Kmer length must be between 2 and 10. Default
+ length is 7 if not specified.
+
+ casava:
+ type: boolean?
+ inputBinding:
+ position: 13
+ prefix: '--casava'
+ doc: |
+ Files come from raw casava output. Files in the same sample
+ group (differing only by the group number) will be analysed
+ as a set rather than individually. Sequences with the filter
+ flag set in the header will be excluded from the analysis.
+ Files must have the same names given to them by casava
+ (including being gzipped and ending with .gz) otherwise they
+ won't be grouped together correctly.
+
+ nofilter:
+ type: boolean?
+ inputBinding:
+ position: 14
+ prefix: '--nofilter'
+ doc: |
+ If running with --casava then don't remove read flagged by
+ casava as poor quality when performing the QC analysis.
+
+ hide_group:
+ type: boolean?
+ inputBinding:
+ position: 15
+ prefix: '--nogroup'
+ doc: |
+ Disable grouping of bases for reads >50bp. All reports will
+ show data for every base in the read. WARNING: Using this
+ option will cause fastqc to crash and burn if you use it on
+ really long reads, and your plots may end up a ridiculous size.
+ You have been warned!
+
+outputs:
+
+ zipped_file:
+ type: File
+ outputBinding:
+ glob: '*.zip'
+ html_file:
+ type: File
+ outputBinding:
+ glob: '*.html'
+ summary_file:
+ type: File
+ outputBinding:
+ glob: "*/summary.txt"
+
+baseCommand: [fastqc, --extract, --outdir, .]
+
+$namespaces:
+ s: http://schema.org/
+
+$schemas:
+- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf
+
+s:name: "fastqc_2"
+s:license: http://www.apache.org/licenses/LICENSE-2.0
+
+s:creator:
+- class: s:Organization
+ s:legalName: "Cincinnati Children's Hospital Medical Center"
+ s:location:
+ - class: s:PostalAddress
+ s:addressCountry: "USA"
+ s:addressLocality: "Cincinnati"
+ s:addressRegion: "OH"
+ s:postalCode: "45229"
+ s:streetAddress: "3333 Burnet Ave"
+ s:telephone: "+1(513)636-4200"
+ s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png"
+ s:department:
+ - class: s:Organization
+ s:legalName: "Allergy and Immunology"
+ s:department:
+ - class: s:Organization
+ s:legalName: "Barski Research Lab"
+ s:member:
+ - class: s:Person
+ s:name: Michael Kotliar
+ s:email: mailto:misha.kotliar@gmail.com
+ s:sameAs:
+ - id: http://orcid.org/0000-0002-6486-3898
+
+doc: |
+ Tool runs FastQC from Babraham Bioinformatics
--- /dev/null
+#!/usr/bin/env cwl-runner
+cwlVersion: v1.0
+class: CommandLineTool
+
+doc: |
+ Indexing BAM.
+
+requirements:
+ InitialWorkDirRequirement:
+ listing:
+ - $(inputs.bam_sorted)
+hints:
+ ResourceRequirement:
+ coresMin: 1
+ ramMin: 20000
+ DockerRequirement:
+ dockerPull: quay.io/biocontainers/samtools:1.14--hb421002_0
+
+baseCommand: ["samtools", "index"]
+arguments:
+ - valueFrom: -b # specifies that index is created in bai format
+ position: 1
+
+inputs:
+ bam_sorted:
+ doc: sorted bam input file
+ type: File
+ inputBinding:
+ position: 2
+
+outputs:
+ bam_sorted_indexed:
+ type: File
+ secondaryFiles: .bai
+ format: edam:format_2572 # BAM
+ outputBinding:
+ glob: $(inputs.bam_sorted.basename)
+
+$namespaces:
+ edam: https://edamontology.org/
+$schemas:
+ - https://edamontology.org/EDAM_1.18.owl
--- /dev/null
+fq:
+ - class: File
+ location: keep:pirca-4zz18-blweknwtwyjys0i/Mov10_oe_1.subset.fq
+ format: http://edamontology.org/format_1930
+ - class: File
+ location: keep:pirca-4zz18-blweknwtwyjys0i/Mov10_oe_2.subset.fq
+ format: http://edamontology.org/format_1930
+genome:
+ class: Directory
+ location: keep:pirca-4zz18-c543b5welq68g90/hg19-chr1-STAR-index
+gtf:
+ class: File
+ location: keep:pirca-4zz18-c543b5welq68g90/chr1-hg19_genes.gtf
+
--- /dev/null
+fq:
+ class: File
+ location: keep:pirca-4zz18-blweknwtwyjys0i/Mov10_oe_1.subset.fq
+ format: http://edamontology.org/format_1930
+genome:
+ class: Directory
+ location: keep:pirca-4zz18-c543b5welq68g90/hg19-chr1-STAR-index
+gtf:
+ class: File
+ location: keep:pirca-4zz18-c543b5welq68g90/chr1-hg19_genes.gtf
+