Adding helper files and updating readme
authorswz <swz@curii.com>
Thu, 15 Sep 2022 14:04:58 +0000 (14:04 +0000)
committerswz <swz@curii.com>
Thu, 15 Sep 2022 14:04:58 +0000 (14:04 +0000)
no issue #
Arvados-DCO-1.1-Signed-off-by: Sarah Wait Zaranek <swz@curii.com>

RNA-Seq/README.md [new file with mode: 0644]
RNA-Seq/cwl/RNA-seq-wf.cwl
RNA-Seq/cwl/helper/STAR-Align.cwl [new file with mode: 0755]
RNA-Seq/cwl/helper/alignment.cwl
RNA-Seq/cwl/helper/fastqc_2.cwl [new file with mode: 0755]
RNA-Seq/cwl/helper/samtools_index.cwl [new file with mode: 0755]
RNA-Seq/yml/RNA-seq-wf.yml [new file with mode: 0644]
RNA-Seq/yml/alignment.yml [new file with mode: 0644]

diff --git a/RNA-Seq/README.md b/RNA-Seq/README.md
new file mode 100644 (file)
index 0000000..25677fd
--- /dev/null
@@ -0,0 +1,13 @@
+This directory contains an Arvados demo that performs bioinformatics RNA-seq analysis. However, specific knowledge of the biology of RNA-seq is not required for this demo. For those unfamiliar with RNA-seq, it is the process of sequencing RNA present in a biological sample. From the sequence reads, we want to measure the relative numbers of different RNA molecules appearing in the sample that were produced by particular genes. This analysis is called “differential gene expression”. 
+
+Workflows are written in CWL v1.2. (https://www.commonwl.org/)
+
+Subdirectories are:
+* cwl - contains CWL code for the demo
+* yml - contains YML inputs for cwl demo code
+* docker - contains dockerfiles necessary to re-create any needed docker images 
+
+To run the workflow:
+
+*  arvados-cwl-runner --no-wait --project-uuid YOUR_PROJECT_UUID ./cwl/RNA-seq-wf.cwl ./yml/RNA-seq-wf.yml
+
index 97461ce437f510061ced436eaf19e9bea7155669..8da505cd8bb507ae0c6829860954ffcdccb1b61d 100644 (file)
@@ -9,7 +9,7 @@ inputs:
 
 steps:
   alignment:
-    run: alignment.cwl
+    run: helper/alignment.cwl
     scatter: fq
     in:
       fq: fq
@@ -21,15 +21,14 @@ steps:
     requirements:
       ResourceRequirement:
         ramMin: 500
-    run: featureCounts.cwl
+    run: helper/featureCounts.cwl
     in:
       counts_input_bam: alignment/bam_sorted_indexed
       gtf: gtf
     out: [featurecounts]
 
-  ### 2. Organizing output files into Directories
   output-subdirs:
-    run: subdirs.cwl
+    run: helper/subdirs.cwl
     in:
       fq: fq
       bams: alignment/bam_sorted_indexed
diff --git a/RNA-Seq/cwl/helper/STAR-Align.cwl b/RNA-Seq/cwl/helper/STAR-Align.cwl
new file mode 100755 (executable)
index 0000000..24a640b
--- /dev/null
@@ -0,0 +1,255 @@
+#!/usr/bin/env cwl-runner
+cwlVersion: v1.0
+class: CommandLineTool
+
+hints:
+  DockerRequirement:
+    dockerPull: "quay.io/biocontainers/star:2.7.5c--0"
+
+inputs:
+  # Required Inputs
+  RunThreadN:
+    type: int
+    inputBinding:
+      prefix: "--runThreadN"
+
+  GenomeDir:
+    type: Directory
+    inputBinding:
+      prefix: "--genomeDir"
+
+  ForwardReads:
+    format: edam:format_1930  # FASTQ
+    type:
+     - File
+     - File[]
+    inputBinding:
+      prefix: "--readFilesIn"
+      itemSeparator: ","
+      position: 1
+  # If paired-end reads (like Illumina), both 1 and 2 must be provided.
+  ReverseReads:
+    format: edam:format_1930  # FASTQ
+    type:
+     - "null"
+     - File
+     - File[]
+    inputBinding:
+      prefix: ""
+      separate: false
+      itemSeparator: ","
+      position: 2
+
+  # Optional Inputs
+  Gtf:
+    type: File?
+    inputBinding:
+      prefix: "--sjdbGTFfile"
+
+  Overhang:
+    type: int?
+    inputBinding:
+      prefix: "--sjdbOverhang"
+
+  OutFilterType:
+    type:
+     - "null"
+     - type: enum
+       symbols:
+        - Normal
+        - BySJout
+    inputBinding:
+      prefix: "--outFilterType"
+
+  OutFilterIntronMotifs:
+    type:
+     - "null"
+     - type: enum
+       symbols:
+        - None
+        - RemoveNoncanonical
+        - RemoveNoncanonicalUnannotated
+    inputBinding:
+      prefix: "--outFilterIntronMotifs"
+
+  OutSAMtype:
+    type:
+     - "null"
+     - type: enum
+       symbols:
+        - "BAM"
+        - "SAM"
+    inputBinding:
+      prefix: "--outSAMtype"
+      position: 3
+
+  Unsorted:
+    type: boolean?
+    inputBinding:
+      prefix: "Unsorted"
+      position: 4
+
+  SortedByCoordinate:
+    type: boolean?
+    inputBinding:
+      prefix: "SortedByCoordinate"
+      position: 5
+
+  ReadFilesCommand:
+    type: string?
+    inputBinding:
+      prefix: "--readFilesCommand"
+
+  AlignIntronMin:
+    type: int?
+    inputBinding:
+      prefix: "--alignIntronMin"
+
+  AlignIntronMax:
+    type: int?
+    inputBinding:
+      prefix: "--alignIntronMax"
+
+  AlignMatesGapMax:
+    type: int?
+    inputBinding:
+      prefix: "--alignMatesGapMax"
+
+  AlignSJoverhangMin:
+    type: int?
+    inputBinding:
+      prefix: "--alignSJoverhangMin"
+
+  AlignSJDBoverhangMin:
+    type: int?
+    inputBinding:
+      prefix: "--alignSJDBoverhangMin"
+
+  SeedSearchStartLmax:
+    type: int?
+    inputBinding:
+      prefix: "--seedSearchStartLmax"
+
+  ChimOutType:
+    type:
+     - "null"
+     - type: enum
+       symbols:
+        - Junctions
+        - SeparateSAMold
+        - WithinBAM
+        - "WithinBAM HardClip"
+        - "WithinBAM SoftClip"
+
+  ChimSegmentMin:
+    type: int?
+    inputBinding:
+      prefix: "--chimSegmentMin"
+
+  ChimJunctionOverhangMin:
+    type: int?
+    inputBinding:
+      prefix: "--chimJunctionOverhangMin"
+
+  OutFilterMultimapNmax:
+    type: int?
+    inputBinding:
+      prefix: "--outFilterMultimapNmax"
+
+  OutFilterMismatchNmax:
+    type: int?
+    inputBinding:
+      prefix: "--outFilterMismatchNmax"
+
+  OutFilterMismatchNoverLmax:
+    type: double?
+    inputBinding:
+      prefix: "--outFilterMismatchNoverLmax"
+
+  OutReadsUnmapped:
+    type:
+     - "null"
+     - type: enum
+       symbols:
+        - None
+        - Fastx
+    inputBinding:
+      prefix: "--outReadsUnmapped"
+
+  OutSAMstrandField:
+    type:
+     - "null"
+     - type: enum
+       symbols:
+        - None
+        - intronMotif
+    inputBinding:
+      prefix: "--outSAMstrandField"
+
+  OutSAMunmapped:
+    type:
+     - "null"
+     - type: enum
+       symbols:
+        - None
+        - Within
+        - "Within KeepPairs"
+    inputBinding:
+      prefix: "--outSAMunmapped"
+
+  OutSAMmapqUnique:
+    type: int?
+    inputBinding:
+      prefix: "--outSAMmapqUnique"
+
+  OutSamMode:
+    type:
+     - "null"
+     - type: enum
+       symbols:
+        - None
+        - Full
+        - NoQS
+    inputBinding:
+      prefix: "--outSAMmode"
+
+  LimitOutSAMoneReadBytes:
+    type: int?
+    inputBinding:
+      prefix: "--limitOutSAMoneReadBytes"
+
+  OutFileNamePrefix:
+    type: string?
+    inputBinding:
+      prefix: "--outFileNamePrefix"
+
+  GenomeLoad:
+    type:
+     - "null"
+     - type: enum
+       symbols:
+        - LoadAndKeep
+        - LoadAndRemove
+        - LoadAndExit
+        - Remove
+        - NoSharedMemory
+    inputBinding:
+      prefix: "--genomeLoad"
+
+baseCommand: [STAR, --runMode, alignReads]
+
+outputs:
+  alignment:
+    type:
+     - File
+    outputBinding:
+      glob: "*.bam"
+  unmapped_reads:
+    type: ["null", File]
+    outputBinding:
+      glob: "Unmapped.out*"
+
+$namespaces:
+  edam: https://edamontology.org/
+$schemas:
+  - https://edamontology.org/EDAM_1.18.owl
index 6712aae6b83c7957840320eeaf3597fb3d717827..e2a434fe3f3852f3e1771d16f2d908b4ee059c55 100644 (file)
@@ -12,7 +12,7 @@ requirements:
 
 steps:
   fastqc:
-    run: bio-cwl-tools/fastqc/fastqc_2.cwl
+    run: fastqc_2.cwl
     in:
       reads_file: fq
     out: [html_file]
@@ -21,7 +21,7 @@ steps:
     requirements:
       ResourceRequirement:
         ramMin: 9000
-    run: bio-cwl-tools/STAR/STAR-Align.cwl
+    run: STAR-Align.cwl
     in:
       RunThreadN: {default: 4}
       GenomeDir: genome
@@ -29,12 +29,11 @@ steps:
       OutSAMtype: {default: BAM}
       SortedByCoordinate: {default: true}
       OutSAMunmapped: {default: Within}
-      ### 1. Expressions on step inputs
       OutFileNamePrefix: {valueFrom: "$(inputs.ForwardReads.nameroot)."}
     out: [alignment]
 
   samtools:
-    run: bio-cwl-tools/samtools/samtools_index.cwl
+    run: samtools_index.cwl
     in:
       bam_sorted: STAR/alignment
     out: [bam_sorted_indexed]
diff --git a/RNA-Seq/cwl/helper/fastqc_2.cwl b/RNA-Seq/cwl/helper/fastqc_2.cwl
new file mode 100755 (executable)
index 0000000..575c9b0
--- /dev/null
@@ -0,0 +1,183 @@
+#!/usr/bin/env cwl-runner
+cwlVersion: v1.0
+class: CommandLineTool
+
+hints:
+  DockerRequirement:
+    dockerPull: quay.io/biocontainers/fastqc:0.11.9--hdfd78af_1
+  SoftwareRequirement:
+    packages:
+      fastqc:
+        specs: [ "http://identifiers.org/biotools/fastqc" ]
+        version: [ "0.11.9--hdfd78af_1", "0.11.9" ]
+
+inputs:
+
+  reads_file:
+    type: File
+    inputBinding:
+      position: 50
+    doc: |
+      Input bam,sam,bam_mapped,sam_mapped or fastq file
+
+  format_enum:
+    type:
+      - "null"
+      - type: enum
+        name: "format"
+        symbols: ['bam','sam','bam_mapped','sam_mapped','fastq']
+    inputBinding:
+      position: 6
+      prefix: '--format'
+    doc: |
+      Bypasses the normal sequence file format detection and
+      forces the program to use the specified format.  Valid
+      formats are bam,sam,bam_mapped,sam_mapped and fastq
+
+  threads:
+    type: int?
+    inputBinding:
+      position: 7
+      prefix: '--threads'
+    doc: |
+      Specifies the number of files which can be processed
+      simultaneously.  Each thread will be allocated 250MB of
+      memory so you shouldn't run more threads than your
+      available memory will cope with, and not more than
+      6 threads on a 32 bit machine
+
+  contaminants:
+    type: File?
+    inputBinding:
+      position: 8
+      prefix: '--contaminants'
+    doc: |
+      Specifies a non-default file which contains the list of
+      contaminants to screen overrepresented sequences against.
+      The file must contain sets of named contaminants in the
+      form name[tab]sequence.  Lines prefixed with a hash will
+      be ignored.
+
+  adapters:
+    type: File?
+    inputBinding:
+      position: 9
+      prefix: '--adapters'
+    doc: |
+      Specifies a non-default file which contains the list of
+      adapter sequences which will be explicity searched against
+      the library. The file must contain sets of named adapters
+      in the form name[tab]sequence.  Lines prefixed with a hash
+      will be ignored.
+
+  limits:
+    type: File?
+    inputBinding:
+      position: 10
+      prefix: '--limits'
+    doc: |
+      Specifies a non-default file which contains a set of criteria
+      which will be used to determine the warn/error limits for the
+      various modules.  This file can also be used to selectively
+      remove some modules from the output all together.  The format
+      needs to mirror the default limits.txt file found in the
+      Configuration folder.
+
+  kmers:
+    type: int?
+    inputBinding:
+      position: 11
+      prefix: '--kmers'
+    doc: |
+      Specifies the length of Kmer to look for in the Kmer content
+      module. Specified Kmer length must be between 2 and 10. Default
+      length is 7 if not specified.
+
+  casava:
+    type: boolean?
+    inputBinding:
+      position: 13
+      prefix: '--casava'
+    doc: |
+      Files come from raw casava output. Files in the same sample
+      group (differing only by the group number) will be analysed
+      as a set rather than individually. Sequences with the filter
+      flag set in the header will be excluded from the analysis.
+      Files must have the same names given to them by casava
+      (including being gzipped and ending with .gz) otherwise they
+      won't be grouped together correctly.
+
+  nofilter:
+    type: boolean?
+    inputBinding:
+      position: 14
+      prefix: '--nofilter'
+    doc: |
+      If running with --casava then don't remove read flagged by
+      casava as poor quality when performing the QC analysis.
+
+  hide_group:
+    type: boolean?
+    inputBinding:
+      position: 15
+      prefix: '--nogroup'
+    doc: |
+      Disable grouping of bases for reads >50bp. All reports will
+      show data for every base in the read.  WARNING: Using this
+      option will cause fastqc to crash and burn if you use it on
+      really long reads, and your plots may end up a ridiculous size.
+      You have been warned!
+
+outputs:
+
+  zipped_file:
+    type: File
+    outputBinding:
+      glob: '*.zip'
+  html_file:
+    type: File
+    outputBinding:
+      glob: '*.html'
+  summary_file:
+    type: File
+    outputBinding:
+      glob: "*/summary.txt"
+
+baseCommand: [fastqc, --extract, --outdir, .]
+
+$namespaces:
+  s: http://schema.org/
+
+$schemas:
+- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf
+
+s:name: "fastqc_2"
+s:license: http://www.apache.org/licenses/LICENSE-2.0
+
+s:creator:
+- class: s:Organization
+  s:legalName: "Cincinnati Children's Hospital Medical Center"
+  s:location:
+  - class: s:PostalAddress
+    s:addressCountry: "USA"
+    s:addressLocality: "Cincinnati"
+    s:addressRegion: "OH"
+    s:postalCode: "45229"
+    s:streetAddress: "3333 Burnet Ave"
+    s:telephone: "+1(513)636-4200"
+  s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png"
+  s:department:
+  - class: s:Organization
+    s:legalName: "Allergy and Immunology"
+    s:department:
+    - class: s:Organization
+      s:legalName: "Barski Research Lab"
+      s:member:
+      - class: s:Person
+        s:name: Michael Kotliar
+        s:email: mailto:misha.kotliar@gmail.com
+        s:sameAs:
+        - id: http://orcid.org/0000-0002-6486-3898
+
+doc: |
+  Tool runs FastQC from Babraham Bioinformatics
diff --git a/RNA-Seq/cwl/helper/samtools_index.cwl b/RNA-Seq/cwl/helper/samtools_index.cwl
new file mode 100755 (executable)
index 0000000..3ae2e9b
--- /dev/null
@@ -0,0 +1,42 @@
+#!/usr/bin/env cwl-runner
+cwlVersion: v1.0
+class: CommandLineTool
+
+doc: |
+  Indexing BAM.
+
+requirements:
+  InitialWorkDirRequirement:
+    listing: 
+      - $(inputs.bam_sorted)
+hints:
+  ResourceRequirement:
+    coresMin: 1
+    ramMin: 20000
+  DockerRequirement:
+    dockerPull: quay.io/biocontainers/samtools:1.14--hb421002_0
+
+baseCommand: ["samtools", "index"]
+arguments:
+  - valueFrom: -b  # specifies that index is created in bai format
+    position: 1
+
+inputs:
+  bam_sorted:
+    doc: sorted bam input file
+    type: File
+    inputBinding:
+      position: 2
+
+outputs:
+  bam_sorted_indexed:
+    type: File
+    secondaryFiles: .bai
+    format: edam:format_2572  # BAM 
+    outputBinding:
+      glob: $(inputs.bam_sorted.basename)
+      
+$namespaces:
+  edam: https://edamontology.org/
+$schemas:
+  - https://edamontology.org/EDAM_1.18.owl
diff --git a/RNA-Seq/yml/RNA-seq-wf.yml b/RNA-Seq/yml/RNA-seq-wf.yml
new file mode 100644 (file)
index 0000000..19f8c54
--- /dev/null
@@ -0,0 +1,14 @@
+fq:
+  - class: File
+    location: keep:pirca-4zz18-blweknwtwyjys0i/Mov10_oe_1.subset.fq
+    format: http://edamontology.org/format_1930
+  - class: File
+    location: keep:pirca-4zz18-blweknwtwyjys0i/Mov10_oe_2.subset.fq
+    format: http://edamontology.org/format_1930
+genome:
+  class: Directory
+  location: keep:pirca-4zz18-c543b5welq68g90/hg19-chr1-STAR-index
+gtf:
+  class: File
+  location: keep:pirca-4zz18-c543b5welq68g90/chr1-hg19_genes.gtf
+
diff --git a/RNA-Seq/yml/alignment.yml b/RNA-Seq/yml/alignment.yml
new file mode 100644 (file)
index 0000000..83d0e48
--- /dev/null
@@ -0,0 +1,11 @@
+fq:
+  class: File
+  location: keep:pirca-4zz18-blweknwtwyjys0i/Mov10_oe_1.subset.fq
+  format: http://edamontology.org/format_1930
+genome:
+  class: Directory
+  location: keep:pirca-4zz18-c543b5welq68g90/hg19-chr1-STAR-index
+gtf:
+  class: File
+  location: keep:pirca-4zz18-c543b5welq68g90/chr1-hg19_genes.gtf
+