From: Peter Amstutz <peter.amstutz@curii.com>
Date: Fri, 15 Jan 2021 20:08:49 +0000 (-0500)
Subject: Initial commit
X-Git-Url: https://git.arvados.org/rnaseq-cwl-training.git/commitdiff_plain/2939316d9153c733503155c7dc4e46690bcb210b

Initial commit
---

diff --git a/README b/README
new file mode 100644
index 0000000..5a995d4
--- /dev/null
+++ b/README
@@ -0,0 +1,11 @@
+These CWL lessons are based on "Introduction to RNA-seq using
+high-performance computing (HPC)" lessons developed by members of the
+teaching team at the Harvard Chan Bioinformatics Core (HBC) and
+obtained from
+
+https://github.com/hbctraining/Intro-to-rnaseq-hpc-O2
+
+The original lessons are open access materials distributed under the
+terms of the Creative Commons Attribution license (CC BY 4.0), which
+permits unrestricted use, distribution, and reproduction in any
+medium, provided the original author and source are credited.
diff --git a/dockerfile/Dockerfile b/dockerfile/Dockerfile
new file mode 100644
index 0000000..69a9327
--- /dev/null
+++ b/dockerfile/Dockerfile
@@ -0,0 +1,2 @@
+FROM debian:10
+RUN apt-get update && apt-get -y --no-install-recommends install rna-star fastqc samtools subread
\ No newline at end of file
diff --git a/scripts/genomeGenerate.sh b/scripts/genomeGenerate.sh
new file mode 100755
index 0000000..c714640
--- /dev/null
+++ b/scripts/genomeGenerate.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# Generate STAR genome index
+
+STAR --runThreadN 4 \
+--runMode genomeGenerate \
+--genomeDir unix_lesson/reference_data \
+--genomeFastaFiles unix_lesson/reference_data/chr1.fa \
+--sjdbGTFfile unix_lesson/reference_data/chr1-hg19_genes.gtf \
+--sjdbOverhang 99
diff --git a/scripts/rnaseq_analysis_on_input_file.sh b/scripts/rnaseq_analysis_on_input_file.sh
new file mode 100755
index 0000000..1996706
--- /dev/null
+++ b/scripts/rnaseq_analysis_on_input_file.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Based on
+# https://hbctraining.github.io/Intro-to-rnaseq-hpc-O2/lessons/07_automating_workflow.html
+#
+
+# This script takes a fastq file of RNA-Seq data, runs FastQC and outputs a counts file for it.
+# USAGE: sh rnaseq_analysis_on_input_file.sh <name of fastq file>
+
+set -e
+
+# initialize a variable with an intuitive name to store the name of the input fastq file
+fq=$1
+
+# grab base of filename for naming outputs
+base=`basename $fq .subset.fq`
+echo "Sample name is $base"
+
+# specify the number of cores to use
+cores=4
+
+# directory with genome reference FASTA and index files + name of the gene annotation file
+genome=rnaseq/reference_data
+gtf=rnaseq/reference_data/chr1-hg19_genes.gtf
+
+# make all of the output directories
+# The -p option means mkdir will create the whole path if it
+# does not exist and refrain from complaining if it does exist
+mkdir -p rnaseq/results/fastqc
+mkdir -p rnaseq/results/STAR
+mkdir -p rnaseq/results/counts
+
+# set up output filenames and locations
+fastqc_out=rnaseq/results/fastqc
+align_out=rnaseq/results/STAR/${base}_
+counts_input_bam=rnaseq/results/STAR/${base}_Aligned.sortedByCoord.out.bam
+counts=rnaseq/results/counts/${base}_featurecounts.txt
+
+echo "Processing file $fq"
+
+# Run FastQC and move output to the appropriate folder
+fastqc $fq
+
+# Run STAR
+STAR --runThreadN $cores --genomeDir $genome --readFilesIn $fq --outFileNamePrefix $align_out --outSAMtype BAM SortedByCoordinate --outSAMunmapped Within --outSAMattributes Standard
+
+# Create BAM index
+samtools index $counts_input_bam
+
+# Count mapped reads
+featureCounts -T $cores -s 2 -a $gtf -o $counts $counts_input_bam