From: Peter Amstutz Date: Fri, 15 Jan 2021 20:08:49 +0000 (-0500) Subject: Initial commit X-Git-Url: https://git.arvados.org/rnaseq-cwl-training.git/commitdiff_plain/2939316d9153c733503155c7dc4e46690bcb210b Initial commit --- diff --git a/README b/README new file mode 100644 index 0000000..5a995d4 --- /dev/null +++ b/README @@ -0,0 +1,11 @@ +These CWL lessons are based on "Introduction to RNA-seq using +high-performance computing (HPC)" lessons developed by members of the +teaching team at the Harvard Chan Bioinformatics Core (HBC) and +obtained from + +https://github.com/hbctraining/Intro-to-rnaseq-hpc-O2 + +The original lessons are open access materials distributed under the +terms of the Creative Commons Attribution license (CC BY 4.0), which +permits unrestricted use, distribution, and reproduction in any +medium, provided the original author and source are credited. diff --git a/dockerfile/Dockerfile b/dockerfile/Dockerfile new file mode 100644 index 0000000..69a9327 --- /dev/null +++ b/dockerfile/Dockerfile @@ -0,0 +1,2 @@ +FROM debian:10 +RUN apt-get update && apt-get -y --no-install-recommends install rna-star fastqc samtools subread \ No newline at end of file diff --git a/scripts/genomeGenerate.sh b/scripts/genomeGenerate.sh new file mode 100755 index 0000000..c714640 --- /dev/null +++ b/scripts/genomeGenerate.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# Generate STAR genome index + +STAR --runThreadN 4 \ +--runMode genomeGenerate \ +--genomeDir unix_lesson/reference_data \ +--genomeFastaFiles unix_lesson/reference_data/chr1.fa \ +--sjdbGTFfile unix_lesson/reference_data/chr1-hg19_genes.gtf \ +--sjdbOverhang 99 diff --git a/scripts/rnaseq_analysis_on_input_file.sh b/scripts/rnaseq_analysis_on_input_file.sh new file mode 100755 index 0000000..1996706 --- /dev/null +++ b/scripts/rnaseq_analysis_on_input_file.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# Based on +# https://hbctraining.github.io/Intro-to-rnaseq-hpc-O2/lessons/07_automating_workflow.html +# + +# This script takes a fastq file of RNA-Seq data, runs FastQC and outputs a counts file for it. +# USAGE: sh rnaseq_analysis_on_input_file.sh + +set -e + +# initialize a variable with an intuitive name to store the name of the input fastq file +fq=$1 + +# grab base of filename for naming outputs +base=`basename $fq .subset.fq` +echo "Sample name is $base" + +# specify the number of cores to use +cores=4 + +# directory with genome reference FASTA and index files + name of the gene annotation file +genome=rnaseq/reference_data +gtf=rnaseq/reference_data/chr1-hg19_genes.gtf + +# make all of the output directories +# The -p option means mkdir will create the whole path if it +# does not exist and refrain from complaining if it does exist +mkdir -p rnaseq/results/fastqc +mkdir -p rnaseq/results/STAR +mkdir -p rnaseq/results/counts + +# set up output filenames and locations +fastqc_out=rnaseq/results/fastqc +align_out=rnaseq/results/STAR/${base}_ +counts_input_bam=rnaseq/results/STAR/${base}_Aligned.sortedByCoord.out.bam +counts=rnaseq/results/counts/${base}_featurecounts.txt + +echo "Processing file $fq" + +# Run FastQC and move output to the appropriate folder +fastqc $fq + +# Run STAR +STAR --runThreadN $cores --genomeDir $genome --readFilesIn $fq --outFileNamePrefix $align_out --outSAMtype BAM SortedByCoordinate --outSAMunmapped Within --outSAMattributes Standard + +# Create BAM index +samtools index $counts_input_bam + +# Count mapped reads +featureCounts -T $cores -s 2 -a $gtf -o $counts $counts_input_bam