From 2939316d9153c733503155c7dc4e46690bcb210b Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Fri, 15 Jan 2021 15:08:49 -0500 Subject: [PATCH] Initial commit --- README | 11 +++++ dockerfile/Dockerfile | 2 + scripts/genomeGenerate.sh | 10 +++++ scripts/rnaseq_analysis_on_input_file.sh | 51 ++++++++++++++++++++++++ 4 files changed, 74 insertions(+) create mode 100644 README create mode 100644 dockerfile/Dockerfile create mode 100755 scripts/genomeGenerate.sh create mode 100755 scripts/rnaseq_analysis_on_input_file.sh diff --git a/README b/README new file mode 100644 index 0000000..5a995d4 --- /dev/null +++ b/README @@ -0,0 +1,11 @@ +These CWL lessons are based on "Introduction to RNA-seq using +high-performance computing (HPC)" lessons developed by members of the +teaching team at the Harvard Chan Bioinformatics Core (HBC) and +obtained from + +https://github.com/hbctraining/Intro-to-rnaseq-hpc-O2 + +The original lessons are open access materials distributed under the +terms of the Creative Commons Attribution license (CC BY 4.0), which +permits unrestricted use, distribution, and reproduction in any +medium, provided the original author and source are credited. diff --git a/dockerfile/Dockerfile b/dockerfile/Dockerfile new file mode 100644 index 0000000..69a9327 --- /dev/null +++ b/dockerfile/Dockerfile @@ -0,0 +1,2 @@ +FROM debian:10 +RUN apt-get update && apt-get -y --no-install-recommends install rna-star fastqc samtools subread \ No newline at end of file diff --git a/scripts/genomeGenerate.sh b/scripts/genomeGenerate.sh new file mode 100755 index 0000000..c714640 --- /dev/null +++ b/scripts/genomeGenerate.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# Generate STAR genome index + +STAR --runThreadN 4 \ +--runMode genomeGenerate \ +--genomeDir unix_lesson/reference_data \ +--genomeFastaFiles unix_lesson/reference_data/chr1.fa \ +--sjdbGTFfile unix_lesson/reference_data/chr1-hg19_genes.gtf \ +--sjdbOverhang 99 diff --git a/scripts/rnaseq_analysis_on_input_file.sh b/scripts/rnaseq_analysis_on_input_file.sh new file mode 100755 index 0000000..1996706 --- /dev/null +++ b/scripts/rnaseq_analysis_on_input_file.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# Based on +# https://hbctraining.github.io/Intro-to-rnaseq-hpc-O2/lessons/07_automating_workflow.html +# + +# This script takes a fastq file of RNA-Seq data, runs FastQC and outputs a counts file for it. +# USAGE: sh rnaseq_analysis_on_input_file.sh + +set -e + +# initialize a variable with an intuitive name to store the name of the input fastq file +fq=$1 + +# grab base of filename for naming outputs +base=`basename $fq .subset.fq` +echo "Sample name is $base" + +# specify the number of cores to use +cores=4 + +# directory with genome reference FASTA and index files + name of the gene annotation file +genome=rnaseq/reference_data +gtf=rnaseq/reference_data/chr1-hg19_genes.gtf + +# make all of the output directories +# The -p option means mkdir will create the whole path if it +# does not exist and refrain from complaining if it does exist +mkdir -p rnaseq/results/fastqc +mkdir -p rnaseq/results/STAR +mkdir -p rnaseq/results/counts + +# set up output filenames and locations +fastqc_out=rnaseq/results/fastqc +align_out=rnaseq/results/STAR/${base}_ +counts_input_bam=rnaseq/results/STAR/${base}_Aligned.sortedByCoord.out.bam +counts=rnaseq/results/counts/${base}_featurecounts.txt + +echo "Processing file $fq" + +# Run FastQC and move output to the appropriate folder +fastqc $fq + +# Run STAR +STAR --runThreadN $cores --genomeDir $genome --readFilesIn $fq --outFileNamePrefix $align_out --outSAMtype BAM SortedByCoordinate --outSAMunmapped Within --outSAMattributes Standard + +# Create BAM index +samtools index $counts_input_bam + +# Count mapped reads +featureCounts -T $cores -s 2 -a $gtf -o $counts $counts_input_bam -- 2.30.2