Commit 37bf9a7a authored by Nicolas Delhomme's avatar Nicolas Delhomme

original repos structure with README and the first runners for the pipeline

parent c4106e62
This directory will contain a description on how to reproduce the results described in our publications.
This directory is meant to contain SBATCH generic scripts only. For project specific scripts, write them in the projects/[project-name]/pipeline directory instead.
#!/bin/bash -l
#SBATCH -p core
#SBATCH -n 1
#SBATCH -t 0:30:00
#SBATCH --mail-type=ALL
## stop on error but be verbose
set -e
set -x
##
# Run fastQC
##
## Usage: sh runFastQC.sh file ouputFolder
## sanity checks
## executable
## are we on UPPMAX
if [ ! -z $SLURM_SUBMIT_DIR ]; then
module load bioinfo-tools
module load FastQC/0.10.1
## echo "Running on UPPMAX"
else
## echo "Running locally"
fastqc=`which fastqc`
if [ "$?" == "1" ]; then
echo "please install fastqc before running this script or add it to your PATH"
exit 1
fi
if [ ! -f $fastqc -a ! -x $fastqc ]; then
echo "your fastQC does not appear to be an executable file"
exit 1
fi
fi
## arguments
if [ $# != 2 ]; then
echo "This script takes two arguments: the input file and the output directory"
exit 1
fi
## input file
if [ ! -f $1 ]; then
echo "The first argument needs to be an existing fastq (optionally gz) file"
exit 1
fi
## output dir
if [ ! -d $2 ]; then
echo "The second argument needs to be an existing output directory."
fi
## start
fastqc --noextract --outdir $2 $1
#!/bin/bash -l
#SBATCH -p core -n 1
#SBATCH -t 0-01:00:00
#SBATCH --mail-type=ALL
usage() {
echo "usage: `basename $0` <fastq>
Run fastQValidator on a FASTQ file. Prints output on stdout and
exits with a non-zero exit status if the input file does not
conform to the standard.
ARGUMENTS:
fastq a FASTQ file, can be gzipped
NOTES:
fastQValidator must lie in your PATH" 1>&2
}
## stop on error
set -e
## check
if [ $# != 1 ]; then
echo "The argument should be one fastq filename" 1>&2
usage
exit 1
fi
if [ ! -f $1 ]; then
echo "The fastq filename you provided does not exist" 1>&2
usage
exit 1
fi
if ! hash fastQValidator 2>/dev/null; then
echo "fastQValidator was not found in your path" 1>&2
exit 1
fi
## we print 1000 errors, should be enough
fastQValidator --file $1 --printableErrors 1000
#!/bin/bash -l
#SBATCH -p core
#SBATCH -n 1
#SBATCH -t 8:00:00
#SBATCH --mail-type=ALL
## -A and --mail-user set in the submit job
## stop on error
set -ex
## usage
usage(){
echo >&2 \
"
Usage: runHTSeq.sh [options] <out dir> <in.bam> <in.gff>
Options:
-i precise the IDATTR
default to 'Parent', but e.g. should be 'pacid'
for the P. trichocarpa gene exon gff3 file
-s is the protocol stranded?
default to FALSE
Note:
BAM file are expected to be sorted by position
Only HTSeq 0.6+ version(s) are supported
"
exit 1
}
## Are we on UPPMAX?
if [ ! -z $SLURM_SUBMIT_DIR ]; then
## laod the modules
echo Loading modules
module load python/2.7.6
module load bioinfo-tools
module load samtools/0.1.19
else
htseq=`which htseq-count`
if [ "$?" -ne 0 ]; then
echo "error: you need to install htseq or add it to your path"
exit 1
fi
fi
## check the version
isVersion6=`htseq-count --help | grep "version 0.6" | wc -l`
if [ $isVersion6 != 1 ]; then
echo Only HTSeq version 0.6+ are supported
usage
fi
## options
IDATTR="Parent"
stranded=0
## get the options
while getopts i:s option
do
case "$option" in
i) IDATTR=$OPTARG;;
s) stranded=1;;
\?) ## unknown flag
usage;;
esac
done
shift `expr $OPTIND - 1`
## we get two dir and two files as input
if [ $# == 4 ]; then
echo "This function arguments have changed!"
usage
fi
if [ $# != 3 ]; then
echo "This function takes one directory, one bam and one gff3 file as arguments"
usage
fi
if [ ! -d $1 ]; then
echo "The first argument needs to be an existing directory"
usage
fi
if [ ! -f $2 ]; then
echo "The third argument needs to be an existing bam file"
usage
fi
nam=`basename ${2//.bam/}`
if [ ! -f $3 ]; then
echo "The forth argument needs to be an existing gff3 file"
usage
fi
## sort by id
## samtools sort -n $3 $2/${nam}-byname
## get the count table
if [ $stranded == 0 ]; then
## since we are not using strand specific, go for the union
htseq-count -f bam -r pos -m union -s no -t exon -i $IDATTR $2 $3 > $1/$nam.txt
else
htseq-count -f bam -r pos -m intersection-nonempty -s reverse -t exon -i $IDATTR $2 $3 > $1/$nam.txt
fi
## clean
## rm $2/${nam}-byname.bam
#!/bin/bash
# Preprocessing script for RNA-Seq data.
# THIS SCRIPT IS NOT TO BE RUN THROUGH SBATCH, USE BASH!
underline=`tput smul`
nounderline=`tput rmul`
bold=`tput bold`
normal=`tput sgr0`
usage() {
echo "usage: bash `basename $0` [OPTIONS] <proj> <mail> <fastq1> <fastq2> <outdir>
Run the RNA-Seq preprocessing pipeline, i.e. FastQC, Trimmomatic, Sortmerna
and STAR. You throw in a pair of fastq files and it spits out a BAM file. Sweet!
${bold}ARGUMENTS:${normal}
proj project that this should be run as
mail e-mail address where notifications will be sent
fastq1 forward fastq file
fastq2 reverse fastq file
outdir directory to save everything in
${bold}OPTIONS:${normal}
-h show this help message and exit
-s n step at which to start (see ${underline}STEPS${nounderline})
-e n step at which to end (see ${underline}STEPS${nounderline})
-g dir path to STAR reference to use (required if STAR is included
in pipeline)
-G gff gene model GFF3 file for STAR
-H gff gff3 file for ${underline}H${nounderline}TSeq, if not given, the star gff3 will be used
-t library is s${underline}t${nounderline}randed (currently only relevant for HTSeq)
-i IDATTR in GFF3 file to report counts (default: 'Parent')
${bold}STEPS:${normal}
The steps of this script are as follows:
1) fastQValidator
2) FastQC
3) SortMeRNA
4) FastQC
5) Trimmomatic
6) FastQC
7) STAR
8) HTSeq-count
${bold}NOTES:${normal}
* This script should not be run through sbatch, just use bash.
* If the output directory already exists, content may be overwritten
* If the SORTMERNA and UPSCb environmental variables don't exist, the
script will guess them. Set them yourself just to be safe." 1>&2
}
# Check the number of arguments
if [ $# -lt 5 ]; then
usage
exit 1
fi
pstart=1
pend=100
star_ref=
star_gff=
htseq_gff=
stranded=0
idattr="Parent"
# Parse the options
OPTIND=1
while getopts "hs:e:g:G:H:ti:" opt; do
case "$opt" in
h) usage; exit 1 ;;
s) pstart=$OPTARG ;;
e) pend=$OPTARG ;;
g) star_ref=$OPTARG ;;
G) star_gff=$OPTARG ;;
H) htseq_gff=$OPTARG ;;
t) stranded=1 ;;
i) idattr=$OPTARG ;;
?) usage; exit 1 ;;
esac
done
shift $((OPTIND - 1))
[[ $pstart =~ ^[0-9]+$ ]] || {
echo "ERROR: '$pstart' is not a valid start value" 1>&2
exit 1
}
[[ $pend =~ ^[0-9]+$ ]] || {
echo "ERROR: '$pend' is not a valid end value" 1>&2
exit 1
}
[[ $pend -lt $pstart ]] && {
echo "ERROR: you can't finish before you start" 1>&2
exit 1
}
# Check if STAR is included and then if the reference is set
if [ $pstart -le 7 ] && [ $pend -ge 7 ]; then
if [ -z $star_ref ]; then
echo >&2 "ERROR: You are running STAR but have not given a STAR reference"
exit 1
elif [ ! -d $star_ref ]; then
echo >&2 "ERROR: Could not find STAR reference"
exit 1
fi
if [ ! -z $star_gff ] && [ ! -f $star_gff ]; then
echo >&2 "ERROR: Could not find gff file: '$star_gff'"
exit 1
fi
fi
# Check that HTSeq will get a GFF3 file (if it's included in the pipeline)
if [ $pstart -le 8 ] && [ $pend -ge 8 ]; then
if [ -z $htseq_gff ] && [ -z $star_gff ]; then
echo >&2 "ERROR: HTSeq needs a GFF3 file"
exit 1
elif [ -z $htseq_gff ]; then
htseq_gff=$star_gff
elif [ ! -z $htseq_gff ] && [ ! -f $htseq_gff ]; then
echo >&2 "ERROR: Could not find gff file: '$htseq_gff'"
exit 1
fi
fi
# This variable holds the absolute path of this script, i.e. the
# repository's pipeline directory.
# http://stackoverflow.com/questions/59895/can-a-bash-script-tell-what-directory-its-stored-in
PIPELINE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
set -e
proj=$1
mail=$2
if [ ! -f $3 ]; then
echo "ERROR: fastq1 is not a file: '$3'" 1>&2
usage
exit 1
fi
if [ ! -f $4 ]; then
echo "ERROR: fastq2 is not a file: '$4'" 1>&2
usage
exit 1
fi
fastq1=$3
fastq2=$4
# Sample name to use for output
s_prefix=${fastq1%_1.f*q.gz}
sname=`basename $s_prefix`
if [ ! -d `dirname $5` ]; then
echo "ERROR: could not find parent directory for output directory" 1>&2
usage
exit 1
fi
## Set up the directory structure
outdir=$5
[[ ! -d $outdir ]] && mkdir $outdir
## Use the directory name as a job identifier
JOBNAME=`basename $outdir`
## FastQC
fastqc_raw="$outdir/fastqc/raw"
[[ ! -d $fastqc_raw ]] && mkdir -p $fastqc_raw
fastqc_sortmerna="$outdir/fastqc/sortmerna"
[[ ! -d $fastqc_sortmerna ]] && mkdir $fastqc_sortmerna
fastqc_trimmomatic="$outdir/fastqc/trimmomatic"
[[ ! -d $fastqc_trimmomatic ]] && mkdir $fastqc_trimmomatic
## SortmeRNA
sortmerna="$outdir/sortmerna"
[[ ! -d $sortmerna ]] && mkdir $sortmerna
## Trimmomatic
trimmomatic="$outdir/trimmomatic"
[[ ! -d $trimmomatic ]] && mkdir $trimmomatic
## STAR
star="$outdir/star"
[[ ! -d $star ]] && mkdir $star
## HTSeq
htseq="$outdir/htseq"
[[ ! -d $htseq ]] && mkdir $htseq
## Export some variables
[[ -z $UPSCb ]] && export UPSCb=$PIPELINE_DIR/..
## I will just assume that the sortmerna data is symlinked in the repo
[[ -z $SORTMERNADIR ]] && export SORTMERNADIR=$PIPELINE_DIR/../data/sortmerna
if [ ! -e $SORTMERNADIR ]; then
echo "ERROR: could not find the sortmerna data in $SORTMERNADIR" 1&>2
exit 1
fi
## Functions ##
run_sbatch_usage() {
echo "usage: run_sbatch OPTIONS <batch script> [<script param> ...]
Run a batch script and echo the job id
OPTIONS
-e stderr (required)
-o stdout (required)
-J job name
-d dependency" 1>&2
}
cleanup() {
#jobs=$@
if [ $# -gt 0 ]; then
echo "Canceling already started jobs: $@" 1>&2
scancel $@
fi
exit 3
}
run_sbatch() {
# Start a batch script and echo the job id
if [ $# -lt 3 ]; then
run_sbatch_usage
exit 1
fi
OPTIND=1
log_path=""
out_path=""
dependency=""
while getopts "J:e:o:d:" opt; do
case "$opt" in
J) jobname=$OPTARG ;;
e) log_path=$OPTARG ;;
o) out_path=$OPTARG ;;
d) dependency=$OPTARG ;;
?) run_sbatch_usage; exit 1 ;;
esac
done
shift $((OPTIND-1))
script=$1
shift
if [ -z $jobname ]; then
jobname="${sname}.RNAPreproc.${script}"
fi
# Check that the output file paths are given
if [ -z $log_path ] || [ -z $out_path ]; then
run_sbatch_usage
exit 1
fi
# Check that the output file directories exist
if [ ! -d `dirname $log_path` ] || [ ! -d `dirname $out_path` ]; then
echo "ERROR: stderr and stdout paths must exist" 1>&2
run_sbatch_usage
exit 1
fi
sbatch_options=
if [ ! -z $dependency ]; then
sbatch_options="-d $dependency"
fi
sbatch_echo=`sbatch -A "$proj" \
-J "$jobname" \
--mail-user "$mail" \
-e "$log_path" \
-o "$out_path" \
$sbatch_options \
$PIPELINE_DIR/$script $@`
if [ $? -ne 0 ]; then
echo "ERROR: submission failed" 1>&2
cleanup ${JOBIDS[*]}
fi
echo ${sbatch_echo//[^0-9]/}
}
## End functions ##
## Job ID array
JOBIDS=()
## Run fastQValidator
if [ $pstart -le 1 ]; then
fastqv_id1=`run_sbatch \
-e $fastqc_raw/${sname}_1_validate.err \
-o $fastqc_raw/${sname}_1_validate.out \
-J ${sname}.RNAseq.FastQValidate1 \
runFastQValidator.sh $fastq1`
JOBIDS+=($fastqv_id1)
fastqv_id2=`run_sbatch \
-e $fastqc_raw/${sname}_2_validate.err \
-o $fastqc_raw/${sname}_2_validate.out \
-J ${sname}.RNAseq.FastQCValidate2 \
runFastQValidator.sh $fastq2`
JOBIDS+=($fastqv_id2)
fi
## Run FastQC. Depends on a successful run of fastQValidator
if [ $pstart -le 2 ] && [ $pend -ge 2 ]; then
dep1=
dep2=
if [ $pstart -lt 2 ]; then
dep1="-d afterok:$fastqv_id1"
dep2="-d afterok:$fastqv_id2"
fi
fqc_raw_id1=`run_sbatch \
-e $fastqc_raw/${sname}_1_fastqc.err \
-o $fastqc_raw/${sname}_1_fastqc.out \
-J ${sname}.RNAseq.FastQC.raw1 \
$dep1 runFastQC.sh $fastq1 $fastqc_raw`
JOBIDS+=($fqc_raw_id1)
fqc_raw_id2=`run_sbatch \
-e $fastqc_raw/${sname}_2_fastqc.err \
-o $fastqc_raw/${sname}_2_fastqc.out \
-J ${sname}.RNAseq.FastQC.raw2 \
$dep2 runFastQC.sh $fastq2 $fastqc_raw`
JOBIDS+=($fqc_raw_id2)
fi
#TODO: Run fastqc stats
## Run Sortmerna. Depends on a successful run of FastQC
if [ $pstart -le 3 ] && [ $pend -ge 3 ]; then
dep=
if [ $pstart -lt 3 ]; then
dep="-d afterok:$fqc_raw_id1:$fqc_raw_id2"
fi
sortmerna_id=`run_sbatch \
-e $sortmerna/${sname}_sortmerna.err \
-o $sortmerna/${sname}_sortmerna.out \
$dep \
-J ${sname}.RNAseq.SortMeRNA \
runSortmerna.sh $sortmerna $SNIC_TMP $fastq1 $fastq2`
JOBIDS+=($sortmerna_id)
fi
fastq_sort_1="$sortmerna/${sname}_sortmerna_1.fq.gz"
fastq_sort_2="$sortmerna/${sname}_sortmerna_2.fq.gz"
# Run FastQC. Depends on a successful run of SortMeRNA
if [ $pstart -le 4 ] && [ $pend -ge 4 ]; then
dep=
if [ $pstart -lt 4 ]; then
dep="-d afterok:$sortmerna_id"
elif [ ! -f $fastq_sort_1 ] || [ ! -f $fastq_sort_2 ]; then
echo >&2 "ERROR: rRNA-filtered FASTQ-files could not be found"
cleanup
fi
fqc_sort_id1=`run_sbatch \
-e $fastqc_sortmerna/${sname}_1_fastqc.err \
-o $fastqc_sortmerna/${sname}_1_fastqc.out \
-J ${sname}.RNAseq.FastQC.SortMeRNA1 \
$dep runFastQC.sh $fastq_sort_1 $fastqc_sortmerna`
JOBIDS+=($fqc_sort_id1)
fqc_sort_id2=`run_sbatch \
-e $fastqc_sortmerna/${sname}_2_fastqc.err \
-o $fastqc_sortmerna/${sname}_2_fastqc.out \
-J ${sname}.RNAseq.FastQC.SortMeRNA2 \
$dep runFastQC.sh $fastq_sort_2 $fastqc_sortmerna`
JOBIDS+=($fqc_sort_id2)
fi
## Run trimmomatic. Depends on a successful run of SortMeRNA
if [ $pstart -le 5 ] && [ $pend -ge 5 ]; then
dep=
if [ $pstart -lt 5 ]; then
# SortMeRNA has to finish, if it was started
dep="-d afterok:$sortmerna_id"
elif [ ! -f $fastq_sort_1 ] || [ ! -f $fastq_sort_2 ]; then
# If we start here, make sure the files exist
echo >&2 "ERROR: rRNA-filtered FASTQ-files could not be found"
cleanup
fi
trimmomatic_id=`run_sbatch \
-e $trimmomatic/${sname}_trimmomatic.err \
-o $trimmomatic/${sname}_trimmomatic.log \
-J ${sname}.RNAseq.Trimmomatic \
$dep \
runTrimmomatic.sh $fastq_sort_1 $fastq_sort_2 $trimmomatic SLIDINGWINDOW:7:30 MINLEN:50`
JOBIDS+=($trimmomatic_id)
fi
## Trimmed fastq file paths
fastq_trimmed_1="$trimmomatic/${sname}_sortmerna_trimmomatic_1.fq.gz"
fastq_trimmed_2="$trimmomatic/${sname}_sortmerna_trimmomatic_2.fq.gz"
# Run FastQC. Depends on a successful run of Trimmomatic
if [ $pstart -le 6 ] && [ $pend -ge 6 ]; then
dep=
if [ $pstart -lt 6 ]; then
dep="-d afterok:$trimmomatic_id"
elif [ ! -f $fastq_trimmed_1 ] || [ ! -f $fastq_trimmed_2 ]; then
echo >&2 "ERROR: rRNA-filtered FASTQ-files could not be found"
cleanup
fi
fqc_trimmed_id1=`run_sbatch \
-e $fastqc_trimmomatic/${sname}_1_fastqc.err \
-o $fastqc_trimmomatic/${sname}_1_fastqc.out \
-J ${sname}.RNAseq.FastQC.Trimmomatic1 \
$dep runFastQC.sh $fastq_trimmed_1 $fastqc_trimmomatic`
JOBIDS+=($fqc_trimmed_id1)
fqc_trimmed_id2=`run_sbatch \
-e $fastqc_trimmomatic/${sname}_2_fastqc.err \
-o $fastqc_trimmomatic/${sname}_2_fastqc.out \
-J ${sname}.RNAseq.FastQC.Trimmomatic2 \
$dep runFastQC.sh $fastq_trimmed_2 $fastqc_trimmomatic`
JOBIDS+=($fqc_trimmed_id2)
fi
# Run STAR. Depends on a successful run of Trimmomatic.
if [ $pstart -le 7 ] && [ $pend -ge 7 ]; then
dep=
if [ $pstart -lt 7 ]; then
dep="-d afterok:$trimmomatic_id"
elif [ ! -f $fastq_trimmed_1 ] || [ ! -f $fastq_trimmed_2 ]; then
echo >&2 "ERROR: rRNA-filtered FASTQ-files could not be found"
cleanup
fi
if [ ! -z $star_gff ]; then
star_id=`run_sbatch \
-e $star/${sname}_STAR.err \
-o $star/${sname}_STAR.out \
-J ${sname}.RNAseq.STAR \
$dep runSTAR.sh -o $star $fastq_trimmed_1 $fastq_trimmed_2 $star_ref $star_gff -- --outReadsUnmapped Fastx`
else
star_id=`run_sbatch \
-e $star/${sname}_STAR.err \
-o $star/${sname}_STAR.out \
-J ${sname}.RNAseq.STAR \
$dep runSTAR.sh -o $star -g $fastq_trimmed_1 $fastq_trimmed_2 $star_ref -- --outReadsUnmapped Fastx`
fi
JOBIDS+=($star_id)
fi
# Run HTSeq. Depends on a successful run of STAR
if [ $pstart -le 8 ] && [ $pend -ge 8 ]; then
# Load a proper python version
module load python/2.7.6
if hash htseq-count; then
if ! htseq-count --help 2>&1 | grep "version 0.6" >/dev/null 2>&1; then
echo >&2 "ERROR: HTSeq v0.6 or higher is required"
cleanup
fi
else
echo >&2 "ERROR: Could not find HTSeq in path"
cleanup
fi
dep=
if [ $pstart -lt 8 ]; then
dep="-d afterok:$star_id"
elif [ ! -f $star/${sname}_sortmerna_trimmomatic_STAR.bam ]; then
echo >&2 "ERROR: STAR alignment BAM file could not be found"
cleanup
fi
strand_arg=
if [ $stranded -eq 1 ]; then
strand_arg="-s"
fi
htseq_id=`run_sbatch \
-e $htseq/${sname}_HTSeq.err \
-o $htseq/${sname}_HTSeq.out \
-J ${sname}.RNAseq.HTSeq \
$dep runHTSeq.sh -i $idattr $strand_arg $htseq $star/${sname}_sortmerna_trimmomatic_STAR.bam $htseq_gff`
JOBIDS+=($htseq_id)
fi
echo "Successfully submitted ${#JOBIDS[@]} jobs for ${sname}: ${JOBIDS[@]}"
#!/bin/bash -l
## THINK OF --outStd SAM --outSAMunmapped Within to write SAM directly and keep all reads. That does not affect any of the log file to be generated
## but consider if we want that when reporting the Chimeric SAM (i.e. for merging the files, we would not want the reads to be part of the SAM...
## the good thing with outputting to SAM is that it can be readily piped into samtools -bs - | samtools sort - filename
#SBATCH -p core
#SBATCH -n 8
#SBATCH -t 0-02:00:00
#SBATCH --mail-type=ALL
#################
## Build geneModel
#################
## TODO extract that to its own script
##usage sbatch -p devel -t 1:00:00 runSTAR.sh genome.fa
#/home/davidsu/bin/STAR --runMode genomeGenerate --genomeDir $1 --genomeFastaFiles $2 --sjdbOverhang 99 --sjdbGTFfile $3 --runThreadN 8
#exit;
## stop on error and be verbose in the output
set -e -x
## exec
STAR=
### tool sanity
if [ ! -z $SLURM_SUBMIT_DIR ]; then
module load bioinfo-tools
module load samtools/0.1.19
module load star/2.3.0e
STAR=`which STAR`
else
STAR=`which STAR`
if [ $? != 0 ]; then
echo "please install STAR before running this script or add it to your PATH"
exit 1
fi