Commit 13d5f2d4 authored by Nicolas Delhomme's avatar Nicolas Delhomme

updated the pipeline

parent bf69cb31
Pipeline #53 skipped
......@@ -18,7 +18,7 @@ set -x
## are we on UPPMAX
if [ ! -z $SLURM_SUBMIT_DIR ]; then
module load bioinfo-tools
module load FastQC/0.10.1
module load FastQC/0.11.5
## echo "Running on UPPMAX"
else
## echo "Running locally"
......
......@@ -4,8 +4,16 @@
#SBATCH -t 0-01:00:00
#SBATCH --mail-type=ALL
## load the module if it exists
module load bioinfo-tools && module load fastQvalidator || {
if ! hash fastQValidator 2>/dev/null; then
echo "fastQValidator was not found in your path" 1>&2
exit 1
fi
}
usage() {
echo "usage: `basename $0` <fastq>
echo "usage: `basename $0` <fastq>
Run fastQValidator on a FASTQ file. Prints output on stdout and
exits with a non-zero exit status if the input file does not
......@@ -16,6 +24,8 @@ ARGUMENTS:
NOTES:
fastQValidator must lie in your PATH" 1>&2
exit 1
}
## stop on error
......@@ -23,20 +33,13 @@ set -e
## check
if [ $# != 1 ]; then
echo "The argument should be one fastq filename" 1>&2
echo "This function takes one argument: a fastq filename" 1>&2
usage
exit 1
fi
if [ ! -f $1 ]; then
echo "The fastq filename you provided does not exist" 1>&2
usage
exit 1
fi
if ! hash fastQValidator 2>/dev/null; then
echo "fastQValidator was not found in your path" 1>&2
exit 1
fi
## we print 1000 errors, should be enough
......
......@@ -20,7 +20,9 @@ echo >&2 \
for the P. trichocarpa gene exon gff3 file
-s is the protocol stranded?
default to FALSE
-a are we counting antisense transcripts?
default to FALSE, only active in combination with -s
-t Chose attribute to count in the gff3 file default is exon
Note:
BAM file are expected to be sorted by position
Only HTSeq 0.6+ version(s) are supported
......@@ -28,24 +30,11 @@ echo >&2 \
exit 1
}
## Are we on UPPMAX?
if [ ! -z $SLURM_SUBMIT_DIR ]; then
## laod the modules
echo Loading modules
module load python/2.7.6
module load bioinfo-tools
module load samtools/0.1.19
else
htseq=`which htseq-count`
if [ "$?" -ne 0 ]; then
echo "error: you need to install htseq or add it to your path"
exit 1
fi
fi
echo Loading modules
module load bioinfo-tools htseq
## check the version
isVersion6=`htseq-count --help | grep "version 0.6" | wc -l`
if [ $isVersion6 != 1 ]; then
if [ `htseq-count --help | grep -c "version 0.6"` -ne 1 ]; then
echo Only HTSeq version 0.6+ are supported
usage
fi
......@@ -53,13 +42,17 @@ fi
## options
IDATTR="Parent"
stranded=0
antisense=0
t="exon"
## get the options
while getopts i:s option
while getopts ai:st: option
do
case "$option" in
a) antisense=1;;
i) IDATTR=$OPTARG;;
s) stranded=1;;
t) t=$OPTARG;;
\?) ## unknown flag
usage;;
esac
......@@ -67,12 +60,6 @@ done
shift `expr $OPTIND - 1`
## we get two dir and two files as input
if [ $# == 4 ]; then
echo "This function arguments have changed!"
usage
fi
if [ $# != 3 ]; then
echo "This function takes one directory, one bam and one gff3 file as arguments"
usage
......@@ -84,28 +71,33 @@ if [ ! -d $1 ]; then
fi
if [ ! -f $2 ]; then
echo "The third argument needs to be an existing bam file"
echo "The second argument needs to be an existing bam file"
usage
fi
nam=`basename ${2//.bam/}`
if [ ! -f $3 ]; then
echo "The forth argument needs to be an existing gff3 file"
echo "The third argument needs to be an existing gff3 file"
usage
fi
## sort by id
## samtools sort -n $3 $2/${nam}-byname
if [ $t == "CDS" ]; then
echo "Warning: the CDS option require the CDS feature to be capital in you gff3 file"
fi
## get the count table
if [ $stranded == 0 ]; then
if [ $antisense == 1 ]; then
echo "The antisense only works in conjunction with the -s option" >&2
fi
## since we are not using strand specific, go for the union
htseq-count -f bam -r pos -m union -s no -t exon -i $IDATTR $2 $3 > $1/$nam.txt
htseq-count -f bam -r pos -m union -s no -t $t -i $IDATTR $2 $3 > $1/$nam.txt
else
htseq-count -f bam -r pos -m intersection-nonempty -s reverse -t exon -i $IDATTR $2 $3 > $1/$nam.txt
## normal counting
if [ $antisense == 0 ]; then
htseq-count -f bam -r pos -m intersection-nonempty -s reverse -t $t -i $IDATTR $2 $3 > $1/$nam.txt
else
htseq-count -f bam -r pos -m intersection-nonempty -s yes -t $t -i $IDATTR $2 $3 > $1/$nam.txt
fi
fi
## clean
## rm $2/${nam}-byname.bam
#!/bin/bash
set -e
### ========================================================
# Preprocessing script for RNA-Seq data.
# THIS SCRIPT IS NOT TO BE RUN THROUGH SBATCH, USE BASH!
### ========================================================
VERSION="0.1.6"
### ========================================================
## pretty print
### ========================================================
underline=`tput smul`
nounderline=`tput rmul`
bold=`tput bold`
normal=`tput sgr0`
### ========================================================
## functions
### ========================================================
## usage
usage() {
echo "usage: bash `basename $0` [OPTIONS] <proj> <mail> <fastq1> <fastq2> <outdir>
......@@ -24,14 +35,20 @@ ${bold}ARGUMENTS:${normal}
${bold}OPTIONS:${normal}
-h show this help message and exit
-d dry-run. Check for the software dependencies, output the command lines and exit
-D Debug. On Uppmax use a devel node to try out changes during development or program updates, it's highly recommended to not run more than one step in this mode
-s n step at which to start (see ${underline}STEPS${nounderline})
-e n step at which to end (see ${underline}STEPS${nounderline})
-g dir path to STAR reference to use (required if STAR is included
in pipeline)
-k the STAR genome is available in shared memory (the STAR option ${underline}--genomeLoad LoadAndKeep${nounderline})
-l the BAM sorting memory limit for STAR in bytes (default: 10000000000)
-G gff gene model GFF3 file for STAR
-H gff gff3 file for ${underline}H${nounderline}TSeq, if not given, the star gff3 will be used
-t library is s${underline}t${nounderline}randed (currently only relevant for HTSeq)
-i IDATTR in GFF3 file to report counts (default: 'Parent')
-m the memory requirement for performing the alignment 128, 256 or 512 (128 is default, using ${bold}fat${normal} request anything larger than 128 )
-t library is s${underline}t${nounderline}randed (currently only relevant for HTSeq, for the illumina protocol, second strand cDNA using dUTP)
-a library is s${underline}t${nounderline}randed (currently only relevant for HTSeq, for the non illumina protocol e.g. first strand cDNA using dUTP)
${bold}STEPS:${normal}
The steps of this script are as follows:
......@@ -47,67 +64,308 @@ ${bold}STEPS:${normal}
${bold}NOTES:${normal}
* This script should not be run through sbatch, just use bash.
* If the output directory already exists, content may be overwritten
* If the output directory already exists, content may be overwritten.
* All tools needed must be in your PATH. You will be notified if they aren't.
* If the SORTMERNA and UPSCb environmental variables don't exist, the
script will guess them. Set them yourself just to be safe." 1>&2
script will guess them. Set them yourself just to be safe.
* If sbatch is not available, then the script does not submit the jobs
but rather print out the commands.
* The -a and -t option are mutually exclusive
" 1>&2
exit 1
}
# Check the number of arguments
if [ $# -lt 5 ]; then
usage
## check if a tool is present and is executable
toolCheck() {
tool=`which $1 2>/dev/null`
if [ ! -z $tool ] && [ -f $tool ] && [ -x $tool ]; then
echo 0
else
echo 1
fi
}
## cleanup on failed test
cleanup() {
#jobs=$@
if [ $# -gt 0 ]; then
echo "Canceling already started jobs: $@" 1>&2
scancel $@
fi
exit 3
}
### --------------------------------------------------------
## sbatch related
run_sbatch_usage() {
echo "usage: run_sbatch OPTIONS <batch script> [<script param> ...]
Run a batch script and echo the job id
OPTIONS
-e stderr (required)
-o stdout (required)
-J job name
-d dependency
-D debug mode
-m memory" 1>&2
exit 1
fi
}
prepare_sbatch() {
# Start a batch script and echo the job id
if [ $# -lt 3 ]; then
echo "ERROR: wrong number of arguments"
run_sbatch_usage
fi
OPTIND=1
log_path=""
out_path=""
dependency=""
memory=""
debug=""
while getopts "J:e:o:d:Dm:" opt; do
case "$opt" in
J) jobname=$OPTARG ;;
e) log_path=$OPTARG ;;
o) out_path=$OPTARG ;;
d) dependency=$OPTARG ;;
D) debug="-p devel -t 1:00:00";;
m) memory=$OPTARG ;;
?) run_sbatch_usage;;
esac
done
shift $((OPTIND-1))
script=$1
shift
if [ -z $jobname ]; then
jobname="${sname}.RNAPreproc.${script}"
fi
# Check that the output file paths are given
if [ -z $log_path ] || [ -z $out_path ]; then
echo "ERROR: output file paths are not given"
run_sbatch_usage
usage
fi
# Check that the output file directories exist
if [ ! -d `dirname $log_path` ] || [ ! -d `dirname $out_path` ]; then
echo "ERROR: stderr and stdout paths must exist" 1>&2
run_sbatch_usage
usage
fi
# sbatch_options=
# if [ ! -z $memory ]; then
# sbatch_options=" -C $memory"
# fi
# if [ ! -z $dependency ]; then
# sbatch_options="${sbatch_options} -d $dependency"
# fi
# echo the command
echo "sbatch -A" $proj \
"-J" $jobname "--mail-user" $mail \
"-e" $log_path "-o" $out_path \
$debug \
${memory:+"-C" "$memory"} \
${dependency:+"-d" "$dependency"} \
$PIPELINE_DIR/$script $@
}
## TODO warn about the random!!!
run_sbatch() {
if [ $dryrun -eq 1 ]; then
echo $RANDOM
else
IFS=$SPACEIFS
sbatch_echo=`$1`
if [ $? -ne 0 ]; then
echo "ERROR: submission failed" 1>&2
echo "$1" 1>&2
cleanup ${JOBIDS[*]}
fi
IFS=$LFIFS
echo ${sbatch_echo//[^0-9]/}
fi
}
### --------------------------------------------------------
## bash commands
prepare_bash() {
# echo the job
# fetch but ignore the arguments
#
OPTIND=1
log_path=""
out_path=""
dependency=""
memory=""
debug=""
while getopts "J:e:o:d:Dm:" opt; do
case "$opt" in
J) jobname=$OPTARG ;;
e) log_path=$OPTARG ;;
o) out_path=$OPTARG ;;
d) dependency=$OPTARG ;;
D) debug="";;
m) memory=$OPTARG ;;
?) run_sbatch_usage;;
esac
done
shift $((OPTIND-1))
# the script
script=$1
shift
echo "bash $PIPELINE_DIR/$script $@;"
}
run_bash(){
echo $RANDOM
}
### ========================================================
## main
### ========================================================
## set the default vars
dryrun=0
debug=0
pstart=1
pend=100
pend=8
star_ref=
star_gff=
star_runner_options=
star_options=
htseq_gff=
stranded=0
ilm_stranded=0
non_ilm_stranded=0
idattr="Parent"
mem=
bam_memory=10000000000
# Parse the options
OPTIND=1
while getopts "hs:e:g:G:H:ti:" opt; do
while getopts "hdDs:e:g:G:kl:H:tai:m:" opt; do
case "$opt" in
h) usage; exit 1 ;;
h) usage;;
d) dryrun=1;;
D) debug=1;;
s) pstart=$OPTARG ;;
e) pend=$OPTARG ;;
g) star_ref=$OPTARG ;;
k) star_options="-- --genomeLoad LoadAndKeep";;
G) star_gff=$OPTARG ;;
l) bam_memory=$OPTARG ;;
H) htseq_gff=$OPTARG ;;
t) stranded=1 ;;
t) ilm_stranded=1 ;;
a) non_ilm_stranded=1;;
i) idattr=$OPTARG ;;
?) usage; exit 1 ;;
m) case "$OPTARG" in
128) mem=;;
256) mem="mem${OPTARG}GB";;
512) mem="mem${OPTARG}GB";;
fat) mem="fat";;
*) usage;;
esac ;;
?) usage;;
esac
done
shift $((OPTIND - 1))
# check for exclusive options
if [ $ilm_stranded -eq 1 ] && [ $non_ilm_stranded -eq 1 ]; then
echo "ERROR: the -a and -t option are mutually exclusive. Choose either or."
usage
fi
# check the number of arguments
if [ "$#" != "5" ]; then
echo "ERROR: this script expects 5 arguments"
usage
fi
## Do basic checks
[[ $pstart =~ ^[0-9]+$ ]] || {
echo "ERROR: '$pstart' is not a valid start value" 1>&2
exit 1
usage
}
[[ $pend =~ ^[0-9]+$ ]] || {
echo "ERROR: '$pend' is not a valid end value" 1>&2
exit 1
usage
}
[[ $pend -lt $pstart ]] && {
echo "ERROR: you can't finish before you start" 1>&2
exit 1
usage
}
echo "### ========================================
# UPSC pre-RNA-Seq pipeline v$VERSION
### ========================================
# Checking the environment for the necessary tools"
## trimmotatic is java based and we have the jar
## in the git repos, so we only need to check
## for java
## the toolList list all the necessary tools
## the toolArray (starting at 1) link the tool(s) to its respective step(s)
## starArray and htseqArray are there to simulate a nested array
toolList=(fastQValidator fastqc sortmerna java STAR samtools python htseq-count)
starArray=([0]=4 [1]=5)
htseqArray=([0]=6 [1]=7)
toolArray=([1]=0 [2]=1 [3]=2 [4]=1 [5]=3 [6]=1 [7]=${starArray[*]} [8]=${htseqArray[*]})
## a global var to stop if we miss tools
## but only after having checked them all
status=0
for ((i=$pstart;i<=$pend;i++)); do
## the nested arrays have a length of (2 x number of element) -1
tot=$(((${#toolArray[$i]} + 1) / 2))
for ((j=0;j<$tot;j++)); do
echo "# Checking step $i ${toolList[${toolArray[$i]:$j:$((j + 1))}]}"
if [ `toolCheck ${toolList[${toolArray[$i]:$j:$((j + 1))}]}` -eq 1 ]; then
echo >&2 "ERROR: Please install the missing tool: ${toolList[${toolArray[$i]:$j:$((j + 1))}]}."
((status+=1))
fi
done
done
if [ $status -gt 0 ]; then
exit 1;
fi
echo "### ========================================
# Preparation started on `date`
# Going from step $pstart to $pend
# Provided parameters are:
# STAR genome: $star_ref
# STAR gff3: $star_gff
# HTSeq gff3: $htseq_gff"
# Check if STAR is included and then if the reference is set
if [ $pstart -le 7 ] && [ $pend -ge 7 ]; then
if [ -z $star_ref ]; then
echo >&2 "ERROR: You are running STAR but have not given a STAR reference"
exit 1
usage
elif [ ! -d $star_ref ]; then
echo >&2 "ERROR: Could not find STAR reference"
exit 1
usage
fi
if [ ! -z $star_gff ] && [ ! -f $star_gff ]; then
echo >&2 "ERROR: Could not find gff file: '$star_gff'"
exit 1
usage
fi
fi
......@@ -115,12 +373,12 @@ fi
if [ $pstart -le 8 ] && [ $pend -ge 8 ]; then
if [ -z $htseq_gff ] && [ -z $star_gff ]; then
echo >&2 "ERROR: HTSeq needs a GFF3 file"
exit 1
usage
elif [ -z $htseq_gff ]; then
htseq_gff=$star_gff
elif [ ! -z $htseq_gff ] && [ ! -f $htseq_gff ]; then
echo >&2 "ERROR: Could not find gff file: '$htseq_gff'"
exit 1
usage
fi
fi
......@@ -129,25 +387,31 @@ fi
# http://stackoverflow.com/questions/59895/can-a-bash-script-tell-what-directory-its-stored-in
PIPELINE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
## abort on any error
set -e
## get some vars
proj=$1
mail=$2
## check some more
if [ ! -f $3 ]; then
echo "ERROR: fastq1 is not a file: '$3'" 1>&2
usage
exit 1
usage
fi
if [ ! -f $4 ]; then
echo "ERROR: fastq2 is not a file: '$4'" 1>&2
usage
exit 1
usage
fi
fastq1=$3
fastq2=$4
## Just to make sure we avoid path issues
# fastq1=$(readlink -f "$3")
# fastq2=$(readlink -f "$4")
fastq1="$3"
fastq2="$4"
# Sample name to use for output
s_prefix=${fastq1%_1.f*q.gz}
......@@ -156,11 +420,12 @@ sname=`basename $s_prefix`
if [ ! -d `dirname $5` ]; then
echo "ERROR: could not find parent directory for output directory" 1>&2
usage
exit 1
usage
fi
## Set up the directory structure
outdir=$5
## in case outdir is .
outdir=`readlink -f $5`
[[ ! -d $outdir ]] && mkdir $outdir
## Use the directory name as a job identifier
......@@ -196,155 +461,119 @@ htseq="$outdir/htseq"
[[ -z $SORTMERNADIR ]] && export SORTMERNADIR=$PIPELINE_DIR/../data/sortmerna
if [ ! -e $SORTMERNADIR ]; then
echo "ERROR: could not find the sortmerna data in $SORTMERNADIR" 1>&2
exit 1
usage
fi
## Functions ##
run_sbatch_usage() {
echo "usage: run_sbatch OPTIONS <batch script> [<script param> ...]
Run a batch script and echo the job id
OPTIONS
-e stderr (required)
-o stdout (required)
-J job name
-d dependency" 1>&2
}
cleanup() {
#jobs=$@
if [ $# -gt 0 ]; then
echo "Canceling already started jobs: $@" 1>&2
scancel $@
fi
exit 3
}
run_sbatch() {
# Start a batch script and echo the job id
if [ $# -lt 3 ]; then
run_sbatch_usage
exit 1
fi
OPTIND=1
log_path=""
out_path=""
dependency=""
while getopts "J:e:o:d:" opt; do
case "$opt" in
J) jobname=$OPTARG ;;
e) log_path=$OPTARG ;;
o) out_path=$OPTARG ;;
d) dependency=$OPTARG ;;
?) run_sbatch_usage; exit 1 ;;
esac
done
shift $((OPTIND-1))
script=$1
shift
if [ -z $jobname ]; then
jobname="${sname}.RNAPreproc.${script}"
fi
# Check that the output file paths are given
if [ -z $log_path ] || [ -z $out_path ]; then
run_sbatch_usage
exit 1
fi
# Check that the output file directories exist
if [ ! -d `dirname $log_path` ] || [ ! -d `dirname $out_path` ]; then
echo "ERROR: stderr and stdout paths must exist" 1>&2
run_sbatch_usage
exit 1
fi
## final setup
## check for sbatch, return 1 if no sbatch, 0 otherwise
CMD=sbatch
if [ `toolCheck $CMD` -eq 1 ]; then
CMD=bash
fi
sbatch_options=
if [ ! -z $dependency ]; then
sbatch_options="-d $dependency"
fi
## setup the tmp dir
[[ -z $SNIC_TMP ]] && export SNIC_TMP=/tmp
sbatch_echo=`sbatch -A "$proj" \
-J "$jobname" \
--mail-user "$mail" \
-e "$log_path" \
-o "$out_path" \
$sbatch_options \
$PIPELINE_DIR/$script $@`
if [ $? -ne 0 ]; then
echo "ERROR: submission failed" 1>&2
cleanup ${JOBIDS[*]}
fi
echo "### ========================================"
echo ${sbatch_echo//[^0-9]/}
}
## End functions ##
## Check if we dry run
if [ $dryrun -eq 1 ]; then
echo "# This is a dry-run! Printing command lines only"
fi
echo "# Using $CMD as a job submitter"
### --------------------------------------------------------
## prepare the job array
debug_var=
if [ $debug -eq 1 ]; then
echo "# Running in debug mode, devel node will be used on uppmax, it's highly recommended to not run more than one step in this mode"
debug_var="-D"
fi
## Job ID array
JOBIDS=()
## Job CMD array
JOBCMDS=()
## Internal field separator
## We need to change the input field separator
## to be line feed and not space for some of the
## array, but this might fail command execution
SPACEIFS=$IFS
LFIFS=$'\n'
IFS=$LFIFS
## TODO WE NEED A single prepare and run function
## that contains the logic
## Run fastQValidator
if [ $pstart -le 1 ]; then
fastqv_id1=`run_sbatch \
echo "# Preparing step 1"