Commit 37bf9a7a authored by Nicolas Delhomme's avatar Nicolas Delhomme

original repos structure with README and the first runners for the pipeline

parent c4106e62
This directory will contain a description on how to reproduce the results described in our publications.
This directory is meant to contain SBATCH generic scripts only. For project specific scripts, write them in the projects/[project-name]/pipeline directory instead.
#!/bin/bash -l
#SBATCH -p core
#SBATCH -n 1
#SBATCH -t 0:30:00
#SBATCH --mail-type=ALL
## stop on error but be verbose
set -e
set -x
##
# Run fastQC
##
## Usage: sh runFastQC.sh file ouputFolder
## sanity checks
## executable
## are we on UPPMAX
if [ ! -z $SLURM_SUBMIT_DIR ]; then
module load bioinfo-tools
module load FastQC/0.10.1
## echo "Running on UPPMAX"
else
## echo "Running locally"
fastqc=`which fastqc`
if [ "$?" == "1" ]; then
echo "please install fastqc before running this script or add it to your PATH"
exit 1
fi
if [ ! -f $fastqc -a ! -x $fastqc ]; then
echo "your fastQC does not appear to be an executable file"
exit 1
fi
fi
## arguments
if [ $# != 2 ]; then
echo "This script takes two arguments: the input file and the output directory"
exit 1
fi
## input file
if [ ! -f $1 ]; then
echo "The first argument needs to be an existing fastq (optionally gz) file"
exit 1
fi
## output dir
if [ ! -d $2 ]; then
echo "The second argument needs to be an existing output directory."
fi
## start
fastqc --noextract --outdir $2 $1
#!/bin/bash -l
#SBATCH -p core -n 1
#SBATCH -t 0-01:00:00
#SBATCH --mail-type=ALL
usage() {
echo "usage: `basename $0` <fastq>
Run fastQValidator on a FASTQ file. Prints output on stdout and
exits with a non-zero exit status if the input file does not
conform to the standard.
ARGUMENTS:
fastq a FASTQ file, can be gzipped
NOTES:
fastQValidator must lie in your PATH" 1>&2
}
## stop on error
set -e
## check
if [ $# != 1 ]; then
echo "The argument should be one fastq filename" 1>&2
usage
exit 1
fi
if [ ! -f $1 ]; then
echo "The fastq filename you provided does not exist" 1>&2
usage
exit 1
fi
if ! hash fastQValidator 2>/dev/null; then
echo "fastQValidator was not found in your path" 1>&2
exit 1
fi
## we print 1000 errors, should be enough
fastQValidator --file $1 --printableErrors 1000
#!/bin/bash -l
#SBATCH -p core
#SBATCH -n 1
#SBATCH -t 8:00:00
#SBATCH --mail-type=ALL
## -A and --mail-user set in the submit job
## stop on error
set -ex
## usage
usage(){
echo >&2 \
"
Usage: runHTSeq.sh [options] <out dir> <in.bam> <in.gff>
Options:
-i precise the IDATTR
default to 'Parent', but e.g. should be 'pacid'
for the P. trichocarpa gene exon gff3 file
-s is the protocol stranded?
default to FALSE
Note:
BAM file are expected to be sorted by position
Only HTSeq 0.6+ version(s) are supported
"
exit 1
}
## Are we on UPPMAX?
if [ ! -z $SLURM_SUBMIT_DIR ]; then
## laod the modules
echo Loading modules
module load python/2.7.6
module load bioinfo-tools
module load samtools/0.1.19
else
htseq=`which htseq-count`
if [ "$?" -ne 0 ]; then
echo "error: you need to install htseq or add it to your path"
exit 1
fi
fi
## check the version
isVersion6=`htseq-count --help | grep "version 0.6" | wc -l`
if [ $isVersion6 != 1 ]; then
echo Only HTSeq version 0.6+ are supported
usage
fi
## options
IDATTR="Parent"
stranded=0
## get the options
while getopts i:s option
do
case "$option" in
i) IDATTR=$OPTARG;;
s) stranded=1;;
\?) ## unknown flag
usage;;
esac
done
shift `expr $OPTIND - 1`
## we get two dir and two files as input
if [ $# == 4 ]; then
echo "This function arguments have changed!"
usage
fi
if [ $# != 3 ]; then
echo "This function takes one directory, one bam and one gff3 file as arguments"
usage
fi
if [ ! -d $1 ]; then
echo "The first argument needs to be an existing directory"
usage
fi
if [ ! -f $2 ]; then
echo "The third argument needs to be an existing bam file"
usage
fi
nam=`basename ${2//.bam/}`
if [ ! -f $3 ]; then
echo "The forth argument needs to be an existing gff3 file"
usage
fi
## sort by id
## samtools sort -n $3 $2/${nam}-byname
## get the count table
if [ $stranded == 0 ]; then
## since we are not using strand specific, go for the union
htseq-count -f bam -r pos -m union -s no -t exon -i $IDATTR $2 $3 > $1/$nam.txt
else
htseq-count -f bam -r pos -m intersection-nonempty -s reverse -t exon -i $IDATTR $2 $3 > $1/$nam.txt
fi
## clean
## rm $2/${nam}-byname.bam
This diff is collapsed.
#!/bin/bash -l
## THINK OF --outStd SAM --outSAMunmapped Within to write SAM directly and keep all reads. That does not affect any of the log file to be generated
## but consider if we want that when reporting the Chimeric SAM (i.e. for merging the files, we would not want the reads to be part of the SAM...
## the good thing with outputting to SAM is that it can be readily piped into samtools -bs - | samtools sort - filename
#SBATCH -p core
#SBATCH -n 8
#SBATCH -t 0-02:00:00
#SBATCH --mail-type=ALL
#################
## Build geneModel
#################
## TODO extract that to its own script
##usage sbatch -p devel -t 1:00:00 runSTAR.sh genome.fa
#/home/davidsu/bin/STAR --runMode genomeGenerate --genomeDir $1 --genomeFastaFiles $2 --sjdbOverhang 99 --sjdbGTFfile $3 --runThreadN 8
#exit;
## stop on error and be verbose in the output
set -e -x
## exec
STAR=
### tool sanity
if [ ! -z $SLURM_SUBMIT_DIR ]; then
module load bioinfo-tools
module load samtools/0.1.19
module load star/2.3.0e
STAR=`which STAR`
else
STAR=`which STAR`
if [ $? != 0 ]; then
echo "please install STAR before running this script or add it to your PATH"
exit 1
fi
if [ ! -f $STAR -a ! -x $STAR ]; then
echo "your STAR does not appear to be an executable file"
exit 1
fi
samtools=`which samtools`
if [ $? != 0 ]; then
echo "please install samtools before running this script or add it to your PATH"
exit 1
fi
if [ ! -f $samtools -a ! -x $samtools ]; then
echo "your samtools does not appear to be an executable file"
exit 1
fi
fi
##########
# Run star
##########
INTRONMAX=11000
OUT_DIR=`pwd`
GFF=1
SINGLE=0
PROC=8
## usage
usage(){
echo >&2 \
"
Usage: runSTAR.sh [option] <fwd file> <rv file> <genome dir> <gene model gff3> [--] [additional STAR arguments]
Options:
-e STAR executable
-g if there is no gff file
-m max intron length
-o outdir
-p number of threads to be used (default: 8)
-s if there is no reverse file
Notes:
-- is a special argument that stop the command line scanning for the script options.
It is necessary if you want to precised additional - non-default - STAR arguments.
"
exit 1
}
## get the options
while getopts e:gm:o:sp: option
do
case "$option" in
e) STAR=$OPTARG;;
g) GFF=0;;
m) INTRONMAX=$OPTARG;;
o) OUT_DIR=$OPTARG;;
p) PROC=$OPTARG;;
s) SINGLE=1;;
\?) ## unknown flag
usage;;
esac
done
shift `expr $OPTIND - 1`
## check the arguments
if [ ! -z $STAR -a ! -f $STAR -a ! -x $STAR ]; then
echo "your STAR does not appear to be an executable file"
exit 1
fi
ARGS=4
if [ $SINGLE == 1 ]; then
let "ARGS = $ARGS - 1"
FIND=".f*q.gz"
else
FIND="_1.f*q.gz"
fi
if [ $GFF == 0 ]; then
let "ARGS = $ARGS - 1"
fi
if [ $# -lt $ARGS ]; then
echo "This script needs 2 arguments without GFF and for SE data; 3 for either and 4 for none of these two conditions."
usage
fi
if [ ! -f $1 ]; then
echo "The forward fastq file: $1 does not exist"
usage
else
in1=$1
shift
fi
if [ $SINGLE == 0 ]; then
if [ ! -f $1 ]; then
echo "The reverse fastq file: $1 does not exist"
usage
else
in2=$1
shift
fi
fi
if [ ! -d $1 ]; then
echo "The genome directory: $1 does not exist"
usage
else
genome=$1
shift
fi
if [ $GFF == 1 ]; then
if [ ! -f $1 ]; then
echo "The gene model gff3 file: $1 does not exists"
usage
else
gff3=$1
shift
fi
fi
## do we have more arguments
if [ $# != 0 ]; then
## drop the --
shift
fi
## output name
uz3=$OUT_DIR/`basename ${in1//$FIND/}`
## start star
if [ $SINGLE == 1 -a $GFF == 0 ]; then
$STAR --genomeDir $genome --readFilesIn $in1 --runThreadN $PROC --alignIntronMax $INTRONMAX --outSAMstrandField intronMotif --readFilesCommand zcat --outFileNamePrefix $uz3 $@
else
if [ $SINGLE == 1 -o $GFF == 0 ]; then
if [ $GFF == 0 ]; then
$STAR --genomeDir $genome --readFilesIn $in1 $in2 --runThreadN $PROC --alignIntronMax $INTRONMAX --outSAMstrandField intronMotif --readFilesCommand zcat --outFileNamePrefix $uz3 $@
else
$STAR --genomeDir $genome --readFilesIn $in1 --runThreadN $PROC --alignIntronMax $INTRONMAX --outSAMstrandField intronMotif --sjdbGTFfile $gff3 --readFilesCommand zcat --outFileNamePrefix $uz3 $@
fi
else
$STAR --genomeDir $genome --readFilesIn $in1 $in2 --runThreadN $PROC --alignIntronMax $INTRONMAX --outSAMstrandField intronMotif --sjdbGTFfile $gff3 --readFilesCommand zcat --outFileNamePrefix $uz3 $@
fi
fi
## save the logs
mkdir -p ${uz3}_logs
mv ${uz3}Log.* ${uz3}_logs
mv ${uz3}SJ* ${uz3}_logs
## convert sam to bam
samtools view -Sb ${uz3}Aligned.out.sam | samtools sort - ${uz3}_STAR
samtools index ${uz3}_STAR.bam
## clean
rm ${uz3}Aligned.out.sam
## TODO modify to print SAM to stdout to pipe into samtools, add the mate1,2 generation and compress these
#!/bin/bash -l
#SBATCH -p node
## for large files
## we don't need the proc but the mem
## we could give that as param
#SBATCH -n 16
## time too for large files
#SBATCH -t 12:00:00
#SBATCH --mail-type=ALL
## mail-user and A have to be set in the submit script
## stop on error
set -e
## be verbose and extend the commands
set -x
## check the options if any
KEEP=0
useMtSSU=1
UNPAIRED=0
PROC=16
## local run
## replaced by checking for the SORTMERNADIR - see below
## if [ -z $SLURM_SUBMIT_DIR ]; then
## SLURM_SUBMIT_DIR=`pwd`
## fi
## usage
usage(){
echo >&2 \
"
Usage: runSortmerna.sh [option] <out dir> <tmp dir> <forward fastq.gz> <reverse fastq.gz>
Options:
-k keep the rRNA
-m do not run against mtSSU
-p number of threads to be used (default $PROC)
-u single end data (in that case only the forward fastq is needed)
Note:
1) The SORTMERNADIR environment variable needs to be set
2) Only SortMeRna version 1.9 is supported
"
exit 1
}
## get the options
while getopts kmp:u option
do
case "$option" in
k) KEEP=1;;
m) useMtSSU=0;;
p) PROC=$OPTARG;;
u) UNPAIRED=1;;
\?) ## unknown flag
usage;;
esac
done
shift `expr $OPTIND - 1`
##
echo Setting up
## set some env var
## this location is not in Git anymore!
## it has to be downloaded by the user
## check the ethylene-insensitive project submitter to see
## how to set that up
if [ -z $SORTMERNADIR ]; then
echo You need to set your SORTMERNADIR environment variable
usage
fi
## set the dbs
db5s=$SORTMERNADIR/rRNA_databases/rfam-5s-database-id98.fasta
db58s=$SORTMERNADIR/rRNA_databases/rfam-5.8s-database-id98.fasta
db16s=$SORTMERNADIR/rRNA_databases/silva-bac-16s-database-id85.fasta
db18s=$SORTMERNADIR/rRNA_databases/silva-euk-18s-database-id95.fasta
db23s=$SORTMERNADIR/rRNA_databases/silva-bac-23s-database-id98.fasta
db28s=$SORTMERNADIR/rRNA_databases/silva-euk-28s-database-id98.fasta
dbNum=6
dbs="$db5s $db58s $db16s $db18s $db23s $db28s"
if [ $useMtSSU == 1 ]; then
mtSSU=$SORTMERNADIR/rRNA_databases/mtSSU_UCLUST-95-identity.fasta
dbs="$db5s $db58s $db16s $db18s $db23s $db28s $mtSSU"
dbNum=7
fi
##
echo Checking
## we get two dir and two files as input
if [ $UNPAIRED == 0 ]; then
if [ $# != 4 ]; then
echo "This function takes two directories and two files as arguments"
usage
fi
else
if [ $# != 3 ]; then
echo "This function takes two directories and one file as argument"
usage
fi
fi
if [ ! -d $1 ]; then
echo "The first argument needs to be an existing directory"
usage
fi
if [ ! -d $2 ]; then
echo "The second argument needs to be an existing directory"
usage
fi
##
echo Gunzipping
## unzip the files
if [ ! -f $3 ]; then
echo "The third argument needs to be an existing fastq.gz file"
usage
fi
f1=`basename ${3//.gz/}`
if [ $UNPAIRED == 0 ]; then
if [ ! -f $4 ]; then
echo "The forth argument needs to be an existing fastq.gz file"
usage
fi
f2=`basename ${4//.gz/}`
fi
## decompress them
if [ ! -f $2/$f1 ]; then
gunzip -c $3 > $2/$f1
fi
if [ $UNPAIRED == 0 ]; then
if [ ! -f $2/$f2 ]; then
gunzip -c $4 > $2/$f2
fi
fi
## interleave them
fm=`basename ${3//.f*q.gz/}`
if [ $UNPAIRED == 0 ]; then
isVersion9=`sortmerna --version | grep "version 1.9" | wc -l`
if [ $isVersion9 != 1 ]; then
echo Only SortMeRna version 1.9 is supported
usage
else
merge-paired-reads.sh $2/$f1 $2/$f2 $2/$fm
fi
fi
##
if [ $UNPAIRED == 0 ]; then
echo Pre-cleaning
rm -f $2/$f1 $2/$f2
fi
##
echo Sorting
## PE
if [ $UNPAIRED == 0 ]; then
fo=`basename ${3//_[1,2].f*q.gz/_sortmerna}`
else
fo=`basename ${3//.f*q.gz/_sortmerna}`
fi
## check the options
opt=
if [ $KEEP -eq 1 ]; then
opt="--bydbs --accept $2/${fo}_rRNA"
fi
if [ $UNPAIRED == 0 ]; then
sortmerna -n $dbNum --db $dbs --I $2/$fm --other $2/$fo --log $1/$fo -a $PROC -v --paired-in $opt
else
sortmerna -n $dbNum --db $dbs --I $2/$f1 --other $1/$fo --log $1/$fo -a $PROC -v $opt
fi
## deinterleave it
if [ $UNPAIRED == 0 ]; then
## sortmerna get confused by dots in the filenames
if [ ! -f $2/$fo.fastq ]; then
mv $2/$fo.* $2/$fo.fastq
fi
unmerge-paired-reads.sh $2/$fo.fastq $1/${fo}_1.fq $1/${fo}_2.fq
fi
## rm the tmp
echo Post-Cleaning
if [ $UNPAIRED == 0 ]; then
rm -f $2/$fm $2/$fo.fastq
else
rm -f $2/$f1
fi
## deinterleave the rest if needed
if [ $KEEP -eq 1 ]; then
if [ $UNPAIRED == 0 ]; then
find $2 -name "${fo}_rRNA*" -print0 | xargs -0 -I {} -P 6 sh -c 'unmerge-paired-reads.sh $0 $1/`basename ${0//.fastq/_1.fq}` $1/`basename ${0//.fastq/_2.fq}`' {} $1
fi
fi
## keep that as a reminder if that happens again
## sortmerna get confused by the dots as well...
## echo Validating
if [ $UNPAIRED -eq 1 ]; then
if [ ! -f $1/$fo.fastq ]; then
mv $1/$fo.* $1/$fo.fq
fi
fi
##
echo Gzipping
## compress the output files
find $1 -name "${fo}*.fq" -print0 | xargs -0 -I {} -P 8 gzip -f {}
#printf "%s\0%s" $1/${fo}_1.fq $1/${fo}_2.fq | xargs -0 -I {} -P 2 gzip -f {}
##
echo Done
#!/bin/bash -l
#SBATCH -p node
#SBATCH -n 16
#SBATCH -t 3-00:00:00
#SBATCH --mail-type=ALL
## abort on error
set -e
## usage
usage(){
echo >&2 \
"Usage:
Paired end: $0 <fwd fastq file> <rev fastq file> <output dir> [trimming options]
Single end: $0 -s <fastq file> <output dir> [trimming options]
Options:
-c clipping file and settings
-p number of threads to use
-q use illumina quality (+64 offset), default to sanger now (+33 offset)!
-s single end reads
-t add a trim log (defaut no trimlog anymore)
-v verbose output
Trimming options:
Trimming defaults to 'SLIDINGWINDOW:5:20 MINLEN:50'
If you change the default, you need to provide the COMPLETE trimming option again!!!
e.g. to use a 30 quality threshold for the sliding window, provide: SLIDINGWINDOW:5:30 MINLEN:50.
Clipping defaults to 'ILLUMINACLIP:\"$UPSCb/data/TruSeq3-PE-2.fa\":2:30:10'
Notes:
The UPSCb Environment Variable needs to be set to your Git UPSCb checkout dir.
"
exit 1
}
## check env var
if [ -z $UPSCb ]; then
echo "The UPSCb environment variable needs to be set."
usage
fi
if [ ! -f $UPSCb/data/TruSeq3-PE-2.fa ]; then
echo "Either your UPSC env. var. is not set correctly or your checkout is too old."
usage
fi
## options
clip=
single_end=0
quality="-phred33"
thread=16
trimlog=0
verbose=0
while getopts "c:p:qstv" opt; do
case $opt in
c) clip=$OPTARG;;
p) thread=$OPTARG;;
q) quality="-phred64";;
s) single_end=1;;
t) trimlog=1;;
v) verbose=1;;
\?) usage;;
esac
done
shift `expr $OPTIND - 1`
if [ $verbose -eq 1 ]; then
echo "Options are to use $thread CPUs"
if [ $single_end -eq 0 ]; then
echo "for paired-end trimming."
else
echo "for single-end trimming."
fi
fi
## check the arguments
if [ $single_end -eq 0 -a $# -lt 3 ] || [ $single_end -eq 1 -a $# -lt 2 ]; then
echo "Given the provided option, the number of argument is incorrect."
usage
fi
## the clip default
if [ -z $clip ]; then
if [ $single_end -eq 0 ]; then
clip=ILLUMINACLIP:$UPSCb/data/TruSeq3-PE-2.fa:2:30:10
else
clip=ILLUMINACLIP:$UPSCb/data/TruSeq3-SE.fa:2:30:10
fi
fi
## the trim default
trim="SLIDINGWINDOW:5:20 MINLEN:50"
## check file 1
if [ ! -f "$1" ]; then
echo "The first argument must be the valid file name of the forward fastq file."
usage
fi
fwd=$1
shift
if [ $verbose -eq 1 ]; then
echo "Forward file is $fwd"
fi
## create the pattern
if [ $single_end -eq 0 ]; then
pattern=`basename ${fwd//_1.f*q.gz//}`
else
pattern=`basename ${fwd//.f*q.gz//}`
fi
<