Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
U
UPSCb-public
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Administrator
UPSCb-public
Commits
13d5f2d4
Commit
13d5f2d4
authored
May 19, 2016
by
Nicolas Delhomme
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
updated the pipeline
parent
bf69cb31
Pipeline
#53
skipped
Changes
6
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
765 additions
and
399 deletions
+765
-399
runFastQC.sh
pipeline/runFastQC.sh
+1
-1
runFastQValidator.sh
pipeline/runFastQValidator.sh
+12
-9
runHTSeq.sh
pipeline/runHTSeq.sh
+26
-34
runRNASeqPreprocessing.sh
pipeline/runRNASeqPreprocessing.sh
+449
-186
runSTAR.sh
pipeline/runSTAR.sh
+144
-123
runSortmerna.sh
pipeline/runSortmerna.sh
+133
-46
No files found.
pipeline/runFastQC.sh
View file @
13d5f2d4
...
...
@@ -18,7 +18,7 @@ set -x
## are we on UPPMAX
if
[
!
-z
$SLURM_SUBMIT_DIR
]
;
then
module load bioinfo-tools
module load FastQC/0.1
0.1
module load FastQC/0.1
1.5
## echo "Running on UPPMAX"
else
## echo "Running locally"
...
...
pipeline/runFastQValidator.sh
View file @
13d5f2d4
...
...
@@ -4,8 +4,16 @@
#SBATCH -t 0-01:00:00
#SBATCH --mail-type=ALL
## load the module if it exists
module load bioinfo-tools
&&
module load fastQvalidator
||
{
if
!
hash
fastQValidator 2>/dev/null
;
then
echo
"fastQValidator was not found in your path"
1>&2
exit
1
fi
}
usage
()
{
echo
"usage:
`
basename
$0
`
<fastq>
echo
"usage:
`
basename
$0
`
<fastq>
Run fastQValidator on a FASTQ file. Prints output on stdout and
exits with a non-zero exit status if the input file does not
...
...
@@ -16,6 +24,8 @@ ARGUMENTS:
NOTES:
fastQValidator must lie in your PATH"
1>&2
exit
1
}
## stop on error
...
...
@@ -23,20 +33,13 @@ set -e
## check
if
[
$#
!=
1
]
;
then
echo
"Th
e argument should be one
fastq filename"
1>&2
echo
"Th
is function takes one argument: a
fastq filename"
1>&2
usage
exit
1
fi
if
[
!
-f
$1
]
;
then
echo
"The fastq filename you provided does not exist"
1>&2
usage
exit
1
fi
if
!
hash
fastQValidator 2>/dev/null
;
then
echo
"fastQValidator was not found in your path"
1>&2
exit
1
fi
## we print 1000 errors, should be enough
...
...
pipeline/runHTSeq.sh
View file @
13d5f2d4
...
...
@@ -20,7 +20,9 @@ echo >&2 \
for the P. trichocarpa gene exon gff3 file
-s is the protocol stranded?
default to FALSE
-a are we counting antisense transcripts?
default to FALSE, only active in combination with -s
-t Chose attribute to count in the gff3 file default is exon
Note:
BAM file are expected to be sorted by position
Only HTSeq 0.6+ version(s) are supported
...
...
@@ -28,24 +30,11 @@ echo >&2 \
exit
1
}
## Are we on UPPMAX?
if
[
!
-z
$SLURM_SUBMIT_DIR
]
;
then
## laod the modules
echo
Loading modules
module load python/2.7.6
module load bioinfo-tools
module load samtools/0.1.19
else
htseq
=
`
which htseq-count
`
if
[
"
$?
"
-ne
0
]
;
then
echo
"error: you need to install htseq or add it to your path"
exit
1
fi
fi
echo
Loading modules
module load bioinfo-tools htseq
## check the version
isVersion6
=
`
htseq-count
--help
|
grep
"version 0.6"
|
wc
-l
`
if
[
$isVersion6
!=
1
]
;
then
if
[
`
htseq-count
--help
|
grep
-c
"version 0.6"
`
-ne
1
]
;
then
echo
Only HTSeq version 0.6+ are supported
usage
fi
...
...
@@ -53,13 +42,17 @@ fi
## options
IDATTR
=
"Parent"
stranded
=
0
antisense
=
0
t
=
"exon"
## get the options
while
getopts
i:s
option
while
getopts
ai:st:
option
do
case
"
$option
"
in
a
)
antisense
=
1
;;
i
)
IDATTR
=
$OPTARG
;;
s
)
stranded
=
1
;;
t
)
t
=
$OPTARG
;;
\?
)
## unknown flag
usage
;;
esac
...
...
@@ -67,12 +60,6 @@ done
shift
`
expr
$OPTIND
- 1
`
## we get two dir and two files as input
if
[
$#
==
4
]
;
then
echo
"This function arguments have changed!"
usage
fi
if
[
$#
!=
3
]
;
then
echo
"This function takes one directory, one bam and one gff3 file as arguments"
usage
...
...
@@ -84,28 +71,33 @@ if [ ! -d $1 ]; then
fi
if
[
!
-f
$2
]
;
then
echo
"The
thir
d argument needs to be an existing bam file"
echo
"The
secon
d argument needs to be an existing bam file"
usage
fi
nam
=
`
basename
${
2
//.bam/
}
`
if
[
!
-f
$3
]
;
then
echo
"The
forth
argument needs to be an existing gff3 file"
echo
"The
third
argument needs to be an existing gff3 file"
usage
fi
## sort by id
## samtools sort -n $3 $2/${nam}-byname
if
[
$t
==
"CDS"
]
;
then
echo
"Warning: the CDS option require the CDS feature to be capital in you gff3 file"
fi
## get the count table
if
[
$stranded
==
0
]
;
then
if
[
$antisense
==
1
]
;
then
echo
"The antisense only works in conjunction with the -s option"
>
&2
fi
## since we are not using strand specific, go for the union
htseq-count
-f
bam
-r
pos
-m
union
-s
no
-t
exon
-i
$IDATTR
$2
$3
>
$1
/
$nam
.txt
htseq-count
-f
bam
-r
pos
-m
union
-s
no
-t
$t
-i
$IDATTR
$2
$3
>
$1
/
$nam
.txt
else
htseq-count
-f
bam
-r
pos
-m
intersection-nonempty
-s
reverse
-t
exon
-i
$IDATTR
$2
$3
>
$1
/
$nam
.txt
## normal counting
if
[
$antisense
==
0
]
;
then
htseq-count
-f
bam
-r
pos
-m
intersection-nonempty
-s
reverse
-t
$t
-i
$IDATTR
$2
$3
>
$1
/
$nam
.txt
else
htseq-count
-f
bam
-r
pos
-m
intersection-nonempty
-s
yes
-t
$t
-i
$IDATTR
$2
$3
>
$1
/
$nam
.txt
fi
fi
## clean
## rm $2/${nam}-byname.bam
pipeline/runRNASeqPreprocessing.sh
View file @
13d5f2d4
This diff is collapsed.
Click to expand it.
pipeline/runSTAR.sh
View file @
13d5f2d4
#!/bin/bash -l
## THINK OF --outStd SAM --outSAMunmapped Within to write SAM directly and keep all reads. That does not affect any of the log file to be generated
## but consider if we want that when reporting the Chimeric SAM (i.e. for merging the files, we would not want the reads to be part of the SAM...
## the good thing with outputting to SAM is that it can be readily piped into samtools -bs - | samtools sort - filename
#SBATCH -p core
#SBATCH -n 8
#SBATCH -t 0-02:00:00
#SBATCH -p node
#SBATCH -n 16
#SBATCH -t 0-12:00:00
#SBATCH --mail-type=ALL
#################
## Build geneModel
#################
## TODO extract that to its own script
##usage sbatch -p devel -t 1:00:00 runSTAR.sh genome.fa
#/home/davidsu/bin/STAR --runMode genomeGenerate --genomeDir $1 --genomeFastaFiles $2 --sjdbOverhang 99 --sjdbGTFfile $3 --runThreadN 8
#exit;
# -p node is needed to accept the -C memory configuration
## stop on error and be verbose in the output
set
-e
-x
## load the modules
module load bioinfo-tools star/2.4.0f1 samtools
## exec
STAR
=
### tool sanity
if
[
!
-z
$SLURM_SUBMIT_DIR
]
;
then
module load bioinfo-tools
module load samtools/0.1.19
module load star/2.3.0e
STAR
=
`
which STAR
`
else
STAR
=
`
which STAR
`
if
[
$?
!=
0
]
;
then
echo
"please install STAR before running this script or add it to your PATH"
exit
1
fi
if
[
!
-f
$STAR
-a
!
-x
$STAR
]
;
then
echo
"your STAR does not appear to be an executable file"
exit
1
fi
samtools
=
`
which samtools
`
if
[
$?
!=
0
]
;
then
echo
"please install samtools before running this script or add it to your PATH"
exit
1
fi
if
[
!
-f
$samtools
-a
!
-x
$samtools
]
;
then
echo
"your samtools does not appear to be an executable file"
exit
1
fi
fi
##########
# Run star
##########
## vars
INTRONMAX
=
11000
OUT_DIR
=
`
pwd
`
GFF
=
1
GFF
=
SINGLE
=
0
PROC
=
8
PROC
=
16
FORMAT
=
"gtf"
LIMIT
=
10000000000
## additional options for STAR
OPTIONS
=
"--outSAMstrandField intronMotif --readFilesCommand zcat --outSAMmapqUnique 254 --quantMode TranscriptomeSAM --outFilterMultimapNmax 100 --outReadsUnmapped Fastx --chimSegmentMin 1 --outSAMtype BAM SortedByCoordinate --outWigType bedGraph"
## usage
usage
(){
echo
>
&2
\
"
Usage:
runSTAR.sh [option] <fwd file> <rv file> <genome dir> <gene model gff3
> [--] [additional STAR arguments]
Usage:
$0
[option] <out dir> <genome dir> <fwd file> <rv file
> [--] [additional STAR arguments]
Options:
-e STAR executable
-g if there is no gff file
-m max intron length
-o outdir
-p number of threads to be used (default: 8)
-s if there is no reverse file
-f the gtf/gff3 file format (default gtf)
-g the path to a gtf/gff3 file
-l the BAM sorting memory limit (
$LIMIT
)
-m the max intron length (
$INTRONMAX
)
-p number of threads to be used (default: 16)
-q set for Illumina +64 Phred score
-s if there is no reverse
-n no default option
Notes:
The number of arguments is only 3 when -s is set.
-- is a special argument that stop the command line scanning for the script options.
It is necessary if you want to precised additional - non-default - STAR arguments.
When the format is gff3, the exon-transcript relationship assumes a 'Parent' keylink.
"
exit
1
}
## get the options
while
getopts
e:gm:o:sp:
option
while
getopts
f:g:l:m:np:qs
option
do
case
"
$option
"
in
e
)
STAR
=
$OPTARG
;;
g
)
GFF
=
0
;;
m
)
INTRONMAX
=
$OPTARG
;;
o
)
OUT_DIR
=
$OPTARG
;;
f
)
FORMAT
=
$OPTARG
;;
g
)
GFF
=
$OPTARG
;;
l
)
LIMIT
=
$OPTARG
;;
m
)
INTRONMAX
=
$OPTARG
;;
n
)
OPTIONS
=
""
;;
p
)
PROC
=
$OPTARG
;;
q
)
OPTIONS
=
"
$OPTIONS
--outQSconversionAdd -31"
;;
s
)
SINGLE
=
1
;;
\?
)
## unknown flag
\?
)
## unknown flag
usage
;;
esac
done
shift
`
expr
$OPTIND
- 1
`
##
check the argument
s
if
[
!
-z
$STAR
-a
!
-f
$STAR
-a
!
-x
$STAR
]
;
then
echo
"your STAR does not appear to be an executable file"
exit
1
##
update the option
s
## dirty if loop to accomodate for v2.3.*
if
[
"
$OPTIONS
"
!=
""
]
;
then
OPTIONS
=
"
$OPTIONS
--limitBAMsortRAM
$LIMIT
"
fi
## check the arguments
echo
"Parsing the arguments"
ARGS
=
4
if
[
$SINGLE
==
1
]
;
then
let
"ARGS =
$ARGS
- 1"
FIND
=
".f*
q
.gz"
FIND
=
".f*.gz"
else
FIND
=
"_1.f*q.gz"
fi
if
[
$GFF
==
0
]
;
then
let
"ARGS =
$ARGS
- 1"
FIND
=
"_[1,2].f*q.gz"
fi
## checkthe number of args
if
[
$#
-lt
$ARGS
]
;
then
echo
"This script needs
2 arguments without GFF and for SE data; 3 for either and 4 for none of these two conditions
."
echo
"This script needs
3 or 4 arguments for SE or PE data, respectively
."
usage
fi
## get the out dir
outdir
=
$1
shift
## check the genome dir
if
[
!
-d
$1
]
;
then
echo
"The genome directory:
$1
does not exist"
usage
else
genome
=
$1
shift
fi
## Check if the first file exists
if
[
!
-f
$1
]
;
then
echo
"The forward fastq file:
$1
does not exist"
usage
usage
else
in1
=
$1
shift
fwd
=
$1
shift
fi
## Check if the second file exists
if
[
$SINGLE
==
0
]
;
then
if
[
!
-f
$1
]
;
then
echo
"The reverse fastq file:
$1
does not exist"
usage
else
in2
=
$1
shift
rev
=
$1
shift
fi
fi
if
[
!
-d
$1
]
;
then
echo
"The genome directory:
$1
does not exist"
usage
## if gff is set check if it exists
if
[
!
-z
$GFF
]
&&
[
!
-f
$GFF
]
;
then
echo
"The gene model gtf/gff3 file:
$GFF
does not exists"
usage
else
genome
=
$1
shift
if
[
!
-z
$GFF
]
;
then
OPTIONS
=
"--sjdbGTFfile
$GFF
$OPTIONS
"
fi
fi
if
[
$GFF
==
1
]
;
then
if
[
!
-f
$1
]
;
then
echo
"The gene model gff3 file:
$1
does not exists"
usage
else
gff3
=
$1
shift
fi
fi
## if format is set
case
$FORMAT
in
gff3
)
OPTIONS
=
"
$OPTIONS
--sjdbGTFtagExonParentTranscript Parent"
;;
gff
)
OPTIONS
=
"
$OPTIONS
--sjdbGTFtagExonParentTranscript Parent"
;;
gtf
);;
#nothing to do
*
)
echo
"There are only 2 supported format, gtf or gff3"
usage
;;
esac
## do we have more arguments
if
[
$#
!=
0
]
;
then
...
...
@@ -164,34 +152,67 @@ if [ $# != 0 ]; then
shift
fi
## output name
uz3
=
$OUT_DIR
/
`
basename
${
in1
//
$FIND
/
}
`
## create the output dir
echo
"Processing"
if
[
!
-d
$outdir
]
;
then
mkdir
-p
$outdir
fi
## output prefix
bnam
=
`
basename
${
fwd
//
$FIND
/
}
`
fnam
=
$outdir
/
$bnam
## start STAR
echo
"Aligning"
if
[
$SINGLE
==
1
]
;
then
STAR
--genomeDir
$genome
--readFilesIn
$fwd
--runThreadN
$PROC
--alignIntronMax
$INTRONMAX
--outFileNamePrefix
$fnam
$OPTIONS
$@
else
STAR
--genomeDir
$genome
--readFilesIn
$fwd
$rev
--runThreadN
$PROC
--alignIntronMax
$INTRONMAX
--outFileNamePrefix
$fnam
$OPTIONS
$@
fi
## save the log
echo
"Logging"
mkdir
-p
${
fnam
}
_logs
mv
${
fnam
}
Log.
*
${
fnam
}
_logs
## save the junctions
mkdir
-p
${
fnam
}
_junctions
mv
${
fnam
}
SJ
*
${
fnam
}
_junctions
mv
${
fnam
}
Chimeric.out.junction
${
fnam
}
_junctions
## save the wig
echo
"Wiggling"
mkdir
-p
${
fnam
}
_bedgraphs
mv
${
fnam
}
Signal.
*
.bg
${
fnam
}
_bedgraphs
## start star
if
[
$SINGLE
==
1
-a
$GFF
==
0
]
;
then
$STAR
--genomeDir
$genome
--readFilesIn
$in1
--runThreadN
$PROC
--alignIntronMax
$INTRONMAX
--outSAMstrandField
intronMotif
--readFilesCommand
zcat
--outFileNamePrefix
$uz3
$@
## rename the output
echo
"Renaming"
mv
${
fnam
}
Aligned.sortedByCoord.out.bam
${
fnam
}
_STAR.bam
if
[
$SINGLE
==
0
]
;
then
mv
${
fnam
}
Unmapped.out.mate1
${
fnam
}
_Unmapped_1.fq
mv
${
fnam
}
Unmapped.out.mate2
${
fnam
}
_Unmapped_2.fq
else
if
[
$SINGLE
==
1
-o
$GFF
==
0
]
;
then
if
[
$GFF
==
0
]
;
then
$STAR
--genomeDir
$genome
--readFilesIn
$in1
$in2
--runThreadN
$PROC
--alignIntronMax
$INTRONMAX
--outSAMstrandField
intronMotif
--readFilesCommand
zcat
--outFileNamePrefix
$uz3
$@
else
$STAR
--genomeDir
$genome
--readFilesIn
$in1
--runThreadN
$PROC
--alignIntronMax
$INTRONMAX
--outSAMstrandField
intronMotif
--sjdbGTFfile
$gff3
--readFilesCommand
zcat
--outFileNamePrefix
$uz3
$@
fi
else
$STAR
--genomeDir
$genome
--readFilesIn
$in1
$in2
--runThreadN
$PROC
--alignIntronMax
$INTRONMAX
--outSAMstrandField
intronMotif
--sjdbGTFfile
$gff3
--readFilesCommand
zcat
--outFileNamePrefix
$uz3
$@
fi
mv
${
fnam
}
Unmapped.out.mate1
${
fnam
}
_Unmapped.fq
fi
## save the logs
mkdir
-p
${
uz3
}
_logs
mv
${
uz3
}
Log.
*
${
uz3
}
_logs
mv
${
uz3
}
SJ
*
${
uz3
}
_logs
mv
${
fnam
}
Aligned.toTranscriptome.out.bam
${
fnam
}
_STAR_Transcriptome.bam
## compress files (we would only need 2 CPUS, but what if PROC is set to 1)
find
$outdir
-name
"
${
bnam
}
_Unmapped*.fq"
-print0
| xargs
-P
$PROC
-0
-I
{}
gzip
-f
{}
## sort the transcriptome bam and rename
samtools
sort
-@ 16
-n
${
fnam
}
_STAR_Transcriptome.bam
${
fnam
}
_STAR_Transcriptome.sorted
rm
${
fnam
}
_STAR_Transcriptome.bam
mv
${
fnam
}
_STAR_Transcriptome.sorted.bam
${
fnam
}
_STAR_Transcriptome.bam
## convert sam to bam
samtools view
-Sb
${
uz3
}
Aligned.out.sam | samtools
sort
-
${
uz3
}
_STAR
samtools index
${
uz3
}
_STAR.bam
## convert the chimeric sam to bam
samtools view
-Sb
${
fnam
}
Chimeric.out.sam | samtools
sort
-@ 16 -
${
fnam
}
_STAR_Chimeric
## clean
rm
${
uz3
}
Aligned.out.sam
## index the BAMs
echo
"Indexing"
printf
"%s
\0
%s"
${
fnam
}
_STAR.bam
${
fnam
}
_STAR_Chimeric.bam | xargs
-P
$PROC
-0
-I
{}
samtools index
{}
## TODO modify to print SAM to stdout to pipe into samtools, add the mate1,2 generation and compress these
## cleanup
echo
"Cleaning"
rm
${
fnam
}
Chimeric.out.sam
rm
-rf
${
fnam
}
_STARtmp/
pipeline/runSortmerna.sh
View file @
13d5f2d4
...
...
@@ -16,16 +16,11 @@ set -e
set
-x
## check the options if any
KEEP
=
0
useMtSSU
=
1
KEEP
=
1
useMtSSU
=
0
UNPAIRED
=
0
PROC
=
16
## local run
## replaced by checking for the SORTMERNADIR - see below
## if [ -z $SLURM_SUBMIT_DIR ]; then
## SLURM_SUBMIT_DIR=`pwd`
## fi
DBS
=
## usage
usage
(){
...
...
@@ -34,24 +29,73 @@ echo >&2 \
Usage: runSortmerna.sh [option] <out dir> <tmp dir> <forward fastq.gz> <reverse fastq.gz>
Options:
-k keep the rRNA
-m do not run against mtSSU
-p number of threads to be used (default
$PROC
)
-d define your dbs (semi-colon separated)
-k drop the rRNA (only for v1.9, default to keep them)
-m run against mtSSU in addition (only for v1.9)
-p number of threads to be used (default
$PROC
)
-u single end data (in that case only the forward fastq is needed)
Note:
1) The SORTMERNADIR environment variable needs to be set
2) Only SortMeRna version 1.9 is supported
2) Only SortMeRna version 1.9 and 2.x are supported (2.x is default)
3) -m is not applicable if -d is set
"
exit
1
}
## load the module
module load bioinfo-tools
## Does not work on uppmax - umea has an empty result
## while uppmax is verbose.
## avail=$( module avail sortmerna 2>&1 > /dev/null)
## avail=`echo $avail | tr -d [:blank:]`
## if [ ! -z $avail ]; then
## module load sortmerna
## sortmerna --version
##fi
## record the SORTMERNADIR if it exists
STOREENV
=
if
[
!
-z
$SORTMERNADIR
]
;
then
STOREENV
=
$SORTMERNADIR
fi
## try to load or echo
module load sortmerna
||
{
echo
"No sortmerna as module"
## then check for availability
tool
=
`
which sortmerna 2>/dev/null
`
if
[
!
-z
$tool
]
&&
[
-f
$tool
]
&&
[
-x
$tool
]
;
then
echo
"sortmerna available"
else
echo
"ERROR: INSTALL SortMeRna"
usage
fi
}
# restore the env if it existed
if
[
!
-z
$STOREENV
]
;
then
export
SORTMERNADIR
=
$STOREENV
fi
## check for sortmerna version
is1dot9
=
`
sortmerna
--version
2>&1 |
grep
version |
grep
1.9 |
wc
-c
`
is2dotx
=
`
sortmerna
--version
2>&1 |
grep
"version 2."
|
wc
-c
`
if
[
$is1dot9
==
0
]
&&
[
$is2dotx
==
0
]
;
then
echo
"Only version 1.9 and 2.x are supported"
usage
fi
## get the options
while
getopts
kmp:u option
while
getopts
d:
kmp:u option
do
case
"
$option
"
in
k
)
KEEP
=
1
;;
m
)
useMtSSU
=
0
;;
d
)
DBS
=
$OPTARG
;;
k
)
KEEP
=
0
;;
m
)
useMtSSU
=
1
;;
p
)
PROC
=
$OPTARG
;;
u
)
UNPAIRED
=
1
;;
\?
)
## unknown flag
...
...
@@ -73,23 +117,46 @@ if [ -z $SORTMERNADIR ]; then
usage
fi
## set the dbs
db5s
=
$SORTMERNADIR
/rRNA_databases/rfam-5s-database-id98.fasta
db58s
=
$SORTMERNADIR
/rRNA_databases/rfam-5.8s-database-id98.fasta
db16s
=
$SORTMERNADIR
/rRNA_databases/silva-bac-16s-database-id85.fasta
db18s
=
$SORTMERNADIR
/rRNA_databases/silva-euk-18s-database-id95.fasta
db23s
=
$SORTMERNADIR
/rRNA_databases/silva-bac-23s-database-id98.fasta
db28s
=
$SORTMERNADIR
/rRNA_databases/silva-euk-28s-database-id98.fasta
dbNum
=
6
dbs
=
"
$db5s
$db58s
$db16s
$db18s
$db23s
$db28s
"
if
[
$useMtSSU
==
1
]
;
then
## set the default dbs
if
[
!
-z
$DBS
]
;
then
dbs
=
${
DBS
//;/
}
dbNum
=
`
echo
$DBS
|
awk
-F
";"
'{print NF}'
`
else
if
[
$is2dotx
!=
0
]
;
then
db5s
=
$SORTMERNADIR
/rRNA_databases/rfam-5s-database-id98.fasta,
$SORTMERNADIR
/automata/rfam-5s-database-id98
db58s
=
$SORTMERNADIR
/rRNA_databases/rfam-5.8s-database-id98.fasta,
$SORTMERNADIR
/automata/rfam-5.8s-database-id98
db16sa
=
$SORTMERNADIR
/rRNA_databases/silva-arc-16s-id95.fasta,
$SORTMERNADIR
/automata/silva-arc-16s-database-id95
db16s
=
$SORTMERNADIR
/rRNA_databases/silva-bac-16s-id90.fasta,
$SORTMERNADIR
/automata/silva-bac-16s-database-id90
db18s
=
$SORTMERNADIR
/rRNA_databases/silva-euk-18s-id95.fasta,
$SORTMERNADIR
/automata/silva-euk-18s-database-id95
db23sa
=
$SORTMERNADIR
/rRNA_databases/silva-arc-23s-id98.fasta,
$SORTMERNADIR
/automata/silva-arc-23s-database-id98
db23s
=
$SORTMERNADIR
/rRNA_databases/silva-bac-23s-id98.fasta,
$SORTMERNADIR
/automata/silva-bac-23s-database-id98
db28s
=
$SORTMERNADIR
/rRNA_databases/silva-euk-28s-id98.fasta,
$SORTMERNADIR
/automata/silva-euk-28s-database-id98
dbs
=
"
$db5s
:
$db58s
:
$db16sa
:
$db16s
:
$db18s
:
$db23sa
:
$db23s
:
$db28s
"
#if [ ! -f $SORTMERNADIR/automata/rfam-5s-database-id98.stats ]; then
# echo "No indexes found, creating indexes in folder $SORTMERNADIR/automata"
# indexdb_rna --ref $dbs
#fi
else
db5s
=
$SORTMERNADIR
/rRNA_databases/rfam-5s-database-id98.fasta
db58s
=
$SORTMERNADIR
/rRNA_databases/rfam-5.8s-database-id98.fasta
db16sa
=
$SORTMERNADIR
/rRNA_databases/silva-arc-16s-database-id95.fasta
db16s
=
$SORTMERNADIR
/rRNA_databases/silva-bac-16s-database-id85.fasta
db18s
=
$SORTMERNADIR
/rRNA_databases/silva-euk-18s-database-id95.fasta
db23sa
=
$SORTMERNADIR
/rRNA_databases/silva-arc-23s-database-id98.fasta
db23s
=
$SORTMERNADIR
/rRNA_databases/silva-bac-23s-database-id98.fasta
db28s
=
$SORTMERNADIR
/rRNA_databases/silva-euk-28s-database-id98.fasta
dbNum
=
8
dbs
=
"
$db5s
$db58s
$db16sa
$db16s
$db18s
$db23sa
$db23s
$db28s
"
fi
## Add the mtSSU
if
[
$is1dot9
!=
0
]
&&
[
$useMtSSU
==
1
]
;
then
mtSSU
=
$SORTMERNADIR
/rRNA_databases/mtSSU_UCLUST-95-identity.fasta
dbs
=
"
$db5s
$db58s
$db16s
$db18s
$db23s
$db28s
$mtSSU
"
dbNum
=
7
dbs
=
"
$dbs
$mtSSU
"
dbNum
=
9
fi
fi
##
echo
Checking
...
...
@@ -147,19 +214,15 @@ fi
## interleave them
fm
=
`
basename
${
3
//.f*q.gz/
}
`
if
[
$UNPAIRED
==
0
]
;
then
isVersion9
=
`
sortmerna
--version
|
grep
"version 1.9"
|
wc
-l
`
if
[
$isVersion9
!=
1
]
;
then
echo
Only SortMeRna version 1.9 is supported
usage
else
merge-paired-reads.sh
$2
/
$f1
$2
/
$f2
$2
/
$fm
fi
merge-paired-reads.sh
$2
/
$f1
$2
/
$f2
$2
/
$fm
fi
##
if
[
$UNPAIRED
==
0
]
;
then
echo
Pre-cleaning
rm
-f
$2
/
$f1
$2
/
$f2
else
echo
"TODO: Cleaning needs implementing for single end sequencing"
fi
##
...
...
@@ -173,28 +236,45 @@ else
fi
## check the options
opt
=
if
[
$KEEP
-eq
1
]
;
then
opt
=
"--bydbs --accept
$2
/
${
fo
}
_rRNA"
opt
=
"-a
$PROC
"
if
[
$KEEP
==
1
]
&&
[
$is1dot9
!=
0
]
;
then
opt
=
"
$opt
--bydbs --accept
$2
/
${
fo
}
_rRNA"
fi
## run
if
[
$UNPAIRED
==
0
]
;
then
sortmerna
-n
$dbNum
--db
$dbs
--I
$2
/
$fm
--other
$2
/
$fo
--log
$1
/
$fo
-a
$PROC
-v
--paired-in
$opt
if
[
$is2dotx
!=
0
]
;
then
sortmerna
--ref
$dbs
--reads
$2
/
$fm
--other
$2
/
$fo
--log
--paired_in
--fastx
$opt
--sam
--num_alignments
1
--aligned
$2
/
${
fo
}
_rRNA
else
sortmerna
-n
$dbNum
--db
$dbs
--I
$2
/
$fm
--other
$2
/
$fo
--log
$1
/
$fo
--paired-in
$opt
fi
else
sortmerna
-n
$dbNum
--db
$dbs
--I
$2
/
$f1
--other
$1
/
$fo
--log
$1
/
$fo
-a
$PROC
-v
$opt
if
[
$is2dotx
!=
0
]
;
then
sortmerna
--ref
$dbs
--reads
$2
/
$f1
--other
$1
/
$fo
--log
$opt
--sam
--fastx
--num_alignments
1
--aligned
$2
/
${
fo
}
_rRNA
else
sortmerna
-n
$dbNum
--db
$dbs
--I
$2
/
$f1
--other
$1
/
$fo
--log
$1
/
$fo
$opt
fi
fi
## deinterleave it
if
[
$UNPAIRED
==
0
]
;
then
## sortmerna get confused by dots in the filenames
if
[
!
-f
$2
/
$fo
.fastq
]
;
then
mv
$2
/
$fo
.
*
$2
/
$fo
.fastq
mv
$2
/
$fo
.
*
$2
/
$fo
.fastq
fi
unmerge-paired-reads.sh
$2
/
$fo
.fastq
$1
/
${
fo
}
_1.fq
$1
/
${
fo
}
_2.fq
fi
##
rm the tm
p
##
cleanu
p
echo
Post-Cleaning
if
[
$is2dotx
!=
0
]
;
then
## mv the rRNA, fastq and log back
mv
$2
/
${
fo
}
_rRNA.
*
$1
fi
## rm the tmp
if
[
$UNPAIRED
==
0
]
;
then
rm
-f
$2
/
$fm
$2
/
$fo
.fastq
else
...
...
@@ -202,7 +282,7 @@ else
fi
## deinterleave the rest if needed
if
[
$KEEP
-eq
1
]
;
then
if
[
$KEEP
==
1
]
;
then
if
[
$UNPAIRED
==
0
]
;
then
find
$2
-name
"
${
fo
}
_rRNA*"
-print0
| xargs