runSortmerna.sh 8.23 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
#!/bin/bash -l
#SBATCH -p node
## for large files
## we don't need the proc but the mem
## we could give that as param
#SBATCH -n 16
## time too for large files
#SBATCH -t 12:00:00
#SBATCH --mail-type=ALL
## mail-user and A have to be set in the submit script

## stop on error
set -e

## be verbose and extend the commands
set -x

## check the options if any
Nicolas Delhomme's avatar
Nicolas Delhomme committed
19 20
KEEP=1
useMtSSU=0
21 22
UNPAIRED=0
PROC=16
Nicolas Delhomme's avatar
Nicolas Delhomme committed
23
DBS=
24 25 26 27 28 29 30 31

## usage
usage(){
echo >&2 \
"
	Usage: runSortmerna.sh [option] <out dir> <tmp dir> <forward fastq.gz> <reverse fastq.gz>
	
	Options:
Nicolas Delhomme's avatar
Nicolas Delhomme committed
32 33 34 35
                -d define your dbs (semi-colon separated)
                -k drop the rRNA (only for v1.9, default to keep them)
                -m run against mtSSU in addition (only for v1.9)
                -p number of threads to be used (default $PROC)
36 37 38 39
                -u single end data (in that case only the forward fastq is needed)

         Note:
               1) The SORTMERNADIR environment variable needs to be set
Nicolas Delhomme's avatar
Nicolas Delhomme committed
40 41
               2) Only SortMeRna version 1.9 and 2.x are supported (2.x is default)
               3) -m is not applicable if -d is set
42 43 44 45
"
	exit 1
}

Nicolas Delhomme's avatar
Nicolas Delhomme committed
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
## load the module
module load bioinfo-tools

## Does not work on uppmax - umea has an empty result
## while uppmax is verbose.
## avail=$( module avail sortmerna 2>&1 > /dev/null)
## avail=`echo $avail | tr -d [:blank:]`
## if [ ! -z $avail ]; then
##  module load sortmerna
##  sortmerna --version
##fi

## record the SORTMERNADIR if it exists
STOREENV=
if [ ! -z $SORTMERNADIR ]; then
  STOREENV=$SORTMERNADIR
fi

## try to load or echo
module load sortmerna || {
  echo "No sortmerna as module"

  ## then check for availability
  tool=`which sortmerna 2>/dev/null`
  if [ ! -z $tool ] && [ -f $tool ] && [ -x $tool ]; then
    echo "sortmerna available"
  else
    echo "ERROR: INSTALL SortMeRna"
    usage
  fi
}

# restore the env if it existed
if [ ! -z $STOREENV ]; then
  export SORTMERNADIR=$STOREENV
fi

## check for sortmerna version
is1dot9=`sortmerna --version 2>&1 | grep version | grep 1.9 | wc -c`
is2dotx=`sortmerna --version 2>&1 | grep "version 2." | wc -c`

if [ $is1dot9 == 0 ] && [ $is2dotx  == 0 ]; then
  echo "Only version 1.9 and 2.x are supported"
  usage
fi

92
## get the options
Nicolas Delhomme's avatar
Nicolas Delhomme committed
93
while getopts d:kmp:u option
94 95
do
        case "$option" in
Nicolas Delhomme's avatar
Nicolas Delhomme committed
96 97 98
      d) DBS=$OPTARG;;
	    k) KEEP=0;;
	    m) useMtSSU=1;;
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
	    p) PROC=$OPTARG;;
	    u) UNPAIRED=1;;
		\?) ## unknown flag
		usage;;
        esac
done
shift `expr $OPTIND - 1`

##
echo Setting up

## set some env var
## this location is not in Git anymore!
## it has to be downloaded by the user
## check the ethylene-insensitive project submitter to see
## how to set that up
if [ -z $SORTMERNADIR ]; then
    echo You need to set your SORTMERNADIR environment variable
    usage
fi

Nicolas Delhomme's avatar
Nicolas Delhomme committed
120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
## set the default dbs
if [ ! -z $DBS ]; then
  dbs=${DBS//;/ }
  dbNum=`echo $DBS | awk -F";" '{print NF}'`
else
  if [ $is2dotx != 0 ]; then
    db5s=$SORTMERNADIR/rRNA_databases/rfam-5s-database-id98.fasta,$SORTMERNADIR/automata/rfam-5s-database-id98
    db58s=$SORTMERNADIR/rRNA_databases/rfam-5.8s-database-id98.fasta,$SORTMERNADIR/automata/rfam-5.8s-database-id98
    db16sa=$SORTMERNADIR/rRNA_databases/silva-arc-16s-id95.fasta,$SORTMERNADIR/automata/silva-arc-16s-database-id95
    db16s=$SORTMERNADIR/rRNA_databases/silva-bac-16s-id90.fasta,$SORTMERNADIR/automata/silva-bac-16s-database-id90
    db18s=$SORTMERNADIR/rRNA_databases/silva-euk-18s-id95.fasta,$SORTMERNADIR/automata/silva-euk-18s-database-id95
    db23sa=$SORTMERNADIR/rRNA_databases/silva-arc-23s-id98.fasta,$SORTMERNADIR/automata/silva-arc-23s-database-id98
    db23s=$SORTMERNADIR/rRNA_databases/silva-bac-23s-id98.fasta,$SORTMERNADIR/automata/silva-bac-23s-database-id98
    db28s=$SORTMERNADIR/rRNA_databases/silva-euk-28s-id98.fasta,$SORTMERNADIR/automata/silva-euk-28s-database-id98
    dbs="$db5s:$db58s:$db16sa:$db16s:$db18s:$db23sa:$db23s:$db28s"
  #if [ ! -f $SORTMERNADIR/automata/rfam-5s-database-id98.stats ]; then
  #  echo "No indexes found, creating indexes in folder $SORTMERNADIR/automata"
  #  indexdb_rna --ref $dbs
  #fi
  else
    db5s=$SORTMERNADIR/rRNA_databases/rfam-5s-database-id98.fasta
    db58s=$SORTMERNADIR/rRNA_databases/rfam-5.8s-database-id98.fasta
    db16sa=$SORTMERNADIR/rRNA_databases/silva-arc-16s-database-id95.fasta
    db16s=$SORTMERNADIR/rRNA_databases/silva-bac-16s-database-id85.fasta
    db18s=$SORTMERNADIR/rRNA_databases/silva-euk-18s-database-id95.fasta
    db23sa=$SORTMERNADIR/rRNA_databases/silva-arc-23s-database-id98.fasta
    db23s=$SORTMERNADIR/rRNA_databases/silva-bac-23s-database-id98.fasta
    db28s=$SORTMERNADIR/rRNA_databases/silva-euk-28s-database-id98.fasta
    dbNum=8
    dbs="$db5s $db58s $db16sa $db16s $db18s $db23sa $db23s $db28s"
  fi

  ## Add the mtSSU
  if [ $is1dot9 != 0 ] && [ $useMtSSU == 1 ]; then
154
    mtSSU=$SORTMERNADIR/rRNA_databases/mtSSU_UCLUST-95-identity.fasta
Nicolas Delhomme's avatar
Nicolas Delhomme committed
155 156 157
    dbs="$dbs $mtSSU"
    dbNum=9
  fi
158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
fi

##
echo Checking

## we get two dir and two files as input
if [ $UNPAIRED == 0 ]; then
    if [ $# != 4 ]; then
	echo "This function takes two directories and two files as arguments"
	usage
    fi
else
    if [ $# != 3 ]; then
	echo "This function takes two directories and one file as argument"
	usage
    fi
fi

if [ ! -d $1 ]; then
    echo "The first argument needs to be an existing directory"
    usage
fi

if [ ! -d $2 ]; then
    echo "The second argument needs to be an existing directory"
    usage
fi

## 
echo Gunzipping

## unzip the files
if [ ! -f $3 ]; then
    echo "The third argument needs to be an existing fastq.gz file"
    usage
fi
f1=`basename ${3//.gz/}`

if [ $UNPAIRED == 0 ]; then
    if [ ! -f $4 ]; then
	echo "The forth argument needs to be an existing fastq.gz file"
	usage
    fi
    f2=`basename ${4//.gz/}`
fi

## decompress them
if [ ! -f $2/$f1 ]; then
    gunzip -c $3 > $2/$f1
fi
if [ $UNPAIRED == 0 ]; then
    if [ ! -f $2/$f2 ]; then
	gunzip -c $4 > $2/$f2
    fi
fi

## interleave them
fm=`basename ${3//.f*q.gz/}`
if [ $UNPAIRED == 0 ]; then
Nicolas Delhomme's avatar
Nicolas Delhomme committed
217
  merge-paired-reads.sh $2/$f1 $2/$f2 $2/$fm
218 219 220 221 222 223
fi

##
if [ $UNPAIRED == 0 ]; then
    echo Pre-cleaning
    rm -f $2/$f1 $2/$f2
Nicolas Delhomme's avatar
Nicolas Delhomme committed
224 225
else
    echo "TODO: Cleaning needs implementing for single end sequencing"
226 227 228 229 230 231 232 233 234 235 236 237 238
fi

##
echo Sorting

## PE
if [ $UNPAIRED == 0 ]; then
    fo=`basename ${3//_[1,2].f*q.gz/_sortmerna}`
else
    fo=`basename ${3//.f*q.gz/_sortmerna}`
fi

## check the options
Nicolas Delhomme's avatar
Nicolas Delhomme committed
239 240 241 242
opt="-a $PROC"

if [ $KEEP == 1 ] && [ $is1dot9 != 0 ]; then
  opt="$opt --bydbs --accept $2/${fo}_rRNA"
243 244
fi 

Nicolas Delhomme's avatar
Nicolas Delhomme committed
245
## run
246
if [ $UNPAIRED == 0 ]; then
Nicolas Delhomme's avatar
Nicolas Delhomme committed
247 248 249 250 251
  if [ $is2dotx != 0 ]; then
    sortmerna --ref $dbs --reads $2/$fm --other $2/$fo --log --paired_in --fastx $opt --sam --num_alignments 1 --aligned $2/${fo}_rRNA
  else
    sortmerna -n $dbNum --db $dbs --I $2/$fm --other $2/$fo --log $1/$fo --paired-in $opt
  fi  
252
else
Nicolas Delhomme's avatar
Nicolas Delhomme committed
253 254 255 256 257
  if [ $is2dotx != 0 ]; then
    sortmerna --ref $dbs --reads $2/$f1 --other $1/$fo --log $opt --sam --fastx --num_alignments 1 --aligned $2/${fo}_rRNA
  else
    sortmerna -n $dbNum --db $dbs --I $2/$f1 --other $1/$fo --log $1/$fo $opt
  fi
258 259 260 261 262 263
fi

## deinterleave it
if [ $UNPAIRED == 0 ]; then
    ## sortmerna get confused by dots in the filenames
    if [ ! -f $2/$fo.fastq ]; then
Nicolas Delhomme's avatar
Nicolas Delhomme committed
264
	    mv $2/$fo.* $2/$fo.fastq
265 266 267 268
    fi
    unmerge-paired-reads.sh $2/$fo.fastq $1/${fo}_1.fq $1/${fo}_2.fq
fi

Nicolas Delhomme's avatar
Nicolas Delhomme committed
269
## cleanup
270
echo Post-Cleaning
Nicolas Delhomme's avatar
Nicolas Delhomme committed
271 272 273 274 275 276 277

if [ $is2dotx != 0 ]; then
  ## mv the rRNA, fastq and log back
  mv $2/${fo}_rRNA.* $1
fi

## rm the tmp
278 279 280 281 282 283 284
if [ $UNPAIRED == 0 ]; then
    rm -f $2/$fm $2/$fo.fastq
else
    rm -f $2/$f1
fi

## deinterleave the rest if needed
Nicolas Delhomme's avatar
Nicolas Delhomme committed
285
if [ $KEEP == 1 ]; then
286 287 288 289 290 291 292 293
    if [ $UNPAIRED == 0 ]; then
	find $2 -name "${fo}_rRNA*" -print0 | xargs -0 -I {} -P 6 sh -c 'unmerge-paired-reads.sh $0 $1/`basename ${0//.fastq/_1.fq}` $1/`basename ${0//.fastq/_2.fq}`' {} $1
    fi
fi

## keep that as a reminder if that happens again
## sortmerna get confused by the dots as well...
## echo Validating
Nicolas Delhomme's avatar
Nicolas Delhomme committed
294
if [ $UNPAIRED == 1 ]; then
295
    if [ ! -f $1/$fo.fastq ]; then
Nicolas Delhomme's avatar
Nicolas Delhomme committed
296
      mv $1/$fo.* $1/$fo.fq
297 298 299 300 301 302 303 304
    fi
fi

## 
echo Gzipping

## compress the output files
find $1 -name "${fo}*.fq" -print0 | xargs -0 -I {} -P 8 gzip -f {}
Nicolas Delhomme's avatar
Nicolas Delhomme committed
305 306 307 308 309 310 311
if [ $is2dotx != 0 ]; then
  find $1 -name "${fo}_rRNA.[f,s]*" -print0 | xargs -0 -I {} -P 8 gzip -f {}
fi

## TODO if unpaired, then the input file is still called fastq and hence not compressed
## FIXME

312 313 314 315
#printf "%s\0%s" $1/${fo}_1.fq $1/${fo}_2.fq | xargs -0 -I {} -P 2 gzip -f {}

##
echo Done