diff --git a/.gitignore b/.gitignore index 9cf9d3a104fe2875bd8bb09bc4f4a6851ae82766..0c6fac7ecb6d74c28ec079a484cee6ca067c08e6 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,7 @@ .Rhistory .RData .DS_Store +*.bam +*.bam.bai +*.rda +*.gtf.gz diff --git a/tutorial/easyRNASeq/annotation-manipulation-example.R b/tutorial/easyRNASeq/annotation-manipulation-example.R new file mode 100644 index 0000000000000000000000000000000000000000..19586027ffd6a2cbe01434eea82f5a85be3071ab --- /dev/null +++ b/tutorial/easyRNASeq/annotation-manipulation-example.R @@ -0,0 +1,109 @@ +#' --- +#' title: "Annotation manipulation example" +#' author: "Nicolas Delhomme" +#' date: "`r Sys.Date()`" +#' output: +#' html_document: +#' toc: true +#' number_sections: true +#' --- +#' +#' # Aim +#' The aim is to get the sequence names in the annotation and BAM files' header +#' in sync to overcome the easyRNASeq error +#' ```{r easyRNASeq error, echo=FALSE} +#' message("There is no common genomic references between your BAM files and the provided annotation. Fix one or the other.") +#' ``` +#' +#' # Setup +#' Load the libraries +library(easyRNASeq) +suppressPackageStartupMessages(library(IRanges)) +suppressPackageStartupMessages(library(GenomeInfoDb)) +suppressPackageStartupMessages(library(genomeIntervals)) +library(pander) + +#' Source an helper file +source("https://microasp.upsc.se/root/upscb-public/raw/master/src/R/createSyntheticTranscripts.R") + +#' Download the annotation file (gtf) EnsEMBL: +#' +#' ftp://ftp.ensembl.org/pub/current_gtf/homo_sapiens/Homo_sapiens.GRCh38.80.gtf.gz +#' +#' Note that here, I use the local copy I have downloaded and copied into my +#' current directory (where this script is). + +#' # Overview +#' +#' 1. First, we will process the annotation file to create synthetic transcripts +#' +#' 2. Second, we will edit the genomic reference names to match those in the +#' BAM files + +#' # Process +#' ## Synthetic transcripts creation +#' This function takes a gtf or gff3 _filename_ as input. +#' +#' The _input_ parameter defines the file format (default to gff3). +#' +#' The _feature_ parameter defines which feature to look for in the provided +#' file. Commonly mRNA for gff3 and transcript for gtf. It defaults to mRNA. +#' Several parameter can ge given as argument. +#' +#' The _output_ paramter defines the type of object that is returned. +#' It can generate a **Genome_intervals** or a __GRanges__ class of objects. +#' The former can be saved as a gff3 using the writeGff3 function from the +#' genomeIntervals package (loaded). The latter can be saved as an RData object +#' and/or be used directly in the construction of an AnnotParam. +gAnnot <- createSyntheticTranscripts( + filename="Homo_sapiens.GRCh38.80.gtf.gz", + input="gtf", + feature="transcript", + output="GRanges") + +#' This created the synthetic transcripts. We now change the sequence names +#' to match those in the BAM files (which are prepended with a 'chr' string). +seqlevels(gAnnot) <- paste("chr",seqlevels(gAnnot),sep="") + +#' We also 'rescue' the mitochondria. The other sequence names (haplotypes) +#' would require a manual curation to be added. Right now, we should have the +#' 22 autosomes, the sexual chromosomes and the mitochondria. +seqlevels(gAnnot)[seqlevels(gAnnot)=="chrMT"] <- "chrM" + +#' ## Export +#' Save the object for later re-use +save(gAnnot, file="Homo_sapiens.GRCh38.80-synthetic-transcripts.rda") + +#' ## Summarization +#' ### Set the params +#' Here it has been tricky. The flag in the BAM files says that the +#' record are Paired-end reads, but a closer look revealed that they +#' are not. An example record is: +#' +#' FCC64Y1ACXX:1:2311:16630:3201#GCCAATAT 81 chr10 60021 0 49M * 0 0 GCATCGGGGTGCTCTGGTTTTGTTGTTGTTATTTCTGAATGACATTTAC hiihhiiiiiiiiiiiiihdhiiiihhfiiihihhheggggeeeeebb_ NM:i:0 MD:Z:49 +#' +#' where the 7 to 9th columns represent the mate alignment, which is in all +#' occurences empty (* for the sequence name, 0 for the pos and 0 for the +#' insert size) +#' +#' To remediate that, we set the argument paired to 'FALSE' and we need +#' to call 'simpleRNASeq' with the 'override' argument set to 'TRUE' to +#' avoid the parameter auto-detection. +param <- RnaSeqParam(annotParam=AnnotParam(datasource=gAnnot), + bamParam=BamParam(paired=FALSE), + countBy="gene") + +#' ### Get the BAM files +#' Here again, I use the provided BAM files, which are now in my script +#' directory +bamFiles <- getBamFileList( + filenames=dir(".",pattern=".*.bam$",full.names=TRUE)) + +#' ### Run +sexp <- simpleRNASeq(bamFiles=bamFiles,param=param,override=TRUE,verbose=TRUE) + +#' ### Check +pander(colSums(assay(sexp))) + +#' # Session Info +sessionInfo() diff --git a/tutorial/easyRNASeq/annotation-manipulation-example.html b/tutorial/easyRNASeq/annotation-manipulation-example.html new file mode 100644 index 0000000000000000000000000000000000000000..61b1fa7242acee84f66f0409dff19c9d46beec59 --- /dev/null +++ b/tutorial/easyRNASeq/annotation-manipulation-example.html @@ -0,0 +1,308 @@ + + + + + + + + + + + + + + +Annotation manipulation example + + + + + + + + + + + + + + + + + + + + + +
+ + + + +
+ +
+ +
+

1 Aim

+

The aim is to get the sequence names in the annotation and BAM files’ header in sync to overcome the easyRNASeq error

+
## There is no common genomic references between your BAM files and the provided annotation. Fix one or the other.
+
+
+

2 Setup

+

Load the libraries

+
library(easyRNASeq)
+suppressPackageStartupMessages(library(IRanges))
+suppressPackageStartupMessages(library(GenomeInfoDb))
+suppressPackageStartupMessages(library(genomeIntervals))
+library(pander)
+

Source an helper file

+
source("https://microasp.upsc.se/root/upscb-public/raw/master/src/R/createSyntheticTranscripts.R")
+

Download the annotation file (gtf) EnsEMBL:

+

ftp://ftp.ensembl.org/pub/current_gtf/homo_sapiens/Homo_sapiens.GRCh38.80.gtf.gz

+

Note that here, I use the local copy I have downloaded and copied into my current directory (where this script is). # Overview

+
    +
  1. First, we will process the annotation file to create synthetic transcripts

  2. +
  3. Second, we will edit the genomic reference names to match those in the BAM files # Process ## Synthetic transcripts creation This function takes a gtf or gff3 filename as input.

  4. +
+

The input parameter defines the file format (default to gff3).

+

The feature parameter defines which feature to look for in the provided file. Commonly mRNA for gff3 and transcript for gtf. It defaults to mRNA. Several parameter can ge given as argument.

+

The output paramter defines the type of object that is returned. It can generate a Genome_intervals or a GRanges class of objects. The former can be saved as a gff3 using the writeGff3 function from the genomeIntervals package (loaded). The latter can be saved as an RData object and/or be used directly in the construction of an AnnotParam.

+
gAnnot <- createSyntheticTranscripts(
+  filename="Homo_sapiens.GRCh38.80.gtf.gz",
+  input="gtf",
+  feature="transcript",
+  output="GRanges")
+

This created the synthetic transcripts. We now change the sequence names to match those in the BAM files (which are prepended with a ‘chr’ string).

+
seqlevels(gAnnot) <- paste("chr",seqlevels(gAnnot),sep="")
+

We also ‘rescue’ the mitochondria. The other sequence names (haplotypes) would require a manual curation to be added. Right now, we should have the 22 autosomes, the sexual chromosomes and the mitochondria.

+
seqlevels(gAnnot)[seqlevels(gAnnot)=="chrMT"] <- "chrM"
+
+

2.1 Export

+

Save the object for later re-use

+
save(gAnnot, file="Homo_sapiens.GRCh38.80-synthetic-transcripts.rda")
+
+
+

2.2 Summarization

+
+

2.2.1 Set the params

+

Here it has been tricky. The flag in the BAM files says that the record are Paired-end reads, but a closer look revealed that they are not. An example record is:

+

FCC64Y1ACXX:1:2311:16630:3201#GCCAATAT 81 chr10 60021 0 49M * 0 0 GCATCGGGGTGCTCTGGTTTTGTTGTTGTTATTTCTGAATGACATTTAC hiihhiiiiiiiiiiiiihdhiiiihhfiiihihhheggggeeeeebb_ NM:i:0 MD:Z:49

+

where the 7 to 9th columns represent the mate alignment, which is in all occurences empty (* for the sequence name, 0 for the pos and 0 for the insert size)

+

To remediate that, we set the argument paired to ‘FALSE’ and we need to call ‘simpleRNASeq’ with the ‘override’ argument set to ‘TRUE’ to avoid the parameter auto-detection.

+
param <- RnaSeqParam(annotParam=AnnotParam(datasource=gAnnot),
+                     bamParam=BamParam(paired=FALSE),
+                     countBy="gene")
+
+
+

2.2.2 Get the BAM files

+

Here again, I use the provided BAM files, which are now in my script directory

+
bamFiles <- getBamFileList(
+  filenames=dir(".",pattern=".*.bam$",full.names=TRUE))
+
+
+

2.2.3 Run

+
sexp <- simpleRNASeq(bamFiles=bamFiles,param=param,override=TRUE,verbose=TRUE)
+
## ==========================
+## simpleRNASeq version 2.4.7
+## ==========================
+## Creating a SummarizedExperiment.
+## ==========================
+## Processing the alignments.
+## ==========================
+## Pre-processing 2 BAM files.
+## Validating the BAM files.
+## Extracted 93 reference sequences information.
+## Extracting parameter from excerpt01.bam
+## Extracting parameter from excerpt02.bam
+## Found 0 single-end BAM files.
+## Found 2 paired-end BAM files.
+## Bam file: excerpt01.bam has reads of length 49bp
+## Bam file: excerpt02.bam has reads of length 49bp
+
## Warning in FUN(X[[i]], ...): Bam file: excerpt01.bam is considered
+## unstranded.
+
## Warning in FUN(X[[i]], ...): Bam file: excerpt01.bam Strandedness could not
+## be determined using 38351 regions spanning 6330948 bp on either strand at a
+## 90% cutoff; 52.13 percent appear to be stranded.
+
## Warning in FUN(X[[i]], ...): Bam file: excerpt02.bam is considered
+## unstranded.
+
## Warning in FUN(X[[i]], ...): Bam file: excerpt02.bam Strandedness could not
+## be determined using 40047 regions spanning 6556751 bp on either strand at a
+## 90% cutoff; 52.21 percent appear to be stranded.
+
## Warning in simpleRNASeq(bamFiles = bamFiles, param = param, override =
+## TRUE, : You provided an incorrect BAM parameter; 'paired' should be set to
+## 'TRUE'.
+
## Warning in simpleRNASeq(bamFiles = bamFiles, param = param, override =
+## TRUE, : You have chosen to override the detected parameters. Hope you know
+## what you are doing. Contact me if you think the parameter detection failed.
+
## Streaming excerpt01.bam
+## Read 999906 reads
+## Streaming excerpt02.bam
+## Read 999906 reads
+## Bam file: excerpt01.bam has 999906 reads.
+## Bam file: excerpt02.bam has 999906 reads.
+## ==========================
+## Processing the annotation
+## ==========================
+## Validating the annotation source
+## No validation performed at that stage
+## Fetching the annotation
+## Using the provided annotation as such
+## ==========================
+## Sanity checking
+## ==========================
+## ==========================
+## Creating the count table
+## ==========================
+## Using 1 CPU core
+## Streaming excerpt01.bam
+## The data is single-end
+## The data is unstranded; overlapping features will be ignored.
+## The summarization is by 'read' and the mode is: Union.
+## Processing 999906 reads
+## Done with excerpt01.bam
+## Streaming excerpt02.bam
+## The data is single-end
+## The data is unstranded; overlapping features will be ignored.
+## The summarization is by 'read' and the mode is: Union.
+## Processing 999906 reads
+## Done with excerpt02.bam
+## ==========================
+## Returning a
+##       SummarizedExperiment
+## ==========================
+
+
+

2.2.4 Check

+
pander(colSums(assay(sexp)))
+ ++++ + + + + + + + + + + + + +
excerpt01.bamexcerpt02.bam
4627947402
+
+
+
+
+

3 Session Info

+
sessionInfo()
+
## R version 3.2.1 (2015-06-18)
+## Platform: x86_64-apple-darwin13.4.0 (64-bit)
+## Running under: OS X 10.10.4 (Yosemite)
+## 
+## locale:
+## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
+## 
+## attached base packages:
+## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
+## [8] methods   base     
+## 
+## other attached packages:
+## [1] pander_0.5.2           genomeIntervals_1.24.1 intervals_0.15.0      
+## [4] GenomeInfoDb_1.4.1     IRanges_2.2.5          S4Vectors_0.6.1       
+## [7] BiocGenerics_0.14.0    easyRNASeq_2.4.7      
+## 
+## loaded via a namespace (and not attached):
+##  [1] Rcpp_0.11.6             formatR_1.2            
+##  [3] RColorBrewer_1.1-2      futile.logger_1.4.1    
+##  [5] XVector_0.8.0           bitops_1.0-6           
+##  [7] futile.options_1.0.0    tools_3.2.1            
+##  [9] zlibbioc_1.14.0         biomaRt_2.24.0         
+## [11] digest_0.6.8            annotate_1.46.0        
+## [13] evaluate_0.7            RSQLite_1.0.0          
+## [15] lattice_0.20-31         DBI_0.3.1              
+## [17] yaml_2.1.13             DESeq_1.20.0           
+## [19] hwriter_1.3.2           genefilter_1.50.0      
+## [21] stringr_1.0.0           knitr_1.10.5           
+## [23] Biostrings_2.36.1       locfit_1.5-9.1         
+## [25] LSD_3.0                 grid_3.2.1             
+## [27] Biobase_2.28.0          AnnotationDbi_1.30.1   
+## [29] XML_3.98-1.3            survival_2.38-3        
+## [31] BiocParallel_1.2.7      rmarkdown_0.7          
+## [33] limma_3.24.12           latticeExtra_0.6-26    
+## [35] edgeR_3.10.2            geneplotter_1.46.0     
+## [37] lambda.r_1.1.7          magrittr_1.5           
+## [39] Rsamtools_1.20.4        htmltools_0.2.6        
+## [41] splines_3.2.1           GenomicAlignments_1.4.1
+## [43] GenomicRanges_1.20.5    ShortRead_1.26.0       
+## [45] xtable_1.7-4            stringi_0.5-5          
+## [47] RCurl_1.95-4.7
+
+ + +
+ + + + + + + +