Commit 8181df09 authored by Nicolas Delhomme's avatar Nicolas Delhomme

Corrected the bibliography, extended the R script and knitted the HTML.

parent 5f6fda09
#' ---
#' title: "RNA-Seq tutorial"
#' author: "Nicolas Delhomme, Bastian Schiffthaler"
#' date: "01 October 2014"
#' date: "`r Sys.Date()`"
#' output:
#' BiocStyle::html_document:
#' toc: true
#' number_sections: true
#' bibliography: Robinson-Delhomme-et-al_BMC-Plant-Biology_2014.bib
#' bibliography: ~/Git/UPSCb-public/tutorial/Robinson-Delhomme-et-al_BMC-Plant-Biology_2014.bib
#' ---
#' ```{r set up, echo=FALSE}
#' knitr::opts_knit$set(root.dir="~/Git/UPSCb-public")
#' ```
#' # Introduction
#' This tutorial introduces and RNA-Seq differential expression analysis performed
#' using R and Bioconductor[@ref:R, @Gentleman:2004p2013].
......@@ -40,25 +44,25 @@
#' ## Setup - loading the libraries
suppressPackageStartupMessages(library(DESeq))
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(easyRNASeq))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(vsn))
suppressPackageStartupMessages(library(scatterplot3d))
#suppressPackageStartupMessages(library(arrayQualityMetrics))
suppressPackageStartupMessages(library(VennDiagram))
#suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(LSD))
#suppressPackageStartupMessages(library(RnaSeqTutorial))
source("src/R/plot.multidensity.R")
source("src/R/volcanoPlot.R")
#' ## Processing the data
#' ### Reading in the data
#' First we read the count files produced by HTSeq[@Anders:2014p6365] in a matrix.
#' The DESeq2[@Love:2014p6358] package now actually has a function to ease that process: **DESeqDataSetFromHTSeqCount**.
#' Here we just process the samples in parallel using mclapply instead.
res <- mclapply(dir("data/htseq",pattern="^[2,3].*_STAR\\.txt",
res <- mclapply(dir("data/htseq-count",pattern="^[2,3].*_STAR\\.txt",
full.names=TRUE),function(fil){
read.delim(fil,header=FALSE,stringsAsFactors=FALSE)
},mc.cores=2)
names(res) <- gsub("_.*_STAR\\.txt","",dir("HTSeq",pattern="^[2,3].*_STAR\\.txt"))
names(res) <- gsub("_.*_STAR\\.txt","",dir("data/htseq-count",pattern="^[2,3].*_STAR\\.txt"))
#' Then we extract the additional information that HTSeq writes at the end of every
#' file detailing the number of reads that were not taken into account while
......@@ -73,7 +77,7 @@ count.table <- do.call(cbind,lapply(res,"[",2))[-sel,]
colnames(count.table) <- names(res)
rownames(count.table) <- res[[1]][,1][-sel]
#' ### the HTSeq stat lines
#' ### The HTSeq stat lines
#' Here we aggregate the information about how many
#' reads aligned together with the information gathered above.
count.stats <- do.call(cbind,lapply(res,"[",2))[sel,]
......@@ -332,10 +336,8 @@ cdsFull = estimateDispersions( cdsFull )
plotDispLSD(cdsFull)
#' Next, we create both models (one considering the date only and one the date and sex combination)
#' _Note that the pragma dev.null <- capture.output(...) below is not necessary, it
#' is just used for pretty printing of the HMTL doc you are reading._
dev.null <- capture.output(fit1 = suppressWarnings(fitNbinomGLMs( cdsFull, count ~ date + sex )))
dev.null <- capture.output(fit0 = suppressWarnings(fitNbinomGLMs( cdsFull, count ~ date)))
fit1 = suppressWarnings(fitNbinomGLMs( cdsFull, count ~ date + sex ))
fit0 = suppressWarnings(fitNbinomGLMs( cdsFull, count ~ date))
#' For the rest of the analysis, we ignore the genes that did not converge in the
#' previous step
......@@ -591,4 +593,4 @@ sessionInfo()
# lapply(split(sex,date),table)
# lapply(split(sexR,date),table)
#
#' # References
\ No newline at end of file
#' # References
%\BibtexIndexEntry{02. Introduction}
@Manual{ref:R,
title = {R: A Language and Environment for Statistical Computing},
author = {{R Development Core Team}},
......@@ -12,30 +10,23 @@ url = {http://www.R-project.org},
@article{Gentleman:2004p2013,
author = {Robert C Gentleman and others},
journal = {Genome Biology 2010 11:202},
journal = {Genome Biology},
title = {Bioconductor: open software development for computational biology and bioinformatics},
abstract = {The Bioconductor project is an initiative for the collaborative creation of extensible software for computational biology and bioinformatics. The goals of the project include: fostering collaborative development and widespread use of innovative software, reducing barriers to entry into interdisciplinary scientific research, and promoting the achievement of remote reproducibility of research results. We describe details of our aims and methods, identify current challenges, compare Bioconductor to other open bioinformatics projects, and provide working examples.},
affiliation = {Department of Biostatistical Science, Dana-Farber Cancer Institute, 44 Binney St, Boston, MA 02115, USA. rgentlem@jimmy.harvard.edu},
number = {10},
pages = {R80},
volume = {5},
year = {2004},
month = {Jan}
}
@article{Robinson:2014p6362,
author = {Kathryn Robinson and Nicolas Delhomme and Niklas M{\"a}hler and Bastian Schiffthaler and Jenny {\"O}nskog and Benedicte Albrectsen and P{\"a}r Ingvarsson and Torgeir Hvidsten and Stefan Jansson and Nathaniel Street},
author = {Nicolas Delhomme and Kathryn Robinson and Niklas M{\"a}hler and Bastian Schiffthaler and Jenny {\"O}nskog and Benedicte Albrectsen and P{\"a}r Ingvarsson and Torgeir Hvidsten and Stefan Jansson and Nathaniel Street},
journal = {BMC Plant Biology},
title = {
Populus tremula (European aspen) shows no evidence of sexual dimorphism},
abstract = {Evolutionary theory suggests that males and females may evolve sexually dimorphic phenotypic and biochemical traits concordant with each sex having different optimal strategies of resource investment to maximise reproductive success and fitness. Such sexual dimorphism would result in sex biased gene expression patterns in non-floral organs for autosomal genes associated with the control and development of such phenotypic traits.},
title = {Populus tremula (European aspen) shows no evidence of sexual dimorphism},
number = {1},
pages = {276--276},
volume = {14},
year = {2014},
month = {Oct},
doi = {10.1186/s12870-014-0276-5},
URL = {http://www.biomedcentral.com/1471-2229/14/276/abstract},
}
@article{Anders:2014p6365,
......@@ -52,7 +43,7 @@ title = {Moderated estimation of fold change and dispersion for RNA-Seq data wit
year = {2014}
}
@article{Pakull
@article{Pakull,
author={B. Pakull, B. Kersten, J. Lüneburg and M. Fladung},
journal = "Plant Biology",
title = {A simple PCR-based marker to determine sex in aspen)},
......@@ -61,45 +52,30 @@ year = {2014}
@article{Anders:2010p1659,
author = {Simon Anders and Wolfgang Huber},
journal = {Genome Biology 2010 11:202},
journal = {Genome Biology},
title = {Differential expression analysis for sequence count data},
abstract = {ABSTRACT: High-throughput sequencing assays such as RNA-Seq, ChIP-Seq or barcode counting provide quantitative readouts in the form of count data. To infer diferential signal in such data correctly and with good statistical power, estimation of data variability throughout the dynamic range and a suitable error model are required. We propose a method based on the negative binomial distribution, with variance and mean linked by local regression and present an implementation, DESeq, as an R/Bioconductor package.},
number = {10},
pages = {R106},
volume = {11},
year = {2010},
month = {Oct}
}
@article{Robinson:2010p775,
author = {Mark D Robinson and others},
journal = {Bioinformatics},
title = {edgeR: a Bioconductor package for differential expression analysis of digital gene expression data},
abstract = {SUMMARY: It is expected that emerging digital gene expression (DGE) technologies will overtake microarray technologies in the near future for many functional genomics applications. One of the fundamental data analysis tasks, especially for gene expression studies, involves determining whether there is evidence that counts for a transcript or exon are significantly different across experimental conditions. edgeR is a Bioconductor software package for examining differential expression of replicated count data. An overdispersed Poisson model is used to account for both biological and technical variability. Empirical Bayes methods are used to moderate the degree of overdispersion across transcripts, improving the reliability of inference. The methodology can be used even with the most minimal levels of replication, provided at least one phenotype or experimental condition is replicated. The software may have other applications beyond sequencing data, such as proteome peptide count data. AVAILABILITY: The package is freely available under the LGPL licence from the Bioconductor web site (http://bioconductor.org).},
affiliation = {Cancer Program, Garvan Institute of Medical Research, 384 Victoria Street, Darlinghurst, NSW 2010, Australia. mrobinson@wehi.edu.au},
number = {1},
pages = {139--40},
volume = {26},
year = {2010},
month = {Jan}
}
@article{Soneson:2013p5778,
author = {Charlotte Soneson and Mauro Delorenzi},
journal = {BMC Bioinformatics},
title = {A comparison of methods for differential expression analysis of RNA-seq data},
abstract = {BACKGROUND: Finding genes that are differentially expressed between conditions is an integral part of understanding the molecular basis of phenotypic variation. In the past decades, DNA microarrays have been used extensively to quantify the abundance of mRNA corresponding to different genes, and more recently high-throughput sequencing of cDNA (RNA-seq) has emerged as a powerful competitor. As the cost of sequencing decreases, it is conceivable that the use of RNA-seq for differential expression analysis will increase rapidly. To exploit the possibilities and address the challenges posed by this relatively new type of data, a number of software packages have been developed especially for differential expression analysis of RNA-seq data.
RESULTS: We conducted an extensive comparison of eleven methods for differential expression analysis of RNA-seq data. All methods are freely available within the R framework and take as input a matrix of counts, i.e. the number of reads mapping to each genomic feature of interest in each of a number of samples. We evaluate the methods based on both simulated data and real RNA-seq data.
CONCLUSIONS: Very small sample sizes, which are still common in RNA-seq experiments, impose problems for all evaluated methods and any results obtained under such conditions should be interpreted with caution. For larger sample sizes, the methods combining a variance-stabilizing transformation with the 'limma' method for differential expression analysis perform well under many different conditions, as does the nonparametric SAMseq method.},
affiliation = {Bioinformatics Core Facility, SIB Swiss Institute of Bioinformatics, Lausanne, Switzerland. Charlotte.Soneson@isb-sib.ch},
pages = {91},
volume = {14},
year = {2013},
month = {Jan},
language = {eng},
doi = {10.1186/1471-2105-14-91},
pii = {1471-2105-14-91},
pmid = {23497356},
}
\ No newline at end of file
}
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment