diff --git a/manuscripts/Robinson2014/Robinson2014-rnaseq-analysis.html b/manuscripts/Robinson2014/Robinson2014-rnaseq-analysis.html new file mode 100644 index 0000000000000000000000000000000000000000..909863bda4a6fea4c50161df774fa62e38cdcbb5 --- /dev/null +++ b/manuscripts/Robinson2014/Robinson2014-rnaseq-analysis.html @@ -0,0 +1,892 @@ + + + + + + + +Sexual dimorphism DE analysis + + + + + + + + + + + + + + + + + +

Sexual dimorphism DE analysis

+ +

delhomme — May 26, 2014, 9:50 PM

+ +
### ==============================
+## load the necessary libraries
+### ==============================
+suppressPackageStartupMessages(library(DESeq2))
+suppressPackageStartupMessages(library(RColorBrewer))
+suppressPackageStartupMessages(library(vsn))
+suppressPackageStartupMessages(library(scatterplot3d))
+suppressPackageStartupMessages(library(arrayQualityMetrics))
+
+ +
Creating a generic function for 'hist' from package 'graphics' in package 'affyPLM'
+
+ +
suppressPackageStartupMessages(library(VennDiagram))
+suppressPackageStartupMessages(library(gplots))
+
+source("~/Git/UPSCb/src/R/plot.multidensity.R")
+source("~/Git/UPSCb/src/R/densityPlot.R")
+source("~/Git/UPSCb/src/R/plotMA.R")
+source("~/Git/UPSCb/src/R/volcanoPlot.R")
+
+### ==============================
+## set the working dir
+### ==============================
+# setwd("/mnt/picea/projects/aspseq/sex")
+setwd("/Users/delhomme/Desktop/tmp/sex")
+
+### ==============================
+## read the HTSeq files in a matrix
+### ==============================
+# res <- mclapply(dir("HTSeq/Ptrichocarpa",pattern="^[2,3].*_STAR\\.txt",full.names=TRUE),function(fil){
+res <- mclapply(dir(".",pattern="^[2,3].*_STAR\\.txt",full.names=TRUE),function(fil){
+  read.delim(fil,header=FALSE,stringsAsFactors=FALSE)
+},mc.cores=2)
+# names(res) <- gsub("_.*_STAR\\.txt","",dir("HTSeq/Ptrichocarpa",pattern="^[2,3].*_STAR\\.txt"))
+names(res) <- gsub("_.*_STAR\\.txt","",dir(".",pattern="^[2,3].*_STAR\\.txt"))
+
+### ==============================
+## get the count table
+### ==============================
+addInfo <- c("__no_feature","__ambiguous",
+             "__too_low_aQual","__not_aligned",
+             "__alignment_not_unique")
+sel <- match(addInfo,res[[1]][,1])
+count.table <- do.call(cbind,lapply(res,"[",2))[-sel,]
+colnames(count.table) <- names(res)
+rownames(count.table) <- res[[1]][,1][-sel]
+write.csv(count.table,"analysis/sex-raw-unormalised-data.csv")
+
+### ==============================
+## extract the HTSeq stat lines
+### ==============================
+count.stats <- do.call(cbind,lapply(res,"[",2))[sel,]
+colnames(count.stats) <- names(res)
+rownames(count.stats) <- sub("__","",addInfo)
+count.stats <- rbind(count.stats,aligned=colSums(count.table))
+count.stats <- count.stats[rowSums(count.stats) > 0,]
+
+## as percentages
+apply(count.stats,2,function(co){round(co*100/sum(co))})
+
+ +
                     202 207 213.1 221 226.1 229 229.1 235 236 239 244 303
+no_feature             3   3     3   2     3   3     3   3   3   3   3   3
+ambiguous              2   2     2   2     2   2     2   2   2   2   2   2
+alignment_not_unique  13  13    12  13    12  12    12  12  13  14  12  12
+aligned               82  82    83  83    83  83    83  83  82  81  83  83
+                     305.3 309.1 310.3 337.1 349.2
+no_feature               3     3     3     3     3
+ambiguous                2     2     2     2     2
+alignment_not_unique    12    12    13    12    14
+aligned                 83    83    82    83    81
+
+ +
### ==============================
+## plot the stats
+### ==============================
+pal=brewer.pal(6,"Dark2")[1:nrow(count.stats)]
+mar <- par("mar")
+par(mar=c(7.1,5.1,4.1,2.1))
+barplot(as.matrix(count.stats),col=pal,beside=TRUE,las=2,main="read proportion",
+        ylim=range(count.stats) + c(0,2e+7))
+legend("top",fill=pal,legend=rownames(count.stats),bty="n",cex=0.8)
+
+ +

plot of chunk unnamed-chunk-1

+ +
par(mar=mar)
+
+### ==============================
+## 11.4% of the genes are not expressed
+## out of a total of 41335 genes
+### ==============================
+sel <- rowSums(count.table) == 0
+sprintf("%s percent",round(sum(sel) * 100/ nrow(count.table),digits=1))
+
+ +
[1] "14.2 percent"
+
+ +
sprintf("of %s genes are not expressed",nrow(count.table))
+
+ +
[1] "of 41335 genes are not expressed"
+
+ +
### ==============================
+## display the per-gene mean expression
+## i.e. the mean raw count of every 
+## gene across samples is calculated
+## and displayed on a log10 scale
+### ==============================
+plot(density(log10(rowMeans(count.table))),col=pal[1],
+     main="mean raw counts distribution",
+     xlab="mean raw counts (log10)")
+
+ +

plot of chunk unnamed-chunk-1

+ +
### ==============================
+## The same is done for the individual
+## samples colored by sample type
+### ==============================
+pal=brewer.pal(8,"Dark2")
+plot.multidensity(log10(count.table),col=rep(pal,each=3),
+                  legend.x="topright",legend.cex=0.5,
+                  main="sample raw counts distribution",
+                  xlab="per gene raw counts (log10)")
+
+ +

plot of chunk unnamed-chunk-1

+ +
### ==============================
+## For visualization, the data is
+## submitted to a variance stabilization
+## transformation using DESeq2. The 
+## dispersion is estimated independently
+## of the sample type
+### ==============================
+samples <- read.csv("~/Git/UPSCb/projects/sex/doc/sex-samples.csv")
+conditions <- names(res)
+sex <- samples$sex[match(names(res),samples$sample)]
+
+## correct the sex based on sex determination gene
+sex[names(res) == "226.1"] <- "M"
+
+## get the date
+date <- factor(samples$date[match(names(res),samples$sample)])
+
+## create the dds object
+dds <- DESeqDataSetFromMatrix(
+  countData = count.table,
+  colData = data.frame(condition=conditions,
+                       sex=sex,
+                       date=date),
+  design = ~ condition)
+
+### ==============================
+## check the size factors
+## no big variation, can go for vsd
+## over rld
+### ==============================
+dds <- estimateSizeFactors(dds)
+sizes <- sizeFactors(dds)
+names(sizes) <- names(res)
+sizes
+
+ +
   202    207  213.1    221  226.1    229  229.1    235    236    239 
+0.7835 1.0659 0.8751 0.8125 0.9131 0.9292 1.8947 1.5191 1.0922 1.0645 
+   244    303  305.3  309.1  310.3  337.1  349.2 
+0.9019 0.9966 0.6491 0.7094 1.0976 0.7977 1.9518 
+
+ +
## do the vsd
+colData(dds)$condition <- factor(colData(dds)$condition,
+                                 levels=unique(conditions))
+vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
+vst <- assay(vsd)
+colnames(vst) <- colnames(count.table)
+vst <- vst - min(vst)
+write.csv(vst,"analysis/sex-library-size-normalized_variance-stabilized_data.csv")
+
+### ==============================
+## Visualize the corrected mean - sd
+## relationship. It is fairly linear,
+## meaning we can assume homoscedasticity.
+## the slight initial trend / bump is
+## due to genes having few counts in
+## a few subset of the samples and hence 
+## having a higher variability. This is
+## expected.
+### ==============================
+meanSdPlot(assay(vsd)[rowSums(count.table)>0,], ylim = c(0,2.5))
+
+ +

plot of chunk unnamed-chunk-1

+ +
### ==============================
+## perform a Principal Component
+## Analysis (PCA) of the data
+## to do a quick quality assessment
+## i.e. replicate should cluster
+## and the first dimensions should 
+## be explainable by biological means.
+### ==============================
+pc <- prcomp(t(vst))
+percent <- round(summary(pc)$importance[2,]*100)
+smpls <- conditions
+
+## color by date and sex
+sex.cols<-c("pink","lightblue")
+sex.names<-c(F="Female",M="Male")
+
+## is the separation sex?
+scatterplot3d(pc$x[,1],
+              pc$x[,2],
+              pc$x[,3],
+              xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
+              ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
+              zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
+              color=sex.cols[as.integer(factor(sex))],
+              pch=19)
+legend("bottomright",pch=c(NA,15,15),col=c(NA,sex.cols[1:2]),
+       legend=c("Color:",sex.names[levels(factor(sex))]))
+
+ +

plot of chunk unnamed-chunk-1

+ +
par(mar=mar)
+
+## or date?
+dat <- date
+scatterplot3d(pc$x[,1],
+              pc$x[,2],
+              pc$x[,3],
+              xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
+              ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
+              zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
+              color=pal[as.integer(factor(dat))],
+              pch=19)
+legend("topleft",pch=rep(c(19,23),each=10),col=rep(pal,2),legend=levels(factor(dat)),bty="n")
+
+ +

plot of chunk unnamed-chunk-1

+ +
par(mar=mar)
+
+### final plot
+### color = sex and sample = date
+scatterplot3d(pc$x[,1],
+              pc$x[,2],
+              pc$x[,3],
+              xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
+              ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
+              zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
+              color=sex.cols[as.integer(factor(sex))],
+              pch=c(19,17)[as.integer(factor(dat))])
+legend("bottomright",pch=c(NA,15,15),col=c(NA,sex.cols[1:2]),
+       legend=c("Color:",sex.names[levels(factor(sex))]))
+legend("topleft",pch=c(NA,21,24),col=c(NA,1,1),
+       legend=c("Symbol:",levels(factor(dat))),cex=0.85)
+
+ +

plot of chunk unnamed-chunk-1

+ +
par(mar=mar)
+
+### ==============================
+## and the first two dims
+### ==============================
+plot(pc$x[,1],  
+     pc$x[,2],
+     xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
+     ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
+     col=sex.cols[as.integer(factor(sex))],
+     pch=c(19,17)[as.integer(factor(dat))],
+     main="Principal Component Analysis",sub="variance stabilized counts")
+legend("bottomleft",pch=c(NA,15,15),col=c(NA,sex.cols[1:2]),
+       legend=c("Color:",sex.names[levels(factor(sex))]))
+legend("topright",pch=c(NA,21,24),col=c(NA,1,1),
+       legend=c("Symbol:",levels(factor(dat))),cex=0.85)
+text(pc$x[,1],  
+     pc$x[,2],
+     labels=conditions,cex=.5,adj=-1)
+
+ +

plot of chunk unnamed-chunk-1

+ +
### ==============================
+## and the 2nd and 3rd dims
+### ==============================
+plot(pc$x[,2],
+     pc$x[,3],
+     xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
+     ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
+     col=sex.cols[as.integer(factor(sex))],
+     pch=c(19,17)[as.integer(factor(dat))],     
+     main="Principal Component Analysis",sub="variance stabilized counts")
+legend("bottomleft",pch=c(NA,15,15),col=c(NA,sex.cols[1:2]),
+       legend=c("Color:",sex.names[levels(factor(sex))]))
+legend("topright",pch=c(NA,21,24),col=c(NA,1,1),
+       legend=c("Symbol:",levels(factor(dat))),cex=0.85)
+
+ +

plot of chunk unnamed-chunk-1

+ +
### ==============================
+## plot 
+### ==============================
+## do the rld
+rld <- rlogTransformation(dds)
+rlt <- assay(rld)
+colnames(rlt) <- colnames(count.table)
+write.csv(rlt,"analysis/sex-library-size-normalized_regularized-log-transformed_data.csv")
+
+## get the PCA from RLT data
+rpc <- prcomp(t(rlt))
+percent <- round(summary(rpc)$importance[2,]*100)
+
+## and plot the first 2 dim
+plot(rpc$x[,1],
+     rpc$x[,2],
+     xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
+     ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
+     col=sex.cols[as.integer(factor(sex))],
+     pch=c(19,17)[as.integer(factor(dat))],
+     main="Principal Component Analysis",sub="variance stabilized counts")
+legend("bottomleft",pch=c(NA,15,15),col=c(NA,sex.cols[1:2]),
+       legend=c("Color:",sex.names[levels(factor(sex))]))
+legend("topright",pch=c(NA,21,24),col=c(NA,1,1),
+       legend=c("Symbol:",levels(factor(dat))),cex=0.85)
+text(pc$x[,1],  
+     pc$x[,2],
+     labels=conditions,cex=.5,adj=1)
+
+ +

plot of chunk unnamed-chunk-1

+ +
par(mar=mar)
+
+### ==============================
+## heatmap and hc (samples and samples x genes)
+### ==============================
+#heatmap.2(vst, col=redgreen(75), scale="row", ColSideColors=patientcolors,
+#          key=TRUE, symkey=FALSE, density.info="none", trace="none", cexRow=0.5)
+
+### ==============================
+## arrayQualityMetrics
+### ==============================
+#suppressMessages(arrayQualityMetrics(ExpressionSet(assayData=vst),
+#                                     outdir="analysis/aQM"))
+
+### ==============================
+## do the Diff. Exp. -- with corrected sex
+### ==============================
+design(dds) <- ~date+sex
+dds <- DESeq(dds)
+
+ +
using pre-existing size factors
+estimating dispersions
+gene-wise dispersion estimates
+mean-dispersion relationship
+final dispersion estimates
+fitting model and testing
+
+ +
plotDispEsts(dds)
+
+ +

plot of chunk unnamed-chunk-1

+ +
res <- results(dds,contrast=c("sex","F","M"))
+alpha=0.01
+plotMA(res,alpha=alpha)
+
+ +

plot of chunk unnamed-chunk-1

+ +
volcanoPlot(res,alpha=alpha)
+
+ +
Loading required package: LSD
+Loading required package: MASS
+Loading required package: gtools
+Loading required package: colorRamps
+Loading required package: schoolmath
+Loading required package: ellipse
+
+Attaching package: 'ellipse'
+
+The following object is masked from 'package:VennDiagram':
+
+    ellipse
+
+ +

plot of chunk unnamed-chunk-1

+ +
hist(res$padj,breaks=seq(0,1,.01))
+
+ +

plot of chunk unnamed-chunk-1

+ +
sum(res$padj<alpha,na.rm=TRUE)
+
+ +
[1] 15
+
+ +
UmAspG <- rownames(res[order(res$padj,na.last=TRUE),][1:sum(res$padj<alpha,na.rm=TRUE),])
+
+### ==============================
+## do the Diff. Exp. -- with the original sample
+### ==============================
+sex[conditions=="226.1"] <- "F"
+dds <- DESeqDataSetFromMatrix(
+  countData = count.table,
+  colData = data.frame(condition=conditions,
+                       sex=sex,
+                       date=date),
+  design = ~ date+sex)
+
+dds <- DESeq(dds)
+
+ +
estimating size factors
+estimating dispersions
+gene-wise dispersion estimates
+mean-dispersion relationship
+final dispersion estimates
+fitting model and testing
+
+ +
plotDispEsts(dds)
+
+ +

plot of chunk unnamed-chunk-1

+ +
res.orig <- results(dds,contrast=c("sex","F","M"))
+plotMA(res.orig,alpha=alpha)
+
+ +