Commit c419cb63 authored by Nicolas Delhomme's avatar Nicolas Delhomme

added gtf support

parent 02dfdfbc
"createSyntheticTranscripts" <- function(gff3, "createSyntheticTranscripts" <- function(filename,
output = c("Genome_intervals","GRanges"), input = c("gff3","gtf"),
features = c("mRNA", "tRNA", "miRNA")) { features = c("mRNA", "miRNA", "tRNA", "transcript"),
require(genomeIntervals) output = c("Genome_intervals","GRanges")) {
require(S4Vectors)
#' load libraries
stopifnot(require(genomeIntervals))
stopifnot(require(S4Vectors))
stopifnot(require(IRanges))
stopifnot(require(easyRNASeq))
#' first check
stopifnot(file.exists(filename))
#' get the values
input <- match.arg(input)
features <- match.arg(features, several.ok = TRUE) features <- match.arg(features, several.ok = TRUE)
output <- match.arg(output) output <- match.arg(output)
gff <- readGff3(gff3) #' define some global variables
relation <- switch(input,
"gff3"=list(ID="ID",Parent="Parent"),
"gtf"=list(ID="transcript_id",Parent="gene_id"))
#' read the gff3/gtf file
dat <- readGff3(filename)
## get the gene <-> pacid map #' If gtf, reformat the attributes and drop the double quotes
if(input=="gtf"){
dat$gffAttributes <- gsub("\"","",easyRNASeq:::.convertGffToGtfAttributes(dat$gffAttributes))
}
## get the gene <-> mRNA/transcript map
# This is mRNA IDs and their parents (genes) # This is mRNA IDs and their parents (genes)
sel <- gff$type %in% features sel <- dat$type %in% features
idMap <- data.frame(type = gff[sel]$type,
getGffAttribute(gff[sel],"ID"), # That step would not necessary for gtf, but it is easier to implement in a
getGffAttribute(gff[sel],"Parent")) # similar way for both format
idMap <- data.frame(type = dat[sel]$type,
getGffAttribute(dat[sel],relation$ID),
getGffAttribute(dat[sel],relation$Parent))
## extract the exons and group by gene ID ## extract the exons and group by gene ID
sel <- gff$type == "exon" sel <- dat$type == "exon"
## we can drop multiple Parents (i.e. comma separated Parent values as we are ## we can drop multiple Parents (i.e. comma separated Parent values as we are
## collapsing them anyway) ## collapsing them anyway)
mRnaID <- sub(",.*","",getGffAttribute(gff[sel],"Parent")) mRnaID <- sub(",.*","",getGffAttribute(dat[sel],switch(input,
"gff3"=relation$Parent,
"gtf"=relation$ID)))
## avoid unwanted features ## avoid unwanted features
rngs <- IRanges::IRanges(start = gff[sel, 1], rngs <- IRanges(start = dat[sel, 1],
end = gff[sel, 2])[mRnaID %in% idMap$ID] end = dat[sel, 2])[mRnaID %in% idMap[,relation$ID]]
## create a set of synthetic exons ## create a set of synthetic exons
rngList <- IRanges::reduce( rngList <- IRanges::reduce(
IRanges::split(rngs, IRanges::split(rngs,
idMap[match(mRnaID[mRnaID %in% idMap$ID], idMap[match(mRnaID[mRnaID %in% idMap[,relation$ID]],
idMap$ID),"Parent"])) idMap[,relation$ID]),relation$Parent]))
## export the gene, exon and features as gff3 ## export the gene, exon and features as gff3
## create the new gff object ## create the new gff object
## select the gene ## select the gene
sel <- gff$type == "gene" sel <- dat$type == "gene"
## create the gene gff ## create the gene gff
geneID <- getGffAttribute(gff[sel],"ID") geneID <- getGffAttribute(dat[sel],switch(input,
geneGff <- gff[sel][geneID %in% idMap$Parent] "gff3"=relation$ID,
"gtf"=relation$Parent))
geneGff <- dat[sel][geneID %in% idMap[,relation$Parent]]
if (input == "gtf"){
geneGff$gffAttributes <- sub(relation$Parent,"ID",geneGff$gffAttributes)
}
## create gffs for each feature ## create gffs for each feature
featureGff <- Reduce(c, lapply(features, function(f) { featureGff <- Reduce(c, lapply(features, function(f) {
fGff <- gff[sel][geneID %in% idMap$Parent[idMap$type == f]] f.sel <- geneID %in% idMap[,relation$Parent][idMap$type == f]
fGff <- dat[sel][f.sel]
fGff$type <- f fGff$type <- f
fGff$gffAttributes <- paste(paste( fGff$gffAttributes <- paste("ID=",
sub(";", ".0;", fGff$gffAttributes), "0;Parent=", sep="."), getGffAttribute(fGff,relation$Parent),
geneID[geneID %in% idMap$Parent[idMap$type == f]], sep="") ".0;Parent=",
getGffAttribute(fGff,relation$Parent),
sep="")
fGff fGff
})) }))
## create the exon gff ## create the exon gff
rngList <- rngList[match(geneID[geneID %in% idMap$Parent], names(rngList))] rngList <- rngList[match(geneID[geneID %in% idMap[,relation$Parent]], names(rngList))]
exonNumber <- elementLengths(rngList) exonNumber <- elementLengths(rngList)
exonGff <- gff[rep(which(sel)[geneID %in% idMap$Parent], exonNumber)] exonGff <- dat[rep(which(sel)[geneID %in% idMap[,relation$Parent]], exonNumber)]
exonGff[,1] <- IRanges::unlist(start(rngList)) exonGff[,1] <- IRanges::unlist(start(rngList))
exonGff[,2] <- IRanges::unlist(end(rngList)) exonGff[,2] <- IRanges::unlist(end(rngList))
exonID <- sapply(exonNumber, ":", 1) exonID <- sapply(exonNumber, ":", 1)
sel <- geneGff$strand == "+" sel <- geneGff$strand == "+"
exonID[sel] <- sapply(exonID[sel], rev) exonID[sel] <- sapply(exonID[sel], rev)
ID <- getGffAttribute(exonGff, "ID") ID <- getGffAttribute(exonGff, switch(input,"gff3"=relation$ID,"gtf"=relation$Parent))
exonGff$gffAttributes <- paste0("ID=", paste(ID, "exon", unlist(exonID, use.names=FALSE), sep="."), exonGff$gffAttributes <- paste0("ID=", paste(ID, "exon", unlist(exonID, use.names=FALSE), sep="."),
";Name=", paste(ID, "exon", unlist(exonID, use.names=FALSE), sep="."), ";Name=", paste(ID, "exon", unlist(exonID, use.names=FALSE), sep="."),
";Parent=", paste(ID,"0",sep = ".")) ";Parent=", paste(ID,"0",sep = "."))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment