-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Mehdi Fazel
committed
Mar 16, 2017
1 parent
29700f1
commit 78f437d
Showing
11 changed files
with
435 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
^.*\.Rproj$ | ||
^\.Rproj\.user$ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
.Rproj.user | ||
.Rhistory | ||
.RData |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
Package: tRegs | ||
Type: Package | ||
Title: Signatures of TF regulatory activiy | ||
Version: 1.0 | ||
Date: 2017-03-16 | ||
Author: Mehdi Fazel Najafabadi, Mario Medvedovic | ||
Maintainer: Mehdi Fazel Najafabadi <[email protected]> | ||
Description: The package provides tools for constructing gene-specific TF binding scores from ChIP-seq data (TREG binding scores) and for constructing TREG signatures of TF regulatory activity by integrating TREG binding scores with suitable matched differential gene expression profiles. | ||
Depends: | ||
R (>= 3.3.1), | ||
GenomicFeatures (>= 1.24.5), | ||
org.Hs.eg.db, | ||
org.Mm.eg.db, | ||
org.Rn.eg.db | ||
License: GPL (>= 2) | ||
Encoding: UTF-8 | ||
LazyData: true | ||
RoxygenNote: 6.0.1 | ||
Packaged: 2017-03-16 10:18:15 UTC; Mehdi Fazel Najafabadi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# Generated by roxygen2: do not edit by hand | ||
|
||
export(annotateChipSeqPeaks) | ||
export(chipSeqWeightedSum) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
|
||
#' A function to match each peak to associated genes in terms of RefID. | ||
#' | ||
#' @description This function allows you to annotate chip-sequencing reads to a reference. | ||
#' It is intended to search all peaks within up and down | ||
#' distances for each gene. Users could specify their own platforms | ||
#' or the function will download platform from UCSC as default. | ||
#' @param chip.seq A matrix listing all CHIP-Seq peaks in which the chromosome | ||
#' ID for all peaks are listed in the first column, peaks' start | ||
#' positions are listed in the second column, peaks' end | ||
#' position are listed in the third column, peaks' ID are listed | ||
#' in the forth column and the fifth column lists peaks' score. | ||
#' @param transcriptDB This is Transcript definition table created by | ||
#' makeTranscriptDbFromUCSC or makeTxDbFromUCSC functions. | ||
#' @param distanceRange The up- and downstream window aroung gene TSS to be used | ||
#' in associating peaks to a gene. The default is c(-1e+06,1e+06). | ||
#' | ||
#' @return This function returns a matrix in which the first column contains | ||
#' the RefSeqID, the second column contains Entrez ID , the third | ||
#' column contains chromosome ID, the fourth column contains the | ||
#' absolute distance from each peak to transcription start site and | ||
#' the fifth column contains chip-seq score. | ||
#' | ||
#' @details This function is intended to search all peaks within up and down distances for each gene. | ||
#' | ||
#' @author Mehdi Fazel-Najafabadi, Mario Medvedovic | ||
#' | ||
#' @export | ||
#' @examples | ||
#' ## not run | ||
#' data(erAlpha) | ||
#' refTable <- GenomicFeatures::makeTxDbFromUCSC(genome="hg19",tablename="refGene") | ||
#' ChipSeq <- annotateChipSeqPeaks(chip.seq=erAlpha[[3]], transcriptDB=refTable, distanceRange=c(-1e+06,1e+06)) | ||
#' ## not run | ||
|
||
annotateChipSeqPeaks <- function(chip.seq, transcriptDB=NULL, distanceRange=c(-1e+06,1e+06)) { | ||
colnames(chip.seq) <- c("Chromosome","Start","End","Des","Score") | ||
if(is.null(transcriptDB) | class(transcriptDB) != "TxDb") stop("Please provide refGenome platform\n") else { | ||
genomeDB <- transcriptDB | ||
} | ||
getRefSeqs <- function(reftable, keys="TXNAME", columns=c("GENEID","TXNAME"), keytype="TXNAME") { | ||
refSeqs <- select(reftable, keys(reftable, keys), columns=columns ,keytype=keytype) | ||
refSeqs | ||
} | ||
refSeqs <- getRefSeqs(genomeDB) | ||
# importFrom GenomicFeatures transcripts | ||
# importFrom IRanges IRanges findOverlaps NCList values | ||
allchr <- unique(chip.seq[,1]) | ||
exps = matrix(NA,ncol=5) | ||
colnames(exps) <- c("RefID","GeneID","Chromosome","TSSDist","Score") | ||
for(chr in allchr) | ||
{ | ||
cat("Annotating ",chr,"\n") | ||
genome <- GenomicFeatures::transcripts(genomeDB, filter = list(tx_name=refSeqs[,"TXNAME"],tx_chrom=chr)) | ||
genome.starts <- start(genome) | ||
genome.ends <- end(genome) | ||
genome.strands <- as.vector(strand(genome)) | ||
genome.0.starts <- genome.starts | ||
genome.0.ends <- genome.ends | ||
genome.0.starts[genome.strands=="+"] <- genome.starts[genome.strands=="+"]+distanceRange[1] | ||
genome.0.starts[genome.0.starts<0] <- 0 | ||
genome.0.ends[genome.strands=="+"] <- genome.starts[genome.strands=="+"]+distanceRange[2] | ||
genome.0.starts[genome.strands=="-"] <- genome.ends[genome.strands=="-"]+distanceRange[1] | ||
genome.0.starts[genome.0.starts<0] <- 0 | ||
genome.0.ends[genome.strands=="-"] <- genome.ends[genome.strands=="-"]+distanceRange[2] | ||
gene.id <- refSeqs$GENEID[match(as.character(IRanges::values(genome)[,"tx_name"]), refSeqs$TXNAME)] | ||
|
||
tree <- IRanges::NCList(IRanges::IRanges(genome.0.starts,genome.0.ends)) | ||
temp <- chip.seq[chip.seq$Chromosome==chr,] | ||
tables <- as.matrix(IRanges::findOverlaps(query=IRanges::IRanges(temp$Start,temp$End),tree,type="within",select="all")) | ||
distances <- genome.starts[tables[,2]] | ||
distances[is.element(tables[,2],which(genome.strands=="-"))] <- genome.ends[tables[is.element(tables[,2],which(genome.strands=="-")),2]] | ||
starts1 <- temp[tables[,1],"Start"] | ||
starts2 <- temp[tables[,1],"End"] | ||
distances <- as.integer(abs((starts1+starts2)/2-distances)) | ||
tables<-cbind(tables,Dist=distances,Score=temp[tables[,1],"Score"]) | ||
rowID <- paste(IRanges::values(genome)[tables[,2],"tx_name"], IRanges::values(genome)[tables[,2],"tx_id"],tables[,3],tables[,1],sep="-") | ||
if(length(rowID)>length(unique(rowID))) cat(chr,"\n") | ||
tables <- data.frame(RefID=IRanges::values(genome)[tables[,2],"tx_name"],GeneID=gene.id[tables[,2]],Chromosome=as.character(rep(chr,dim(tables)[1])), | ||
TSSDist=tables[,3],Score=tables[,4],row.names=rowID,stringsAsFactors=FALSE) | ||
exps <- rbind(exps,tables) | ||
rm(tables) | ||
} | ||
exps <- exps[!is.na(exps[,1]),] | ||
exps | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,195 @@ | ||
|
||
#' A function to calculate binding strengh by summing weighted chip-seq peaks. | ||
#' | ||
#' @description This function is intended to calculate chip-seq binding score and | ||
#' associated probability for each gene. The input chip-seq list | ||
#' contains all peaks found in up/down stream within certain range | ||
#' whose weights contribute to final score is assigned based on the | ||
#' its distance to transcription start site. | ||
#' @param ChipList A matrix contains all peaks for each genes with each row | ||
#' represent one peak and the first column contains ref-seq ID, | ||
#' second column contain entrez ID, third column contains | ||
#' chromosome ID, fourth column contains distance from the | ||
#' center of the peak to transcription start site and the fifth | ||
#' column contains the peak's strength. | ||
#' @param verbose If TRUE, prints out results of every iteration. | ||
#' @param genome Transcript definition table created by | ||
#' makeTranscriptDbFromUCSC function and used to create ChipList | ||
#' | ||
#' @return This function returns a matrix with first column contains the | ||
#' entrez ID, the second column contains the score and the third | ||
#' column contains the associated probability. | ||
#' | ||
#' @details This function is intended to calculate chip-seq binding score and | ||
#' associated probability for each gene. | ||
#' | ||
#' @author Mehdi Fazel-Najafabadi, Mario Medvedovic | ||
#' | ||
#' @references ... | ||
#' @seealso ... | ||
#' @keywords annotation | ||
# importFrom GenomicFeatures transcripts | ||
# importFrom IRanges IRanges findOverlaps | ||
#' @export | ||
#' @examples | ||
#' ## not run | ||
#' data(erAlpha) | ||
#' refTable <- GenomicFeatures::makeTxDbFromUCSC(genome="hg19",tablename="refGene") | ||
#' ChipSeq <- annotateChipSeqPeaks(chip.seq=erAlpha[[3]], transcriptDB=refTable, distanceRange=c(-1e+06,1e+06)) | ||
#' tregBindingERalpha <- chipSeqWeightedSum(ChipSeq, verbose=TRUE, genome=refTable) | ||
#' ## not run | ||
|
||
chipSeqWeightedSum <- function(ChipList,verbose=FALSE, genome=NULL) { | ||
emExpUniform <- function(data,p=0.1, lamda=0.1, steps=1000,stopCond=0.01) { | ||
maxValue <- max(data) | ||
minValue <- min(data) | ||
likelihood1 <- sum(log(p*lamda*exp(-lamda*data)+(1-p)/(maxValue-minValue))) | ||
likelihood2 <- 9999 | ||
if(verbose)cat("Log likelihood ",likelihood1, "\nIteration\tLikelihood\tWeights\tLamda\n") | ||
iter <- 1 | ||
while(abs(likelihood2-likelihood1) > stopCond & iter < steps) | ||
{ | ||
if(iter > 1) likelihood1 <- likelihood2 | ||
tao <- p*lamda*exp(-lamda*data)/(p*lamda*exp(-lamda*data)+(1-p)/(maxValue-minValue)) | ||
p <- sum(tao)/length(data) | ||
lamda <- sum(tao)/sum(tao*data) | ||
likelihood2 <- sum(log(p*lamda*exp(-lamda*data)+(1-p)/(maxValue-minValue))) | ||
iter <- iter+1 | ||
if(verbose)cat(iter,likelihood2,p,lamda,"\n") | ||
} | ||
res <- list(p=p,lamda=lamda) | ||
res | ||
} | ||
|
||
emExpNormal <- function(data,p=0.1,lamda=0.1,mean=2.0,var=1.0,steps=1000,stopCond=0.01) { | ||
likelihood1 <- sum(log(p*lamda*exp(-lamda*data)+(1-p)*dnorm(data,mean=mean,sd=sqrt(var)))) | ||
likelihood2 <- 9999 | ||
if(verbose) cat("Log likelihood ",likelihood1, "\nIteration\tLikelihood\tWeights\tLamda\tMean\tVariance\n") | ||
iter <- 1 | ||
while(abs(likelihood2-likelihood1) > stopCond & iter < steps) | ||
{ | ||
if(iter > 1) likelihood1 <- likelihood2 | ||
tao1 <- p*lamda*exp(-lamda*data)/(p*lamda*exp(-lamda*data)+(1-p)*dnorm(data,mean=mean,sd=sqrt(var))) | ||
tao2 <- 1-tao1 | ||
p <- sum(tao1)/length(data) | ||
lamda <- sum(tao1)/sum(tao1*data) | ||
mean <- sum(tao2*data)/sum(tao2) | ||
var <- sum(tao2*(data-mean)^2)/sum(tao2) | ||
likelihood2 <- sum(log(p*lamda*exp(-lamda*data)+(1-p)*dnorm(data,mean=mean,sd=sqrt(var)))) | ||
iter <- iter+1 | ||
if(verbose)cat(iter,likelihood2,p,lamda,mean,var,"\n") | ||
} | ||
res <- list(p=p,lamda=lamda,mean=mean,var=var) | ||
res | ||
} | ||
|
||
emExpNormal.prior <- function(data,p=0.1,lamda=0.1,mean=2.0,var=1.0,steps=20,stopCond=0.01,background=3,probs=0.5) { | ||
if(background <= 0) background <- 0.1 | ||
lamda <- -log(probs)/background | ||
mean <- background+1 | ||
likelihood1 <- sum(log(p*lamda*exp(-lamda*data)+(1-p)*dnorm(data,mean=mean,sd=sqrt(var)))) | ||
likelihood2 <- 9999 | ||
if(verbose) cat("Log likelihood ",likelihood1, "\nIteration\tLikelihood\tWeights\tLamda\tMean\tVariance\n") | ||
iter <- 1 | ||
res <- list() | ||
while(abs(likelihood2 - likelihood1) > stopCond & iter < steps) { | ||
if(iter > 1) likelihood1 <- likelihood2 | ||
tao1 <- p*lamda*exp(-lamda*data)/(p*lamda*exp(-lamda*data)+(1-p)*dnorm(data,mean=mean,sd=sqrt(var))) | ||
tao2 <- 1-tao1 | ||
p <- sum(tao1)/length(data) | ||
mean <- sum(tao2*data)/sum(tao2) | ||
var <- sum(tao2*(data-mean)^2)/sum(tao2) | ||
likelihood2 <- sum(log(p*lamda*exp(-lamda*data)+(1-p)*dnorm(data,mean=mean,sd=sqrt(var)))) | ||
res[[iter]] <- list(p=p, lamda=lamda, mean=mean, var=var) | ||
if(verbose) cat(iter,likelihood2,p,lamda,mean,var,"\n") | ||
if (likelihood2 == Inf) return(res[[iter-1]]) | ||
iter <- iter+1 | ||
} | ||
res <- list(p=p, lamda=lamda, mean=mean, var=var) | ||
res | ||
} | ||
|
||
estimate.background <- function(data,para) { | ||
minDist <- min(as.numeric(data[,"TSSDist"])) | ||
maxDist <- max(as.numeric(data[,"TSSDist"])) | ||
|
||
windowsize <- table(data[,"RefID"]) | ||
windowsize <- max(windowsize) | ||
windowsize <- as.integer((maxDist-minDist)/windowsize) | ||
res <- 0 | ||
iter <- minDist | ||
bin.start <- seq(minDist,maxDist-windowsize,by=windowsize) | ||
bin.end <- seq(windowsize,maxDist,by=windowsize) | ||
bins <- IRanges::IRanges(start=bin.start,end=bin.end) | ||
query <- IRanges::IRanges(as.numeric(data[,"TSSDist"]),as.numeric(data[,"TSSDist"])+0.5) | ||
tables <- as.matrix(IRanges::findOverlaps(query,bins)) | ||
bin <- unique(tables[,2]) | ||
peaks <- unlist(sapply(bin,function(x) mean(as.numeric(data[tables[tables[,2]==x,1],"Score"])))) | ||
Weights <- para$p*para$lamda*exp(-para$lamda*(bin.start+windowsize/2)) | ||
Weights <- Weights/(Weights+(1-para$p)/(maxDist-minDist)) | ||
res <- Weights*peaks | ||
res <- sum(res) | ||
|
||
res | ||
} | ||
|
||
WeightedSum <- function(data,TSSDist.Bound=10000) { | ||
data <- data[as.numeric(data[,"TSSDist"])<=TSSDist.Bound,] | ||
para <- emExpUniform(as.numeric(data[,"TSSDist"]),p=0.1,lamda=0.1) | ||
if(verbose) cat("Estimate background...") | ||
uniform <- estimate.background(data,para) | ||
if(verbose) cat(log(uniform+1),"\n") | ||
Weights <- para$p*para$lamda*exp(-para$lamda*as.numeric(data[,"TSSDist"])) | ||
Weights <- Weights/(Weights+(1-para$p)/(max(as.numeric(data[,"TSSDist"]))-min(as.numeric(data[,"TSSDist"])))) | ||
data <- cbind(data,WeightScore=Weights*as.numeric(data[,"Score"])) | ||
Weighted.Sum <- matrix(0,nrow=length(unique(data[,"RefID"])),ncol=3) | ||
colnames(Weighted.Sum) <- c("RefID","EntrezID","Score") | ||
if(verbose) cat("Summarize Chip.Seq peaks...\n") | ||
temp <- split(data,data[,"RefID"]) | ||
Weighted.Sum[,1] <- names(temp) | ||
Weighted.Sum[,2] <- sapply(temp,function(x)x[1,"GeneID"]) | ||
Weighted.Sum[,3] <- sapply(temp,function(x) sum(x[,"WeightScore"])) | ||
rownames(Weighted.Sum) <- names(temp) | ||
Weighted.Sum <- data.frame(Weighted.Sum,stringsAsFactors=F) | ||
|
||
res <- list(Score=Weighted.Sum,Uniform=uniform) | ||
res | ||
} | ||
|
||
WeightedSum.Prob <- function(data,prob=0.005,prior=TRUE,log=TRUE) { | ||
data.2 <- data[[1]] | ||
if(log) { | ||
data.2[,"Score"] <- log(as.numeric(data.2[,"Score"])+1) | ||
backgrounds <- log(data[[2]]+1) | ||
} else backgrounds <- data[[2]] | ||
|
||
if(prior) {para <- emExpNormal.prior(as.numeric(data.2[,"Score"]), background=backgrounds,probs=prob) | ||
} else para <- emExpNormal(as.numeric(data.2[,"Score"])) | ||
Prob <- (1-para$p)*dnorm(as.numeric(data.2[,"Score"]), mean=para$mean,sd=sqrt(para$var)) | ||
Prob <- Prob/(Prob+para$p*para$lamda*exp(-para$lamda*as.numeric(data.2[,"Score"]))) | ||
data.2 <- cbind(data.2, Prob=Prob) | ||
rownames(data.2) <- data.2[,1] | ||
data.2 <- data.2[,-1] | ||
data.2 | ||
} | ||
getRefSeqs <- function(reftable, keys="TXNAME", columns=c("GENEID","TXNAME"), keytype="TXNAME") { | ||
refSeqs <- select(reftable, keys(reftable, keys), columns=columns ,keytype=keytype) | ||
refSeqs | ||
} | ||
|
||
if(verbose) cat("Sum Chip-seq peaks...\n") | ||
Chip.Weighted <- WeightedSum(ChipList,TSSDist.Bound=1000000) | ||
if(verbose) cat("Get binding probs...\n") | ||
Chip.Weighted.Prob <- WeightedSum.Prob(Chip.Weighted, prob=0.001, prior=TRUE) | ||
if(!is.null(genome)){ | ||
print("Assigning zeros to non-bound genes") | ||
refseqs <- getRefSeqs(genome) | ||
matchingAllReseqs <- match(refseqs[,1],rownames(Chip.Weighted.Prob)) | ||
Chip.Weighted.Prob <- Chip.Weighted.Prob[matchingAllReseqs,] | ||
Chip.Weighted.Prob$Score[is.na(matchingAllReseqs)] <- 0 | ||
Chip.Weighted.Prob$Prob[is.na(matchingAllReseqs)] <- 0 | ||
Chip.Weighted.Prob$EntrezID <- refseqs[, 2] | ||
rownames(Chip.Weighted.Prob) <- refseqs[, 1] | ||
} | ||
Chip.Weighted.Prob | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
# tR | ||
# tRegs | ||
A package for annotating chip-seq data and creating TREG signatures |
Binary file not shown.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Oops, something went wrong.