Skip to content

Commit

Permalink
Initial version
Browse files Browse the repository at this point in the history
  • Loading branch information
Mehdi Fazel committed Mar 16, 2017
1 parent 29700f1 commit 78f437d
Show file tree
Hide file tree
Showing 11 changed files with 435 additions and 1 deletion.
2 changes: 2 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
^.*\.Rproj$
^\.Rproj\.user$
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
.Rproj.user
.Rhistory
.RData
19 changes: 19 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
Package: tRegs
Type: Package
Title: Signatures of TF regulatory activiy
Version: 1.0
Date: 2017-03-16
Author: Mehdi Fazel Najafabadi, Mario Medvedovic
Maintainer: Mehdi Fazel Najafabadi <[email protected]>
Description: The package provides tools for constructing gene-specific TF binding scores from ChIP-seq data (TREG binding scores) and for constructing TREG signatures of TF regulatory activity by integrating TREG binding scores with suitable matched differential gene expression profiles.
Depends:
R (>= 3.3.1),
GenomicFeatures (>= 1.24.5),
org.Hs.eg.db,
org.Mm.eg.db,
org.Rn.eg.db
License: GPL (>= 2)
Encoding: UTF-8
LazyData: true
RoxygenNote: 6.0.1
Packaged: 2017-03-16 10:18:15 UTC; Mehdi Fazel Najafabadi
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Generated by roxygen2: do not edit by hand

export(annotateChipSeqPeaks)
export(chipSeqWeightedSum)
86 changes: 86 additions & 0 deletions R/annotateChipSeqPeaks.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@

#' A function to match each peak to associated genes in terms of RefID.
#'
#' @description This function allows you to annotate chip-sequencing reads to a reference.
#' It is intended to search all peaks within up and down
#' distances for each gene. Users could specify their own platforms
#' or the function will download platform from UCSC as default.
#' @param chip.seq A matrix listing all CHIP-Seq peaks in which the chromosome
#' ID for all peaks are listed in the first column, peaks' start
#' positions are listed in the second column, peaks' end
#' position are listed in the third column, peaks' ID are listed
#' in the forth column and the fifth column lists peaks' score.
#' @param transcriptDB This is Transcript definition table created by
#' makeTranscriptDbFromUCSC or makeTxDbFromUCSC functions.
#' @param distanceRange The up- and downstream window aroung gene TSS to be used
#' in associating peaks to a gene. The default is c(-1e+06,1e+06).
#'
#' @return This function returns a matrix in which the first column contains
#' the RefSeqID, the second column contains Entrez ID , the third
#' column contains chromosome ID, the fourth column contains the
#' absolute distance from each peak to transcription start site and
#' the fifth column contains chip-seq score.
#'
#' @details This function is intended to search all peaks within up and down distances for each gene.
#'
#' @author Mehdi Fazel-Najafabadi, Mario Medvedovic
#'
#' @export
#' @examples
#' ## not run
#' data(erAlpha)
#' refTable <- GenomicFeatures::makeTxDbFromUCSC(genome="hg19",tablename="refGene")
#' ChipSeq <- annotateChipSeqPeaks(chip.seq=erAlpha[[3]], transcriptDB=refTable, distanceRange=c(-1e+06,1e+06))
#' ## not run

annotateChipSeqPeaks <- function(chip.seq, transcriptDB=NULL, distanceRange=c(-1e+06,1e+06)) {
colnames(chip.seq) <- c("Chromosome","Start","End","Des","Score")
if(is.null(transcriptDB) | class(transcriptDB) != "TxDb") stop("Please provide refGenome platform\n") else {
genomeDB <- transcriptDB
}
getRefSeqs <- function(reftable, keys="TXNAME", columns=c("GENEID","TXNAME"), keytype="TXNAME") {
refSeqs <- select(reftable, keys(reftable, keys), columns=columns ,keytype=keytype)
refSeqs
}
refSeqs <- getRefSeqs(genomeDB)
# importFrom GenomicFeatures transcripts
# importFrom IRanges IRanges findOverlaps NCList values
allchr <- unique(chip.seq[,1])
exps = matrix(NA,ncol=5)
colnames(exps) <- c("RefID","GeneID","Chromosome","TSSDist","Score")
for(chr in allchr)
{
cat("Annotating ",chr,"\n")
genome <- GenomicFeatures::transcripts(genomeDB, filter = list(tx_name=refSeqs[,"TXNAME"],tx_chrom=chr))
genome.starts <- start(genome)
genome.ends <- end(genome)
genome.strands <- as.vector(strand(genome))
genome.0.starts <- genome.starts
genome.0.ends <- genome.ends
genome.0.starts[genome.strands=="+"] <- genome.starts[genome.strands=="+"]+distanceRange[1]
genome.0.starts[genome.0.starts<0] <- 0
genome.0.ends[genome.strands=="+"] <- genome.starts[genome.strands=="+"]+distanceRange[2]
genome.0.starts[genome.strands=="-"] <- genome.ends[genome.strands=="-"]+distanceRange[1]
genome.0.starts[genome.0.starts<0] <- 0
genome.0.ends[genome.strands=="-"] <- genome.ends[genome.strands=="-"]+distanceRange[2]
gene.id <- refSeqs$GENEID[match(as.character(IRanges::values(genome)[,"tx_name"]), refSeqs$TXNAME)]

tree <- IRanges::NCList(IRanges::IRanges(genome.0.starts,genome.0.ends))
temp <- chip.seq[chip.seq$Chromosome==chr,]
tables <- as.matrix(IRanges::findOverlaps(query=IRanges::IRanges(temp$Start,temp$End),tree,type="within",select="all"))
distances <- genome.starts[tables[,2]]
distances[is.element(tables[,2],which(genome.strands=="-"))] <- genome.ends[tables[is.element(tables[,2],which(genome.strands=="-")),2]]
starts1 <- temp[tables[,1],"Start"]
starts2 <- temp[tables[,1],"End"]
distances <- as.integer(abs((starts1+starts2)/2-distances))
tables<-cbind(tables,Dist=distances,Score=temp[tables[,1],"Score"])
rowID <- paste(IRanges::values(genome)[tables[,2],"tx_name"], IRanges::values(genome)[tables[,2],"tx_id"],tables[,3],tables[,1],sep="-")
if(length(rowID)>length(unique(rowID))) cat(chr,"\n")
tables <- data.frame(RefID=IRanges::values(genome)[tables[,2],"tx_name"],GeneID=gene.id[tables[,2]],Chromosome=as.character(rep(chr,dim(tables)[1])),
TSSDist=tables[,3],Score=tables[,4],row.names=rowID,stringsAsFactors=FALSE)
exps <- rbind(exps,tables)
rm(tables)
}
exps <- exps[!is.na(exps[,1]),]
exps
}
195 changes: 195 additions & 0 deletions R/chipSeqWeightedSum.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@

#' A function to calculate binding strengh by summing weighted chip-seq peaks.
#'
#' @description This function is intended to calculate chip-seq binding score and
#' associated probability for each gene. The input chip-seq list
#' contains all peaks found in up/down stream within certain range
#' whose weights contribute to final score is assigned based on the
#' its distance to transcription start site.
#' @param ChipList A matrix contains all peaks for each genes with each row
#' represent one peak and the first column contains ref-seq ID,
#' second column contain entrez ID, third column contains
#' chromosome ID, fourth column contains distance from the
#' center of the peak to transcription start site and the fifth
#' column contains the peak's strength.
#' @param verbose If TRUE, prints out results of every iteration.
#' @param genome Transcript definition table created by
#' makeTranscriptDbFromUCSC function and used to create ChipList
#'
#' @return This function returns a matrix with first column contains the
#' entrez ID, the second column contains the score and the third
#' column contains the associated probability.
#'
#' @details This function is intended to calculate chip-seq binding score and
#' associated probability for each gene.
#'
#' @author Mehdi Fazel-Najafabadi, Mario Medvedovic
#'
#' @references ...
#' @seealso ...
#' @keywords annotation
# importFrom GenomicFeatures transcripts
# importFrom IRanges IRanges findOverlaps
#' @export
#' @examples
#' ## not run
#' data(erAlpha)
#' refTable <- GenomicFeatures::makeTxDbFromUCSC(genome="hg19",tablename="refGene")
#' ChipSeq <- annotateChipSeqPeaks(chip.seq=erAlpha[[3]], transcriptDB=refTable, distanceRange=c(-1e+06,1e+06))
#' tregBindingERalpha <- chipSeqWeightedSum(ChipSeq, verbose=TRUE, genome=refTable)
#' ## not run

chipSeqWeightedSum <- function(ChipList,verbose=FALSE, genome=NULL) {
emExpUniform <- function(data,p=0.1, lamda=0.1, steps=1000,stopCond=0.01) {
maxValue <- max(data)
minValue <- min(data)
likelihood1 <- sum(log(p*lamda*exp(-lamda*data)+(1-p)/(maxValue-minValue)))
likelihood2 <- 9999
if(verbose)cat("Log likelihood ",likelihood1, "\nIteration\tLikelihood\tWeights\tLamda\n")
iter <- 1
while(abs(likelihood2-likelihood1) > stopCond & iter < steps)
{
if(iter > 1) likelihood1 <- likelihood2
tao <- p*lamda*exp(-lamda*data)/(p*lamda*exp(-lamda*data)+(1-p)/(maxValue-minValue))
p <- sum(tao)/length(data)
lamda <- sum(tao)/sum(tao*data)
likelihood2 <- sum(log(p*lamda*exp(-lamda*data)+(1-p)/(maxValue-minValue)))
iter <- iter+1
if(verbose)cat(iter,likelihood2,p,lamda,"\n")
}
res <- list(p=p,lamda=lamda)
res
}

emExpNormal <- function(data,p=0.1,lamda=0.1,mean=2.0,var=1.0,steps=1000,stopCond=0.01) {
likelihood1 <- sum(log(p*lamda*exp(-lamda*data)+(1-p)*dnorm(data,mean=mean,sd=sqrt(var))))
likelihood2 <- 9999
if(verbose) cat("Log likelihood ",likelihood1, "\nIteration\tLikelihood\tWeights\tLamda\tMean\tVariance\n")
iter <- 1
while(abs(likelihood2-likelihood1) > stopCond & iter < steps)
{
if(iter > 1) likelihood1 <- likelihood2
tao1 <- p*lamda*exp(-lamda*data)/(p*lamda*exp(-lamda*data)+(1-p)*dnorm(data,mean=mean,sd=sqrt(var)))
tao2 <- 1-tao1
p <- sum(tao1)/length(data)
lamda <- sum(tao1)/sum(tao1*data)
mean <- sum(tao2*data)/sum(tao2)
var <- sum(tao2*(data-mean)^2)/sum(tao2)
likelihood2 <- sum(log(p*lamda*exp(-lamda*data)+(1-p)*dnorm(data,mean=mean,sd=sqrt(var))))
iter <- iter+1
if(verbose)cat(iter,likelihood2,p,lamda,mean,var,"\n")
}
res <- list(p=p,lamda=lamda,mean=mean,var=var)
res
}

emExpNormal.prior <- function(data,p=0.1,lamda=0.1,mean=2.0,var=1.0,steps=20,stopCond=0.01,background=3,probs=0.5) {
if(background <= 0) background <- 0.1
lamda <- -log(probs)/background
mean <- background+1
likelihood1 <- sum(log(p*lamda*exp(-lamda*data)+(1-p)*dnorm(data,mean=mean,sd=sqrt(var))))
likelihood2 <- 9999
if(verbose) cat("Log likelihood ",likelihood1, "\nIteration\tLikelihood\tWeights\tLamda\tMean\tVariance\n")
iter <- 1
res <- list()
while(abs(likelihood2 - likelihood1) > stopCond & iter < steps) {
if(iter > 1) likelihood1 <- likelihood2
tao1 <- p*lamda*exp(-lamda*data)/(p*lamda*exp(-lamda*data)+(1-p)*dnorm(data,mean=mean,sd=sqrt(var)))
tao2 <- 1-tao1
p <- sum(tao1)/length(data)
mean <- sum(tao2*data)/sum(tao2)
var <- sum(tao2*(data-mean)^2)/sum(tao2)
likelihood2 <- sum(log(p*lamda*exp(-lamda*data)+(1-p)*dnorm(data,mean=mean,sd=sqrt(var))))
res[[iter]] <- list(p=p, lamda=lamda, mean=mean, var=var)
if(verbose) cat(iter,likelihood2,p,lamda,mean,var,"\n")
if (likelihood2 == Inf) return(res[[iter-1]])
iter <- iter+1
}
res <- list(p=p, lamda=lamda, mean=mean, var=var)
res
}

estimate.background <- function(data,para) {
minDist <- min(as.numeric(data[,"TSSDist"]))
maxDist <- max(as.numeric(data[,"TSSDist"]))

windowsize <- table(data[,"RefID"])
windowsize <- max(windowsize)
windowsize <- as.integer((maxDist-minDist)/windowsize)
res <- 0
iter <- minDist
bin.start <- seq(minDist,maxDist-windowsize,by=windowsize)
bin.end <- seq(windowsize,maxDist,by=windowsize)
bins <- IRanges::IRanges(start=bin.start,end=bin.end)
query <- IRanges::IRanges(as.numeric(data[,"TSSDist"]),as.numeric(data[,"TSSDist"])+0.5)
tables <- as.matrix(IRanges::findOverlaps(query,bins))
bin <- unique(tables[,2])
peaks <- unlist(sapply(bin,function(x) mean(as.numeric(data[tables[tables[,2]==x,1],"Score"]))))
Weights <- para$p*para$lamda*exp(-para$lamda*(bin.start+windowsize/2))
Weights <- Weights/(Weights+(1-para$p)/(maxDist-minDist))
res <- Weights*peaks
res <- sum(res)

res
}

WeightedSum <- function(data,TSSDist.Bound=10000) {
data <- data[as.numeric(data[,"TSSDist"])<=TSSDist.Bound,]
para <- emExpUniform(as.numeric(data[,"TSSDist"]),p=0.1,lamda=0.1)
if(verbose) cat("Estimate background...")
uniform <- estimate.background(data,para)
if(verbose) cat(log(uniform+1),"\n")
Weights <- para$p*para$lamda*exp(-para$lamda*as.numeric(data[,"TSSDist"]))
Weights <- Weights/(Weights+(1-para$p)/(max(as.numeric(data[,"TSSDist"]))-min(as.numeric(data[,"TSSDist"]))))
data <- cbind(data,WeightScore=Weights*as.numeric(data[,"Score"]))
Weighted.Sum <- matrix(0,nrow=length(unique(data[,"RefID"])),ncol=3)
colnames(Weighted.Sum) <- c("RefID","EntrezID","Score")
if(verbose) cat("Summarize Chip.Seq peaks...\n")
temp <- split(data,data[,"RefID"])
Weighted.Sum[,1] <- names(temp)
Weighted.Sum[,2] <- sapply(temp,function(x)x[1,"GeneID"])
Weighted.Sum[,3] <- sapply(temp,function(x) sum(x[,"WeightScore"]))
rownames(Weighted.Sum) <- names(temp)
Weighted.Sum <- data.frame(Weighted.Sum,stringsAsFactors=F)

res <- list(Score=Weighted.Sum,Uniform=uniform)
res
}

WeightedSum.Prob <- function(data,prob=0.005,prior=TRUE,log=TRUE) {
data.2 <- data[[1]]
if(log) {
data.2[,"Score"] <- log(as.numeric(data.2[,"Score"])+1)
backgrounds <- log(data[[2]]+1)
} else backgrounds <- data[[2]]

if(prior) {para <- emExpNormal.prior(as.numeric(data.2[,"Score"]), background=backgrounds,probs=prob)
} else para <- emExpNormal(as.numeric(data.2[,"Score"]))
Prob <- (1-para$p)*dnorm(as.numeric(data.2[,"Score"]), mean=para$mean,sd=sqrt(para$var))
Prob <- Prob/(Prob+para$p*para$lamda*exp(-para$lamda*as.numeric(data.2[,"Score"])))
data.2 <- cbind(data.2, Prob=Prob)
rownames(data.2) <- data.2[,1]
data.2 <- data.2[,-1]
data.2
}
getRefSeqs <- function(reftable, keys="TXNAME", columns=c("GENEID","TXNAME"), keytype="TXNAME") {
refSeqs <- select(reftable, keys(reftable, keys), columns=columns ,keytype=keytype)
refSeqs
}

if(verbose) cat("Sum Chip-seq peaks...\n")
Chip.Weighted <- WeightedSum(ChipList,TSSDist.Bound=1000000)
if(verbose) cat("Get binding probs...\n")
Chip.Weighted.Prob <- WeightedSum.Prob(Chip.Weighted, prob=0.001, prior=TRUE)
if(!is.null(genome)){
print("Assigning zeros to non-bound genes")
refseqs <- getRefSeqs(genome)
matchingAllReseqs <- match(refseqs[,1],rownames(Chip.Weighted.Prob))
Chip.Weighted.Prob <- Chip.Weighted.Prob[matchingAllReseqs,]
Chip.Weighted.Prob$Score[is.na(matchingAllReseqs)] <- 0
Chip.Weighted.Prob$Prob[is.na(matchingAllReseqs)] <- 0
Chip.Weighted.Prob$EntrezID <- refseqs[, 2]
rownames(Chip.Weighted.Prob) <- refseqs[, 1]
}
Chip.Weighted.Prob
}
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
# tR
# tRegs
A package for annotating chip-seq data and creating TREG signatures
Binary file added data/erAlpha.rda
Binary file not shown.
48 changes: 48 additions & 0 deletions man/annotateChipSeqPeaks.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 78f437d

Please sign in to comment.