Initial version

uc-bd2k · Mar 16, 2017 · 78f437d · 78f437d
1 parent 29700f1
commit 78f437d
Show file tree

Hide file tree

Showing 11 changed files with 435 additions and 1 deletion.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -0,0 +1,2 @@
+^.*\.Rproj$
+^\.Rproj\.user$
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+.Rproj.user
+.Rhistory
+.RData
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,19 @@
+Package: tRegs
+Type: Package
+Title: Signatures of TF regulatory activiy
+Version: 1.0
+Date: 2017-03-16
+Author: Mehdi Fazel Najafabadi, Mario Medvedovic
+Maintainer: Mehdi Fazel Najafabadi <[email protected]>
+Description: The package provides tools for constructing gene-specific TF binding scores from ChIP-seq data (TREG binding scores) and for constructing TREG signatures of TF regulatory activity by integrating TREG binding scores with suitable matched differential gene expression profiles.
+Depends:
+    R (>= 3.3.1),
+	GenomicFeatures (>= 1.24.5),
+	org.Hs.eg.db,
+	org.Mm.eg.db,
+	org.Rn.eg.db
+License: GPL (>= 2)
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 6.0.1
+Packaged: 2017-03-16 10:18:15 UTC; Mehdi Fazel Najafabadi
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,4 @@
+# Generated by roxygen2: do not edit by hand
+
+export(annotateChipSeqPeaks)
+export(chipSeqWeightedSum)
diff --git a/R/annotateChipSeqPeaks.R b/R/annotateChipSeqPeaks.R
@@ -0,0 +1,86 @@
+
+#' A function to match each peak to associated genes in terms of RefID.
+#'
+#' @description This function allows you to annotate chip-sequencing reads to a reference.
+#'	It is intended to search all peaks within up and down
+#'	distances for each gene. Users could specify their own platforms
+#'	or the function will download platform from UCSC as default.
+#' @param chip.seq A matrix listing all CHIP-Seq peaks in which the chromosome
+#'		ID for all peaks are listed in the first column, peaks' start
+#'		positions are listed in the second column, peaks' end
+#'		position are listed in the third column, peaks' ID are listed
+#'		in the forth column and the fifth column lists peaks' score.
+#' @param transcriptDB This is Transcript definition table created by
+#'		makeTranscriptDbFromUCSC or makeTxDbFromUCSC functions.
+#' @param distanceRange The up- and downstream window aroung gene TSS to be used
+#'		in associating peaks to a gene. The default is c(-1e+06,1e+06).
+#'
+#' @return This function returns a matrix in which the first column contains
+#'	the RefSeqID, the second column contains Entrez ID , the third
+#'	column contains chromosome ID, the fourth column contains the
+#'	absolute distance from each peak to transcription start site and
+#'	the fifth column contains chip-seq score.
+#'
+#' @details This function is intended to search all peaks within up and down distances for each gene.
+#'
+#' @author Mehdi Fazel-Najafabadi, Mario Medvedovic
+#'
+#' @export
+#' @examples
+#' ## not run
+#' data(erAlpha)
+#' refTable <- GenomicFeatures::makeTxDbFromUCSC(genome="hg19",tablename="refGene")
+#' ChipSeq <- annotateChipSeqPeaks(chip.seq=erAlpha[[3]], transcriptDB=refTable, distanceRange=c(-1e+06,1e+06))
+#' ## not run
+
+annotateChipSeqPeaks <- function(chip.seq, transcriptDB=NULL, distanceRange=c(-1e+06,1e+06)) {
+	colnames(chip.seq) <- c("Chromosome","Start","End","Des","Score")
+	if(is.null(transcriptDB) | class(transcriptDB) != "TxDb") stop("Please provide refGenome platform\n") else {
+	    genomeDB <- transcriptDB
+	}
+	getRefSeqs <- function(reftable, keys="TXNAME", columns=c("GENEID","TXNAME"), keytype="TXNAME") {
+	refSeqs <- select(reftable, keys(reftable, keys), columns=columns ,keytype=keytype)
+	refSeqs
+	}
+	refSeqs <- getRefSeqs(genomeDB)
+# importFrom GenomicFeatures transcripts 
+# importFrom IRanges IRanges findOverlaps NCList values
+	allchr <- unique(chip.seq[,1])
+	exps = matrix(NA,ncol=5)
+ 	colnames(exps) <- c("RefID","GeneID","Chromosome","TSSDist","Score")
+	for(chr in allchr)
+	{
+	  cat("Annotating ",chr,"\n")
+	  genome <- GenomicFeatures::transcripts(genomeDB, filter = list(tx_name=refSeqs[,"TXNAME"],tx_chrom=chr))
+	  genome.starts <- start(genome)
+	  genome.ends <- end(genome)
+	  genome.strands <- as.vector(strand(genome))
+	  genome.0.starts <- genome.starts
+	  genome.0.ends <- genome.ends
+	  genome.0.starts[genome.strands=="+"] <- genome.starts[genome.strands=="+"]+distanceRange[1]
+	  genome.0.starts[genome.0.starts<0] <- 0
+	  genome.0.ends[genome.strands=="+"] <- genome.starts[genome.strands=="+"]+distanceRange[2]
+	  genome.0.starts[genome.strands=="-"] <- genome.ends[genome.strands=="-"]+distanceRange[1]
+	  genome.0.starts[genome.0.starts<0] <- 0
+	  genome.0.ends[genome.strands=="-"] <- genome.ends[genome.strands=="-"]+distanceRange[2]
+	  gene.id <- refSeqs$GENEID[match(as.character(IRanges::values(genome)[,"tx_name"]), refSeqs$TXNAME)]
+
+	  tree <- IRanges::NCList(IRanges::IRanges(genome.0.starts,genome.0.ends))
+	  temp <- chip.seq[chip.seq$Chromosome==chr,]
+	  tables <- as.matrix(IRanges::findOverlaps(query=IRanges::IRanges(temp$Start,temp$End),tree,type="within",select="all"))
+	  distances <- genome.starts[tables[,2]]
+	  distances[is.element(tables[,2],which(genome.strands=="-"))] <- genome.ends[tables[is.element(tables[,2],which(genome.strands=="-")),2]]
+	  starts1 <- temp[tables[,1],"Start"]
+	  starts2 <- temp[tables[,1],"End"]
+	  distances <- as.integer(abs((starts1+starts2)/2-distances))
+	  tables<-cbind(tables,Dist=distances,Score=temp[tables[,1],"Score"])
+	  rowID <- paste(IRanges::values(genome)[tables[,2],"tx_name"], IRanges::values(genome)[tables[,2],"tx_id"],tables[,3],tables[,1],sep="-")
+	  if(length(rowID)>length(unique(rowID))) cat(chr,"\n")
+ 	  tables <- data.frame(RefID=IRanges::values(genome)[tables[,2],"tx_name"],GeneID=gene.id[tables[,2]],Chromosome=as.character(rep(chr,dim(tables)[1])),
+	    TSSDist=tables[,3],Score=tables[,4],row.names=rowID,stringsAsFactors=FALSE)
+	  exps <- rbind(exps,tables)
+	  rm(tables)
+	}
+	exps <- exps[!is.na(exps[,1]),]
+	exps
+}
diff --git a/R/chipSeqWeightedSum.R b/R/chipSeqWeightedSum.R
@@ -0,0 +1,195 @@
+
+#' A function to calculate binding strengh by summing weighted chip-seq peaks.
+#'
+#' @description This function is intended to calculate chip-seq binding score and
+#'	associated probability for each gene. The input chip-seq list
+#'	contains all peaks found in up/down stream within certain range
+#'	whose weights contribute to final score is assigned based on the
+#'	its distance to transcription start site.
+#' @param ChipList A matrix contains all peaks for each genes with each row
+#'		represent one peak and the first column contains ref-seq ID,
+#'		second column contain entrez ID, third column contains
+#'		chromosome ID, fourth column contains distance from the
+#'		center of the peak to transcription start site and the fifth
+#'		column contains the peak's strength.
+#' @param verbose If TRUE, prints out results of every iteration.
+#' @param genome Transcript definition table created by
+#'		makeTranscriptDbFromUCSC function and used to create ChipList
+#'
+#' @return This function returns a matrix with first column contains the
+#'	entrez ID, the second column contains the score and the third
+#'	column contains the associated probability.
+#'
+#' @details This function is intended to calculate chip-seq binding score and
+#'	associated probability for each gene.
+#'
+#' @author Mehdi Fazel-Najafabadi, Mario Medvedovic
+#'
+#' @references ...
+#' @seealso ...
+#' @keywords annotation
+# importFrom GenomicFeatures transcripts
+# importFrom IRanges IRanges findOverlaps
+#' @export
+#' @examples
+#' ## not run
+#' data(erAlpha)
+#' refTable <- GenomicFeatures::makeTxDbFromUCSC(genome="hg19",tablename="refGene")
+#' ChipSeq <- annotateChipSeqPeaks(chip.seq=erAlpha[[3]], transcriptDB=refTable, distanceRange=c(-1e+06,1e+06))
+#' tregBindingERalpha <- chipSeqWeightedSum(ChipSeq, verbose=TRUE, genome=refTable)
+#' ## not run
+
+chipSeqWeightedSum <- function(ChipList,verbose=FALSE, genome=NULL) {
+	emExpUniform <- function(data,p=0.1, lamda=0.1, steps=1000,stopCond=0.01) {
+		maxValue <- max(data)
+		minValue <- min(data)
+		likelihood1 <- sum(log(p*lamda*exp(-lamda*data)+(1-p)/(maxValue-minValue)))
+		likelihood2 <- 9999
+		if(verbose)cat("Log likelihood ",likelihood1, "\nIteration\tLikelihood\tWeights\tLamda\n")
+		iter <- 1
+		while(abs(likelihood2-likelihood1) > stopCond & iter < steps)
+		{
+			if(iter > 1) likelihood1 <- likelihood2
+			tao <- p*lamda*exp(-lamda*data)/(p*lamda*exp(-lamda*data)+(1-p)/(maxValue-minValue))
+			p <- sum(tao)/length(data)
+			lamda <- sum(tao)/sum(tao*data)
+			likelihood2 <- sum(log(p*lamda*exp(-lamda*data)+(1-p)/(maxValue-minValue)))
+			iter <- iter+1
+			if(verbose)cat(iter,likelihood2,p,lamda,"\n")
+		}
+		res <- list(p=p,lamda=lamda)
+		res
+	} 
+
+    emExpNormal <- function(data,p=0.1,lamda=0.1,mean=2.0,var=1.0,steps=1000,stopCond=0.01) {
+	      likelihood1 <- sum(log(p*lamda*exp(-lamda*data)+(1-p)*dnorm(data,mean=mean,sd=sqrt(var))))
+	      likelihood2 <- 9999
+	     if(verbose) cat("Log likelihood ",likelihood1, "\nIteration\tLikelihood\tWeights\tLamda\tMean\tVariance\n")
+	      iter <- 1
+	      while(abs(likelihood2-likelihood1) > stopCond & iter < steps)
+	      {
+		      if(iter > 1) likelihood1 <- likelihood2
+		      tao1 <- p*lamda*exp(-lamda*data)/(p*lamda*exp(-lamda*data)+(1-p)*dnorm(data,mean=mean,sd=sqrt(var)))
+		      tao2 <- 1-tao1
+		      p <- sum(tao1)/length(data)
+		      lamda <- sum(tao1)/sum(tao1*data)
+		      mean <- sum(tao2*data)/sum(tao2)
+		      var <- sum(tao2*(data-mean)^2)/sum(tao2)
+		      likelihood2 <- sum(log(p*lamda*exp(-lamda*data)+(1-p)*dnorm(data,mean=mean,sd=sqrt(var))))
+		      iter <- iter+1
+		      if(verbose)cat(iter,likelihood2,p,lamda,mean,var,"\n")
+	      }
+	      res <- list(p=p,lamda=lamda,mean=mean,var=var)
+	      res
+      } 
+
+    emExpNormal.prior <- function(data,p=0.1,lamda=0.1,mean=2.0,var=1.0,steps=20,stopCond=0.01,background=3,probs=0.5) {
+	    if(background <= 0) background <- 0.1
+	    lamda <- -log(probs)/background
+	    mean <- background+1
+	    likelihood1 <- sum(log(p*lamda*exp(-lamda*data)+(1-p)*dnorm(data,mean=mean,sd=sqrt(var))))
+	    likelihood2 <- 9999
+	    if(verbose) cat("Log likelihood ",likelihood1, "\nIteration\tLikelihood\tWeights\tLamda\tMean\tVariance\n")
+	    iter <- 1
+	    res <- list()
+	    while(abs(likelihood2 - likelihood1) > stopCond & iter < steps) {
+		    if(iter > 1) likelihood1 <- likelihood2
+		    tao1 <- p*lamda*exp(-lamda*data)/(p*lamda*exp(-lamda*data)+(1-p)*dnorm(data,mean=mean,sd=sqrt(var)))
+		    tao2 <- 1-tao1
+		    p <- sum(tao1)/length(data)
+		    mean <- sum(tao2*data)/sum(tao2)
+		    var <- sum(tao2*(data-mean)^2)/sum(tao2)
+		    likelihood2 <- sum(log(p*lamda*exp(-lamda*data)+(1-p)*dnorm(data,mean=mean,sd=sqrt(var))))
+		    res[[iter]] <- list(p=p, lamda=lamda, mean=mean, var=var)
+		    if(verbose) cat(iter,likelihood2,p,lamda,mean,var,"\n")
+		    if (likelihood2 == Inf) return(res[[iter-1]])
+		    iter <- iter+1
+	    }
+	    res <- list(p=p, lamda=lamda, mean=mean, var=var)
+	    res
+    } 
+
+    estimate.background <- function(data,para) {
+	    minDist <- min(as.numeric(data[,"TSSDist"]))
+	    maxDist <- max(as.numeric(data[,"TSSDist"]))
+
+ 	    windowsize <- table(data[,"RefID"])
+	    windowsize <- max(windowsize)
+	    windowsize <- as.integer((maxDist-minDist)/windowsize)
+	    res <- 0
+	    iter <- minDist
+	    bin.start <- seq(minDist,maxDist-windowsize,by=windowsize)
+	    bin.end <- seq(windowsize,maxDist,by=windowsize)
+	    bins <- IRanges::IRanges(start=bin.start,end=bin.end)
+	    query <- IRanges::IRanges(as.numeric(data[,"TSSDist"]),as.numeric(data[,"TSSDist"])+0.5)
+	    tables <- as.matrix(IRanges::findOverlaps(query,bins))
+	    bin <- unique(tables[,2])
+	    peaks <- unlist(sapply(bin,function(x) mean(as.numeric(data[tables[tables[,2]==x,1],"Score"]))))
+	    Weights <- para$p*para$lamda*exp(-para$lamda*(bin.start+windowsize/2))
+	    Weights <- Weights/(Weights+(1-para$p)/(maxDist-minDist))
+	    res <- Weights*peaks
+	    res <- sum(res)
+
+	    res
+    }
+
+    WeightedSum <- function(data,TSSDist.Bound=10000) {
+	    data <- data[as.numeric(data[,"TSSDist"])<=TSSDist.Bound,]
+	    para <- emExpUniform(as.numeric(data[,"TSSDist"]),p=0.1,lamda=0.1)
+	    if(verbose) cat("Estimate background...")
+	    uniform <- estimate.background(data,para)
+ 	    if(verbose) cat(log(uniform+1),"\n")
+	    Weights <- para$p*para$lamda*exp(-para$lamda*as.numeric(data[,"TSSDist"]))
+	    Weights <- Weights/(Weights+(1-para$p)/(max(as.numeric(data[,"TSSDist"]))-min(as.numeric(data[,"TSSDist"]))))
+	    data <- cbind(data,WeightScore=Weights*as.numeric(data[,"Score"]))
+  	    Weighted.Sum <- matrix(0,nrow=length(unique(data[,"RefID"])),ncol=3)
+ 	    colnames(Weighted.Sum) <- c("RefID","EntrezID","Score")
+	    if(verbose) cat("Summarize Chip.Seq peaks...\n")
+	    temp <- split(data,data[,"RefID"])
+	    Weighted.Sum[,1] <- names(temp)
+ 	    Weighted.Sum[,2] <- sapply(temp,function(x)x[1,"GeneID"])
+ 	    Weighted.Sum[,3] <- sapply(temp,function(x) sum(x[,"WeightScore"]))
+	    rownames(Weighted.Sum) <- names(temp)
+	    Weighted.Sum <- data.frame(Weighted.Sum,stringsAsFactors=F)
+
+	    res <- list(Score=Weighted.Sum,Uniform=uniform)
+	    res
+    }
+
+    WeightedSum.Prob <- function(data,prob=0.005,prior=TRUE,log=TRUE) {
+	data.2 <- data[[1]]
+	if(log) {
+		data.2[,"Score"] <- log(as.numeric(data.2[,"Score"])+1)
+		backgrounds <- log(data[[2]]+1)
+	} else backgrounds <- data[[2]]
+
+	if(prior) {para <- emExpNormal.prior(as.numeric(data.2[,"Score"]), background=backgrounds,probs=prob)
+	} else para <- emExpNormal(as.numeric(data.2[,"Score"]))
+	Prob <- (1-para$p)*dnorm(as.numeric(data.2[,"Score"]), mean=para$mean,sd=sqrt(para$var))
+	Prob <- Prob/(Prob+para$p*para$lamda*exp(-para$lamda*as.numeric(data.2[,"Score"])))
+ 	data.2 <- cbind(data.2, Prob=Prob)
+	rownames(data.2) <- data.2[,1]
+	data.2 <- data.2[,-1]
+	data.2
+      }
+	getRefSeqs <- function(reftable, keys="TXNAME", columns=c("GENEID","TXNAME"), keytype="TXNAME") {
+	refSeqs <- select(reftable, keys(reftable, keys), columns=columns ,keytype=keytype)
+	refSeqs
+	}
+
+    if(verbose) cat("Sum Chip-seq peaks...\n")
+    Chip.Weighted <- WeightedSum(ChipList,TSSDist.Bound=1000000)
+    if(verbose) cat("Get binding probs...\n")
+    Chip.Weighted.Prob <- WeightedSum.Prob(Chip.Weighted, prob=0.001, prior=TRUE)
+    if(!is.null(genome)){
+      print("Assigning zeros to non-bound genes")
+ 	refseqs <- getRefSeqs(genome)
+      matchingAllReseqs <- match(refseqs[,1],rownames(Chip.Weighted.Prob))
+      Chip.Weighted.Prob <- Chip.Weighted.Prob[matchingAllReseqs,]
+      Chip.Weighted.Prob$Score[is.na(matchingAllReseqs)] <- 0
+      Chip.Weighted.Prob$Prob[is.na(matchingAllReseqs)] <- 0
+      Chip.Weighted.Prob$EntrezID <- refseqs[, 2]
+     rownames(Chip.Weighted.Prob) <- refseqs[, 1]
+    }
+    Chip.Weighted.Prob
+}
diff --git a/README.md b/README.md
@@ -1 +1,2 @@
-# tR
+# tRegs
+ A package for annotating chip-seq data and creating TREG signatures
diff --git a/data/erAlpha.rda b/data/erAlpha.rda
diff --git a/man/annotateChipSeqPeaks.Rd b/man/annotateChipSeqPeaks.Rd