Skip to content
Merged

Dev #12

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: TFBlearner
Title: Functionality for training TF-specific classifiers to predict TF bindings based on ATAC-seq data.
Version: 0.1.1
Version: 0.1.2
Authors@R:
person("Emanuel", "Sonder", , "emanuel.sonder@hest.ethz.ch", role = c("aut", "cre"),
comment = c(ORCID = "0000-0003-4788-9508"))
Expand Down
41 changes: 33 additions & 8 deletions R/contextTfFeatures.R
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ contextTfFeatures <- function(mae,
motifPath <- subset(colData(maeSub[[MOTIFEXP]]), get(MOTIFNAMECOL)==tf)$origin
baseDir <- metadata(colData(maeSub[[MOTIFEXP]]))[[BASEDIRCOL]]
motifRanges <- readRDS(file.path(baseDir, motifPath))
motifRanges$motif_id <- tfName

if(addLabels){
colDataChIP <- as.data.table(colData(mae[[CHIPEXP]]))
Expand All @@ -114,6 +115,37 @@ contextTfFeatures <- function(mae,
names(labels) <- contexts
}

addArgs <- list(...)
if("profiles" %in% names(addArgs)){
if(is.null(insertionProfile)){
insertionProfile <- addArgs[["profiles"]]}
else{
warning("Provided duplicated argument, profiles (via ...) and insertionProfile.
Using insertionProfile.")
}
}

# pre-compute insertion-profile on training data
if(is.null(insertionProfile) & "Weighted_Inserts" %in% features){
message("No insertion-profile provided, pre-computing on training data")

trainCols <- unique(subset(sampleMap(mae), get(ISTRAINCOL))$colname)
labContexts <- getContexts(mae, tfName=tfName, which="Both")
trainContexts <- intersect(trainCols, labContexts)

atacTrainFragFilePaths <- unlist(subset(colData(mae[[ATACEXP]]),
get(annoCol) %in% trainContexts)$origin)
baseDir <- metadata(colData(mae[[ATACEXP]]))[[BASEDIRCOL]]
atacTrainFragPaths <- file.path(baseDir, atacTrainFragFilePaths)

ins <- getInsertionProfiles(atacTrainFragPaths, motifRanges)
insertionProfile <- list(ins$insertProfiles)
names(insertionProfile) <- tfName
}
else if(!("Weighted_Inserts" %in% features)){
insertionProfile <- NULL
}

# loop over contexts to get the features
message("Get insert features")
labels <- labels[contexts] # ensure ordering
Expand All @@ -123,20 +155,13 @@ contextTfFeatures <- function(mae,
threads, BPPARAM, ...){
data.table::setDTthreads(threads)

calcProfile <- FALSE
if("Weighted_Inserts" %in% features & is.null(profile)){
calcProfile <- TRUE
}
else if("Weighted_Inserts" %in% features & !is.null(profile)){
message("Using pre-computed insertion-profiles")
}

atacFrag <- atacFrag[names(atacFrag)==context]

addArgs <- list(...)
addArgs <- addArgs[names(addArgs) %in% c("margin", "shift",
"subSample","symmetric",
"stranded")]
calcProfile <- FALSE
args <- c(list(atacData=atacFrag, motifRanges=motifRanges,
profiles=profile, calcProfile=calcProfile, BPPARAM=BPPARAM),
addArgs)
Expand Down
20 changes: 11 additions & 9 deletions R/getFeatureMatrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,34 +13,36 @@
return(mat)
}

.robustNormalization <- function(mat){
qs <- .marginQuant(mat, probs=c(0.25,0.5,0.75), margin="col")
.robustNormalization <- function(mat, naRm=TRUE){
qs <- .marginQuant(mat, probs=c(0.25,0.5,0.75), margin="col", naRm=naRm)
Matrix::Matrix(t(t(sweep(mat, 2, qs[2,], "-"))/max((qs[3,]-qs[1,]),1e-5)))
}

.minMaxNormalization <- function(mat, useMax=FALSE){
.minMaxNormalization <- function(mat, useMax=FALSE, naRm=TRUE){
if(useMax){
qs <- .marginQuant(mat, probs=c(0.0,1.0), margin="col")
qs <- .marginQuant(mat, probs=c(0.0,1.0), margin="col", naRm=naRm)
}
else{
qs <- .marginQuant(mat, probs=c(0.0,0.9), margin="col")
qs <- .marginQuant(mat, probs=c(0.0,0.9), margin="col", naRm=naRm)
}
Matrix::Matrix(t(t(mat)/max((qs[2,]-qs[1,]),1e-5)))
}

.contextNormalization <- function(mat, method=c("robust", "min-max",
"column", "none")){
"column", "none"),
naRm=TRUE){

method <- match.arg(method, choices=c("robust", "min-max",
"column", "none"))
if(method=="column"){
normMat <- Matrix::t(Matrix::t(mat)/pmax(colSums(mat), rep(1e-5, nrow(mat))))
normMat <- Matrix::t(Matrix::t(mat)/pmax(colSums(mat, na.rm=naRm),
rep(1e-5, nrow(mat))))
}
else if(method=="min-max"){
normMat <- .minMaxNormalization(mat)
normMat <- .minMaxNormalization(mat, naRm=naRm)
}
else if(method=="robust"){
normMat <- .robustNormalization(mat)
normMat <- .robustNormalization(mat, naRm=naRm)
}
else if(method=="none")
{
Expand Down
11 changes: 10 additions & 1 deletion R/getInsertionProfiles.R
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,14 @@ getInsertionProfiles <- function(atacData,
motifData[,motif_id:=1L]
}

if(!calcProfile & !is.null(profiles) &
length(setdiff(motifData$motif_id, names(profiles)))>0){
warning("Not all motif-ranges have an insertion-profile provided.
If wished to use a pre-computed profile provide one for all the motifs specified in the motifRanges arg.
Switching to computing profiles for all (calcProfile=TRUE).")
calcProfile <- TRUE
}

margin <- as.integer(margin)
if(margin>0){
motifMarginRanges <- as.data.table(GenomicRanges::resize(motifRanges,
Expand Down Expand Up @@ -180,6 +188,7 @@ getInsertionProfiles <- function(atacData,
atacFrag <- split(atacFrag, by="chr")

if(calcProfile){
message("Computing insertion-profiles")
atacProfiles <- BiocParallel::bpmapply(function(md, af, stranded, shiftLeft){
atacInserts <- .getInsertsPos(af, md, stranded, shiftLeft)
atacProfile <- atacInserts[,.(pos_count_global=.N),
Expand Down Expand Up @@ -238,7 +247,7 @@ getInsertionProfiles <- function(atacData,
else{
atacProfiles <- profiles
if(is.list(atacProfiles)){
message("Using precomputed profiles")
message("Skipped insertion-profiles computation. Using provided pre-computed ones")
atacProfiles <- rbindlist(atacProfiles, idcol="motif_id")}
}

Expand Down
12 changes: 6 additions & 6 deletions R/helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
return(mat)
}

.marginMax <- function(mat, margin=c("row", "col")){
.marginMax <- function(mat, margin=c("row", "col"), naRm=TRUE){

margin <- match.arg(margin, choices=c("col", "row"))
if(margin=="row"){fun <- MatrixGenerics::rowMaxs}
Expand All @@ -20,11 +20,11 @@
if(!is(mat, "CsparseMatrix") & !is(mat, "TsparseMatrix")){
mat <- as.matrix(mat)
}
marginMax <- fun(mat)
marginMax <- fun(mat, na.rm=naRm)
return(marginMax)
}

.marginSum <- function(mat, margin=c("row", "col")){
.marginSum <- function(mat, margin=c("row", "col"), naRm=TRUE){

margin <- match.arg(margin, choices=c("col", "row"))
if(margin=="row"){fun <- MatrixGenerics::rowSums}
Expand All @@ -33,19 +33,19 @@
if(!is(mat, "CsparseMatrix") & !is(mat, "TsparseMatrix")){
mat <- as.matrix(mat)
}
marginMax <- fun(mat)
marginMax <- fun(mat, na.rm=naRm)
return(marginMax)
}

.marginQuant <- function(mat, probs, margin=c("row", "col")){
.marginQuant <- function(mat, probs, margin=c("row", "col"), naRm=TRUE){

margin <- match.arg(margin, choices=c("col", "row"))
if(margin=="row"){fun <- MatrixGenerics::rowQuantiles}
else{fun <- MatrixGenerics::colQuantiles}
if(!is(mat, "CsparseMatrix") & !is(mat, "TsparseMatrix")){
mat <- as.matrix(mat)
}
marginQuant <- t(fun(mat, probs=probs))
marginQuant <- t(fun(mat, probs=probs, na.rm=naRm))

return(marginQuant)
}
Expand Down
18 changes: 9 additions & 9 deletions R/predict.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,18 +54,18 @@ predictTfBinding <- function(models,
npDt, chunk, contextColName){
data.table::setDTthreads(floor(numThreads/nWorker))

allFeats <- listFeatures()
colsToRemove <- unlist(subset(allFeats,
!included_in_training)$feature_matrix_column_names)
colsToRemove <- c(colsToRemove, LABELCOLNAME, contextColName, CSCORECOLNAME)
if(name==MODELALLNAME){
colsToRemove <- setdiff(colsToRemove, CSCORECOLNAME)
# get feature order, to ensure the same order of features
modText <- model$save_model_to_string()
featOrder <- unlist(tstrsplit(modText, split="\n", keep=8))
if(!any(grepl("feature_names", featOrder))){
stop("Feature names could not be retrieved from the provided model.")
}
featOrder <- unlist(tstrsplit(gsub("feature_names=","", featOrder), split=" "))
Comment on lines 58 to 63
Copy link

Copilot AI Oct 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The feature order extraction relies on hard-coded line number (keep=8) from the model string representation, which is fragile and may break if the model's string format changes. Consider adding error handling to verify that feature names were successfully extracted, or use a more robust method to access feature names if available in the lightgbm API.

Suggested change
modText <- model$save_model_to_string()
featOrder <- unlist(tstrsplit(modText, split="\n", keep=8))
featOrder <- unlist(tstrsplit(gsub("feature_names=","", featOrder), split=" "))
featOrder <- model$feature_name()

Copilot uses AI. Check for mistakes.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

model$feature_name() does not seem to exist in the R-API of lightgbm

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

one could still check that the "feature_names=" is present in that row

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agreed, see commit 2ef3949](2ef3949)


if(!is.null(chunk) & is.numeric(chunk)){
predDts <- lapply(npDt, function(indDt){
pred <- predict(model, as.matrix(data[indDt$ind, !(colnames(data) %in% colsToRemove)]))
predDt <- data.table(pred=pred)
preds <- predict(model, as.matrix(data[indDt$ind,featOrder]))
predDt <- data.table(pred=preds)

if(sparsify) predDt[,pred:=fifelse(pred*scalFactPred>model$params[[SPARSETHR]],pred,0L)]
predDt
Expand All @@ -74,7 +74,7 @@ predictTfBinding <- function(models,
predMat <- Matrix::Matrix(as.matrix(predDt))
}
else{
preds <- predict(model, as.matrix(data[,!(colnames(data) %in% colsToRemove)]))
preds <- predict(model, as.matrix(data[,featOrder]))
predDt <- data.table(pred=preds)
if(sparsify) predDt[,pred:=fifelse(pred*scalFactPred>model$params[[SPARSETHR]],pred,0L)]
predMat <- Matrix::Matrix(as.matrix(predDt))
Expand Down
9 changes: 8 additions & 1 deletion R/tfFeatures.R
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@
chIPMat <- .binMat(chIPMat, threshold=0)
}

message(paste("Applying nonnegative matrix factorization on a matrix of dimension",
nrow(chIPMat),"x", ncol(chIPMat), collapse=" "))
message(paste("Requested rank", nPatterns))

if(is.null(aggFun)){
chIPMat <- as(chIPMat, "CsparseMatrix")
nmfRes <- suppressMessages(RcppML::nmf(chIPMat, k=nPatterns,
Expand Down Expand Up @@ -360,7 +364,10 @@ tfFeatures <- function(mae,
# assay-matrices
atacMat <- .convertToMatrix(assays(mae[[ATACEXP]])[[TOTALOVERLAPSFEATNAME]])
colnames(atacMat) <- colnames(mae[[ATACEXP]])
whichCol <- which(mae[[CHIPEXP]][[TFNAMECOL]]!=tfName)
whichCol <- which(mae[[CHIPEXP]][[TFNAMECOL]]!=tfName &
!(mae[[CHIPEXP]][["combination"]] %in%
subset(sampleMap(mae), get(ISTESTCOL) & assay==CHIPEXP)$colname))

chIPMat <- as(assays(mae[[CHIPEXP]])[[PEAKASSAY]][,whichCol],"CsparseMatrix")
colnames(chIPMat) <- paste(colData(mae[[CHIPEXP]])[whichCol,annoCol],
colData(mae[[CHIPEXP]])[whichCol,TFNAMECOL],
Expand Down
96 changes: 96 additions & 0 deletions tests/testthat/test-contextTfFeatures.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,99 @@ test_that("Error if features have not been computed for provided TF", {
tfName="JUN"
expect_error(contextTfFeatures(maeTest, tfName=tfName))
})

test_that("Using precomputed profile", {
experiments(maeTest)[[CONTEXTTFFEAT]] <- NULL
profile <- data.table(rel_pos=-200:200)
profile[,w:=1/nrow(profile)]
profile <- list("CTCF"=profile)

contextTfFeatures(maeTest, tfName="CTCF",
insertionProfile=profile)
expect_no_message(contextTfFeatures(maeTest, tfName="CTCF",
insertionProfile=profile),
message="No insertion-profile provided, pre-computing on training data")
expect_no_message(contextTfFeatures(maeTest, tfName="CTCF",
insertionProfile=profile),
message="Computing insertion-profiles")
expect_message(maeTest <- contextTfFeatures(maeTest, tfName="CTCF",
insertionProfile=profile),
regexp="Skipped insertion-profiles computation. Using provided pre-computed ones")
expect_equal(sum(is.na(assays(maeTest[["contextTfFeat"]])$contextTfFeat_weightedInserts.margin_tfMotif_1)), 0)
expect_equal(sum(is.na(assays(maeTest[["contextTfFeat"]])$contextTfFeat_weightedInserts.within_tfMotif_1)), 0)

expect_no_warning(contextTfFeatures(maeTest, tfName="CTCF",
insertionProfile=profile))
expect_no_message(contextTfFeatures(maeTest, tfName="CTCF",
insertionProfile=profile),
message="Computing insertion-profiles")
})

test_that("Warning when using precomputed profile - not maching the motifRanges by name", {
experiments(maeTest)[[CONTEXTTFFEAT]] <- NULL
profile <- data.table(rel_pos=-200:200)
profile[,w:=1/nrow(profile)]
profile <- list("ATF2"=profile)

expect_warning(contextTfFeatures(maeTest, tfName="CTCF",
whichCol="OnlyTrain",
insertionProfile=profile),
regexp="*Not all motif-ranges*")
expect_message(suppressWarnings(contextTfFeatures(maeTest, tfName="CTCF",
insertionProfile=profile)),
regexp="Computing insertion-profiles")
})

test_that("Using precomputed profile - add provided via ... (profiles) arg", {
experiments(maeTest)[[CONTEXTTFFEAT]] <- NULL
profile <- data.table(rel_pos=-200:200)
profile[,w:=1/nrow(profile)]
profile <- list("CTCF"=profile)

expect_no_message(contextTfFeatures(maeTest, tfName="CTCF",
profiles=profile),
message="No insertion-profile provided, pre-computing on training data")
expect_no_message(contextTfFeatures(maeTest, tfName="CTCF",
profiles=profile),
message="Computing insertion-profiles")
expect_message(maeTest <- contextTfFeatures(maeTest, tfName="CTCF",
profiles=profile),
regexp="Skipped insertion-profiles computation. Using provided pre-computed ones")
})

test_that("Warning when using precomputed profile - via ... (profiles) and insertionProfile arg",{
experiments(maeTest)[[CONTEXTTFFEAT]] <- NULL
profile <- data.table(rel_pos=-200:200)
profile[,w:=1/nrow(profile)]
profile <- list("CTCF"=profile)

expect_warning(contextTfFeatures(maeTest, tfName="CTCF",
profiles=profile,
insertionProfile=profile),
regexp="*Provided duplicated argument*")
})

test_that("Using message when no pre-computed profile provided", {
experiments(maeTest)[[CONTEXTTFFEAT]] <- NULL

expect_message(contextTfFeatures(maeTest, tfName="CTCF",
insertionProfile=NULL),
regexp="No insertion-profile provided, pre-computing on training data")
expect_message(contextTfFeatures(maeTest, tfName="CTCF",
insertionProfile=NULL),
regexp="Computing insertion-profiles")
})

test_that("Weighted_Inserts not in features", {
experiments(maeTest)[[CONTEXTTFFEAT]] <- NULL

expect_no_message(contextTfFeatures(maeTest, features="Inserts",
tfName="CTCF"),
message="No insertion-profile provided, pre-computing on training data")
expect_no_message(maeTest <- contextTfFeatures(maeTest, tfName="CTCF",
features="Inserts"),
message="Computing insertion-profiles")

expect_equal(sum(grepl("weightedInserts", names(assays(maeTest[["contextTfFeat"]])))),
0)
})
21 changes: 21 additions & 0 deletions tests/testthat/test-getFeatureMatrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -208,3 +208,24 @@ test_that("Feature Matrix: correct features are normalized", {
normedCols <- colnames(fm)[grepl(NORMEDAFFIX,colnames(fm))]
expect_length(setdiff(normedCols, colNamesAll), 0)
})

test_that("Feature Matrix: normalization does not fail with NA containing columns", {

randMat <- matrix(runif(1e3), ncol=10, nrow=100)
randMat[sample(1:length(randMat),100)] <- NA

robNormed <- .robustNormalization(randMat)
expect_equal(which(is.na(robNormed)), which(is.na(randMat)))
expect_equal(which(is.na(robNormed)), which(is.na(randMat)))
expect_equal(sum(!is.na(robNormed)), sum(!is.na(randMat)))

minMaxNormed <- .minMaxNormalization(randMat)
expect_equal(which(is.na(minMaxNormed)), which(is.na(randMat)))
expect_equal(which(is.na(minMaxNormed)), which(is.na(randMat)))
expect_equal(sum(!is.na(minMaxNormed)), sum(!is.na(randMat)))

colNormed <- .contextNormalization(randMat, method="column")
expect_equal(which(is.na(colNormed)), which(is.na(randMat)))
expect_equal(which(is.na(colNormed)), which(is.na(randMat)))
expect_equal(sum(!is.na(colNormed)), sum(!is.na(randMat)))
})
Loading