ETHZ-INS · emsonder · Oct 28, 2025 · Oct 24, 2025 · Oct 24, 2025 · Oct 24, 2025
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: TFBlearner
 Title: Functionality for training TF-specific classifiers to predict TF bindings based on ATAC-seq data.
-Version: 0.1.1
+Version: 0.1.2
 Authors@R: 
     person("Emanuel", "Sonder", , "emanuel.sonder@hest.ethz.ch", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0003-4788-9508"))

diff --git a/R/contextTfFeatures.R b/R/contextTfFeatures.R
@@ -99,6 +99,7 @@ contextTfFeatures <- function(mae,
   motifPath <- subset(colData(maeSub[[MOTIFEXP]]), get(MOTIFNAMECOL)==tf)$origin
   baseDir <- metadata(colData(maeSub[[MOTIFEXP]]))[[BASEDIRCOL]]
   motifRanges <- readRDS(file.path(baseDir, motifPath))
+  motifRanges$motif_id <- tfName
 
   if(addLabels){
     colDataChIP <- as.data.table(colData(mae[[CHIPEXP]]))
@@ -114,6 +115,37 @@ contextTfFeatures <- function(mae,
     names(labels) <- contexts
   }
 
+  addArgs <- list(...)
+  if("profiles" %in% names(addArgs)){
+    if(is.null(insertionProfile)){
+      insertionProfile <- addArgs[["profiles"]]}
+    else{
+      warning("Provided duplicated argument, profiles (via ...) and insertionProfile.
+               Using insertionProfile.")
+    }
+  }
+
+  # pre-compute insertion-profile on training data
+  if(is.null(insertionProfile) & "Weighted_Inserts" %in% features){
+    message("No insertion-profile provided, pre-computing on training data")
+
+    trainCols <- unique(subset(sampleMap(mae), get(ISTRAINCOL))$colname)
+    labContexts <- getContexts(mae, tfName=tfName, which="Both")
+    trainContexts <- intersect(trainCols, labContexts)
+
+    atacTrainFragFilePaths <- unlist(subset(colData(mae[[ATACEXP]]),
+                                            get(annoCol) %in% trainContexts)$origin)
+    baseDir <- metadata(colData(mae[[ATACEXP]]))[[BASEDIRCOL]]
+    atacTrainFragPaths <- file.path(baseDir, atacTrainFragFilePaths)
+
+    ins <- getInsertionProfiles(atacTrainFragPaths, motifRanges)
+    insertionProfile <- list(ins$insertProfiles)
+    names(insertionProfile) <- tfName
+  }
+  else if(!("Weighted_Inserts" %in% features)){
+    insertionProfile <- NULL
+  }
+
   # loop over contexts to get the features
   message("Get insert features")
   labels <- labels[contexts] # ensure ordering
@@ -123,20 +155,13 @@ contextTfFeatures <- function(mae,
                            threads, BPPARAM, ...){
     data.table::setDTthreads(threads)
 
-    calcProfile <- FALSE
-    if("Weighted_Inserts" %in% features & is.null(profile)){
-      calcProfile <- TRUE
-    }
-    else if("Weighted_Inserts" %in% features & !is.null(profile)){
-      message("Using pre-computed insertion-profiles")
-    }
-
     atacFrag <- atacFrag[names(atacFrag)==context]
 
     addArgs <- list(...)
     addArgs <- addArgs[names(addArgs) %in% c("margin", "shift",
                                              "subSample","symmetric",
                                              "stranded")]
+    calcProfile <- FALSE
     args <- c(list(atacData=atacFrag, motifRanges=motifRanges,
                    profiles=profile, calcProfile=calcProfile, BPPARAM=BPPARAM),
               addArgs)

diff --git a/R/getFeatureMatrix.R b/R/getFeatureMatrix.R
@@ -13,34 +13,36 @@
   return(mat)
 }
 
-.robustNormalization <- function(mat){
-   qs <- .marginQuant(mat, probs=c(0.25,0.5,0.75), margin="col")
+.robustNormalization <- function(mat, naRm=TRUE){
+   qs <- .marginQuant(mat, probs=c(0.25,0.5,0.75), margin="col", naRm=naRm)
    Matrix::Matrix(t(t(sweep(mat, 2, qs[2,], "-"))/max((qs[3,]-qs[1,]),1e-5)))
 }
 
-.minMaxNormalization <- function(mat, useMax=FALSE){
+.minMaxNormalization <- function(mat, useMax=FALSE, naRm=TRUE){
   if(useMax){
-    qs <- .marginQuant(mat, probs=c(0.0,1.0), margin="col")
+    qs <- .marginQuant(mat, probs=c(0.0,1.0), margin="col", naRm=naRm)
   }
   else{
-    qs <- .marginQuant(mat, probs=c(0.0,0.9), margin="col")
+    qs <- .marginQuant(mat, probs=c(0.0,0.9), margin="col", naRm=naRm)
   }
   Matrix::Matrix(t(t(mat)/max((qs[2,]-qs[1,]),1e-5)))
 }
 
 .contextNormalization <- function(mat, method=c("robust", "min-max",
-                                                "column", "none")){
+                                                "column", "none"),
+                                  naRm=TRUE){
 
   method <- match.arg(method, choices=c("robust", "min-max",
                                         "column", "none"))
   if(method=="column"){
-    normMat <- Matrix::t(Matrix::t(mat)/pmax(colSums(mat), rep(1e-5, nrow(mat))))
+    normMat <- Matrix::t(Matrix::t(mat)/pmax(colSums(mat, na.rm=naRm),
+                                             rep(1e-5, nrow(mat))))
   }
   else if(method=="min-max"){
-    normMat <- .minMaxNormalization(mat)
+    normMat <- .minMaxNormalization(mat, naRm=naRm)
   }
   else if(method=="robust"){
-    normMat <- .robustNormalization(mat)
+    normMat <- .robustNormalization(mat, naRm=naRm)
   }
   else if(method=="none")
   {

diff --git a/R/getInsertionProfiles.R b/R/getInsertionProfiles.R
@@ -116,6 +116,14 @@ getInsertionProfiles <- function(atacData,
     motifData[,motif_id:=1L]
   }
 
+  if(!calcProfile & !is.null(profiles) &
+     length(setdiff(motifData$motif_id, names(profiles)))>0){
+      warning("Not all motif-ranges have an insertion-profile provided.
+      If wished to use a pre-computed profile provide one for all the motifs specified in the motifRanges arg.
+      Switching to computing profiles for all (calcProfile=TRUE).")
+    calcProfile <- TRUE
+  }
+
   margin <- as.integer(margin)
   if(margin>0){
     motifMarginRanges <- as.data.table(GenomicRanges::resize(motifRanges,
@@ -180,6 +188,7 @@ getInsertionProfiles <- function(atacData,
   atacFrag <- split(atacFrag, by="chr")
 
   if(calcProfile){
+   message("Computing insertion-profiles")
    atacProfiles <- BiocParallel::bpmapply(function(md, af, stranded, shiftLeft){
       atacInserts <- .getInsertsPos(af, md, stranded, shiftLeft)
       atacProfile <- atacInserts[,.(pos_count_global=.N),
@@ -238,7 +247,7 @@ getInsertionProfiles <- function(atacData,
   else{
     atacProfiles <- profiles
     if(is.list(atacProfiles)){
-      message("Using precomputed profiles")
+      message("Skipped insertion-profiles computation. Using provided pre-computed ones")
       atacProfiles <- rbindlist(atacProfiles, idcol="motif_id")}
   }
 

diff --git a/R/helpers.R b/R/helpers.R
@@ -11,7 +11,7 @@
   return(mat)
 }
 
-.marginMax <- function(mat, margin=c("row", "col")){
+.marginMax <- function(mat, margin=c("row", "col"), naRm=TRUE){
 
   margin <- match.arg(margin, choices=c("col", "row"))
   if(margin=="row"){fun <- MatrixGenerics::rowMaxs}
@@ -20,11 +20,11 @@
   if(!is(mat, "CsparseMatrix") & !is(mat, "TsparseMatrix")){
     mat <- as.matrix(mat)
   }
-  marginMax <- fun(mat)
+  marginMax <- fun(mat, na.rm=naRm)
   return(marginMax)
 }
 
-.marginSum <- function(mat, margin=c("row", "col")){
+.marginSum <- function(mat, margin=c("row", "col"), naRm=TRUE){
 
   margin <- match.arg(margin, choices=c("col", "row"))
   if(margin=="row"){fun <- MatrixGenerics::rowSums}
@@ -33,19 +33,19 @@
   if(!is(mat, "CsparseMatrix") & !is(mat, "TsparseMatrix")){
     mat <- as.matrix(mat)
   }
-  marginMax <- fun(mat)
+  marginMax <- fun(mat, na.rm=naRm)
   return(marginMax)
 }
 
-.marginQuant <- function(mat, probs, margin=c("row", "col")){
+.marginQuant <- function(mat, probs, margin=c("row", "col"), naRm=TRUE){
 
   margin <- match.arg(margin, choices=c("col", "row"))
   if(margin=="row"){fun <- MatrixGenerics::rowQuantiles}
   else{fun <- MatrixGenerics::colQuantiles}
   if(!is(mat, "CsparseMatrix") & !is(mat, "TsparseMatrix")){
     mat <- as.matrix(mat)
   }
-  marginQuant <- t(fun(mat, probs=probs))
+  marginQuant <- t(fun(mat, probs=probs, na.rm=naRm))
 
   return(marginQuant)
 }

diff --git a/R/predict.R b/R/predict.R
@@ -54,18 +54,18 @@ predictTfBinding <- function(models,
                              npDt, chunk, contextColName){
     data.table::setDTthreads(floor(numThreads/nWorker))
 
-    allFeats <- listFeatures()
-    colsToRemove <- unlist(subset(allFeats,
-                                  !included_in_training)$feature_matrix_column_names)
-    colsToRemove <- c(colsToRemove, LABELCOLNAME, contextColName, CSCORECOLNAME)
-    if(name==MODELALLNAME){
-      colsToRemove <- setdiff(colsToRemove, CSCORECOLNAME)
+    # get feature order, to ensure the same order of features
+    modText <- model$save_model_to_string()
+    featOrder <- unlist(tstrsplit(modText, split="\n", keep=8))
+    if(!any(grepl("feature_names", featOrder))){
+      stop("Feature names could not be retrieved from the provided model.")
     }
+    featOrder <- unlist(tstrsplit(gsub("feature_names=","", featOrder), split=" "))
-    modText <- model$save_model_to_string()
-    featOrder <- unlist(tstrsplit(modText, split="\n", keep=8))
-    featOrder <- unlist(tstrsplit(gsub("feature_names=","", featOrder), split=" "))
+    featOrder <- model$feature_name()
-    modText <- model$save_model_to_string()
-    featOrder <- unlist(tstrsplit(modText, split="\n", keep=8))
-    featOrder <- unlist(tstrsplit(gsub("feature_names=","", featOrder), split=" "))
+    featOrder <- model$feature_name()
 
     if(!is.null(chunk) & is.numeric(chunk)){
     predDts <- lapply(npDt, function(indDt){
-      pred  <- predict(model, as.matrix(data[indDt$ind, !(colnames(data) %in% colsToRemove)]))
-      predDt <- data.table(pred=pred)
+      preds  <- predict(model, as.matrix(data[indDt$ind,featOrder]))
+      predDt <- data.table(pred=preds)
 
       if(sparsify) predDt[,pred:=fifelse(pred*scalFactPred>model$params[[SPARSETHR]],pred,0L)]
       predDt
@@ -74,7 +74,7 @@ predictTfBinding <- function(models,
       predMat <- Matrix::Matrix(as.matrix(predDt))
     }
     else{
-      preds  <- predict(model, as.matrix(data[,!(colnames(data) %in% colsToRemove)]))
+      preds  <- predict(model, as.matrix(data[,featOrder]))
       predDt <- data.table(pred=preds)
       if(sparsify) predDt[,pred:=fifelse(pred*scalFactPred>model$params[[SPARSETHR]],pred,0L)]
       predMat <- Matrix::Matrix(as.matrix(predDt))

diff --git a/R/tfFeatures.R b/R/tfFeatures.R
@@ -61,6 +61,10 @@
       chIPMat <- .binMat(chIPMat, threshold=0)
     }
 
+    message(paste("Applying nonnegative matrix factorization on a matrix of dimension",
+            nrow(chIPMat),"x", ncol(chIPMat), collapse=" "))
+    message(paste("Requested rank", nPatterns))
+
     if(is.null(aggFun)){
       chIPMat <- as(chIPMat, "CsparseMatrix")
       nmfRes <- suppressMessages(RcppML::nmf(chIPMat, k=nPatterns,
@@ -360,7 +364,10 @@ tfFeatures <- function(mae,
   # assay-matrices
   atacMat <- .convertToMatrix(assays(mae[[ATACEXP]])[[TOTALOVERLAPSFEATNAME]])
   colnames(atacMat) <- colnames(mae[[ATACEXP]])
-  whichCol <- which(mae[[CHIPEXP]][[TFNAMECOL]]!=tfName)
+  whichCol <- which(mae[[CHIPEXP]][[TFNAMECOL]]!=tfName &
+                    !(mae[[CHIPEXP]][["combination"]] %in%
+                    subset(sampleMap(mae), get(ISTESTCOL) & assay==CHIPEXP)$colname))
+
   chIPMat <- as(assays(mae[[CHIPEXP]])[[PEAKASSAY]][,whichCol],"CsparseMatrix")
   colnames(chIPMat) <- paste(colData(mae[[CHIPEXP]])[whichCol,annoCol],
                              colData(mae[[CHIPEXP]])[whichCol,TFNAMECOL],

diff --git a/tests/testthat/test-contextTfFeatures.R b/tests/testthat/test-contextTfFeatures.R
@@ -54,3 +54,99 @@ test_that("Error if features have not been computed for provided TF", {
   tfName="JUN"
   expect_error(contextTfFeatures(maeTest, tfName=tfName))
 })
+
+test_that("Using precomputed profile", {
+  experiments(maeTest)[[CONTEXTTFFEAT]] <- NULL
+  profile <- data.table(rel_pos=-200:200)
+  profile[,w:=1/nrow(profile)]
+  profile <- list("CTCF"=profile)
+
+  contextTfFeatures(maeTest, tfName="CTCF",
+                    insertionProfile=profile)
+  expect_no_message(contextTfFeatures(maeTest, tfName="CTCF",
+                                   insertionProfile=profile),
+                    message="No insertion-profile provided, pre-computing on training data")
+  expect_no_message(contextTfFeatures(maeTest, tfName="CTCF",
+                                      insertionProfile=profile),
+                    message="Computing insertion-profiles")
+  expect_message(maeTest <- contextTfFeatures(maeTest, tfName="CTCF",
+                                   insertionProfile=profile),
+                 regexp="Skipped insertion-profiles computation. Using provided pre-computed ones")
+  expect_equal(sum(is.na(assays(maeTest[["contextTfFeat"]])$contextTfFeat_weightedInserts.margin_tfMotif_1)), 0)
+  expect_equal(sum(is.na(assays(maeTest[["contextTfFeat"]])$contextTfFeat_weightedInserts.within_tfMotif_1)), 0)
+
+  expect_no_warning(contextTfFeatures(maeTest, tfName="CTCF",
+                                      insertionProfile=profile))
+  expect_no_message(contextTfFeatures(maeTest, tfName="CTCF",
+                                      insertionProfile=profile),
+                    message="Computing insertion-profiles")
+})
+
+test_that("Warning when using precomputed profile - not maching the motifRanges by name", {
+  experiments(maeTest)[[CONTEXTTFFEAT]] <- NULL
+  profile <- data.table(rel_pos=-200:200)
+  profile[,w:=1/nrow(profile)]
+  profile <- list("ATF2"=profile)
+
+  expect_warning(contextTfFeatures(maeTest, tfName="CTCF",
+                                   whichCol="OnlyTrain",
+                                   insertionProfile=profile),
+                 regexp="*Not all motif-ranges*")
+  expect_message(suppressWarnings(contextTfFeatures(maeTest, tfName="CTCF",
+                                  insertionProfile=profile)),
+                 regexp="Computing insertion-profiles")
+})
+
+test_that("Using precomputed profile - add provided via ... (profiles) arg", {
+  experiments(maeTest)[[CONTEXTTFFEAT]] <- NULL
+  profile <- data.table(rel_pos=-200:200)
+  profile[,w:=1/nrow(profile)]
+  profile <- list("CTCF"=profile)
+
+  expect_no_message(contextTfFeatures(maeTest, tfName="CTCF",
+                                   profiles=profile),
+                 message="No insertion-profile provided, pre-computing on training data")
+  expect_no_message(contextTfFeatures(maeTest, tfName="CTCF",
+                                      profiles=profile),
+                    message="Computing insertion-profiles")
+  expect_message(maeTest <- contextTfFeatures(maeTest, tfName="CTCF",
+                                              profiles=profile),
+                 regexp="Skipped insertion-profiles computation. Using provided pre-computed ones")
+})
+
+test_that("Warning when using precomputed profile -  via ... (profiles) and insertionProfile arg",{
+  experiments(maeTest)[[CONTEXTTFFEAT]] <- NULL
+  profile <- data.table(rel_pos=-200:200)
+  profile[,w:=1/nrow(profile)]
+  profile <- list("CTCF"=profile)
+
+  expect_warning(contextTfFeatures(maeTest, tfName="CTCF",
+                                   profiles=profile,
+                                   insertionProfile=profile),
+                 regexp="*Provided duplicated argument*")
+})
+
+test_that("Using message when no pre-computed profile provided", {
+  experiments(maeTest)[[CONTEXTTFFEAT]] <- NULL
+
+  expect_message(contextTfFeatures(maeTest, tfName="CTCF",
+                                   insertionProfile=NULL),
+                 regexp="No insertion-profile provided, pre-computing on training data")
+  expect_message(contextTfFeatures(maeTest, tfName="CTCF",
+                                   insertionProfile=NULL),
+                 regexp="Computing insertion-profiles")
+})
+
+test_that("Weighted_Inserts not in features", {
+  experiments(maeTest)[[CONTEXTTFFEAT]] <- NULL
+
+  expect_no_message(contextTfFeatures(maeTest, features="Inserts",
+                                   tfName="CTCF"),
+                    message="No insertion-profile provided, pre-computing on training data")
+  expect_no_message(maeTest <- contextTfFeatures(maeTest, tfName="CTCF",
+                                                 features="Inserts"),
+                    message="Computing insertion-profiles")
+
+  expect_equal(sum(grepl("weightedInserts", names(assays(maeTest[["contextTfFeat"]])))),
+               0)
+})
diff --git a/tests/testthat/test-getFeatureMatrix.R b/tests/testthat/test-getFeatureMatrix.R
@@ -208,3 +208,24 @@ test_that("Feature Matrix: correct features are normalized", {
   normedCols <- colnames(fm)[grepl(NORMEDAFFIX,colnames(fm))]
   expect_length(setdiff(normedCols, colNamesAll), 0)
 })
+
+test_that("Feature Matrix: normalization does not fail with NA containing columns", {
+
+  randMat <- matrix(runif(1e3), ncol=10, nrow=100)
+  randMat[sample(1:length(randMat),100)] <- NA
+
+  robNormed <- .robustNormalization(randMat)
+  expect_equal(which(is.na(robNormed)), which(is.na(randMat)))
+  expect_equal(which(is.na(robNormed)), which(is.na(randMat)))
+  expect_equal(sum(!is.na(robNormed)), sum(!is.na(randMat)))
+
+  minMaxNormed <- .minMaxNormalization(randMat)
+  expect_equal(which(is.na(minMaxNormed)), which(is.na(randMat)))
+  expect_equal(which(is.na(minMaxNormed)), which(is.na(randMat)))
+  expect_equal(sum(!is.na(minMaxNormed)), sum(!is.na(randMat)))
+
+  colNormed <- .contextNormalization(randMat, method="column")
+  expect_equal(which(is.na(colNormed)), which(is.na(randMat)))
+  expect_equal(which(is.na(colNormed)), which(is.na(randMat)))
+  expect_equal(sum(!is.na(colNormed)), sum(!is.na(randMat)))
+})