diff --git a/DESCRIPTION b/DESCRIPTION index 52693ae..bbce521 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: TFBlearner Title: Functionality for training TF-specific classifiers to predict TF bindings based on ATAC-seq data. -Version: 0.1.1 +Version: 0.1.2 Authors@R: person("Emanuel", "Sonder", , "emanuel.sonder@hest.ethz.ch", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-4788-9508")) diff --git a/R/contextTfFeatures.R b/R/contextTfFeatures.R index 74e95b0..cf6b88d 100644 --- a/R/contextTfFeatures.R +++ b/R/contextTfFeatures.R @@ -99,6 +99,7 @@ contextTfFeatures <- function(mae, motifPath <- subset(colData(maeSub[[MOTIFEXP]]), get(MOTIFNAMECOL)==tf)$origin baseDir <- metadata(colData(maeSub[[MOTIFEXP]]))[[BASEDIRCOL]] motifRanges <- readRDS(file.path(baseDir, motifPath)) + motifRanges$motif_id <- tfName if(addLabels){ colDataChIP <- as.data.table(colData(mae[[CHIPEXP]])) @@ -114,6 +115,37 @@ contextTfFeatures <- function(mae, names(labels) <- contexts } + addArgs <- list(...) + if("profiles" %in% names(addArgs)){ + if(is.null(insertionProfile)){ + insertionProfile <- addArgs[["profiles"]]} + else{ + warning("Provided duplicated argument, profiles (via ...) and insertionProfile. + Using insertionProfile.") + } + } + + # pre-compute insertion-profile on training data + if(is.null(insertionProfile) & "Weighted_Inserts" %in% features){ + message("No insertion-profile provided, pre-computing on training data") + + trainCols <- unique(subset(sampleMap(mae), get(ISTRAINCOL))$colname) + labContexts <- getContexts(mae, tfName=tfName, which="Both") + trainContexts <- intersect(trainCols, labContexts) + + atacTrainFragFilePaths <- unlist(subset(colData(mae[[ATACEXP]]), + get(annoCol) %in% trainContexts)$origin) + baseDir <- metadata(colData(mae[[ATACEXP]]))[[BASEDIRCOL]] + atacTrainFragPaths <- file.path(baseDir, atacTrainFragFilePaths) + + ins <- getInsertionProfiles(atacTrainFragPaths, motifRanges) + insertionProfile <- list(ins$insertProfiles) + names(insertionProfile) <- tfName + } + else if(!("Weighted_Inserts" %in% features)){ + insertionProfile <- NULL + } + # loop over contexts to get the features message("Get insert features") labels <- labels[contexts] # ensure ordering @@ -123,20 +155,13 @@ contextTfFeatures <- function(mae, threads, BPPARAM, ...){ data.table::setDTthreads(threads) - calcProfile <- FALSE - if("Weighted_Inserts" %in% features & is.null(profile)){ - calcProfile <- TRUE - } - else if("Weighted_Inserts" %in% features & !is.null(profile)){ - message("Using pre-computed insertion-profiles") - } - atacFrag <- atacFrag[names(atacFrag)==context] addArgs <- list(...) addArgs <- addArgs[names(addArgs) %in% c("margin", "shift", "subSample","symmetric", "stranded")] + calcProfile <- FALSE args <- c(list(atacData=atacFrag, motifRanges=motifRanges, profiles=profile, calcProfile=calcProfile, BPPARAM=BPPARAM), addArgs) diff --git a/R/getFeatureMatrix.R b/R/getFeatureMatrix.R index 9d277cf..f0752db 100644 --- a/R/getFeatureMatrix.R +++ b/R/getFeatureMatrix.R @@ -13,34 +13,36 @@ return(mat) } -.robustNormalization <- function(mat){ - qs <- .marginQuant(mat, probs=c(0.25,0.5,0.75), margin="col") +.robustNormalization <- function(mat, naRm=TRUE){ + qs <- .marginQuant(mat, probs=c(0.25,0.5,0.75), margin="col", naRm=naRm) Matrix::Matrix(t(t(sweep(mat, 2, qs[2,], "-"))/max((qs[3,]-qs[1,]),1e-5))) } -.minMaxNormalization <- function(mat, useMax=FALSE){ +.minMaxNormalization <- function(mat, useMax=FALSE, naRm=TRUE){ if(useMax){ - qs <- .marginQuant(mat, probs=c(0.0,1.0), margin="col") + qs <- .marginQuant(mat, probs=c(0.0,1.0), margin="col", naRm=naRm) } else{ - qs <- .marginQuant(mat, probs=c(0.0,0.9), margin="col") + qs <- .marginQuant(mat, probs=c(0.0,0.9), margin="col", naRm=naRm) } Matrix::Matrix(t(t(mat)/max((qs[2,]-qs[1,]),1e-5))) } .contextNormalization <- function(mat, method=c("robust", "min-max", - "column", "none")){ + "column", "none"), + naRm=TRUE){ method <- match.arg(method, choices=c("robust", "min-max", "column", "none")) if(method=="column"){ - normMat <- Matrix::t(Matrix::t(mat)/pmax(colSums(mat), rep(1e-5, nrow(mat)))) + normMat <- Matrix::t(Matrix::t(mat)/pmax(colSums(mat, na.rm=naRm), + rep(1e-5, nrow(mat)))) } else if(method=="min-max"){ - normMat <- .minMaxNormalization(mat) + normMat <- .minMaxNormalization(mat, naRm=naRm) } else if(method=="robust"){ - normMat <- .robustNormalization(mat) + normMat <- .robustNormalization(mat, naRm=naRm) } else if(method=="none") { diff --git a/R/getInsertionProfiles.R b/R/getInsertionProfiles.R index 12d4fcc..4d377a6 100644 --- a/R/getInsertionProfiles.R +++ b/R/getInsertionProfiles.R @@ -116,6 +116,14 @@ getInsertionProfiles <- function(atacData, motifData[,motif_id:=1L] } + if(!calcProfile & !is.null(profiles) & + length(setdiff(motifData$motif_id, names(profiles)))>0){ + warning("Not all motif-ranges have an insertion-profile provided. + If wished to use a pre-computed profile provide one for all the motifs specified in the motifRanges arg. + Switching to computing profiles for all (calcProfile=TRUE).") + calcProfile <- TRUE + } + margin <- as.integer(margin) if(margin>0){ motifMarginRanges <- as.data.table(GenomicRanges::resize(motifRanges, @@ -180,6 +188,7 @@ getInsertionProfiles <- function(atacData, atacFrag <- split(atacFrag, by="chr") if(calcProfile){ + message("Computing insertion-profiles") atacProfiles <- BiocParallel::bpmapply(function(md, af, stranded, shiftLeft){ atacInserts <- .getInsertsPos(af, md, stranded, shiftLeft) atacProfile <- atacInserts[,.(pos_count_global=.N), @@ -238,7 +247,7 @@ getInsertionProfiles <- function(atacData, else{ atacProfiles <- profiles if(is.list(atacProfiles)){ - message("Using precomputed profiles") + message("Skipped insertion-profiles computation. Using provided pre-computed ones") atacProfiles <- rbindlist(atacProfiles, idcol="motif_id")} } diff --git a/R/helpers.R b/R/helpers.R index 8446ae8..73e34f6 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -11,7 +11,7 @@ return(mat) } -.marginMax <- function(mat, margin=c("row", "col")){ +.marginMax <- function(mat, margin=c("row", "col"), naRm=TRUE){ margin <- match.arg(margin, choices=c("col", "row")) if(margin=="row"){fun <- MatrixGenerics::rowMaxs} @@ -20,11 +20,11 @@ if(!is(mat, "CsparseMatrix") & !is(mat, "TsparseMatrix")){ mat <- as.matrix(mat) } - marginMax <- fun(mat) + marginMax <- fun(mat, na.rm=naRm) return(marginMax) } -.marginSum <- function(mat, margin=c("row", "col")){ +.marginSum <- function(mat, margin=c("row", "col"), naRm=TRUE){ margin <- match.arg(margin, choices=c("col", "row")) if(margin=="row"){fun <- MatrixGenerics::rowSums} @@ -33,11 +33,11 @@ if(!is(mat, "CsparseMatrix") & !is(mat, "TsparseMatrix")){ mat <- as.matrix(mat) } - marginMax <- fun(mat) + marginMax <- fun(mat, na.rm=naRm) return(marginMax) } -.marginQuant <- function(mat, probs, margin=c("row", "col")){ +.marginQuant <- function(mat, probs, margin=c("row", "col"), naRm=TRUE){ margin <- match.arg(margin, choices=c("col", "row")) if(margin=="row"){fun <- MatrixGenerics::rowQuantiles} @@ -45,7 +45,7 @@ if(!is(mat, "CsparseMatrix") & !is(mat, "TsparseMatrix")){ mat <- as.matrix(mat) } - marginQuant <- t(fun(mat, probs=probs)) + marginQuant <- t(fun(mat, probs=probs, na.rm=naRm)) return(marginQuant) } diff --git a/R/predict.R b/R/predict.R index 5e5500e..12f9790 100644 --- a/R/predict.R +++ b/R/predict.R @@ -54,18 +54,18 @@ predictTfBinding <- function(models, npDt, chunk, contextColName){ data.table::setDTthreads(floor(numThreads/nWorker)) - allFeats <- listFeatures() - colsToRemove <- unlist(subset(allFeats, - !included_in_training)$feature_matrix_column_names) - colsToRemove <- c(colsToRemove, LABELCOLNAME, contextColName, CSCORECOLNAME) - if(name==MODELALLNAME){ - colsToRemove <- setdiff(colsToRemove, CSCORECOLNAME) + # get feature order, to ensure the same order of features + modText <- model$save_model_to_string() + featOrder <- unlist(tstrsplit(modText, split="\n", keep=8)) + if(!any(grepl("feature_names", featOrder))){ + stop("Feature names could not be retrieved from the provided model.") } + featOrder <- unlist(tstrsplit(gsub("feature_names=","", featOrder), split=" ")) if(!is.null(chunk) & is.numeric(chunk)){ predDts <- lapply(npDt, function(indDt){ - pred <- predict(model, as.matrix(data[indDt$ind, !(colnames(data) %in% colsToRemove)])) - predDt <- data.table(pred=pred) + preds <- predict(model, as.matrix(data[indDt$ind,featOrder])) + predDt <- data.table(pred=preds) if(sparsify) predDt[,pred:=fifelse(pred*scalFactPred>model$params[[SPARSETHR]],pred,0L)] predDt @@ -74,7 +74,7 @@ predictTfBinding <- function(models, predMat <- Matrix::Matrix(as.matrix(predDt)) } else{ - preds <- predict(model, as.matrix(data[,!(colnames(data) %in% colsToRemove)])) + preds <- predict(model, as.matrix(data[,featOrder])) predDt <- data.table(pred=preds) if(sparsify) predDt[,pred:=fifelse(pred*scalFactPred>model$params[[SPARSETHR]],pred,0L)] predMat <- Matrix::Matrix(as.matrix(predDt)) diff --git a/R/tfFeatures.R b/R/tfFeatures.R index 501ddf0..b1fd582 100644 --- a/R/tfFeatures.R +++ b/R/tfFeatures.R @@ -61,6 +61,10 @@ chIPMat <- .binMat(chIPMat, threshold=0) } + message(paste("Applying nonnegative matrix factorization on a matrix of dimension", + nrow(chIPMat),"x", ncol(chIPMat), collapse=" ")) + message(paste("Requested rank", nPatterns)) + if(is.null(aggFun)){ chIPMat <- as(chIPMat, "CsparseMatrix") nmfRes <- suppressMessages(RcppML::nmf(chIPMat, k=nPatterns, @@ -360,7 +364,10 @@ tfFeatures <- function(mae, # assay-matrices atacMat <- .convertToMatrix(assays(mae[[ATACEXP]])[[TOTALOVERLAPSFEATNAME]]) colnames(atacMat) <- colnames(mae[[ATACEXP]]) - whichCol <- which(mae[[CHIPEXP]][[TFNAMECOL]]!=tfName) + whichCol <- which(mae[[CHIPEXP]][[TFNAMECOL]]!=tfName & + !(mae[[CHIPEXP]][["combination"]] %in% + subset(sampleMap(mae), get(ISTESTCOL) & assay==CHIPEXP)$colname)) + chIPMat <- as(assays(mae[[CHIPEXP]])[[PEAKASSAY]][,whichCol],"CsparseMatrix") colnames(chIPMat) <- paste(colData(mae[[CHIPEXP]])[whichCol,annoCol], colData(mae[[CHIPEXP]])[whichCol,TFNAMECOL], diff --git a/tests/testthat/test-contextTfFeatures.R b/tests/testthat/test-contextTfFeatures.R index fe727a9..2daa5e6 100644 --- a/tests/testthat/test-contextTfFeatures.R +++ b/tests/testthat/test-contextTfFeatures.R @@ -54,3 +54,99 @@ test_that("Error if features have not been computed for provided TF", { tfName="JUN" expect_error(contextTfFeatures(maeTest, tfName=tfName)) }) + +test_that("Using precomputed profile", { + experiments(maeTest)[[CONTEXTTFFEAT]] <- NULL + profile <- data.table(rel_pos=-200:200) + profile[,w:=1/nrow(profile)] + profile <- list("CTCF"=profile) + + contextTfFeatures(maeTest, tfName="CTCF", + insertionProfile=profile) + expect_no_message(contextTfFeatures(maeTest, tfName="CTCF", + insertionProfile=profile), + message="No insertion-profile provided, pre-computing on training data") + expect_no_message(contextTfFeatures(maeTest, tfName="CTCF", + insertionProfile=profile), + message="Computing insertion-profiles") + expect_message(maeTest <- contextTfFeatures(maeTest, tfName="CTCF", + insertionProfile=profile), + regexp="Skipped insertion-profiles computation. Using provided pre-computed ones") + expect_equal(sum(is.na(assays(maeTest[["contextTfFeat"]])$contextTfFeat_weightedInserts.margin_tfMotif_1)), 0) + expect_equal(sum(is.na(assays(maeTest[["contextTfFeat"]])$contextTfFeat_weightedInserts.within_tfMotif_1)), 0) + + expect_no_warning(contextTfFeatures(maeTest, tfName="CTCF", + insertionProfile=profile)) + expect_no_message(contextTfFeatures(maeTest, tfName="CTCF", + insertionProfile=profile), + message="Computing insertion-profiles") +}) + +test_that("Warning when using precomputed profile - not maching the motifRanges by name", { + experiments(maeTest)[[CONTEXTTFFEAT]] <- NULL + profile <- data.table(rel_pos=-200:200) + profile[,w:=1/nrow(profile)] + profile <- list("ATF2"=profile) + + expect_warning(contextTfFeatures(maeTest, tfName="CTCF", + whichCol="OnlyTrain", + insertionProfile=profile), + regexp="*Not all motif-ranges*") + expect_message(suppressWarnings(contextTfFeatures(maeTest, tfName="CTCF", + insertionProfile=profile)), + regexp="Computing insertion-profiles") +}) + +test_that("Using precomputed profile - add provided via ... (profiles) arg", { + experiments(maeTest)[[CONTEXTTFFEAT]] <- NULL + profile <- data.table(rel_pos=-200:200) + profile[,w:=1/nrow(profile)] + profile <- list("CTCF"=profile) + + expect_no_message(contextTfFeatures(maeTest, tfName="CTCF", + profiles=profile), + message="No insertion-profile provided, pre-computing on training data") + expect_no_message(contextTfFeatures(maeTest, tfName="CTCF", + profiles=profile), + message="Computing insertion-profiles") + expect_message(maeTest <- contextTfFeatures(maeTest, tfName="CTCF", + profiles=profile), + regexp="Skipped insertion-profiles computation. Using provided pre-computed ones") +}) + +test_that("Warning when using precomputed profile - via ... (profiles) and insertionProfile arg",{ + experiments(maeTest)[[CONTEXTTFFEAT]] <- NULL + profile <- data.table(rel_pos=-200:200) + profile[,w:=1/nrow(profile)] + profile <- list("CTCF"=profile) + + expect_warning(contextTfFeatures(maeTest, tfName="CTCF", + profiles=profile, + insertionProfile=profile), + regexp="*Provided duplicated argument*") +}) + +test_that("Using message when no pre-computed profile provided", { + experiments(maeTest)[[CONTEXTTFFEAT]] <- NULL + + expect_message(contextTfFeatures(maeTest, tfName="CTCF", + insertionProfile=NULL), + regexp="No insertion-profile provided, pre-computing on training data") + expect_message(contextTfFeatures(maeTest, tfName="CTCF", + insertionProfile=NULL), + regexp="Computing insertion-profiles") +}) + +test_that("Weighted_Inserts not in features", { + experiments(maeTest)[[CONTEXTTFFEAT]] <- NULL + + expect_no_message(contextTfFeatures(maeTest, features="Inserts", + tfName="CTCF"), + message="No insertion-profile provided, pre-computing on training data") + expect_no_message(maeTest <- contextTfFeatures(maeTest, tfName="CTCF", + features="Inserts"), + message="Computing insertion-profiles") + + expect_equal(sum(grepl("weightedInserts", names(assays(maeTest[["contextTfFeat"]])))), + 0) +}) diff --git a/tests/testthat/test-getFeatureMatrix.R b/tests/testthat/test-getFeatureMatrix.R index 768dd8a..232c237 100644 --- a/tests/testthat/test-getFeatureMatrix.R +++ b/tests/testthat/test-getFeatureMatrix.R @@ -208,3 +208,24 @@ test_that("Feature Matrix: correct features are normalized", { normedCols <- colnames(fm)[grepl(NORMEDAFFIX,colnames(fm))] expect_length(setdiff(normedCols, colNamesAll), 0) }) + +test_that("Feature Matrix: normalization does not fail with NA containing columns", { + + randMat <- matrix(runif(1e3), ncol=10, nrow=100) + randMat[sample(1:length(randMat),100)] <- NA + + robNormed <- .robustNormalization(randMat) + expect_equal(which(is.na(robNormed)), which(is.na(randMat))) + expect_equal(which(is.na(robNormed)), which(is.na(randMat))) + expect_equal(sum(!is.na(robNormed)), sum(!is.na(randMat))) + + minMaxNormed <- .minMaxNormalization(randMat) + expect_equal(which(is.na(minMaxNormed)), which(is.na(randMat))) + expect_equal(which(is.na(minMaxNormed)), which(is.na(randMat))) + expect_equal(sum(!is.na(minMaxNormed)), sum(!is.na(randMat))) + + colNormed <- .contextNormalization(randMat, method="column") + expect_equal(which(is.na(colNormed)), which(is.na(randMat))) + expect_equal(which(is.na(colNormed)), which(is.na(randMat))) + expect_equal(sum(!is.na(colNormed)), sum(!is.na(randMat))) +}) diff --git a/tests/testthat/test-insertionProfiles.R b/tests/testthat/test-insertionProfiles.R index 918be5d..4162ad4 100644 --- a/tests/testthat/test-insertionProfiles.R +++ b/tests/testthat/test-insertionProfiles.R @@ -83,7 +83,7 @@ test_that("Using precomputed profiles", { symmetric=TRUE, profiles=profiles, margin=10), - regexp="Using precomputed profiles") + regexp="Skipped insertion-profiles computation. Using provided pre-computed ones") profiles <- rbindlist(profiles, idcol="motif_id") expect_equal(insRes$insertProfiles, profiles) }) @@ -121,3 +121,24 @@ test_that("Simplified output format", { DEVFEATNAME)) expect_identical(motifCoords, rowRanges(insRes)) }) + +test_that("Warning when using precomputed profile - not maching the motifRanges by name", { + profile <- data.table(rel_pos=-200:200) + profile[,w:=1/nrow(profile)] + profile <- list("ATF2"=profile) + + motifRanges <- readRDS(exampleMotif[["JUN"]]) + motifRanges$motif_id <- "JUN" + + atacData <- fread(exampleATAC$K562) + expect_warning(getInsertionProfiles(atacData, + motifRanges=motifRanges, + profiles=profile, + calcProfile=FALSE), + regexp="Not all motif-ranges have an insertion-profile provided") + expect_message(suppressWarnings(getInsertionProfiles(atacData, + motifRanges=motifRanges, + profiles=profile, + calcProfile=FALSE)), + regexp="Computing insertion-profiles") +}) diff --git a/tests/testthat/test-predictTfBinding.R b/tests/testthat/test-predictTfBinding.R index 25bddd2..895711a 100644 --- a/tests/testthat/test-predictTfBinding.R +++ b/tests/testthat/test-predictTfBinding.R @@ -45,8 +45,15 @@ test_that("Predictions: sparsification",{ expect_no_error(preds <- predictTfBinding(modTest, fmTest, sparsify=TRUE)) }) - test_that("Predictions: chunking",{ preds <- NULL expect_no_error(preds <- predictTfBinding(modTest, fmTest, chunk=10)) }) + +# TODO: rethink this test, as lightgbm will not throw an error with mixed-up feature order. +# Meaning the case here does not check the feature order just that correct number of columns is removed +test_that("Feature order matches",{ + featCols <- sample(colnames(fmTest), replace=FALSE) + expect_no_error(predictTfBinding(modTest, fmTest[,featCols])) + expect_no_warning(predictTfBinding(modTest, fmTest[,featCols])) +}) diff --git a/tests/testthat/test-tfFeatures.R b/tests/testthat/test-tfFeatures.R index c4f960b..8ee4f60 100644 --- a/tests/testthat/test-tfFeatures.R +++ b/tests/testthat/test-tfFeatures.R @@ -66,3 +66,12 @@ test_that("No cofactors provided for co-binding features", { expect_warning(tfFeatures(maeTest2, tfName="JUN", tfCofactors=NULL), regexp="provide cofactor names") }) + +test_that("None of the testing ChIP-seq datasets used for binding pattern features", { + experiments(maeTest)[[TFFEAT]] <- NULL + experiments(maeTest2)[[TFFEAT]] <- NULL + expect_message(tfFeatures(maeTest, tfName="JUN", tfCofactors="CTCF"), + regexp="dimension 100 x 1") + expect_message(tfFeatures(maeTest2, tfName="JUN", tfCofactors="CTCF"), + regexp="dimension 100 x 1") +})