Merge pull request #108 from hpages/master

Repair and optimize PsiNorm()
YosefLab · Feb 19, 2025 · 120e114 · 120e114
2 parents 92a6cd8 + 39212eb
commit 120e114
Show file tree

Hide file tree

Showing 5 changed files with 99 additions and 18 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: scone
-Version: 1.31.0
+Version: 1.31.1
 Title: Single Cell Overview of Normalized Expression data
 Description: SCONE is an R package for comparing and ranking the performance of
 	different normalization schemes for single-cell RNA-seq and other 
@@ -44,7 +44,8 @@ Imports:
   MatrixGenerics,
   SingleCellExperiment,
   DelayedMatrixStats,
-  sparseMatrixStats
+  sparseMatrixStats,
+  SparseArray (>= 1.7.6)
 Suggests:
   BiocStyle,
   DT,
@@ -59,6 +60,7 @@ Suggests:
   scRNAseq,
   shiny,
   testthat,
+  DelayedArray,
   visNetwork,
   doParallel,
   batchtools,

diff --git a/NAMESPACE b/NAMESPACE
@@ -45,13 +45,15 @@ import(SingleCellExperiment)
 import(SummarizedExperiment)
 import(gplots)
 import(methods)
+importClassesFrom(SparseArray,SparseMatrix)
 importClassesFrom(SummarizedExperiment,SummarizedExperiment)
 importFrom(DelayedMatrixStats,colMins)
 importFrom(DelayedMatrixStats,colSums2)
 importFrom(MatrixGenerics,colMins)
 importFrom(MatrixGenerics,colSums2)
 importFrom(RColorBrewer,brewer.pal)
 importFrom(RUVSeq,RUVg)
+importFrom(SparseArray,nzwhich)
 importFrom(aroma.light,normalizeQuantileRank.matrix)
 importFrom(boot,inv.logit)
 importFrom(boot,logit)

diff --git a/R/SCONE_DEFAULTS.R b/R/SCONE_DEFAULTS.R
@@ -154,7 +154,8 @@ SCRAN_FN = function(ei){
 #' eo <- PSINORM_FN(ei)
 #'
 PSINORM_FN = function(ei){
-  inv_sf <- pareto.MLE(ei+1)
-  eo = t(t(ei) / mean(inv_sf) * inv_sf)
+  tei <- t(ei)
+  sf <- compute_transposed_PsiNorm_sf(tei)
+  eo <- t(mat_v_div(tei, mean(1 / sf) * sf))
   return(eo)
 }
diff --git a/R/pareto_norm.R b/R/pareto_norm.R
@@ -54,22 +54,79 @@ setMethod(
 #' @export
 #' @importFrom DelayedMatrixStats colSums2 colMins
 #' @importFrom sparseMatrixStats colSums2 colMins
+#' @importClassesFrom SparseArray SparseMatrix
+#' @importFrom SparseArray nzwhich
 setMethod(
   f = "PsiNorm",
   signature = signature(x = "ANY"),
   definition = function(x){
-    inv_sf <- pareto.MLE(x+1)
-    t(t(x) * inv_sf)
+    ## Uses only two transpostions (instead of four in original implementation).
+    ## Preserves sparsity.
+    tx <- t(x)
+    sf <- compute_transposed_PsiNorm_sf(tx)
+    t(mat_v_div(tx, sf))
 })
 
-# can we simplify the function and assume min is always 1?
-pareto.MLE <- function(sce) {
-  n <- nrow(sce)
-  m <- MatrixGenerics::colMins(sce)
-  a <- n/MatrixGenerics::colSums2(t(t(log(sce)) - log(m)))
-  return(a)
+## Preserves sparsity.
+compute_transposed_PsiNorm_sf <- function(tx) {
+  ## Temporary workaround until operations from the Math group (e.g. log1p)
+  ## are supported on a SparseMatrix object of type "integer".
+  ## See https://github.com/Bioconductor/SparseArray/blob/devel/TODO
+  if (type(tx) != "double")
+    type(tx) <- "double"
+  x2 <- log1p(tx)
+  m2 <- MatrixGenerics::rowMins(x2)
+  x2 <- mat_v_sub(x2, m2)
+  MatrixGenerics::rowSums2(x2) / ncol(tx)
 }
 
+## Preserves sparsity.
 computePsiNormSF <- function(x) {
-  1/pareto.MLE(x+1)
+  compute_transposed_PsiNorm_sf(t(x))
 }
+
+.build_linear_index <- function(nr, nc, i) {
+  stopifnot(is.integer(nr), length(nr) == 1L,
+            is.integer(nc), length(nc) == 1L, is.integer(i))
+  rep.int(nr * (seq_len(nc) - 1L), rep.int(length(i), nc)) + i
+}
+
+## Implements optimized substraction between a matrix-like object 'mat' and
+## an ordinary vector 'v' with mostly zeros that accepts a SparseMatrix
+## object (in which case it also returns a SparseMatrix object). Note that
+## the SparseArray package does not support this operation out-of-the-box at
+## the moment.
+mat_v_sub <- function(mat, v) {
+  mat_dim <- dim(mat)
+  stopifnot(length(mat_dim) == 2L, is.vector(v), length(v) == mat_dim[[1L]])
+  nzidx <- nzwhich(v)  # indices of nonzero values in 'v'
+  if (length(nzidx) == 0L)
+    return(mat)  # no-op
+  if (!is(mat, "SparseMatrix"))
+    return(mat - v)
+  ## Works well if 'nzidx' is short (i.e. 'v' contains mostly zeros).
+  Lidx <- .build_linear_index(mat_dim[[1L]], mat_dim[[2L]], nzidx)
+  mat[Lidx] <- mat[Lidx] - v[nzidx]
+  mat
+}
+
+## Implements division between a matrix-like object 'mat' and an ordinary
+## vector 'v' with mostly nonzero/non-NA values that accepts a SparseMatrix
+## object (in which case it also returns a SparseMatrix object). Note that
+## the SparseArray package only supports this operation when 'v' contains
+## no zeros or NAs at the moment.
+mat_v_div <- function(mat, v) {
+  mat_dim <- dim(mat)
+  stopifnot(length(mat_dim) == 2L, is.vector(v), length(v) == mat_dim[[1L]])
+  idx <- which(is.na(v) | v == 0)
+  if (length(idx) == 0L || !is(mat, "SparseMatrix"))
+    return(mat / v)
+  ## Works well if 'idx' is short (i.e. if 'v' contains very few zeros/NAs).
+  v2 <- v
+  v2[idx] <- 1L
+  mat <- mat / v2
+  Lidx <- .build_linear_index(mat_dim[[1L]], mat_dim[[2L]], idx)
+  mat[Lidx] <- mat[Lidx] / v[idx]
+  mat
+}
+
diff --git a/tests/testthat/test-pareto_norm.R b/tests/testthat/test-pareto_norm.R
@@ -1,14 +1,33 @@
 test_that("PSiNorm works with all input classes", {
-  m <- matrix(c(1,0,2,0,2,9,3,0), ncol=2)
-
+  m <- matrix(0L, nrow=4, ncol=5)
+  m[c(1,3) , 1] <- 1:2
+  m[1:3, 3] <- c(2L, 9L, 3L)
+  m[ , 4] <- 2:1
+  m[2, 5] <- 1L
+
   out1 <- PsiNorm(m)
-
+
+  svt <- as(m, "SVT_SparseMatrix")
+  out_svt <- PsiNorm(svt)
+  expect_true(is(out_svt, "SVT_SparseMatrix"))
+  expect_equal(out1, as.matrix(out_svt))
+
+  dm1 <- DelayedArray::DelayedArray(m)
+  out_dm1 <- PsiNorm(dm1)
+  expect_true(is(out_dm1, "DelayedMatrix"))
+  expect_equal(out1, as.matrix(out_dm1))
+
+  dm2 <- DelayedArray::DelayedArray(svt)
+  out_dm2 <- PsiNorm(dm2)
+  expect_true(is(out_dm2, "DelayedMatrix"))
+  expect_equal(out1, as.matrix(out_dm2))
+
   se <- SummarizedExperiment(m)
   sce <- SingleCellExperiment(m)
-  
+
   out_se <- PsiNorm(se)
   expect_equal(out1, assay(out_se, "PsiNorm"))
-  
+
   out_sce <- PsiNorm(sce, whichAssay = 1)
   out2 <- t(t(m)/sizeFactors(out_sce))
   expect_equal(out1, out2)