Merge pull request #141 from ncborcherding/dev

ncborcherding · web-flow · commit 5cb4eeadbc1e · 2025-01-24T10:30:27.000-06:00
v2.2.3
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: escape
 Title: Easy single cell analysis platform for enrichment
-Version: 2.2.2
+Version: 2.2.3
 Authors@R: c(
     person(given = "Nick", family = "Borcherding", role = c("aut", "cre"), email = "ncborch@gmail.com"),
     person(given = "Jared", family = "Andrews", role = c("aut"), email = "jared.andrews07@gmail.com"),
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,10 @@
+# escape VERSION 2.2.3
+
+## UNDERLYING CHANGES
+
+* fixed handling of *groups* parameter and data splitting in ```escape.matrix()```
+* improved efficiency of internal ```.split_data.matrix()```
+
 # escape VERSION 2.2.2
 
 ## UNDERLYING CHANGES
diff --git a/R/runEscape.R b/R/runEscape.R
@@ -70,39 +70,42 @@ escape.matrix <- function(input.data,
                 length(splits), 'times.'))
     split.data <- .split_data.matrix(matrix=cnts, chunk.size=groups)
     
+    all_gene_sets <- names(egc) # Collect all gene set names
     
     for (i in seq_along(splits)) {
-          last <- min(ncol(cnts), i+groups-1)
-          if(method == "GSVA") {
-              parameters <- .gsva.setup(split.data[[i]], egc)
-          } else if (method == "ssGSEA") {
-              parameters <- .ssGSEA.setup(split.data[[i]], egc)
-          }
-          if(method %in% c("ssGSEA", "GSVA")) {
-              a <- suppressWarnings(gsva(param = parameters, 
-                        verbose = FALSE,
-                        BPPARAM = BPPARAM,
-                        ...))
-          } else if(method == "UCell") {
-              a <- t(suppressWarnings(
-                ScoreSignatures_UCell(matrix = split.data[[i]], 
-                                      features=egc,
-                                      name = NULL,
-                                      BPPARAM = BPPARAM,
-                                      ...)))
-          } else if (method == "AUCell") {
-            rankings <- AUCell_buildRankings(split.data[[i]],
-                                             plotStats = FALSE,
-                                             verbose = FALSE)
-            a <- assay(AUCell_calcAUC(geneSets = egc,
-                                     rankings,
-                                     normAUC = TRUE,
-                                     aucMaxRank = ceiling(0.2 * nrow(split.data[[i]])),
-                                     verbose = FALSE,
-                                     ...))
-             
-          }
-          scores[[i]] <- a
+      if (method == "GSVA") {
+        parameters <- .gsva.setup(split.data[[i]], egc)
+      } else if (method == "ssGSEA") {
+        parameters <- .ssGSEA.setup(split.data[[i]], egc)
+      }
+      if (method %in% c("ssGSEA", "GSVA")) {
+        a <- suppressWarnings(gsva(param = parameters, 
+                                   verbose = FALSE,
+                                   BPPARAM = BPPARAM,
+                                   ...))
+      } else if (method == "UCell") {
+        a <- t(suppressWarnings(
+          ScoreSignatures_UCell(matrix = split.data[[i]], 
+                                features = egc,
+                                name = NULL,
+                                BPPARAM = BPPARAM,
+                                ...)))
+      } else if (method == "AUCell") {
+        rankings <- AUCell_buildRankings(split.data[[i]],
+                                         plotStats = FALSE,
+                                         verbose = FALSE)
+        a <- assay(AUCell_calcAUC(geneSets = egc,
+                                  rankings,
+                                  normAUC = TRUE,
+                                  aucMaxRank = ceiling(0.2 * nrow(split.data[[i]])),
+                                  verbose = FALSE,
+                                  ...))
+      }
+      
+      # Ensure consistent row names (all_gene_sets) across splits
+      a <- as.data.frame(a)
+      a <- a[match(all_gene_sets, rownames(a), nomatch = NA), , drop = FALSE]
+      scores[[i]] <- a
     }
     scores <- do.call(cbind, scores)
     output <- t(as.matrix(scores))
diff --git a/R/utils.R b/R/utils.R
@@ -133,22 +133,16 @@ is_seurat_or_se_object <- function(obj) {
 }
 
 #split data matrix into cell chunks
-#stole this from https://github.com/carmonalab/UCell
-.split_data.matrix <- function(matrix, chunk.size=1000) {
+#modified this from https://github.com/carmonalab/UCell
+.split_data.matrix <- function(matrix, chunk.size = 1000) {
   ncols <- dim(matrix)[2]
-  nchunks <- (ncols-1) %/% chunk.size + 1
+  nchunks <- ceiling(ncols / chunk.size)  # Total number of chunks
   
-  split.data <- list()
-  min <- 1
+  split.data <- vector("list", nchunks)  # Preallocate list for efficiency
   for (i in seq_len(nchunks)) {
-    if (i == nchunks-1) {  #make last two chunks of equal size
-      left <- ncols-(i-1)*chunk.size
-      max <- min+round(left/2)-1
-    } else {
-      max <- min(i*chunk.size, ncols)
-    }
-    split.data[[i]] <- matrix[,min:max]
-    min <- max+1    #for next chunk
+    min <- (i - 1) * chunk.size + 1
+    max <- min(i * chunk.size, ncols)
+    split.data[[i]] <- matrix[, min:max, drop = FALSE]  # Ensure consistent structure
   }
   return(split.data)
 }
diff --git a/vignettes/vignette.Rmd b/vignettes/vignette.Rmd
@@ -162,6 +162,7 @@ ggplot(data = as.data.frame(enrichment.scores),
   theme_classic() + 
   theme(axis.title = element_blank())
 ```
+
 Multi-core support is for all methods is available through [BiocParallel](https://bioconductor.org/packages/release/bioc/html/BiocParallel.html). To add more cores, use the argument **BPPARAM** to ```escape.matrix()```. Here we will use the ```SnowParam()``` for it's support across platforms and explicitly call 2 workers (or cores).
 
 ```{r tidy=FALSE, eval=FALSE}
@@ -176,7 +177,6 @@ enrichment.scores <- escape.matrix(pbmc_small,
 
 Alternatively, we can use ```runEscape()``` to calculate the enrichment score and directly attach the output to a single-cell object. The additional parameter for ```runEscape` is **new.assay.name**, in order to save the enrichment scores as a custom assay in the single-cell object. 
 
-
 ```{r tidy = FALSE}
 pbmc_small <- runEscape(pbmc_small, 
                         method = "ssGSEA",
@@ -209,8 +209,6 @@ Although we glossed over the normalization that can be used in ```escape.matrix(
 
 There can be inherent bias in enrichment values due to drop out in single-cell expression data. Cells with larger numbers of features and counts will likely have higher enrichment values. ```performNormalization()``` will normalize the enrichment values by calculating the number of genes expressed in each gene set and cell. This is similar to the normalization in classic GSEA and it will be stored in a new assay. 
 
-
-
 ```{r}
 pbmc_small <- performNormalization(sc.data = pbmc_small, 
                                    assay = "escape.ssGSEA",