Merge pull request #166 from toobiwankenobi/geyserSummary

ncborcherding · web-flow · commit 41cbe68852ff · 2025-06-11T10:44:04.000-05:00
fix color.by bug and introduce summarise.by feature for geyserEnrichment
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: escape
 Title: Easy single cell analysis platform for enrichment
-Version: 2.5.4
+Version: 2.5.5
 Authors@R: c(
     person(given = "Nick", family = "Borcherding", role = c("aut", "cre"), email = "ncborch@gmail.com"),
     person(given = "Jared", family = "Andrews", role = c("aut"), email = "jared.andrews07@gmail.com"),
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,10 @@
+# 2.5.5  (2025-06-11)
+
+## Bug fix & enhanced functionality
+* Enable ```color.by``` for both metadata columns and features (other gene sets)
+* Introduce ```summarise.by``` argument for ```geyserEnrichment()``` 
+* Enable scaling if color.by is another gene.set. Enable scaling for ```dgCMatrix```
+
 # 2.5.4  (2025-06-05)
 
 ## Bug fixes
diff --git a/R/geyserEnrichment.R b/R/geyserEnrichment.R
@@ -21,6 +21,10 @@
 #'   *`"group"`* – natural sort of group labels;
 #'   *`NULL`* – keep original ordering.
 #' @param facet.by Optional metadata column used to facet the plot.
+#' @param summarise.by Optional metadata column used to summarise data.
+#' @param summary.stat Optional method used to summarize expression within each
+#'   group defined by \code{summarise.by}. One of: \code{"mean"} (default),
+#'   \code{"median"}, \code{"max"}, \code{"sum"}, or \code{"geometric"}.
 #' @param scale Logical; if `TRUE` scores are centered/scaled (Z‑score) prior
 #' to plotting.
 #' @param palette Character. Any palette from \code{\link[grDevices]{hcl.pals}}.
@@ -50,6 +54,8 @@ geyserEnrichment <- function(input.data,
                              order.by  = NULL,
                              scale     = FALSE,
                              facet.by  = NULL,
+                             summarise.by = NULL,
+                             summary.stat   = "mean",
                              palette   = "inferno") {
   ## ---- 0) Sanity checks -----------------------------------------------------
   if (missing(gene.set) || length(gene.set) != 1L)
@@ -61,24 +67,69 @@ geyserEnrichment <- function(input.data,
   if (identical(color.by, "group"))
     color.by <- group.by
   
-  ## ---- 1) Build tidy data.frame -------------------------------------------
+  if (!is.null(summarise.by) && (identical(summarise.by, group.by) || 
+      identical(summarise.by, facet.by)))
+    stop("'summarise.by' cannot be the same as 'group.by' or 'facet.by'. 
+         Please choose a different metadata column.")
+  
+  # ---- 1) helper to match summary function -------------------------
+  summary_fun <- .match_summary_fun(summary.stat)
+  
+  ## ---- 2) Build tidy data.frame -------------------------------------------
   enriched <- .prepData(input.data, assay, gene.set, group.by,
-                        split.by = NULL, facet.by = facet.by)
+                        split.by = summarise.by, facet.by = facet.by, color.by = color.by)
+  
+  # Define all grouping variables that must be metadata columns
+  grouping_vars <- unique(c(summarise.by, group.by, facet.by))
+  
+  # Determine if color.by is a feature
+  all_features <- rownames(.cntEval(input.data, assay = assay, type = "data"))
+  
+  # Determine if color.by is a feature
+  is_feature_color <- !is.null(color.by) &&
+    (color.by %in% all_features)
+  
+  ## Optionally summarise data with **base aggregate()** ----------------------
+  if (!is.null(summarise.by)) {
+    
+    # add color.by to summarise_vars if it is a feautre, otherwise add to grouping_vars
+    summarise_vars <- unique(c(gene.set, if (is_feature_color) color.by))
+    grouping_vars <- unique(c(grouping_vars, if (!is_feature_color) color.by))
+    
+    # Perform aggregation
+    enriched <- aggregate(enriched[summarise_vars],
+                          by = enriched[grouping_vars],
+                          FUN = summary_fun,
+                          simplify = TRUE)
+  }
   
   ## Optionally Z‑transform ----------------------------------------------------
-  if (scale)
-    enriched[[gene.set]] <- as.numeric(scale(enriched[[gene.set]]))
+  if (scale) {
+    enriched[[gene.set]] <- scale(as.numeric(enriched[[gene.set]]))
+    
+    # Also scale color.by if it's a feature 
+    if (is_feature_color) {
+      enriched[[color.by]] <- scale(enriched[[color.by]])
+    }
+  }
   
   ## Optionally reorder groups -------------------------------------------------
   if (!is.null(order.by))
     enriched <- .orderFunction(enriched, order.by, group.by)
   
-  ## ---- 2) Plot --------------------------------------------------------------
-  plt <- ggplot(enriched, aes(x = .data[[group.by]],
-                              y = .data[[gene.set]],
-                              colour = .data[[color.by]])) +
+  ## ---- 3) Plot --------------------------------------------------------------
+  if (!is.null(color.by))
+    plt <- ggplot(enriched, aes(x = .data[[group.by]],
+                                y = .data[[gene.set]],
+                                group = .data[[group.by]],
+                                colour = .data[[color.by]]))
+  else
+    plt <- ggplot(enriched, aes(x = .data[[group.by]],
+                                y = .data[[gene.set]]),
+                                group = .data[[group.by]])
+
     # Raw points --------------------------------------------------------------
-  geom_jitter(width = 0.25, size = 1.5, alpha = 0.6, na.rm = TRUE) +
+  plt <- plt + geom_jitter(width = 0.25, size = 1.5, alpha = 0.6, na.rm = TRUE) +
     
     # White base interval + median point -------------------------------------
   stat_pointinterval(interval_size_range = c(2, 3), fatten_point = 1.4,
@@ -97,10 +148,11 @@ geyserEnrichment <- function(input.data,
     theme(legend.direction = "horizontal",
           legend.position  = "bottom")
   
-  ## ---- 3) Colour scale ------------------------------------------------------
-  plt <- .colorby(enriched, plt, color.by, palette, type = "color")
+  ## ---- 4) Colour scale ------------------------------------------------------
+  if (!is.null(color.by)) 
+    plt <- .colorby(enriched, plt, color.by, palette, type = "color")
   
-  ## ---- 4) Facetting ---------------------------------------------------------
+  ## ---- 5) Facetting ---------------------------------------------------------
   if (!is.null(facet.by))
     plt <- plt + facet_grid(as.formula(paste(".~", facet.by)))
   
diff --git a/R/gseaEnrichment.R b/R/gseaEnrichment.R
@@ -178,7 +178,7 @@ gseaEnrichment <- function(input.data,
     ggplot2::geom_step(linewidth = 0.8) +
     ggplot2::geom_hline(yintercept = 0) + 
     ggplot2::scale_colour_manual(values = cols, name = NULL) +
-    ggplot2::labs(y = "Running Enrichment Score") +
+    ggplot2::labs(y = paste0(gene.set.use, "\nRunning Enrichment Score")) +
     ggplot2::theme_classic() +
     ggplot2::theme(axis.title.x = element_blank(),
                    axis.text.x  = element_blank(),
@@ -194,7 +194,7 @@ gseaEnrichment <- function(input.data,
           axis.text.y  = element_blank(),
           axis.ticks.y = element_blank(),
           panel.border = element_rect(fill = NA, colour = "black", linewidth = 0.5))
-   
-  p_top / p_mid + patchwork::plot_layout(heights = c(3, 0.4))
+  
+  patchwork::wrap_plots(p_top, p_mid, ncol = 1, heights = c(3, 0.4))
 }
 
diff --git a/R/heatmapEnrichment.R b/R/heatmapEnrichment.R
@@ -17,9 +17,9 @@
 #' @param facet.by Optional metadata column used to facet the plot.
 #' @param scale If \code{TRUE}, Z‑transforms each gene‑set column **after**
 #' summarization.
-#' @param summary.stat Method used to summarize expression within each
-#* group: one of `"mean"` (default), `"median"`, `"max"`,
-#*`"sum"`, or `"geometric"`
+#' @param summary.stat Optional method used to summarize expression within each
+#'   group. One of: \code{"mean"} (default), \code{"median"}, \code{"max"}, 
+#'   \code{"sum"}, or \code{"geometric"}.
 #' @param palette Character. Any palette from \code{\link[grDevices]{hcl.pals}}.
 #'
 #' @return A \code{ggplot2} object.
@@ -47,30 +47,15 @@ heatmapEnrichment <- function(input.data,
                               palette        = "inferno")
 {
   # ---------- 1. helper to match summary function -------------------------
-  .match_summary_fun <- function(fun) {
-    if (is.function(fun)) return(fun)
-    if (!is.character(fun) || length(fun) != 1)
-      stop("'summary.stat' must be a single character keyword or a function")
-    kw <- tolower(fun)
-    fn <- switch(kw,
-                 mean      = base::mean,
-                 median    = stats::median,
-                 sum       = base::sum,
-                 sd        = stats::sd,
-                 max       = base::max,
-                 min       = base::min,
-                 geometric = function(x) exp(mean(log(x + 1e-6))),
-                 stop("Unsupported summary keyword: ", fun))
-    fn
-  }
   summary_fun <- .match_summary_fun(summary.stat)
   
   # ---------- 2. pull / tidy data -----------------------------------------
   if (is.null(group.by)) group.by <- "ident"
   df <- .prepData(input.data, assay, gene.set.use,
                   group.by = group.by,
                   split.by = NULL,
-                  facet.by = facet.by)
+                  facet.by = facet.by, 
+                  color.by = NULL)
   
   # Which columns contain gene-set scores?
   if (identical(gene.set.use, "all"))
diff --git a/R/ridgeEnrichment.R b/R/ridgeEnrichment.R
@@ -61,7 +61,7 @@ ridgeEnrichment <- function(input.data,
   
   ## ---- 1  build long data.frame ---------------------------------------
   df <- .prepData(input.data, assay, gene.set.use, group.by,
-                  split.by = NULL, facet.by = facet.by)
+                  split.by = NULL, facet.by = facet.by, color.by = color.by)
   
   ## optional scaling (Z-transform per gene-set) -------------------------
   if (scale)
diff --git a/R/scatterEnrichment.R b/R/scatterEnrichment.R
@@ -69,7 +69,8 @@ scatterEnrichment <- function(input.data,
   gene.set <- c(x.axis, y.axis)
   
   ## ---- 1  Assemble long data-frame -----------------------------------------
-  enriched <- .prepData(input.data, assay, gene.set, group.by, NULL, facet.by)
+  enriched <- .prepData(input.data, assay, gene.set, group.by, NULL, facet.by,
+                        color.by = NULL)
   
   if (scale) {
     enriched[, gene.set] <- apply(enriched[, gene.set, drop = FALSE], 2, scale)
diff --git a/R/splitEnrichment.R b/R/splitEnrichment.R
@@ -56,7 +56,8 @@ splitEnrichment <- function(input.data,
   if (is.null(group.by)) group.by <- "ident"
   
   # Prepare tidy data with relevant metadata columns
-  enriched <- .prepData(input.data, assay, gene.set.use, group.by, split.by, facet.by)
+  enriched <- .prepData(input.data, assay, gene.set.use, group.by, split.by, 
+                        facet.by, color.by = NULL)
   
   # Determine the number of levels in the splitting variable
   split.levels <- unique(enriched[[split.by]])
diff --git a/R/utils.R b/R/utils.R
@@ -41,40 +41,80 @@
 #  DATA.frame BUILDERS ---------------------------------------------------------
 # -----------------------------------------------------------------------------
 .makeDFfromSCO <- function(input.data, assay = "escape", gene.set = NULL,
-                           group.by = NULL, split.by = NULL, facet.by = NULL) {
+                           group.by = NULL, split.by = NULL, facet.by = NULL, color.by = NULL) {
   if (is.null(assay))
     stop("Please provide assay name")
-  cols <- unique(c(group.by, split.by, facet.by))
+  
+  # Pull count matrix (features) and metadata
   cnts <- .cntEval(input.data, assay = assay, type = "data")
+  features <- rownames(cnts)
+  meta <- .grabMeta(input.data)
+  meta.cols <- colnames(meta)
   
-  if (length(gene.set) == 1 && gene.set == "all")
-    gene.set <- rownames(cnts)
+  # All potential column-like arguments
+  cols <- unique(c(group.by, split.by, facet.by, color.by))
   
-  meta <- .grabMeta(input.data)
-  meta <- meta[, cols, drop = FALSE]
+  # Check that each is either metadata or a feature
+  bad.cols <- cols[!(cols %in% meta.cols | cols %in% features)]
+  if (length(bad.cols) > 0) {
+    stop("The following variables are not found in either metadata or features: ", paste(bad.cols, collapse = ", "))
+  }
+  
+  # Determine if color.by is a feature or meta
+  is_feature_color <- !is.null(color.by) && color.by %in% features
+  is_meta_color <- !is.null(color.by) && color.by %in% meta.cols
   
+  # Prepare metadata subset
+  meta <- meta[, intersect(cols, meta.cols), drop = FALSE]
+  
+  # Convert gene.set if "all"
+  if (length(gene.set) == 1 && gene.set == "all") {
+    gene.set <- features
+  }
+  
+  # Build data frame with expression values
   if (length(gene.set) == 1) {
     df <- cbind(value = cnts[gene.set, ], meta)
     colnames(df)[1] <- gene.set
   } else {
     df <- cbind(Matrix::t(cnts[gene.set, , drop = FALSE]), meta)
   }
-  df
+  
+  # Add color.by feature expression if it's a gene but not in gene.set
+  if (is_feature_color && !(color.by %in% gene.set)) {
+    df[[color.by]] <- cnts[color.by, ]
+  }
+  
+  return(df)
 }
 
-.prepData <- function(input.data, assay, gene.set, group.by, split.by, facet.by) {
+
+.prepData <- function(input.data, assay, gene.set, group.by, split.by, facet.by, color.by) {
   if (.is_seurat_or_sce(input.data)) {
-    df <- .makeDFfromSCO(input.data, assay, gene.set, group.by, split.by, facet.by)
+    df <- .makeDFfromSCO(input.data, assay, gene.set, group.by, split.by, facet.by, color.by)
+    
+    if (identical(gene.set, "all")) {
+      meta_cols <- c(group.by, split.by, facet.by)
+      # Do not remove color.by if it's also a feature
+      non_gene_color <- if (!is.null(color.by) && color.by %in% colnames(df) && !(color.by %in% gene.set)) color.by else NULL
+      gene.set <- setdiff(colnames(df), c(meta_cols, non_gene_color))
+    }
+    
+  } else {
+    all.cols <- unique(c(gene.set, group.by, split.by, facet.by, color.by))
+    missing.cols <- setdiff(all.cols, colnames(input.data))
+    if (length(missing.cols) > 0) {
+      stop("The following columns are missing in the input data: ", paste(missing.cols, collapse = ", "))
+    }
+    
     if (identical(gene.set, "all")) {
-      gene.set <- setdiff(colnames(df), c(group.by, split.by, facet.by))
+      gene.set <- setdiff(colnames(input.data), c(group.by, split.by, facet.by, color.by))
     }
-  } else {                               # assume plain data.frame / matrix
-    if (identical(gene.set, "all"))
-      gene.set <- setdiff(colnames(input.data), c(group.by, split.by, facet.by))
-    df <- input.data[, c(gene.set, group.by, split.by, facet.by), drop = FALSE]
+    
+    df <- input.data[, unique(c(gene.set, group.by, split.by, facet.by, color.by)), drop = FALSE]
   }
-  colnames(df) <- c(gene.set, group.by, split.by, facet.by)
-  df
+  
+  return(df)
 }
 
 # -----------------------------------------------------------------------------
@@ -443,4 +483,24 @@ utils::globalVariables(c(
   "gene.set.query", "index"
 ))
 
+# helper to match summary function
+.match_summary_fun <- function(fun) {
+  if (is.function(fun)) return(fun)
+  if (!is.character(fun) || length(fun) != 1)
+    stop("'summary.stat' must be a single character keyword or a function")
+  kw <- tolower(fun)
+  fn <- switch(kw,
+               mean      = base::mean,
+               median    = stats::median,
+               sum       = base::sum,
+               sd        = stats::sd,
+               max       = base::max,
+               min       = base::min,
+               geometric = function(x) exp(mean(log(x + 1e-6))),
+               stop("Unsupported summary keyword: ", fun))
+  
+  # Attach keyword as attribute
+  attr(fn, "keyword") <- kw
+  fn
+}
 
diff --git a/man/geyserEnrichment.Rd b/man/geyserEnrichment.Rd
diff --git a/man/heatmapEnrichment.Rd b/man/heatmapEnrichment.Rd
diff --git a/tests/testthat/test-splitEnrichment.R b/tests/testthat/test-splitEnrichment.R
@@ -70,7 +70,8 @@ test_that("order.by = 'mean' reorders x-axis levels by descending mean", {
     gene.set   = "Tcells",
     group.by   = "ident",
     split.by   = "groups",
-    facet.by   = NULL
+    facet.by   = NULL,
+    color.by   = NULL
   )
   
   expected <- enr %>%