tidyomics · stemangiola · Nov 11, 2025 · Nov 12, 2025 · Nov 12, 2025
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: tidybulk
 Title: Brings transcriptomics to the tidyverse 
-Version: 2.1.0
+Version: 2.1.1
 Authors@R: c(person("Stefano", "Mangiola", email = "[email protected]",
                   role = c("aut", "cre")),
             person("Maria", "Doyle", email = "[email protected]",

diff --git a/NAMESPACE b/NAMESPACE
@@ -96,6 +96,7 @@ importFrom(ggplot2,scale_x_continuous)
 importFrom(ggplot2,scale_y_continuous)
 importFrom(lifecycle,deprecate_soft)
 importFrom(lifecycle,deprecate_warn)
+importFrom(lifecycle,is_present)
 importFrom(magrittr,"%$%")
 importFrom(magrittr,divide_by)
 importFrom(magrittr,equals)
@@ -127,6 +128,7 @@ importFrom(rlang,enquo)
 importFrom(rlang,inform)
 importFrom(rlang,is_quosure)
 importFrom(rlang,quo)
+importFrom(rlang,quo_get_expr)
 importFrom(rlang,quo_is_missing)
 importFrom(rlang,quo_is_null)
 importFrom(rlang,quo_is_symbol)

diff --git a/R/scale_abundance.R b/R/scale_abundance.R
@@ -4,9 +4,10 @@
 #'
 #' @description scale_abundance() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and Scales transcript abundance compansating for sequencing depth (e.g., with TMM algorithm, Robinson and Oshlack doi.org/10.1186/gb-2010-11-3-r25).
 #'
-#' @importFrom rlang enquo quo_name
+#' @importFrom rlang enquo quo_name quo_is_null
 #' @importFrom stats median
 #' @importFrom SummarizedExperiment assays colnames
+#' @importFrom lifecycle is_present deprecate_warn
 #'
 #' @name scale_abundance
 #'
@@ -16,6 +17,7 @@
 #' @param reference_sample A character string. The name of the reference sample. If NULL the sample with highest total read count will be selected as reference.
 #' @param .subset_for_scaling A gene-wise quosure condition. This will be used to filter rows (features/genes) of the dataset. For example
 #' @param suffix A character string to append to the scaled abundance column name. Default is "_scaled".
+#' @param chunk_sample_size An integer indicating how many samples to process per chunk. Default is `Inf` (no chunking). For HDF5-backed data or large datasets, set to a finite value (e.g., 50) to enable memory-efficient chunked processing with BiocParallel parallelization.
 #'
 #' @param reference_selection_function DEPRECATED. please use reference_sample.
 #' @param ... Further arguments.
@@ -62,17 +64,20 @@
 setGeneric("scale_abundance", function(.data,
 
 
-                                       abundance = assayNames(.data)[1],
+                                       abundance = SummarizedExperiment::assayNames(.data)[1],
                                        method = "TMM",
                                        reference_sample = NULL,
                                        .subset_for_scaling = NULL,
                                        suffix = "_scaled",
+                                       chunk_sample_size = Inf,
                                        # DEPRECATED
                                        reference_selection_function = NULL,
                                        ...,
                                        .abundance = NULL)
   standardGeneric("scale_abundance"))
 
+
+
 #' @importFrom magrittr multiply_by
 #' @importFrom magrittr divide_by
 #' @importFrom SummarizedExperiment assays
@@ -92,15 +97,19 @@ setGeneric("scale_abundance", function(.data,
 #' @importFrom SummarizedExperiment "rowData<-"
 #' @importFrom magrittr "%$%"
 #' @importFrom SummarizedExperiment "assays<-"
+#' @importFrom rlang enquo quo_is_null quo_get_expr
+#' @importFrom lifecycle is_present deprecate_warn
+#' @importFrom methods is
 #'
 .scale_abundance_se = function(.data,
 
 
-                               abundance = assayNames(.data)[1],
+                               abundance = SummarizedExperiment::assayNames(.data)[1],
                                method = "TMM",
                                reference_sample = NULL,
                                .subset_for_scaling = NULL,
                                suffix = "_scaled",
+                               chunk_sample_size = Inf,
                                # DEPRECATED
                                reference_selection_function = NULL,
                                ...,
@@ -123,7 +132,7 @@ setGeneric("scale_abundance", function(.data,
 
 
   # DEPRECATION OF reference function
-  if (is_present(reference_selection_function) & !is.null(reference_selection_function)) {
+  if (is_present(reference_selection_function) && !is.null(reference_selection_function)) {
 
     # Signal the deprecation to the user
     deprecate_warn("1.1.8", "tidybulk::scale_abundance(reference_selection_function = )", details = "The argument reference_selection_function is now deprecated please use reference_sample. By default the reference selection function is max()")
@@ -134,14 +143,18 @@ setGeneric("scale_abundance", function(.data,
   if(!is.null(reference_sample) && !reference_sample %in% (.data |> colnames()))
     stop("tidybulk says: your reference sample is not among the samples in your data frame")
 
+  # Force evaluation of .subset_for_scaling to avoid S4 dispatch issues with default arguments
+  .subset_for_scaling <- force(.subset_for_scaling)
   .subset_for_scaling = enquo(.subset_for_scaling)
 
-
   .data_filtered =
     filter_if_abundant_were_identified(.data)
 
+  # Only apply filtering if a non-NULL subset condition was provided  
+  subset_expr <- rlang::quo_get_expr(.subset_for_scaling)
+  has_subset <- !is.null(subset_expr) && !identical(subset_expr, quote(NULL))
 
-  if (!quo_is_null(.subset_for_scaling))
+  if (has_subset)
     .data_filtered = filter_genes_on_condition(.data_filtered, !!.subset_for_scaling)
 
   # Filter based on user condition
@@ -150,12 +163,15 @@ setGeneric("scale_abundance", function(.data,
   if (nrow(.data_filtered) == 0)
     stop("tidybulk says: there are 0 genes that passes the filters (.abundant and/or .subset_for_scaling). Please check your filtering or your data.")
 
-  # Determine the correct assay name
-  my_counts_filtered = assays(.data_filtered)[[my_assay]] |> na.omit()
-  library_size_filtered = my_counts_filtered |> colSums(na.rm  = TRUE)
-
+  # Get the rownames of features that passed filtering
+  features_to_use <- rownames(.data_filtered)
+
+  # We just carry the gene names, no need to stress the memory with the full data frame
+  rm(.data_filtered)
+
+
   # If not enough genes, warning
-  if(nrow(my_counts_filtered)<100) warning(warning_for_scaling_with_few_genes)
+  if(length(features_to_use)<100) warning(warning_for_scaling_with_few_genes)
 
   # Set column name for value scaled
   value_scaled = paste0(my_assay, suffix)
@@ -164,54 +180,101 @@ setGeneric("scale_abundance", function(.data,
   reference <-
     reference_sample
 
-  if (is.null(reference))
+  if (is.null(reference)){
+      # Get filtered counts for reference selection
+  library_size_filtered = assays(.data[features_to_use, ])[[my_assay]] |> colSums(na.rm  = TRUE)
     reference = library_size_filtered |>
     sort() |>
     tail(1) |>
     names()
+  }
+
 
 
   # Communicate the reference if chosen by default
   if(is.null(reference_sample)) message(sprintf("tidybulk says: the sample with largest library size %s was chosen as reference for scaling", reference))
 
-  # Calculate TMM
-  nf <-
-    edgeR::calcNormFactors(
-      my_counts_filtered,
-      refColumn = reference,
-      method = method
-    )
-
-  # Calculate multiplier
-  multiplier =
-    # Relecting the ratio of effective library size of the reference sample to the effective library size of each sample
-    (library_size_filtered[reference] * nf[reference]) |>
-    divide_by(library_size_filtered * nf) 
-
-  # Add to sample info
-  colData(.data)$TMM = nf
-  colData(.data)$multiplier = multiplier
+
+  # Calculate TMM normalization factors once for all samples
+  chunk_counts_filtered <- assays(.data[features_to_use, ])[[my_assay]] |> na.omit()
+  chunk_library_size <- chunk_counts_filtered |> colSums(na.rm = TRUE)
 
-  my_counts_scaled =
-    list(
-      assay(.data) %*%
-        diag(multiplier)
-
-    ) |>
-    setNames(value_scaled)
-  colnames(my_counts_scaled[[1]]) = assay(.data)  |> colnames()
+  nf <- edgeR::calcNormFactors(
+    chunk_counts_filtered,
+    refColumn = reference,
+    method = method
+  )
 
+  # Calculate multiplier for all samples
+  multiplier <- 
+    (chunk_library_size[reference] * nf[reference]) |>
+    divide_by(chunk_library_size * nf)
 
-  # Add the assay
-  assays(.data, withDimnames=FALSE) =  assays(.data) |> c(my_counts_scaled)
+  # Add to sample info
+  colData(.data)$TMM <- nf
+  colData(.data)$multiplier <- multiplier
 
-  .data |>
+  # If chunk_sample_size is finite, process in chunks with parallelization
+  if (!is.finite(chunk_sample_size) || chunk_sample_size >= ncol(.data)) {
+
+    # Standard processing without chunking - just apply the multipliers
+    .data <- apply_scaling_only(.data, my_assay, multiplier, value_scaled)
+
+  } else {
+
+    sample_names <- colnames(.data)
+    sample_indices <- seq_along(sample_names)
+    sample_groups <- split(sample_names, ceiling(sample_indices / chunk_sample_size))
 
-    # Add methods
-    memorise_methods_used(c("edger", "tmm")) |>
+    # Check if BiocParallel is available, otherwise install
+    check_and_install_packages("BiocParallel")
 
-    # Attach column internals
-    add_tt_columns(.abundance_scaled = !!(((function(x, v)	enquo(v))(x,!!as.symbol(value_scaled))) |> drop_enquo_env()) )
+    # Get the current BiocParallel backend
+    bp_param <- BiocParallel::bpparam()
+
+    # Inform user about parallelization settings
+    if (is(bp_param, "SerialParam")) {
+      message("tidybulk says: Processing chunks serially. For parallel processing, configure BiocParallel with BiocParallel::register() before calling this function. For example: BiocParallel::register(BiocParallel::MulticoreParam(workers = 4, progressbar = TRUE))")
+    } else {
+      message(sprintf("tidybulk says: Processing %d chunks in parallel using %s with %d workers", 
+                      length(sample_groups), class(bp_param)[1], BiocParallel::bpnworkers(bp_param)))
+    }
+
+    chunk_results <-
+      BiocParallel::bplapply(
+        sample_groups,
+        function(current_samples) {
+          # Extract chunk
+          chunk_se <- .data[, current_samples, drop = FALSE]
+
+          # Get multipliers for this chunk
+          chunk_multiplier <- multiplier[current_samples]
+
+          # Apply scaling to chunk (TMM already calculated)
+          chunk_scaled <- apply_scaling_only(
+            chunk_se,
+            my_assay, 
+            chunk_multiplier, 
+            value_scaled
+          )
+
+          chunk_scaled
+        },
+        BPPARAM = bp_param
+      )
+
+    # Combine chunks - use SummarizedExperiment::cbind for proper S4 dispatch
+    message("tidybulk says: Combining chunks")
+    .data <- do.call(SummarizedExperiment::cbind, args = chunk_results)
+
+  }
+
+    scaled_symbol <- rlang::sym(value_scaled)
+    scaled_quosure <- drop_enquo_env(rlang::new_quosure(scaled_symbol))
+
+   .data |>
+      memorise_methods_used(c("edger", "tmm")) |>
+      add_tt_columns(.abundance_scaled = !!scaled_quosure)
 
 }
 
@@ -236,3 +299,32 @@ setMethod("scale_abundance",
 setMethod("scale_abundance",
           "RangedSummarizedExperiment",
           .scale_abundance_se)
+
+
+  # Core scaling function - applies pre-calculated multipliers to create scaled assay
+  apply_scaling_only <- function(data_obj, assay_name, multipliers, scaled_name) {
+
+    # Get the assay data and apply scaling
+    chunk_assay <- assay(data_obj, assay_name)
+    is_delayed <- is(chunk_assay, "DelayedArray")
+
+    if (is_delayed) {
+      # For DelayedArray, use sweep to create a delayed operation
+      check_and_install_packages("DelayedArray")
+      my_counts_scaled <- list(
+        DelayedArray::sweep(chunk_assay, 2, multipliers, "*")
+      ) |> setNames(scaled_name)
+    } else {
+      # For regular matrices, use matrix multiplication
+      my_counts_scaled <- list(
+        chunk_assay %*% diag(multipliers)
+      ) |> setNames(scaled_name)
+    }
+
+    colnames(my_counts_scaled[[1]]) <- colnames(chunk_assay)
+
+    # Add the scaled assay
+    assays(data_obj, withDimnames = FALSE) <- assays(data_obj) |> c(my_counts_scaled)
+    data_obj
+  }
+