Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: tidybulk
Title: Brings transcriptomics to the tidyverse
Version: 2.1.0
Version: 2.1.1
Authors@R: c(person("Stefano", "Mangiola", email = "[email protected]",
role = c("aut", "cre")),
person("Maria", "Doyle", email = "[email protected]",
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ importFrom(ggplot2,scale_x_continuous)
importFrom(ggplot2,scale_y_continuous)
importFrom(lifecycle,deprecate_soft)
importFrom(lifecycle,deprecate_warn)
importFrom(lifecycle,is_present)
importFrom(magrittr,"%$%")
importFrom(magrittr,divide_by)
importFrom(magrittr,equals)
Expand Down Expand Up @@ -127,6 +128,7 @@ importFrom(rlang,enquo)
importFrom(rlang,inform)
importFrom(rlang,is_quosure)
importFrom(rlang,quo)
importFrom(rlang,quo_get_expr)
importFrom(rlang,quo_is_missing)
importFrom(rlang,quo_is_null)
importFrom(rlang,quo_is_symbol)
Expand Down
180 changes: 136 additions & 44 deletions R/scale_abundance.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
#'
#' @description scale_abundance() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and Scales transcript abundance compansating for sequencing depth (e.g., with TMM algorithm, Robinson and Oshlack doi.org/10.1186/gb-2010-11-3-r25).
#'
#' @importFrom rlang enquo quo_name
#' @importFrom rlang enquo quo_name quo_is_null
#' @importFrom stats median
#' @importFrom SummarizedExperiment assays colnames
#' @importFrom lifecycle is_present deprecate_warn
#'
#' @name scale_abundance
#'
Expand All @@ -16,6 +17,7 @@
#' @param reference_sample A character string. The name of the reference sample. If NULL the sample with highest total read count will be selected as reference.
#' @param .subset_for_scaling A gene-wise quosure condition. This will be used to filter rows (features/genes) of the dataset. For example
#' @param suffix A character string to append to the scaled abundance column name. Default is "_scaled".
#' @param chunk_sample_size An integer indicating how many samples to process per chunk. Default is `Inf` (no chunking). For HDF5-backed data or large datasets, set to a finite value (e.g., 50) to enable memory-efficient chunked processing with BiocParallel parallelization.
#'
#' @param reference_selection_function DEPRECATED. please use reference_sample.
#' @param ... Further arguments.
Expand Down Expand Up @@ -62,17 +64,20 @@
setGeneric("scale_abundance", function(.data,


abundance = assayNames(.data)[1],
abundance = SummarizedExperiment::assayNames(.data)[1],
method = "TMM",
reference_sample = NULL,
.subset_for_scaling = NULL,
suffix = "_scaled",
chunk_sample_size = Inf,
# DEPRECATED
reference_selection_function = NULL,
...,
.abundance = NULL)
standardGeneric("scale_abundance"))



#' @importFrom magrittr multiply_by
#' @importFrom magrittr divide_by
#' @importFrom SummarizedExperiment assays
Expand All @@ -92,15 +97,19 @@ setGeneric("scale_abundance", function(.data,
#' @importFrom SummarizedExperiment "rowData<-"
#' @importFrom magrittr "%$%"
#' @importFrom SummarizedExperiment "assays<-"
#' @importFrom rlang enquo quo_is_null quo_get_expr
#' @importFrom lifecycle is_present deprecate_warn
#' @importFrom methods is
#'
.scale_abundance_se = function(.data,


abundance = assayNames(.data)[1],
abundance = SummarizedExperiment::assayNames(.data)[1],
method = "TMM",
reference_sample = NULL,
.subset_for_scaling = NULL,
suffix = "_scaled",
chunk_sample_size = Inf,
# DEPRECATED
reference_selection_function = NULL,
...,
Expand All @@ -123,7 +132,7 @@ setGeneric("scale_abundance", function(.data,


# DEPRECATION OF reference function
if (is_present(reference_selection_function) & !is.null(reference_selection_function)) {
if (is_present(reference_selection_function) && !is.null(reference_selection_function)) {

# Signal the deprecation to the user
deprecate_warn("1.1.8", "tidybulk::scale_abundance(reference_selection_function = )", details = "The argument reference_selection_function is now deprecated please use reference_sample. By default the reference selection function is max()")
Expand All @@ -134,14 +143,18 @@ setGeneric("scale_abundance", function(.data,
if(!is.null(reference_sample) && !reference_sample %in% (.data |> colnames()))
stop("tidybulk says: your reference sample is not among the samples in your data frame")

# Force evaluation of .subset_for_scaling to avoid S4 dispatch issues with default arguments
.subset_for_scaling <- force(.subset_for_scaling)
.subset_for_scaling = enquo(.subset_for_scaling)


.data_filtered =
filter_if_abundant_were_identified(.data)

# Only apply filtering if a non-NULL subset condition was provided
subset_expr <- rlang::quo_get_expr(.subset_for_scaling)
has_subset <- !is.null(subset_expr) && !identical(subset_expr, quote(NULL))

if (!quo_is_null(.subset_for_scaling))
if (has_subset)
.data_filtered = filter_genes_on_condition(.data_filtered, !!.subset_for_scaling)

# Filter based on user condition
Expand All @@ -150,12 +163,15 @@ setGeneric("scale_abundance", function(.data,
if (nrow(.data_filtered) == 0)
stop("tidybulk says: there are 0 genes that passes the filters (.abundant and/or .subset_for_scaling). Please check your filtering or your data.")

# Determine the correct assay name
my_counts_filtered = assays(.data_filtered)[[my_assay]] |> na.omit()
library_size_filtered = my_counts_filtered |> colSums(na.rm = TRUE)

# Get the rownames of features that passed filtering
features_to_use <- rownames(.data_filtered)

# We just carry the gene names, no need to stress the memory with the full data frame
rm(.data_filtered)


# If not enough genes, warning
if(nrow(my_counts_filtered)<100) warning(warning_for_scaling_with_few_genes)
if(length(features_to_use)<100) warning(warning_for_scaling_with_few_genes)

# Set column name for value scaled
value_scaled = paste0(my_assay, suffix)
Expand All @@ -164,54 +180,101 @@ setGeneric("scale_abundance", function(.data,
reference <-
reference_sample

if (is.null(reference))
if (is.null(reference)){
# Get filtered counts for reference selection
library_size_filtered = assays(.data[features_to_use, ])[[my_assay]] |> colSums(na.rm = TRUE)
reference = library_size_filtered |>
sort() |>
tail(1) |>
names()
}



# Communicate the reference if chosen by default
if(is.null(reference_sample)) message(sprintf("tidybulk says: the sample with largest library size %s was chosen as reference for scaling", reference))

# Calculate TMM
nf <-
edgeR::calcNormFactors(
my_counts_filtered,
refColumn = reference,
method = method
)

# Calculate multiplier
multiplier =
# Relecting the ratio of effective library size of the reference sample to the effective library size of each sample
(library_size_filtered[reference] * nf[reference]) |>
divide_by(library_size_filtered * nf)

# Add to sample info
colData(.data)$TMM = nf
colData(.data)$multiplier = multiplier

# Calculate TMM normalization factors once for all samples
chunk_counts_filtered <- assays(.data[features_to_use, ])[[my_assay]] |> na.omit()
chunk_library_size <- chunk_counts_filtered |> colSums(na.rm = TRUE)

my_counts_scaled =
list(
assay(.data) %*%
diag(multiplier)

) |>
setNames(value_scaled)
colnames(my_counts_scaled[[1]]) = assay(.data) |> colnames()
nf <- edgeR::calcNormFactors(
chunk_counts_filtered,
refColumn = reference,
method = method
)

# Calculate multiplier for all samples
multiplier <-
(chunk_library_size[reference] * nf[reference]) |>
divide_by(chunk_library_size * nf)

# Add the assay
assays(.data, withDimnames=FALSE) = assays(.data) |> c(my_counts_scaled)
# Add to sample info
colData(.data)$TMM <- nf
colData(.data)$multiplier <- multiplier

.data |>
# If chunk_sample_size is finite, process in chunks with parallelization
if (!is.finite(chunk_sample_size) || chunk_sample_size >= ncol(.data)) {

# Standard processing without chunking - just apply the multipliers
.data <- apply_scaling_only(.data, my_assay, multiplier, value_scaled)

} else {

sample_names <- colnames(.data)
sample_indices <- seq_along(sample_names)
sample_groups <- split(sample_names, ceiling(sample_indices / chunk_sample_size))

# Add methods
memorise_methods_used(c("edger", "tmm")) |>
# Check if BiocParallel is available, otherwise install
check_and_install_packages("BiocParallel")

# Attach column internals
add_tt_columns(.abundance_scaled = !!(((function(x, v) enquo(v))(x,!!as.symbol(value_scaled))) |> drop_enquo_env()) )
# Get the current BiocParallel backend
bp_param <- BiocParallel::bpparam()

# Inform user about parallelization settings
if (is(bp_param, "SerialParam")) {
message("tidybulk says: Processing chunks serially. For parallel processing, configure BiocParallel with BiocParallel::register() before calling this function. For example: BiocParallel::register(BiocParallel::MulticoreParam(workers = 4, progressbar = TRUE))")
} else {
message(sprintf("tidybulk says: Processing %d chunks in parallel using %s with %d workers",
length(sample_groups), class(bp_param)[1], BiocParallel::bpnworkers(bp_param)))
}

chunk_results <-
BiocParallel::bplapply(
sample_groups,
function(current_samples) {
# Extract chunk
chunk_se <- .data[, current_samples, drop = FALSE]

# Get multipliers for this chunk
chunk_multiplier <- multiplier[current_samples]

# Apply scaling to chunk (TMM already calculated)
chunk_scaled <- apply_scaling_only(
chunk_se,
my_assay,
chunk_multiplier,
value_scaled
)

chunk_scaled
},
BPPARAM = bp_param
)

# Combine chunks - use SummarizedExperiment::cbind for proper S4 dispatch
message("tidybulk says: Combining chunks")
.data <- do.call(SummarizedExperiment::cbind, args = chunk_results)

}

scaled_symbol <- rlang::sym(value_scaled)
scaled_quosure <- drop_enquo_env(rlang::new_quosure(scaled_symbol))

.data |>
memorise_methods_used(c("edger", "tmm")) |>
add_tt_columns(.abundance_scaled = !!scaled_quosure)

}

Expand All @@ -236,3 +299,32 @@ setMethod("scale_abundance",
setMethod("scale_abundance",
"RangedSummarizedExperiment",
.scale_abundance_se)


# Core scaling function - applies pre-calculated multipliers to create scaled assay
apply_scaling_only <- function(data_obj, assay_name, multipliers, scaled_name) {

# Get the assay data and apply scaling
chunk_assay <- assay(data_obj, assay_name)
is_delayed <- is(chunk_assay, "DelayedArray")

if (is_delayed) {
# For DelayedArray, use sweep to create a delayed operation
check_and_install_packages("DelayedArray")
my_counts_scaled <- list(
DelayedArray::sweep(chunk_assay, 2, multipliers, "*")
) |> setNames(scaled_name)
} else {
# For regular matrices, use matrix multiplication
my_counts_scaled <- list(
chunk_assay %*% diag(multipliers)
) |> setNames(scaled_name)
}

colnames(my_counts_scaled[[1]]) <- colnames(chunk_assay)

# Add the scaled assay
assays(data_obj, withDimnames = FALSE) <- assays(data_obj) |> c(my_counts_scaled)
data_obj
}

Loading
Loading