diff --git a/CHANGELOG.md b/CHANGELOG.md index 615b44a0c78..1f08cc5d5af 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,9 +34,10 @@ * `integrate/scarches` and `workflows/annotate/scanvi_scarches`: Enable correction for technical variability by multiple continuous and categorical covariates. - ## BUG FIXES +* `differential_expression/create_pseudobulks`: Fixed the check to verify that the raw counts layer was passed (PR #1072). + * `filter/filter_with_counts`: this component would sometimes crash (segfault) when processing malformatted sparse matrices. A proper error message is now provided in this case (PR #1086). # openpipelines 3.0.0 diff --git a/src/differential_expression/create_pseudobulk/script.py b/src/differential_expression/create_pseudobulk/script.py index 257893c6e69..e9993ffec8b 100644 --- a/src/differential_expression/create_pseudobulk/script.py +++ b/src/differential_expression/create_pseudobulk/script.py @@ -3,7 +3,6 @@ import pandas as pd import sys import scanpy as sc -import scipy.sparse as sp ## VIASH START par = { @@ -27,12 +26,16 @@ def is_normalized(layer): - if sp.issparse(layer): - row_sums = np.array(layer.sum(axis=1)).flatten() - else: - row_sums = layer.sum(axis=1) + exp_layer = np.expm1(layer) # Inverse of log1p + row_sums = np.asarray(np.sum(layer, axis=1)).ravel() + exp_row_sums = np.asarray(np.sum(exp_layer, axis=1)).ravel() - return np.allclose(row_sums, 1) + is_normalized = np.allclose(row_sums, row_sums[0]) + is_log1p_normalized = np.isfinite(exp_row_sums).all() and np.allclose( + exp_row_sums, exp_row_sums[0] + ) + + return is_normalized or is_log1p_normalized def count_obs(adata, pb_adata, obs_cols): diff --git a/src/differential_expression/create_pseudobulk/test.py b/src/differential_expression/create_pseudobulk/test.py index 859be8874b1..5db7f821990 100644 --- a/src/differential_expression/create_pseudobulk/test.py +++ b/src/differential_expression/create_pseudobulk/test.py @@ -1,6 +1,8 @@ import os import mudata as mu import sys +import re +import subprocess import pytest ## VIASH START @@ -41,6 +43,32 @@ def test_simple_execution(run_component, random_h5mu_path): assert adata.shape[0] == 8, "Expected a total of 8 pseudobulk samples in the output" +def test_log_normalized_counts(run_component, random_h5mu_path): + output_path = random_h5mu_path() + with pytest.raises(subprocess.CalledProcessError) as err: + run_component( + [ + "--input", + input_path, + "--output", + output_path, + "--input_layer", + "log_normalized", + "--obs_label", + "cell_type", + "--obs_groups", + "treatment", + "--output_compression", + "gzip", + ] + ) + + assert re.search( + r"ValueError: Input layer must contain raw counts.", + err.value.stdout.decode("utf-8"), + ) + + def test_multiple_factors(run_component, random_h5mu_path): output_path = random_h5mu_path() diff --git a/src/differential_expression/deseq2/script.R b/src/differential_expression/deseq2/script.R index c724d44f72b..e3a6620b7ef 100644 --- a/src/differential_expression/deseq2/script.R +++ b/src/differential_expression/deseq2/script.R @@ -59,17 +59,6 @@ h5mu_to_h5ad <- function(h5mu_path, modality_name) { tmp_path } -# Check if expression data is normalized (row sums =~ 1) -is_normalized <- function(layer) { - row_sums <- if (is(layer, "sparseMatrix") || is(layer, "dgCMatrix")) { - Matrix::rowSums(layer) - } else { - rowSums(layer) - } - - all(abs(row_sums - 1) < 1e-6, na.rm = TRUE) -} - # Extract design factors from formula parse_design_formula <- function(design_formula) { if (!grepl("^~\\s*\\w+(\\s*\\+\\s*\\w+)*$", design_formula)) { @@ -295,10 +284,6 @@ main <- function() { mod$X } - if (is_normalized(layer)) { - stop("Input layer must contain raw counts.") - } - # Prepare analysis components cat("Preparing design formula\n") design_factors <- parse_design_formula(par$design_formula) diff --git a/src/differential_expression/deseq2/test.py b/src/differential_expression/deseq2/test.py index 63a05f0f4e6..54bab701b81 100644 --- a/src/differential_expression/deseq2/test.py +++ b/src/differential_expression/deseq2/test.py @@ -267,7 +267,6 @@ def test_invalid_contrast_column(run_component, tmp_path, pseudobulk_test_data_p ] ) - # Updated regex to match actual R error format (no square brackets) assert re.search( r"Missing required columns in metadata: nonexistent_column", err.value.stdout.decode("utf-8"), @@ -296,7 +295,6 @@ def test_invalid_design_column(run_component, tmp_path, pseudobulk_test_data_pat ] ) - # Updated regex to match actual R error format (no square brackets) assert re.search( r"Invalid design formula: 'malformed formula'", err.value.stdout.decode("utf-8"),