Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@

* `workflows/differential_expression/pseudobulk_deseq2`: Workflow for generating pseudobulk samples from single-cell data followed by DESeq2 differential expression analysis (PR #1044)

## BUG FIX

* `differential_expression/create_pseudobulks`: Fixed the check to verify that the raw counts layer was passed (PR #1072).

# openpipelines 3.0.0

## BREAKING CHANGES
Expand Down
11 changes: 10 additions & 1 deletion src/differential_expression/create_pseudobulk/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,21 @@


def is_normalized(layer):
exp_layer = np.expm1(layer) # Inverse of log1p

if sp.issparse(layer):
row_sums = np.array(layer.sum(axis=1)).flatten()
exp_row_sums = np.array(exp_layer.sum(axis=1)).flatten()
else:
row_sums = layer.sum(axis=1)
exp_row_sums = exp_layer.sum(axis=1)

is_normalized = np.allclose(row_sums, 1)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think is_normalized is asserted correctly here. Normalization scales the raw counts such that the sum of the counts for each cell are equal to a number. With CPM (counts per million) this is number is 1 milion, but the scanpy method is to use the median of total counts before normalization. So this number can differ, its not just 1

is_log1p_normalized = np.isfinite(exp_row_sums).all() and np.allclose(
exp_row_sums, exp_row_sums[0]
)

return np.allclose(row_sums, 1)
return is_normalized or is_log1p_normalized


def count_obs(adata, pb_adata, obs_cols):
Expand Down
28 changes: 28 additions & 0 deletions src/differential_expression/create_pseudobulk/test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os
import mudata as mu
import sys
import re
import subprocess
import pytest

## VIASH START
Expand Down Expand Up @@ -41,6 +43,32 @@ def test_simple_execution(run_component, random_h5mu_path):
assert adata.shape[0] == 8, "Expected a total of 8 pseudobulk samples in the output"


def test_log_normalized_counts(run_component, random_h5mu_path):
output_path = random_h5mu_path()
with pytest.raises(subprocess.CalledProcessError) as err:
run_component(
[
"--input",
input_path,
"--output",
output_path,
"--input_layer",
"log_normalized",
"--obs_label",
"cell_type",
"--obs_groups",
"treatment",
"--output_compression",
"gzip",
]
)

assert re.search(
r"ValueError: Input layer must contain raw counts.",
err.value.stdout.decode("utf-8"),
)


def test_multiple_factors(run_component, random_h5mu_path):
output_path = random_h5mu_path()

Expand Down
15 changes: 0 additions & 15 deletions src/differential_expression/deseq2/script.R
Original file line number Diff line number Diff line change
Expand Up @@ -59,17 +59,6 @@ h5mu_to_h5ad <- function(h5mu_path, modality_name) {
tmp_path
}

# Check if expression data is normalized (row sums =~ 1)
is_normalized <- function(layer) {
row_sums <- if (is(layer, "sparseMatrix") || is(layer, "dgCMatrix")) {
Matrix::rowSums(layer)
} else {
rowSums(layer)
}

all(abs(row_sums - 1) < 1e-6, na.rm = TRUE)
}

# Extract design factors from formula
parse_design_formula <- function(design_formula) {
if (!grepl("^~\\s*\\w+(\\s*\\+\\s*\\w+)*$", design_formula)) {
Expand Down Expand Up @@ -293,10 +282,6 @@ main <- function() {
mod$X
}

if (is_normalized(layer)) {
stop("Input layer must contain raw counts.")
}

# Prepare analysis components
cat("Preparing design formula\n")
design_factors <- parse_design_formula(par$design_formula)
Expand Down
2 changes: 0 additions & 2 deletions src/differential_expression/deseq2/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,6 @@ def test_invalid_contrast_column(run_component, tmp_path, pseudobulk_test_data_p
]
)

# Updated regex to match actual R error format (no square brackets)
assert re.search(
r"Missing required columns in metadata: nonexistent_column",
err.value.stdout.decode("utf-8"),
Expand Down Expand Up @@ -296,7 +295,6 @@ def test_invalid_design_column(run_component, tmp_path, pseudobulk_test_data_pat
]
)

# Updated regex to match actual R error format (no square brackets)
assert re.search(
r"Invalid design formula: 'malformed formula'",
err.value.stdout.decode("utf-8"),
Expand Down
Loading