Skip to content

Commit

Permalink
batch correction dockers added
Browse files Browse the repository at this point in the history
  • Loading branch information
janursa committed Aug 5, 2024
1 parent 25cabcc commit d07dea0
Show file tree
Hide file tree
Showing 22 changed files with 593 additions and 174 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ local/
datasets_raw/
state*
trace*
tw-*

# related to python
.ipynb_checkpoints
Expand Down
27 changes: 27 additions & 0 deletions scripts/run_process_perturbation_tw.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash

RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
resources_dir="s3://openproblems-data/resources/grn/"
publish_dir="s3://openproblems-data/resources/grn/results/${RUN_ID}"

cat > /tmp/params.yaml << HERE
param_list:
- id: test_process_perturatbion
perturbation_counts: "$resources_dir/datasets_raw/perturbation_counts.h5ad",
output_state: "state.yaml"
publish_dir: "$publish_dir"
HERE

./tw-windows-x86_64.exe launch openproblems-bio/task_grn_benchmark \
--revision build/main \
--pull-latest \
--main-script target/nextflow/workflows/process_perturbation/main.nf \
--workspace 53907369739130 \
--compute-env 6TeIFgV5OY4pJCk8I0bfOh \
--params-file /tmp/params.yaml \
--config src/common/nextflow_helpers/labels_tw.config



./tw-windows-x86_64.exe launch s3://openproblems-bio/task_grn_benchmark --revision build/main --pull-latest --main-script target/nextflow/workflows/process_perturbation/main.nf --workspace 53907369739130 --compute-env 6TeIFgV5OY4pJCk8I0bfOh --params-file /tmp/params.yaml --config src/common/nextflow_helpers/labels_tw.config
9 changes: 6 additions & 3 deletions src/api/comp_metric.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,19 @@ functionality:
arguments:
- name: --perturbation_data
__merge__: file_perturbation_h5ad.yaml
required: true
required: false
direction: input
default: resources/grn-benchmark/perturbation_data.h5ad
- name: --prediction
__merge__: file_prediction.yaml
required: true
required: false
direction: input
default: resources/grn-benchmark/grn_models/collectri.csv
- name: --score
__merge__: file_score.yaml
required: true
required: false
direction: output
default: output/score.h5ad
- name: --reg_type
type: string
direction: input
Expand Down
6 changes: 6 additions & 0 deletions src/api/comp_test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
functionality:
test_resources:
- type: python_script
path: /src/common/component_tests/run_and_check_output.py
- path: /resources/grn-benchmark
dest: resources/grn-benchmark
4 changes: 2 additions & 2 deletions src/metrics/regression_1/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,10 +243,10 @@ def main(par):

output = run_method_1(net_subset, pert_df, exclude_missing_genes=exclude_missing_genes, reg_type=reg_type)
result_key = f'ex({exclude_missing_genes})_tf({tf_n})'
layer_results[result_key] = output['mean_score_r2']
layer_results[result_key] = [output['mean_score_r2']]

# Convert results to DataFrame
df_results = pd.DataFrame(results, index=layers)
df_results = pd.DataFrame(layer_results)
if 'ex(True)_tf(140)' not in df_results.columns:
df_results['ex(True)_tf(140)'] = df_results['ex(True)_tf(-1)']
if 'ex(False)_tf(140)' not in df_results.columns:
Expand Down
5 changes: 3 additions & 2 deletions src/metrics/regression_1/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,14 @@
print("Write output to file", flush=True)
print(output)
# output.to_csv(par['score'])
# print("Completed", flush=True)
print("Completed", flush=True)
print(output)

output = ad.AnnData(
X=np.empty((0, 0)),
uns={
"layer": par["layer"],
"metric_ids": output.columns,
"metric_ids": output.columns.to_numpy(),
"metric_values": output.values
}
)
Expand Down
84 changes: 41 additions & 43 deletions src/metrics/regression_2/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,49 +171,47 @@ def main(par: Dict[str, Any]) -> pd.DataFrame:
grn = load_grn(par['prediction'], gene_names)

results = []
layers = ['scgen_pearson', 'scgen_lognorm']
for layer in layers:
print(f'Compute metrics for layer: {layer}', flush=True)

# Load and standardize perturbation data
X = perturbation_data.layers[layer]
X = RobustScaler().fit_transform(X)

# Determine maximum number of input features
max_n_regulators = min(100, int(0.5 * n_genes))

# Learn background distribution for each value of `n_features`:
# r2 scores using random genes as features.
background = learn_background_distribution(par['reg_type'], X, groups, max_n_regulators, random_state=random_state)

# Cross-validate each gene using the inferred GRN to define select input features
res = cross_validate(
par['reg_type'],
gene_names,
X,
groups,
grn,
np.clip(np.sum(grn != 0, axis=0), 0, max_n_regulators)
)

# Compute z-scores from r2 scores to take into account the fact
# that random network can still perform well when the number of features is large
scores = []
for j in range(n_genes):
if np.isnan(res['scores'][j]['avg-r2']):
continue
n_features = int(np.sum(grn[:, j] != 0))
if n_features in background:
mu, sigma = background[n_features]
else:
mu, sigma = background['max']
z_score = (res['scores'][j]['avg-r2'] - mu) / sigma
z_score = max(0, z_score)
scores.append(z_score)

total_score = np.mean(scores)
print(f'Score on {layer}: {total_score}')
results.append({'score': total_score})
layer = 'pearson'
print(f'Compute metrics for layer: {layer}', flush=True)

# Load and standardize perturbation data
X = perturbation_data.layers[layer]
X = RobustScaler().fit_transform(X)

# Determine maximum number of input features
max_n_regulators = min(100, int(0.5 * n_genes))

# Learn background distribution for each value of `n_features`:
# r2 scores using random genes as features.
background = learn_background_distribution(par['reg_type'], X, groups, max_n_regulators, random_state=random_state)

# Cross-validate each gene using the inferred GRN to define select input features
res = cross_validate(
par['reg_type'],
gene_names,
X,
groups,
grn,
np.clip(np.sum(grn != 0, axis=0), 0, max_n_regulators)
)

# Compute z-scores from r2 scores to take into account the fact
# that random network can still perform well when the number of features is large
scores = []
for j in range(n_genes):
if np.isnan(res['scores'][j]['avg-r2']):
continue
n_features = int(np.sum(grn[:, j] != 0))
if n_features in background:
mu, sigma = background[n_features]
else:
mu, sigma = background['max']
z_score = (res['scores'][j]['avg-r2'] - mu) / sigma
z_score = max(0, z_score)
scores.append(z_score)

total_score = np.mean(scores)
print(f'Score on {layer}: {total_score}')

# Convert results to DataFrame
df_results = pd.DataFrame(results, index=layers)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
functionality:
name: batch_correction_evaluation
namespace: "perturbation"
info:
label: batch_correction_evaluation
summary: "Evaluate batch correction using different metrics."

arguments:
- name: --perturbation_data
__merge__: ../../../api/file_perturbation_h5ad.yaml
required: false
direction: input
default: resources/grn-benchmark/perturbation_data.h5ad
- name: --output
type: file
required: true
direction: output
default: output/batch_correction_metrics.csv

resources:
- type: python_script
path: script.py
- path: helper.py
platforms:
- type: docker
# image: ghcr.io/openproblems-bio/base_python:1.0.4
image: ghcr.io/openproblems-bio/base_images/r:1.1.0

setup:
- type: python
packages: [lightgbm==4.5.0, scib==1.1.5, louvain==0.8.2, rpy2]
# - type: r
# packages:
# install.packages('remotes')
# remotes::install_github('theislab/kBET')


- type: native
- type: nextflow
directives:
label: [midtime,midmem,midcpu]
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@

import scib
import lightgbm as lgb
import pandas as pd
from sklearn.model_selection import cross_validate
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import r2_score, make_scorer, accuracy_score

def run_scib(bulk_adata, layer='lognorm', layer_baseline='n_counts', batch_key='plate_name', label_key='cell_type'):
bulk_adata.X = bulk_adata.layers[layer_baseline].copy()

bulk_adata_c = bulk_adata.copy()
bulk_adata_c.X = bulk_adata_c.layers[layer].copy()

scib.pp.reduce_data(
bulk_adata_c, n_top_genes=None, batch_key=batch_key, pca=True, neighbors=True
)
rr = scib.metrics.metrics(bulk_adata, bulk_adata_c, batch_key, label_key, organism='human',
# biological conservation (label)
nmi_=True,
ari_=False,
silhouette_=True,
isolated_labels_f1_=False, # there is no isolated cell type
isolated_labels_asw_=False, # there is no isolated cell type
# biological conservation (label free)
cell_cycle_=True,
hvg_score_=False,
trajectory_=False,
# batch correction
pcr_=False,
graph_conn_=False,
kBET_=True,
ilisi_=False,
clisi_=False,
# Not sure what they are
isolated_labels_=False, # backwards compatibility
n_isolated=None,
lisi_graph_=False,

verbose = 0
)
rr = rr.dropna().T
return rr
def run_classifier(adata, layer, batch_key):
print('GB classifier')
model = lgb.LGBMClassifier(silent=True, verbose=-1)
# model = RidgeClassifier()
X = adata.layers[layer].copy()
y = adata.obs[batch_key]
scoring = {
'accuracy_score': make_scorer(accuracy_score)
}
score = 1 - cross_validate(model, X, y, cv=5, scoring=scoring, return_train_score=False)['test_accuracy_score'].mean()

return pd.DataFrame({'Batch classifier':[score]})
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import anndata as ad
import pandas as pd
import numpy as np
import scanpy as sc
import sys
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

## VIASH START
par = {
'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad',
'output': 'output/batch_correction_metrics.csv'
}
## VIASH END

meta = {
'resources_dir': './'
}

sys.path.append(meta['resources_dir'])
from helper import run_scib, run_classifier

bulk_adata = ad.read_h5ad(par['perturbation_data'])
print(bulk_adata)

baseline_layer = 'n_counts'
layers = ['n_counts', 'pearson', 'lognorm', 'seurat_lognorm', 'seurat_pearson', 'scgen_lognorm', 'scgen_pearson']
batch_key = 'plate_name'
label_key = 'cell_type'


def run_metrics(bulk_adata, layer='lognorm', batch_key='plate_name', label_key='cell_type'):
rr = run_scib(bulk_adata, layer=layer, layer_baseline=baseline_layer, batch_key=batch_key, label_key=label_key)
print("classifier")
rr_classifier = run_classifier(bulk_adata, layer, batch_key)
rr = pd.concat([rr_scib, rr_classifier], axis=1)
rr.index = [layer]
return rr


for i, layer in enumerate(layers):
print('\n', layer)
rr = run_metrics(bulk_adata, layer=layer, batch_key=batch_key, label_key=label_key)
if i == 0:
rr_all = rr
else:
rr_all = pd.concat([rr_all, rr], axis=0)
print(rr_all)
rr_all.to_csv(par["output"])
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
__merge__: ../../../api/comp_test.yaml

functionality:
name: batch_correction_scgen
namespace: "perturbation"
Expand Down Expand Up @@ -66,10 +68,11 @@ functionality:
path: script.py
platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_python:1.0.4
setup:
- type: python
packages: [requests, scgen]
image: janursa/scgen:19-08-2024
# setup:
# - type: python
# packages: [requests]
# git: [https://github.com/theislab/scgen.git]

- type: native
- type: nextflow
Expand Down
Loading

0 comments on commit d07dea0

Please sign in to comment.