batch correction dockers added

openproblems-bio · Aug 5, 2024 · d07dea0 · d07dea0
1 parent 25cabcc
commit d07dea0
Show file tree

Hide file tree

Showing 22 changed files with 593 additions and 174 deletions.
diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,7 @@ local/
 datasets_raw/
 state*
 trace*
+tw-*
 
 # related to python
 .ipynb_checkpoints

diff --git a/scripts/run_process_perturbation_tw.sh b/scripts/run_process_perturbation_tw.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
+resources_dir="s3://openproblems-data/resources/grn/"
+publish_dir="s3://openproblems-data/resources/grn/results/${RUN_ID}"
+
+cat > /tmp/params.yaml << HERE
+param_list:
+  - id: test_process_perturatbion
+    perturbation_counts: "$resources_dir/datasets_raw/perturbation_counts.h5ad",
+
+output_state: "state.yaml"
+publish_dir: "$publish_dir"
+HERE
+
+./tw-windows-x86_64.exe launch openproblems-bio/task_grn_benchmark \
+  --revision build/main \
+  --pull-latest \
+  --main-script target/nextflow/workflows/process_perturbation/main.nf \
+  --workspace 53907369739130 \
+  --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
+  --params-file /tmp/params.yaml \
+  --config src/common/nextflow_helpers/labels_tw.config
+
+
+
+  ./tw-windows-x86_64.exe launch s3://openproblems-bio/task_grn_benchmark --revision build/main --pull-latest --main-script target/nextflow/workflows/process_perturbation/main.nf --workspace 53907369739130 --compute-env 6TeIFgV5OY4pJCk8I0bfOh --params-file /tmp/params.yaml --config src/common/nextflow_helpers/labels_tw.config
diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml
@@ -10,16 +10,19 @@ functionality:
   arguments:
     - name: --perturbation_data
       __merge__: file_perturbation_h5ad.yaml
-      required: true
+      required: false
       direction: input
+      default: resources/grn-benchmark/perturbation_data.h5ad
     - name: --prediction
       __merge__: file_prediction.yaml
-      required: true
+      required: false
       direction: input
+      default: resources/grn-benchmark/grn_models/collectri.csv
     - name: --score
       __merge__: file_score.yaml
-      required: true
+      required: false
       direction: output
+      default: output/score.h5ad
     - name: --reg_type
       type: string
       direction: input

diff --git a/src/api/comp_test.yaml b/src/api/comp_test.yaml
@@ -0,0 +1,6 @@
+functionality:  
+  test_resources:
+    - type: python_script
+      path: /src/common/component_tests/run_and_check_output.py
+    - path: /resources/grn-benchmark
+      dest: resources/grn-benchmark
diff --git a/src/metrics/regression_1/main.py b/src/metrics/regression_1/main.py
@@ -243,10 +243,10 @@ def main(par):
 
             output = run_method_1(net_subset, pert_df, exclude_missing_genes=exclude_missing_genes, reg_type=reg_type)
             result_key = f'ex({exclude_missing_genes})_tf({tf_n})'
-            layer_results[result_key] = output['mean_score_r2']
+            layer_results[result_key] = [output['mean_score_r2']]
 
     # Convert results to DataFrame
-    df_results = pd.DataFrame(results, index=layers)
+    df_results = pd.DataFrame(layer_results)
     if 'ex(True)_tf(140)' not in df_results.columns:
         df_results['ex(True)_tf(140)'] = df_results['ex(True)_tf(-1)']
     if 'ex(False)_tf(140)' not in df_results.columns:

diff --git a/src/metrics/regression_1/script.py b/src/metrics/regression_1/script.py
@@ -27,13 +27,14 @@
 print("Write output to file", flush=True)
 print(output)
 # output.to_csv(par['score'])
-# print("Completed", flush=True)
+print("Completed", flush=True)
+print(output)
 
 output = ad.AnnData(
     X=np.empty((0, 0)),
     uns={
         "layer": par["layer"],
-        "metric_ids": output.columns,
+        "metric_ids": output.columns.to_numpy(),
         "metric_values": output.values
     }
 )

diff --git a/src/metrics/regression_2/main.py b/src/metrics/regression_2/main.py
@@ -171,49 +171,47 @@ def main(par: Dict[str, Any]) -> pd.DataFrame:
     grn = load_grn(par['prediction'], gene_names)
 
     results = []
-    layers = ['scgen_pearson', 'scgen_lognorm']
-    for layer in layers:
-        print(f'Compute metrics for layer: {layer}', flush=True)
-
-        # Load and standardize perturbation data
-        X = perturbation_data.layers[layer]
-        X = RobustScaler().fit_transform(X)
-
-        # Determine maximum number of input features
-        max_n_regulators = min(100, int(0.5 * n_genes))
-
-        # Learn background distribution for each value of `n_features`:
-        # r2 scores using random genes as features.
-        background = learn_background_distribution(par['reg_type'], X, groups, max_n_regulators, random_state=random_state)
-
-        # Cross-validate each gene using the inferred GRN to define select input features
-        res = cross_validate(
-            par['reg_type'],
-            gene_names,
-            X,
-            groups,
-            grn,
-            np.clip(np.sum(grn != 0, axis=0), 0, max_n_regulators)
-        )
-
-        # Compute z-scores from r2 scores to take into account the fact
-        # that random network can still perform well when the number of features is large
-        scores = []
-        for j in range(n_genes):
-            if np.isnan(res['scores'][j]['avg-r2']):
-                continue
-            n_features = int(np.sum(grn[:, j] != 0))
-            if n_features in background:
-                mu, sigma = background[n_features]
-            else:
-                mu, sigma = background['max']
-            z_score = (res['scores'][j]['avg-r2'] - mu) / sigma
-            z_score = max(0, z_score)
-            scores.append(z_score)
-
-        total_score = np.mean(scores)
-        print(f'Score on {layer}: {total_score}')
-        results.append({'score': total_score})
+    layer = 'pearson'
+    print(f'Compute metrics for layer: {layer}', flush=True)
+
+    # Load and standardize perturbation data
+    X = perturbation_data.layers[layer]
+    X = RobustScaler().fit_transform(X)
+
+    # Determine maximum number of input features
+    max_n_regulators = min(100, int(0.5 * n_genes))
+
+    # Learn background distribution for each value of `n_features`:
+    # r2 scores using random genes as features.
+    background = learn_background_distribution(par['reg_type'], X, groups, max_n_regulators, random_state=random_state)
+
+    # Cross-validate each gene using the inferred GRN to define select input features
+    res = cross_validate(
+        par['reg_type'],
+        gene_names,
+        X,
+        groups,
+        grn,
+        np.clip(np.sum(grn != 0, axis=0), 0, max_n_regulators)
+    )
+
+    # Compute z-scores from r2 scores to take into account the fact
+    # that random network can still perform well when the number of features is large
+    scores = []
+    for j in range(n_genes):
+        if np.isnan(res['scores'][j]['avg-r2']):
+            continue
+        n_features = int(np.sum(grn[:, j] != 0))
+        if n_features in background:
+            mu, sigma = background[n_features]
+        else:
+            mu, sigma = background['max']
+        z_score = (res['scores'][j]['avg-r2'] - mu) / sigma
+        z_score = max(0, z_score)
+        scores.append(z_score)
+
+    total_score = np.mean(scores)
+    print(f'Score on {layer}: {total_score}')
 
     # Convert results to DataFrame
     df_results = pd.DataFrame(results, index=layers)

diff --git a/src/process_data/perturbation/batch_correction_evaluation/config.vsh.yaml b/src/process_data/perturbation/batch_correction_evaluation/config.vsh.yaml
@@ -0,0 +1,41 @@
+functionality:
+  name: batch_correction_evaluation
+  namespace: "perturbation"
+  info:
+    label: batch_correction_evaluation
+    summary: "Evaluate batch correction using different metrics."
+
+  arguments:
+    - name: --perturbation_data
+      __merge__: ../../../api/file_perturbation_h5ad.yaml
+      required: false
+      direction: input
+      default: resources/grn-benchmark/perturbation_data.h5ad
+    - name: --output
+      type: file
+      required: true 
+      direction: output
+      default: output/batch_correction_metrics.csv
+
+  resources:
+    - type: python_script
+      path: script.py
+    - path: helper.py
+platforms:
+  - type: docker
+    # image: ghcr.io/openproblems-bio/base_python:1.0.4
+    image: ghcr.io/openproblems-bio/base_images/r:1.1.0
+
+    setup:
+      - type: python
+        packages: [lightgbm==4.5.0, scib==1.1.5, louvain==0.8.2, rpy2]
+      # - type: r
+      #   packages:
+      # install.packages('remotes')
+      # remotes::install_github('theislab/kBET')
+
+
+  - type: native
+  - type: nextflow
+    directives:
+      label: [midtime,midmem,midcpu]
diff --git a/src/process_data/perturbation/batch_correction_evaluation/helper.py b/src/process_data/perturbation/batch_correction_evaluation/helper.py
@@ -0,0 +1,55 @@
+
+import scib
+import lightgbm as lgb
+import pandas as pd
+from sklearn.model_selection import cross_validate
+from sklearn.linear_model import RidgeClassifier
+from sklearn.metrics import r2_score, make_scorer, accuracy_score
+
+def run_scib(bulk_adata, layer='lognorm', layer_baseline='n_counts', batch_key='plate_name', label_key='cell_type'):
+    bulk_adata.X = bulk_adata.layers[layer_baseline].copy()
+
+    bulk_adata_c = bulk_adata.copy()
+    bulk_adata_c.X = bulk_adata_c.layers[layer].copy()
+
+    scib.pp.reduce_data(
+        bulk_adata_c, n_top_genes=None, batch_key=batch_key, pca=True, neighbors=True
+    )
+    rr = scib.metrics.metrics(bulk_adata, bulk_adata_c, batch_key, label_key, organism='human', 
+                            # biological conservation (label)
+                            nmi_=True, 
+                            ari_=False,
+                            silhouette_=True,
+                            isolated_labels_f1_=False, # there is no isolated cell type
+                            isolated_labels_asw_=False, # there is no isolated cell type
+                            # biological conservation (label free)
+                            cell_cycle_=True,
+                            hvg_score_=False,
+                            trajectory_=False,
+                            # batch correction
+                            pcr_=False, 
+                            graph_conn_=False,
+                            kBET_=True,
+                            ilisi_=False,
+                            clisi_=False,
+                            # Not sure what they are
+                            isolated_labels_=False,  # backwards compatibility
+                            n_isolated=None,
+                            lisi_graph_=False,
+
+                            verbose = 0
+                            )
+    rr = rr.dropna().T
+    return rr 
+def run_classifier(adata, layer, batch_key):
+    print('GB classifier')
+    model = lgb.LGBMClassifier(silent=True, verbose=-1)
+    # model = RidgeClassifier()
+    X = adata.layers[layer].copy()
+    y = adata.obs[batch_key]
+    scoring = {
+        'accuracy_score': make_scorer(accuracy_score)
+    }
+    score = 1 - cross_validate(model, X, y, cv=5, scoring=scoring, return_train_score=False)['test_accuracy_score'].mean()
+
+    return pd.DataFrame({'Batch classifier':[score]})
diff --git a/src/process_data/perturbation/batch_correction_evaluation/script.py b/src/process_data/perturbation/batch_correction_evaluation/script.py
@@ -0,0 +1,51 @@
+import anndata as ad 
+import pandas as pd
+import numpy as np
+import scanpy as sc
+import sys
+import warnings
+warnings.filterwarnings("ignore")
+warnings.filterwarnings('ignore', category=FutureWarning)
+warnings.filterwarnings('ignore', category=DeprecationWarning)
+
+## VIASH START
+par = {
+    'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad',
+    'output': 'output/batch_correction_metrics.csv'
+}
+## VIASH END
+
+meta = {
+    'resources_dir': './'
+}
+
+sys.path.append(meta['resources_dir'])
+from helper import run_scib, run_classifier
+
+bulk_adata = ad.read_h5ad(par['perturbation_data'])
+print(bulk_adata)
+
+baseline_layer = 'n_counts'
+layers = ['n_counts', 'pearson', 'lognorm', 'seurat_lognorm', 'seurat_pearson', 'scgen_lognorm', 'scgen_pearson']
+batch_key = 'plate_name'
+label_key = 'cell_type' 
+
+
+def run_metrics(bulk_adata, layer='lognorm', batch_key='plate_name', label_key='cell_type'):
+    rr = run_scib(bulk_adata, layer=layer, layer_baseline=baseline_layer, batch_key=batch_key, label_key=label_key)
+    print("classifier")
+    rr_classifier = run_classifier(bulk_adata, layer, batch_key)
+    rr = pd.concat([rr_scib, rr_classifier], axis=1)
+    rr.index = [layer]
+    return rr
+
+
+for i, layer in enumerate(layers):
+    print('\n', layer)
+    rr = run_metrics(bulk_adata, layer=layer, batch_key=batch_key, label_key=label_key)
+    if i == 0:
+        rr_all = rr 
+    else:
+        rr_all = pd.concat([rr_all, rr], axis=0)
+    print(rr_all)
+    rr_all.to_csv(par["output"])
diff --git a/src/process_data/perturbation/batch_correction_scgen/config.vsh.yaml b/src/process_data/perturbation/batch_correction_scgen/config.vsh.yaml
@@ -1,3 +1,5 @@
+__merge__: ../../../api/comp_test.yaml
+
 functionality:
   name: batch_correction_scgen
   namespace: "perturbation"
@@ -66,10 +68,11 @@ functionality:
       path: script.py
 platforms:
   - type: docker
-    image: ghcr.io/openproblems-bio/base_python:1.0.4
-    setup:
-      - type: python
-        packages: [requests, scgen]
+    image: janursa/scgen:19-08-2024
+    # setup:
+    #   - type: python
+    #     packages: [requests]
+    #     git: [https://github.com/theislab/scgen.git]
 
   - type: native
   - type: nextflow
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,7 @@ local/ @@
     datasets_raw/
     state*
     trace*
+    tw-*
     # related to python
     .ipynb_checkpoints
@@ Expand Down @@