readme updated

openproblems-bio · Feb 2, 2025 · 457753d · 457753d
1 parent 232ef11
commit 457753d
Show file tree

Hide file tree

Showing 13 changed files with 417 additions and 299 deletions.
diff --git a/README.md b/README.md
@@ -6,8 +6,7 @@ This file is automatically generated from the tasks's api/*.yaml files.
 Do not edit this file directly.
 -->
 
-Benchmarking GRN inference methods The full documentation is hosted on
-[ReadTheDocs](https://grn-inference-benchmarking.readthedocs.io/en/latest/index.html).
+Benchmarking GRN inference methods
 
 Repository:
 [openproblems-bio/task_grn_inference](https://github.com/openproblems-bio/task_grn_inference)
@@ -64,8 +63,9 @@ distance, accounting for both accuracy and comprehensiveness.
 Five datasets have been integrated so far, namely OPSCA, Nakatake,
 Norman, Adamson, and Replogle. For each dataset, standardized inference
 datasets are provided to be used for GRN inference and evaluation
-datasets are employed to benchmark. See our publication for the details
-of methods.
+datasets are employed to benchmark.
+
+See our publication for the details of methods.
 
 ## Authors & contributors
 
@@ -83,25 +83,23 @@ of methods.
 flowchart TB
   file_atac_h5ad("<a href='https://github.com/openproblems-bio/task_grn_inference#file-format-NA'>NA</a>")
   comp_method[/"<a href='https://github.com/openproblems-bio/task_grn_inference#component-type-method'>method</a>"/]
-  file_prediction("<a href='https://github.com/openproblems-bio/task_grn_inference#file-format-NA'>NA</a>")
+  file_prediction_h5ad("<a href='https://github.com/openproblems-bio/task_grn_inference#file-format-NA'>NA</a>")
   comp_metric_regression[/"<a href='https://github.com/openproblems-bio/task_grn_inference#component-type-metric-regression'>metric_regression</a>"/]
   comp_metric_ws[/"<a href='https://github.com/openproblems-bio/task_grn_inference#component-type-ws-distance'>ws_distance</a>"/]
   comp_metric[/"<a href='https://github.com/openproblems-bio/task_grn_inference#component-type-metrics'>metrics</a>"/]
-  file_score("<a href='https://github.com/openproblems-bio/task_grn_inference#file-format-NA'>NA</a>")
+  file_score_h5ad("<a href='https://github.com/openproblems-bio/task_grn_inference#file-format-NA'>NA</a>")
   file_evaluation_h5ad("<a href='https://github.com/openproblems-bio/task_grn_inference#file-format-NA'>NA</a>")
   file_rna_h5ad("<a href='https://github.com/openproblems-bio/task_grn_inference#file-format-multiomics-rna'>multiomics rna</a>")
-  comp_method_r[/"<a href='https://github.com/openproblems-bio/task_grn_inference#component-type-method-r'>Method r</a>"/]
   file_atac_h5ad-.-comp_method
-  comp_method-.->file_prediction
-  file_prediction---comp_metric_regression
-  file_prediction---comp_metric_ws
-  file_prediction---comp_metric
-  comp_metric_regression-->file_score
-  comp_metric_ws-->file_score
-  comp_metric-->file_score
-  file_evaluation_h5ad-.-comp_metric_regression
+  comp_method-.->file_prediction_h5ad
+  file_prediction_h5ad---comp_metric_regression
+  file_prediction_h5ad---comp_metric_ws
+  file_prediction_h5ad---comp_metric
+  comp_metric_regression-->file_score_h5ad
+  comp_metric_ws-->file_score_h5ad
+  comp_metric-->file_score_h5ad
+  file_evaluation_h5ad---comp_metric_regression
   file_rna_h5ad---comp_method
-  comp_method_r-.->file_prediction
 ```
 
 ## File format: op_atac.h5ad
@@ -118,25 +116,27 @@ Arguments:
 
 <div class="small">
 
-| Name            | Type      | Description                                 |
-|:----------------|:----------|:--------------------------------------------|
-| `--rna`         | `file`    | RNA expression for multiomics data.         |
-| `--atac`        | `file`    | (*Optional*) Peak data for multiomics data. |
-| `--prediction`  | `file`    | (*Optional, Output*) GRN prediction.        |
-| `--tf_all`      | `file`    | (*Optional*) NA.                            |
-| `--max_n_links` | `integer` | (*Optional*) NA. Default: `50000`.          |
-| `--num_workers` | `integer` | (*Optional*) NA. Default: `4`.              |
-| `--temp_dir`    | `string`  | (*Optional*) NA. Default: `output/temdir`.  |
-| `--seed`        | `integer` | (*Optional*) NA. Default: `32`.             |
-| `--causal`      | `boolean` | (*Optional*) NA. Default: `TRUE`.           |
+| Name | Type | Description |
+|:---|:---|:---|
+| `--rna` | `file` | RNA expression for multiomics data. |
+| `--atac` | `file` | (*Optional*) Peak data for multiomics data. |
+| `--prediction` | `file` | (*Optional, Output*) File indicating the inferred GRN. |
+| `--tf_all` | `file` | (*Optional*) NA. Default: `resources_test/prior/tf_all.csv`. |
+| `--max_n_links` | `integer` | (*Optional*) NA. Default: `50000`. |
+| `--num_workers` | `integer` | (*Optional*) NA. Default: `4`. |
+| `--temp_dir` | `string` | (*Optional*) NA. Default: `output/temdir`. |
+| `--seed` | `integer` | (*Optional*) NA. Default: `32`. |
+| `--causal` | `boolean` | (*Optional*) NA. Default: `TRUE`. |
+| `--dataset_id` | `string` | (*Optional*) NA. Default: `op`. |
+| `--method_id` | `string` | (*Optional*) NA. Default: `grnboost2`. |
 
 </div>
 
-## File format: collectri.csv
+## File format: collectri.h5ad
 
 NA
 
-Example file: `resources_test/grn_models/op/collectri.csv`
+Example file: `resources/grn_models/op/collectri.h5ad`
 
 ## Component type: metric_regression
 
@@ -148,14 +148,14 @@ Arguments:
 
 | Name | Type | Description |
 |:---|:---|:---|
-| `--prediction` | `file` | GRN prediction. |
+| `--prediction` | `file` | File indicating the inferred GRN. |
 | `--score` | `file` | (*Output*) File indicating the score of a metric. |
 | `--method_id` | `string` | (*Optional*) NA. |
 | `--layer` | `string` | (*Optional*) NA. Default: `X_norm`. |
 | `--max_n_links` | `integer` | (*Optional*) NA. Default: `50000`. |
 | `--verbose` | `integer` | (*Optional*) NA. Default: `2`. |
 | `--dataset_id` | `string` | (*Optional*) NA. Default: `op`. |
-| `--evaluation_data` | `file` | (*Optional*) Perturbation dataset for benchmarking. |
+| `--evaluation_data` | `file` | Perturbation dataset for benchmarking. |
 | `--tf_all` | `file` | NA. |
 | `--reg_type` | `string` | (*Optional*) NA. Default: `ridge`. |
 | `--subsample` | `integer` | (*Optional*) NA. Default: `-1`. |
@@ -175,7 +175,7 @@ Arguments:
 
 | Name | Type | Description |
 |:---|:---|:---|
-| `--prediction` | `file` | GRN prediction. |
+| `--prediction` | `file` | File indicating the inferred GRN. |
 | `--score` | `file` | (*Output*) File indicating the score of a metric. |
 | `--method_id` | `string` | (*Optional*) NA. |
 | `--layer` | `string` | (*Optional*) NA. Default: `X_norm`. |
@@ -198,7 +198,7 @@ Arguments:
 
 | Name | Type | Description |
 |:---|:---|:---|
-| `--prediction` | `file` | GRN prediction. |
+| `--prediction` | `file` | File indicating the inferred GRN. |
 | `--score` | `file` | (*Output*) File indicating the score of a metric. |
 | `--method_id` | `string` | (*Optional*) NA. |
 | `--layer` | `string` | (*Optional*) NA. Default: `X_norm`. |
@@ -249,21 +249,3 @@ Data structure:
 
 </div>
 
-## Component type: Method r
-
-A GRN inference method
-
-Arguments:
-
-<div class="small">
-
-| Name            | Type      | Description                                |
-|:----------------|:----------|:-------------------------------------------|
-| `--rna_r`       | `file`    | (*Optional*) NA.                           |
-| `--atac_r`      | `file`    | (*Optional*) NA.                           |
-| `--prediction`  | `file`    | (*Optional, Output*) GRN prediction.       |
-| `--temp_dir`    | `string`  | (*Optional*) NA. Default: `output/temdir`. |
-| `--num_workers` | `integer` | (*Optional*) NA. Default: `4`.             |
-
-</div>
-
diff --git a/_viash.yaml b/_viash.yaml
@@ -15,7 +15,7 @@ links:
 label: GRN Inference
 summary: |
   Benchmarking GRN inference methods
-  The full documentation is hosted on [ReadTheDocs](https://grn-inference-benchmarking.readthedocs.io/en/latest/index.html). 
+
 description: |
   GRNs are essential for understanding cellular identity and behavior. They are simplified models of gene expression regulated by complex processes involving multiple layers of control, from transcription to post-transcriptional modifications, incorporating various regulatory elements and non-coding RNAs. Gene transcription is controlled by a regulatory complex that includes transcription factors (TFs), cis-regulatory elements (CREs) like promoters and enhancers, and essential co-factors. High-throughput datasets, covering thousands of genes, facilitate the use of machine learning approaches to decipher GRNs. The advent of single-cell sequencing technologies, such as scRNA-seq, has made it possible to infer GRNs from a single experiment due to the abundance of samples. This allows researchers to infer condition-specific GRNs, such as for different cell types or diseases, and study potential regulatory factors associated with these conditions. Combining chromatin accessibility data with gene expression measurements has led to the development of enhancer-driven GRN (eGRN) inference pipelines, which offer significantly improved accuracy over single-modality methods.
 
@@ -26,11 +26,12 @@ description: |
   So far, ten GRN inference methods have been integrated: five single-omics methods of GRNBoost2, GENIE3, Portia, PPCOR, and Scenic; and five eGRN inference methods of Scenic+, CellOracle, FigR, scGLUE, and GRaNIE.
 
   Due to its flexible nature, the platform can incorporate various benchmark datasets and evaluation methods, using either prior knowledge or feature-based approaches. 
-  In the current version, due to the absence of standardized prior knowledge, we use indirect approaches to benchmark GRNs. Employing interventional data as evaluation datasets, we have developed 8 metrics using feature-based approach and Wasserstein distance, accounting for both accuracy and comprehensiveness.
+  In the current version, due to the absence of standardized prior knowledge, we use indirect approaches to benchmark GRNs. Employing interventional data as evaluation datasets, 
+  we have developed 8 metrics using feature-based approach and Wasserstein distance, accounting for both accuracy and comprehensiveness.
 
   Five datasets have been integrated so far, namely OPSCA, Nakatake, Norman, Adamson, and Replogle. For each dataset, standardized inference datasets are provided to be used for GRN inference and evaluation datasets are employed to benchmark.
-  See our publication for the details of methods. 
 
+  See our publication for the details of methods. 
 info:
   image: thumbnail.svg # todo: create a thumbnail
   test_resources:
@@ -59,17 +60,25 @@ info:
 
     cd task_grn_inference
     
-    # download resources
+    # download resources 
+    To interact with the framework, you should download the resources containing necessary inferene and evaluation datasets to get started.
+    ```bash
     scripts/download_resources.sh
     ```
-    The datasets for GRN inference are located in `resources/inference_datasets`. 
+
     ## Infer a GRN 
-    One GRN should be inferred for each inference dataset (op, norman, replogle2, adamson, and nakatake). The inferred GRN should have three columns of `source, target, weight`. See `resources/grn_models/op/grnboost2.csv` as an example.
     
+    To infer a GRN for a given dataset (e.g. `norman`) using simple Pearson correlation:
+    ```bash
+    viash run src/control_methods/pearson_corr/config.vsh.yaml -- \
+              --rna resources/inference_datasets/norman_rna.h5ad --prediction output/net.h5ad
+    ```
+
+    ```bash
     ## Evaluate a GRN
-    Once a GRN is inferred (e.g. located in `output/your_GRN.csv`) for a given dataset (e.g. `norman`), use the following code to obtain evaluation scores. 
+    Once got the prediction for a given dataset, use the following code to obtain evaluation scores. 
     ```bash
-    scripts/calculate_score.sh output/your_GRN.csv norman
+    scripts/single_grn_evaluation.sh output/net.h5ad norman
     ```
     This will calculate and print the scores as well as output the scores into `output/score.h5ad`
   

diff --git a/scripts/add_a_method.sh b/scripts/add_a_method.sh
@@ -11,9 +11,8 @@ echo "Please run the script step-by-step."
 method_id="dummy"
 method_lang="python" # change this to "r" if need be
 
-viash run src/common/create_component/config.vsh.yaml -- \
-  --language "$method_lang" \
-  --name "$method_id"
+
+bash common/scripts/create_component --type method --language ${method_lang} --name ${method_id}
 
 # TODO: fill in required fields in src/methods/foo/config.vsh.yaml
 # TODO: edit src/methods/foo/script.py/R
@@ -26,17 +25,13 @@ viash test src/methods/$method_id/config.vsh.yaml
 viash run src/methods/$method_id/config.vsh.yaml -- \
   ---setup cachedbuild ---verbose
 
-# run the method (using h5ad as input)
+# run the inference using the method for op dataset using only RNA data. Add more aurguments if needed.
 viash run src/methods/$method_id/config.vsh.yaml -- \
-  --multiomics_rna "resources/grn-benchmark/multiomics_rna.h5ad" \
-  --multiomics_atac "resources/grn-benchmark/multiomics_atac.h5ad" \
-  --output "output/prediction.csv"
-
-# run evaluation metric
-viash run src/metrics/regression_1/config.vsh.yaml -- \
-  --perturbation_file "resources/grn-benchmark/perturbation_file.h5ad" \
-  --prediction "output/prediction.csv" \
-  --output "output/score.csv"
-
-# print score on kaggle test dataset
-python -c 'import pandas as pd; print(pd.read_csv("output/score.csv"))'
+  --rna "resources/inference_datasets/op_rna.h5ad" \
+  --prediction "output/prediction.h5ad"
+
+# run evaluation metrics
+bash scripts/calculate_score.sh output/prediction.h5ad op
+
+# print the score
+python -c 'import pandas as ad; print(ad.read_h5ad("output/score.h5ad"))'
diff --git a/scripts/calculate_score.sh → scripts/single_grn_evaluation.sh b/scripts/calculate_score.sh → scripts/single_grn_evaluation.sh
diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml
@@ -24,6 +24,7 @@ arguments:
     required: false
     direction: input
     example: resources_test/prior/tf_all.csv
+    default: resources_test/prior/tf_all.csv
   - name: --max_n_links
     type: integer
     default: 50000

diff --git a/src/control_methods/pearson_corr/helper_local_runs.py b/src/control_methods/pearson_corr/helper_local_runs.py
@@ -0,0 +1,29 @@
+
+import argparse
+parser = argparse.ArgumentParser(description="Process multiomics RNA data.")
+parser.add_argument('--rna', type=str, help='Path to the multiomics RNA file')
+parser.add_argument('--prediction', type=str, help='Path to the prediction file')
+parser.add_argument('--tf_all', type=str, help='Path to the tf_all')
+parser.add_argument('--num_workers', type=str, help='Number of cores')
+parser.add_argument('--max_n_links', type=str, help='Number of top links to retain')
+
+args = parser.parse_args()
+
+if args.max_n_links:
+    par['max_n_links'] = int(args.max_n_links)
+
+if args.rna:
+    par['rna'] = args.rna
+if args.prediction:
+    par['prediction'] = args.prediction
+if args.tf_all:
+    par['tf_all'] = args.tf_all
+if args.num_workers:
+    par['num_workers'] = args.num_workers
+
+os.makedirs(par['temp_dir'], exist_ok=True)
+
+meta = {
+    'resources_dir': 'src/utils'
+    }
+sys.path.append(meta["resources_dir"])
diff --git a/src/control_methods/pearson_corr/script.py b/src/control_methods/pearson_corr/script.py
@@ -3,73 +3,55 @@
 import pandas as pd
 import os
 import scanpy as sc 
+import sys
+import numpy as np
+
+# - whatever is between viash start and end will be replaced by Viash with the parameters from the config. file
+
 ## VIASH START
 par = {
-    'rna': 'resources/grn-benchmark/rna.h5ad',
+    'rna': 'resources/evaluation_datasets/op_rna.h5ad',
     'tf_all': 'resources/prior/tf_all.csv',
     'cell_type_specific': False,
     'max_n_links': 50000,
-    'prediction': 'output/pearson_net.csv',
-    "seed": 32,
-    'normalize': False,
-    'donor_specific': False,
-    'temp_dir': 'output/pearson_corr',
+    'prediction': 'output/pearson_net.h5ad',
     'apply_tf': True,
     'normalize': True}
 ## VIASH END
 
-
-import argparse
-parser = argparse.ArgumentParser(description="Process multiomics RNA data.")
-parser.add_argument('--rna', type=str, help='Path to the multiomics RNA file')
-parser.add_argument('--prediction', type=str, help='Path to the prediction file')
-parser.add_argument('--tf_all', type=str, help='Path to the tf_all')
-parser.add_argument('--num_workers', type=str, help='Number of cores')
-parser.add_argument('--max_n_links', type=str, help='Number of top links to retain')
-
-args = parser.parse_args()
-
-if args.max_n_links:
-    par['max_n_links'] = int(args.max_n_links)
-
-if args.rna:
-    par['rna'] = args.rna
-
-if args.prediction:
-    par['prediction'] = args.prediction
-if args.tf_all:
-    par['tf_all'] = args.tf_all
-if args.num_workers:
-    par['num_workers'] = args.num_workers
-
-os.makedirs(par['temp_dir'], exist_ok=True)
-import sys
-
-try:
-    sys.path.append(meta["resources_dir"])
-except:
-    meta = {
-    'resources_dir': 'src/utils'
-    }
-    sys.path.append(meta["resources_dir"])
+sys.path.append(meta["resources_dir"])
 from util import corr_net
 
 
-def create_corr_net(par):
+def infer_net(par: dict) -> pd.DataFrame:
     print(par)
     print('Read data')
     adata = ad.read_h5ad(par["rna"])
-
-    X = adata.layers['X_norm']
-
-    # - corr
-    gene_names = adata.var_names.to_numpy()
+    try:
+        X = adata.layers['X_norm'].todense().A
+    except:
+        X = adata.X
+
+    # - remove genes with 0 standard deviation
+    gene_std = np.std(X, axis=0)
+    nonzero_std_genes = gene_std > 0
+    X = X[:, nonzero_std_genes]
+    # - get the net
+    gene_names = adata[:, nonzero_std_genes].var_names.to_numpy()
     grn = corr_net(X, gene_names, par)    
     return grn
 
-net = create_corr_net(par)
-
-print('Output GRN')
-net['weight'] = net['weight'].astype(str)
-output = ad.AnnData(X=None, uns={"method_id": par['method_id'], "dataset_id": par['dataset_id'], "prediction": net[["source", "target", "weight"]]})
-output.write(par['prediction'])
+if __name__ == '__main__':
+    net = infer_net(par)
+    # - format of et
+    '''
+        the net is a pandas dataframe with the following columns:
+            - source: the source gene of the interaction
+            - target: the target gene of the interaction
+            - weight: the weight of the interaction
+    '''
+
+    print('Output GRN')
+    net['weight'] = net['weight'].astype(str)
+    output = ad.AnnData(X=None, uns={"method_id": par['method_id'], "dataset_id": par['dataset_id'], "prediction": net[["source", "target", "weight"]]})
+    output.write(par['prediction'])