From b056d7438424ef3f259d4dcd810364e9d15a83b5 Mon Sep 17 00:00:00 2001 From: Jalil Nourisa Date: Thu, 6 Feb 2025 14:06:15 +0100 Subject: [PATCH] restructuring data storage and local workflows --- README.md | 6 +- _viash.yaml | 6 +- runs.ipynb | 125 ++++----- scripts/add_a_method.sh | 2 +- scripts/single_grn_evaluation.sh | 12 +- src/api/file_atac_h5ad.yaml | 2 +- src/api/file_evaluation_h5ad.yaml | 2 +- src/api/file_prediction_h5ad.yaml | 2 +- src/api/file_rna_h5ad.yaml | 2 +- .../negative_control/script.py | 2 +- src/control_methods/pearson_corr/script.py | 4 +- src/control_methods/pearson_corr/test.sh | 2 +- .../positive_control/script.py | 2 +- src/control_methods/positive_control/test.sh | 2 +- src/helper.py | 26 +- src/methods/multi_omics/celloracle/script.py | 4 +- src/methods/multi_omics/figr/script.R | 2 +- src/methods/multi_omics/scenicplus/test.sh | 2 +- src/methods/single_omics/ennet/script.R | 2 +- src/methods/single_omics/ennet/test.sh | 2 +- src/methods/single_omics/genie3/script.py | 2 +- src/methods/single_omics/genie3/test.sh | 2 +- src/methods/single_omics/grnboost2/script.py | 2 +- src/methods/single_omics/grnboost2/test.sh | 2 +- src/methods/single_omics/portia/script.py | 2 +- src/methods/single_omics/portia/test.sh | 2 +- src/methods/single_omics/ppcor/script.R | 2 +- src/methods/single_omics/scenic/script.py | 2 +- src/methods/single_omics/scenic/test.sh | 2 +- src/methods/single_omics/scgpt/run.sh | 4 +- src/methods/single_omics/scgpt/test.sh | 2 +- src/methods/single_omics/scprint/run.sh | 33 ++- src/methods/single_omics/scprint/script.py | 8 +- src/methods/single_omics/tigress/script.R | 2 +- src/methods/single_omics/tigress/test.sh | 4 +- src/metrics/all_metrics/helper.py | 12 +- src/metrics/all_metrics/script.py | 52 ++-- src/metrics/all_metrics/script_all.py | 12 +- src/metrics/regression_1/config.vsh.yaml | 2 +- .../regression_1/{main.py => helper.py} | 185 +------------- src/metrics/regression_1/run.sh | 4 +- src/metrics/regression_1/script.py | 148 +++++++---- src/metrics/regression_1/test.sh | 2 +- src/metrics/regression_2/consensus/script.py | 4 +- src/metrics/regression_2/main.py | 2 +- src/metrics/regression_2/run.sh | 6 +- src/metrics/regression_2/script.py | 6 +- src/metrics/regression_2/test.sh | 2 +- src/metrics/regression_2/theta_experiment.py | 4 +- src/metrics/script_all_experiment.py | 185 -------------- .../wasserstein/background_distance/run.sh | 15 ++ .../wasserstein/background_distance/script.py | 9 +- src/metrics/wasserstein/consensus/script.py | 4 +- src/metrics/wasserstein/run.sh | 4 +- src/metrics/wasserstein/script.py | 4 +- src/metrics/wasserstein/script_all.py | 10 +- .../gene_annotation/config.novsh.yaml | 2 +- .../gene_annotation/script.py | 2 +- .../peak_annotation/script.R | 4 +- .../format_data/config.novsh.yaml | 0 .../format_data/script.py | 4 +- .../format_resources_r/config.vsh.yaml | 0 .../format_resources_r/script.R | 0 .../multiome_matrix/config.vsh.yaml | 0 .../multiome_matrix/script.py | 0 .../config.novsh.yaml | 0 .../opsca => opsca_perturbation}/script.py | 58 +---- src/process_data/pereggrn/acquisition.py | 83 ++++++ src/process_data/pereggrn/run.sh | 13 + src/process_data/pereggrn/script.py | 123 +++++++++ src/process_data/replogle_k562_gwps/run.sh | 22 ++ src/process_data/replogle_k562_gwps/script.py | 153 +++++++++++ .../replogle_k562_gwps/sparsify_raw_counts.py | 8 + .../config.novsh.yaml | 0 .../batch_correction_evaluation/helper.py | 0 .../batch_correction_evaluation/script.py | 0 .../batch_correction_scgen/config.novsh.yaml | 2 +- .../repo/batch_correction_scgen/script.py | 0 .../batch_correction_seurat/config.novsh.yaml | 2 +- .../repo/batch_correction_seurat/script.R | 0 src/process_data/test_data/run.sh | 10 +- src/process_data/test_data/script.py | 12 +- src/robustness_analysis/script_all.py | 8 +- src/utils/util.py | 53 +++- src/workflows/run_benchmark/config.novsh.yaml | 4 +- .../run_robustness_analysis/config.novsh.yaml | 2 +- .../benchmark/methods/script.py} | 68 ++--- .../benchmark/metrics/script.py | 241 ++++++++++++++++++ src/workflows_local/benchmark/run.sh | 48 ++++ src/workflows_local/run_all_local.sh | 28 ++ test.ipynb | 79 ++++++ 91 files changed, 1288 insertions(+), 693 deletions(-) rename src/metrics/regression_1/{main.py => helper.py} (51%) delete mode 100644 src/metrics/script_all_experiment.py create mode 100644 src/metrics/wasserstein/background_distance/run.sh rename src/process_data/{multiomics => op_multiomics}/format_data/config.novsh.yaml (100%) rename src/process_data/{multiomics => op_multiomics}/format_data/script.py (93%) rename src/process_data/{multiomics => op_multiomics}/format_resources_r/config.vsh.yaml (100%) rename src/process_data/{multiomics => op_multiomics}/format_resources_r/script.R (100%) rename src/process_data/{multiomics => op_multiomics}/multiome_matrix/config.vsh.yaml (100%) rename src/process_data/{multiomics => op_multiomics}/multiome_matrix/script.py (100%) rename src/process_data/{perturbation/opsca => opsca_perturbation}/config.novsh.yaml (100%) rename src/process_data/{perturbation/opsca => opsca_perturbation}/script.py (82%) create mode 100644 src/process_data/pereggrn/acquisition.py create mode 100644 src/process_data/pereggrn/run.sh create mode 100644 src/process_data/pereggrn/script.py create mode 100644 src/process_data/replogle_k562_gwps/run.sh create mode 100644 src/process_data/replogle_k562_gwps/script.py create mode 100644 src/process_data/replogle_k562_gwps/sparsify_raw_counts.py rename src/process_data/{perturbation => }/repo/batch_correction_evaluation/config.novsh.yaml (100%) rename src/process_data/{perturbation => }/repo/batch_correction_evaluation/helper.py (100%) rename src/process_data/{perturbation => }/repo/batch_correction_evaluation/script.py (100%) rename src/process_data/{perturbation => }/repo/batch_correction_scgen/config.novsh.yaml (96%) rename src/process_data/{perturbation => }/repo/batch_correction_scgen/script.py (100%) rename src/process_data/{perturbation => }/repo/batch_correction_seurat/config.novsh.yaml (93%) rename src/process_data/{perturbation => }/repo/batch_correction_seurat/script.R (100%) rename src/{methods/script_all.py => workflows_local/benchmark/methods/script.py} (79%) create mode 100644 src/workflows_local/benchmark/metrics/script.py create mode 100644 src/workflows_local/benchmark/run.sh create mode 100644 src/workflows_local/run_all_local.sh create mode 100644 test.ipynb diff --git a/README.md b/README.md index 8b0dcdba6..e77993827 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ flowchart TB Chromatin accessibility data -Example file: `resources_test/inference_datasets/op_atac.h5ad` +Example file: `resources_test/grn_benchmark/inference_datasets//op_atac.h5ad` Format: @@ -245,7 +245,7 @@ Data structure: Perturbation dataset for benchmarking. -Example file: `resources_test/evaluation_datasets/op_perturbation.h5ad` +Example file: `resources_test/grn_benchmark/evaluation_datasets//op_perturbation.h5ad` Format: @@ -275,7 +275,7 @@ Data structure: RNA expression data. -Example file: `resources_test/inference_datasets/op_rna.h5ad` +Example file: `resources_test/grn_benchmark/inference_datasets//op_rna.h5ad` Format: diff --git a/_viash.yaml b/_viash.yaml index 833be62e8..2ed8d9e2d 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -30,10 +30,10 @@ info: test_resources: - type: s3 path: s3://openproblems-data/resources_test/grn/inference_datasets/ - dest: resources_test/inference_datasets/ + dest: resources_test/grn_benchmark/inference_datasets// - type: s3 path: s3://openproblems-data/resources_test/grn/evaluation_datasets/ - dest: resources_test/evaluation_datasets/ + dest: resources_test/grn_benchmark/evaluation_datasets// - type: s3 path: s3://openproblems-data/resources_test/grn/prior/ dest: resources_test/prior/ @@ -67,7 +67,7 @@ info: ```bash viash run src/control_methods/pearson_corr/config.vsh.yaml -- \ - --rna resources/inference_datasets/norman_rna.h5ad --prediction output/net.h5ad + --rna resources/grn_benchmark/inference_datasets/norman_rna.h5ad --prediction output/net.h5ad ``` ## Evaluate a GRN diff --git a/runs.ipynb b/runs.ipynb index e27ee309a..4aabd2004 100644 --- a/runs.ipynb +++ b/runs.ipynb @@ -14,42 +14,48 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "delete: s3://openproblems-data/resources_test/grn/grn_models/op/granie.csv\n", - "delete: s3://openproblems-data/resources_test/grn/grn_models/op/scenicplus.csv\n", - "delete: s3://openproblems-data/resources_test/grn/grn_models/op/figr.csv\n", - "delete: s3://openproblems-data/resources_test/grn/grn_models/op/collectri.csv\n", - "delete: s3://openproblems-data/resources_test/grn/grn_models/op/celloracle.csv\n", - "delete: s3://openproblems-data/resources_test/grn/grn_models/op/scglue.csv\n", - "upload: resources_test/grn_models/op/collectri.h5ad to s3://openproblems-data/resources_test/grn/grn_models/op/collectri.h5ad\n", - "upload: resources_test/inference_datasets/op_atac.rds to s3://openproblems-data/resources_test/grn/inference_datasets/op_atac.rds\n", - "upload: resources_test/inference_datasets/op_rna.rds to s3://openproblems-data/resources_test/grn/inference_datasets/op_rna.rds\n", - "upload: resources_test/inference_datasets/op_atac.h5ad to s3://openproblems-data/resources_test/grn/inference_datasets/op_atac.h5ad\n", - "upload: resources_test/prior/regulators_consensus_adamson.json to s3://openproblems-data/resources_test/grn/prior/regulators_consensus_adamson.json\n", - "upload: resources_test/prior/regulators_consensus_norman.json to s3://openproblems-data/resources_test/grn/prior/regulators_consensus_norman.json\n", - "upload: resources_test/datasets_raw/adamson_sc_counts.h5ad to s3://openproblems-data/resources_test/grn/datasets_raw/adamson_sc_counts.h5ad\n", - "upload: resources_test/prior/op_consensus-num-regulators.json to s3://openproblems-data/resources_test/grn/prior/op_consensus-num-regulators.json\n", - "upload: resources_test/prior/regulators_consensus_op.json to s3://openproblems-data/resources_test/grn/prior/regulators_consensus_op.json\n", - "upload: resources_test/prior/regulators_consensus_nakatake.json to s3://openproblems-data/resources_test/grn/prior/regulators_consensus_nakatake.json\n", - "upload: resources_test/prior/regulators_consensus_replogle2.json to s3://openproblems-data/resources_test/grn/prior/regulators_consensus_replogle2.json\n", - "upload: resources_test/inference_datasets/op_rna.h5ad to s3://openproblems-data/resources_test/grn/inference_datasets/op_rna.h5ad\n", - "upload: resources_test/prior/tf_all.csv to s3://openproblems-data/resources_test/grn/prior/tf_all.csv\n", - "upload: resources_test/prior/ws_consensus_adamson.csv to s3://openproblems-data/resources_test/grn/prior/ws_consensus_adamson.csv\n", - "upload: resources_test/prior/ws_consensus_norman.csv to s3://openproblems-data/resources_test/grn/prior/ws_consensus_norman.csv\n", - "upload: resources_test/prior/ws_distance_background_adamson.csv to s3://openproblems-data/resources_test/grn/prior/ws_distance_background_adamson.csv\n", - "upload: resources_test/prior/skeleton.csv to s3://openproblems-data/resources_test/grn/prior/skeleton.csv\n", - "upload: resources_test/prior/ws_distance_background_norman.csv to s3://openproblems-data/resources_test/grn/prior/ws_distance_background_norman.csv\n" + "delete: s3://openproblems-data/resources/grn/datasets_raw/adamson_bulked.h5ad\n", + "delete: s3://openproblems-data/resources/grn/datasets_raw/norman_sc_counts.h5ad \n", + "delete: s3://openproblems-data/resources/grn/datasets_raw/norman_bulked.h5ad \n", + "delete: s3://openproblems-data/resources/grn/datasets_raw/adamson_sc_counts.h5ad \n", + "delete: s3://openproblems-data/resources/grn/datasets_raw/nakatake_bulked.h5ad \n", + "delete: s3://openproblems-data/resources/grn/datasets_raw/op_bulked.h5ad \n", + "delete: s3://openproblems-data/resources/grn/datasets_raw/replogle2_bulked.h5ad \n", + "delete: s3://openproblems-data/resources/grn/evaluation_datasets/adamson_perturbation.h5ad\n", + "delete: s3://openproblems-data/resources/grn/evaluation_datasets/nakatake_perturbation.h5ad\n", + "delete: s3://openproblems-data/resources/grn/evaluation_datasets/norman_perturbation.h5ad\n", + "delete: s3://openproblems-data/resources/grn/evaluation_datasets/op_perturbation.h5ad \n", + "delete: s3://openproblems-data/resources/grn/evaluation_datasets/replogle2_perturbation.h5ad\n", + "upload: resources/extended_data/adamson_bulk.h5ad to s3://openproblems-data/resources/grn/extended_data/adamson_bulk.h5ad\n", + "upload: resources/datasets_raw/nakatake.h5ad to s3://openproblems-data/resources/grn/datasets_raw/nakatake.h5ad\n", + "upload: resources/extended_data/norman_bulk.h5ad to s3://openproblems-data/resources/grn/extended_data/norman_bulk.h5ad\n", + "upload: resources/extended_data/nakatake_bulk.h5ad to s3://openproblems-data/resources/grn/extended_data/nakatake_bulk.h5ad\n", + "upload: resources/datasets_raw/adamson.h5ad to s3://openproblems-data/resources/grn/datasets_raw/adamson.h5ad\n", + "upload: resources/extended_data/replogle_bulk.h5ad to s3://openproblems-data/resources/grn/extended_data/replogle_bulk.h5ad\n", + "upload: resources/grn_benchmark/evaluation_datasets/adamson_bulk.h5ad to s3://openproblems-data/resources/grn/grn_benchmark/evaluation_datasets/adamson_bulk.h5ad\n", + "upload: resources/extended_data/op_perturbation_bulk.h5ad to s3://openproblems-data/resources/grn/extended_data/op_perturbation_bulk.h5ad\n", + "upload: resources/grn_benchmark/evaluation_datasets/nakatake_bulk.h5ad to s3://openproblems-data/resources/grn/grn_benchmark/evaluation_datasets/nakatake_bulk.h5ad\n", + "upload: resources/grn_benchmark/evaluation_datasets/norman_bulk.h5ad to s3://openproblems-data/resources/grn/grn_benchmark/evaluation_datasets/norman_bulk.h5ad\n", + "upload: resources/datasets_raw/norman.h5ad to s3://openproblems-data/resources/grn/datasets_raw/norman.h5ad\n", + "upload: resources/grn_benchmark/evaluation_datasets/adamson_sc.h5ad to s3://openproblems-data/resources/grn/grn_benchmark/evaluation_datasets/adamson_sc.h5ad\n", + "upload: resources/grn_benchmark/evaluation_datasets/replogle_bulk.h5ad to s3://openproblems-data/resources/grn/grn_benchmark/evaluation_datasets/replogle_bulk.h5ad\n", + "upload: resources/grn_benchmark/evaluation_datasets/op_bulk.h5ad to s3://openproblems-data/resources/grn/grn_benchmark/evaluation_datasets/op_bulk.h5ad\n", + "upload: resources/grn_benchmark/inference_datasets/adamson_rna.h5ad to s3://openproblems-data/resources/grn/grn_benchmark/inference_datasets/adamson_rna.h5ad\n", + "upload: resources/grn_benchmark/evaluation_datasets/norman_sc.h5ad to s3://openproblems-data/resources/grn/grn_benchmark/evaluation_datasets/norman_sc.h5ad\n", + "upload: resources/grn_benchmark/inference_datasets/nakatake_rna.h5ad to s3://openproblems-data/resources/grn/grn_benchmark/inference_datasets/nakatake_rna.h5ad\n", + "Completed 25.9 GiB/~223.1 GiB (16.3 MiB/s) with ~7 file(s) remaining (calculating...)\r" ] } ], "source": [ - "!aws s3 sync resources_test/ s3://openproblems-data/resources_test/grn/ --delete" + "!aws s3 sync resources/ s3://openproblems-data/resources/grn/ --delete" ] }, { @@ -60,11 +66,11 @@ "source": [ "# !aws s3 sync resources/ s3://openproblems-data/resources/grn/ --delete\n", "# !aws s3 sync resources/grn_models/ s3://openproblems-data/resources/grn/grn_models --delete\n", - "# !aws s3 sync resources/prior/ s3://openproblems-data/resources/grn/prior --delete\n", + "# !aws s3 sync resources/grn_benchmark/prior/ s3://openproblems-data/resources/grn/prior --delete\n", "# !aws s3 sync resources/results/ s3://openproblems-data/resources/grn/results --delete\n", "# !aws s3 sync resources/scores/ s3://openproblems-data/resources/grn/scores --delete\n", - "# !aws s3 sync resources/evaluation_datasets/ s3://openproblems-data/resources/grn/evaluation_datasets/ --delete\n", - "# !aws s3 sync resources/inference_datasets/ s3://openproblems-data/resources/grn/inference_datasets/ --delete" + "# !aws s3 sync resources/grn_benchmark/evaluation_datasets// s3://openproblems-data/resources/grn/evaluation_datasets/ --delete\n", + "# !aws s3 sync resources/grn_benchmark/inference_datasets/ s3://openproblems-data/resources/grn/inference_datasets/ --delete" ] }, { @@ -201,8 +207,8 @@ "!aws s3 sync s3://openproblems-data/resources/grn/ resources/ --delete\n", "# aws s3 sync s3://openproblems-data/resources/grn/results resources/results/ --delete\n", "# aws s3 sync s3://openproblems-data/resources/grn/grn_models resources/grn_models/\n", - "# aws s3 sync s3://openproblems-data/resources/grn/inference_datasets/ resources/inference_datasets/\n", - "# aws s3 sync s3://openproblems-data/resources/grn/evaluation_datasets/ resources/evaluation_datasets/" + "# aws s3 sync s3://openproblems-data/resources/grn/inference_datasets/ resources/grn_benchmark/inference_datasets/\n", + "# aws s3 sync s3://openproblems-data/resources/grn/evaluation_datasets/ resources/grn_benchmark/evaluation_datasets//" ] }, { @@ -212,7 +218,7 @@ "outputs": [], "source": [ "# !aws s3 sync resources_test/ s3://openproblems-data/resources_test/grn/ --delete\n", - "# !aws s3 sync resources/inference_datasets/ s3://openproblems-data/resources/grn/inference_datasets/ --delete" + "# !aws s3 sync resources/grn_benchmark/inference_datasets/ s3://openproblems-data/resources/grn/inference_datasets/ --delete" ] }, { @@ -224,7 +230,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -236,7 +242,6 @@ "import anndata as ad \n", "import numpy as np\n", "import scanpy as sc \n", - "from src.exp_analysis.helper import plot_cumulative_density\n", "import matplotlib.pyplot as plt\n", "import sys \n", "import subprocess\n", @@ -250,7 +255,9 @@ "\n", "sys.path.append('../')\n", "from grn_benchmark.src.helper import surragate_names\n", - "from src.helper import *\n", + "from src.exp_analysis.helper import plot_cumulative_density\n", + "\n", + "# from src.helper import *\n", "par = {\n", " # 'methods': [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'genie3', 'grnboost2', 'scenic', 'scglue', 'celloracle'],\n", " 'methods': [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'scglue', 'celloracle', 'scenicplus'],\n", @@ -291,36 +298,34 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "adata = ad.read_h5ad('resources/inference_datasets/norman_rna.h5ad')" + "import anndata as ad\n", + "adata = ad.read_h5ad('K562_gwps_raw_singlecell.h5ad', backed='r')" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "array([[ 0., 0., 0., ..., 48., 0., 0.],\n", - " [ 0., 0., 0., ..., 61., 0., 0.],\n", - " [ 0., 0., 0., ..., 21., 0., 0.],\n", - " ...,\n", - " [ 0., 0., 0., ..., 47., 0., 0.],\n", - " [ 0., 0., 0., ..., 27., 0., 0.],\n", - " [ 0., 0., 0., ..., 49., 0., 0.]], dtype=float32)" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" + "ename": "NameError", + "evalue": "name 'adata' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43madata\u001b[49m\u001b[38;5;241m.\u001b[39mobs\n", + "\u001b[0;31mNameError\u001b[0m: name 'adata' is not defined" + ] } ], - "source": [] + "source": [ + "adata.obs" + ] }, { "cell_type": "markdown", @@ -342,7 +347,7 @@ "Error occurred while submitting job for scprint: Command '['bash', 'scripts/sbatch/grn_inference.sh', 'python src/methods/single_omics/scprint/script.py --rna resources/inference_datasets/replogle2_rna.h5ad --prediction resources/grn_models/replogle2//scprint.csv --num_workers 10 ']' returned non-zero exit status 1.\n", "Return code: 1\n", "Command output: \u001b[92m→\u001b[0m connected lamindb: anonymous/test\n", - "{'rna': 'resources/inference_datasets/replogle2_rna.h5ad', 'tf_all': 'resources/prior/tf_all.csv', 'prediction': 'resources/grn_models/replogle2//scprint.csv', 'filtration': 'top-k', 'max_n_links': 50000, 'num_genes': 5000, 'max_cells': 1000, 'num_workers': '10', 'temp_dir': 'output/scprint/', 'method_id': 'scprint', 'dataset_id': 'op'}\n", + "{'rna': 'resources/inference_datasets/replogle2_rna.h5ad', 'tf_all': 'resources/grn_benchmark/prior/tf_all.csv', 'prediction': 'resources/grn_models/replogle2//scprint.csv', 'filtration': 'top-k', 'max_n_links': 50000, 'num_genes': 5000, 'max_cells': 1000, 'num_workers': '10', 'temp_dir': 'output/scprint/', 'method_id': 'scprint', 'dataset_id': 'op'}\n", "Downloading checkpoint\n", "Downloaded checkpoint to output/scprint//scprint.ckpt\n", "\n", @@ -361,7 +366,7 @@ "Error occurred while submitting job for scprint: Command '['bash', 'scripts/sbatch/grn_inference.sh', 'python src/methods/single_omics/scprint/script.py --rna resources/inference_datasets/norman_rna.h5ad --prediction resources/grn_models/norman//scprint.csv --num_workers 10 ']' returned non-zero exit status 1.\n", "Return code: 1\n", "Command output: \u001b[92m→\u001b[0m connected lamindb: anonymous/test\n", - "{'rna': 'resources/inference_datasets/norman_rna.h5ad', 'tf_all': 'resources/prior/tf_all.csv', 'prediction': 'resources/grn_models/norman//scprint.csv', 'filtration': 'top-k', 'max_n_links': 50000, 'num_genes': 5000, 'max_cells': 1000, 'num_workers': '10', 'temp_dir': 'output/scprint/', 'method_id': 'scprint', 'dataset_id': 'op'}\n", + "{'rna': 'resources/inference_datasets/norman_rna.h5ad', 'tf_all': 'resources/grn_benchmark/prior/tf_all.csv', 'prediction': 'resources/grn_models/norman//scprint.csv', 'filtration': 'top-k', 'max_n_links': 50000, 'num_genes': 5000, 'max_cells': 1000, 'num_workers': '10', 'temp_dir': 'output/scprint/', 'method_id': 'scprint', 'dataset_id': 'op'}\n", "Downloading checkpoint\n", "Downloaded checkpoint to output/scprint//scprint.ckpt\n", "X was not raw counts, using 'counts' layer\n", @@ -385,7 +390,7 @@ "Error occurred while submitting job for scprint: Command '['bash', 'scripts/sbatch/grn_inference.sh', 'python src/methods/single_omics/scprint/script.py --rna resources/inference_datasets/adamson_rna.h5ad --prediction resources/grn_models/adamson//scprint.csv --num_workers 10 ']' returned non-zero exit status 1.\n", "Return code: 1\n", "Command output: \u001b[92m→\u001b[0m connected lamindb: anonymous/test\n", - "{'rna': 'resources/inference_datasets/adamson_rna.h5ad', 'tf_all': 'resources/prior/tf_all.csv', 'prediction': 'resources/grn_models/adamson//scprint.csv', 'filtration': 'top-k', 'max_n_links': 50000, 'num_genes': 5000, 'max_cells': 1000, 'num_workers': '10', 'temp_dir': 'output/scprint/', 'method_id': 'scprint', 'dataset_id': 'op'}\n", + "{'rna': 'resources/inference_datasets/adamson_rna.h5ad', 'tf_all': 'resources/grn_benchmark/prior/tf_all.csv', 'prediction': 'resources/grn_models/adamson//scprint.csv', 'filtration': 'top-k', 'max_n_links': 50000, 'num_genes': 5000, 'max_cells': 1000, 'num_workers': '10', 'temp_dir': 'output/scprint/', 'method_id': 'scprint', 'dataset_id': 'op'}\n", "Downloading checkpoint\n", "Downloaded checkpoint to output/scprint//scprint.ckpt\n", "Dropping layers: KeysView(Layers with keys: X_norm)\n", @@ -404,7 +409,7 @@ "Error occurred while submitting job for scprint: Command '['bash', 'scripts/sbatch/grn_inference.sh', 'python src/methods/single_omics/scprint/script.py --rna resources/inference_datasets/nakatake_rna.h5ad --prediction resources/grn_models/nakatake//scprint.csv --num_workers 10 ']' returned non-zero exit status 1.\n", "Return code: 1\n", "Command output: \u001b[92m→\u001b[0m connected lamindb: anonymous/test\n", - "{'rna': 'resources/inference_datasets/nakatake_rna.h5ad', 'tf_all': 'resources/prior/tf_all.csv', 'prediction': 'resources/grn_models/nakatake//scprint.csv', 'filtration': 'top-k', 'max_n_links': 50000, 'num_genes': 5000, 'max_cells': 1000, 'num_workers': '10', 'temp_dir': 'output/scprint/', 'method_id': 'scprint', 'dataset_id': 'op'}\n", + "{'rna': 'resources/inference_datasets/nakatake_rna.h5ad', 'tf_all': 'resources/grn_benchmark/prior/tf_all.csv', 'prediction': 'resources/grn_models/nakatake//scprint.csv', 'filtration': 'top-k', 'max_n_links': 50000, 'num_genes': 5000, 'max_cells': 1000, 'num_workers': '10', 'temp_dir': 'output/scprint/', 'method_id': 'scprint', 'dataset_id': 'op'}\n", "Downloading checkpoint\n", "Downloaded checkpoint to output/scprint//scprint.ckpt\n", "Dropping layers: KeysView(Layers with keys: X_norm)\n", @@ -423,7 +428,7 @@ "Error occurred while submitting job for scprint: Command '['bash', 'scripts/sbatch/grn_inference.sh', 'python src/methods/single_omics/scprint/script.py --rna resources/inference_datasets/op_rna.h5ad --prediction resources/grn_models/op//scprint.csv --num_workers 10 ']' returned non-zero exit status 1.\n", "Return code: 1\n", "Command output: \u001b[92m→\u001b[0m connected lamindb: anonymous/test\n", - "{'rna': 'resources/inference_datasets/op_rna.h5ad', 'tf_all': 'resources/prior/tf_all.csv', 'prediction': 'resources/grn_models/op//scprint.csv', 'filtration': 'top-k', 'max_n_links': 50000, 'num_genes': 5000, 'max_cells': 1000, 'num_workers': '10', 'temp_dir': 'output/scprint/', 'method_id': 'scprint', 'dataset_id': 'op'}\n", + "{'rna': 'resources/inference_datasets/op_rna.h5ad', 'tf_all': 'resources/grn_benchmark/prior/tf_all.csv', 'prediction': 'resources/grn_models/op//scprint.csv', 'filtration': 'top-k', 'max_n_links': 50000, 'num_genes': 5000, 'max_cells': 1000, 'num_workers': '10', 'temp_dir': 'output/scprint/', 'method_id': 'scprint', 'dataset_id': 'op'}\n", "Downloading checkpoint\n", "Downloaded checkpoint to output/scprint//scprint.ckpt\n", "Dropping layers: KeysView(Layers with keys: X_norm, counts)\n", @@ -3833,7 +3838,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "base", "language": "python", "name": "python3" }, @@ -3847,7 +3852,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.16" + "version": "3.10.15" } }, "nbformat": 4, diff --git a/scripts/add_a_method.sh b/scripts/add_a_method.sh index 3b6a8faa8..e697fbf71 100644 --- a/scripts/add_a_method.sh +++ b/scripts/add_a_method.sh @@ -27,7 +27,7 @@ viash run src/methods/$method_id/config.vsh.yaml -- \ # run the inference using the method for op dataset using only RNA data. Add more aurguments if needed. viash run src/methods/$method_id/config.vsh.yaml -- \ - --rna "resources/inference_datasets/op_rna.h5ad" \ + --rna "resources/grn_benchmark/inference_datasets/op_rna.h5ad" \ --prediction "output/prediction.h5ad" # run evaluation metrics diff --git a/scripts/single_grn_evaluation.sh b/scripts/single_grn_evaluation.sh index 0be7f4bf1..65dc14902 100644 --- a/scripts/single_grn_evaluation.sh +++ b/scripts/single_grn_evaluation.sh @@ -7,9 +7,9 @@ viash run src/metrics/all_metrics/config.novsh.yaml -- \ --prediction ${prediction} \ --dataset_id ${dataset_id} \ --score output/score.h5ad \ - --tf_all resources/prior/tf_all.csv \ - --regulators_consensus resources/prior/regulators_consensus_${dataset_id}.json \ - --ws_consensus resources/prior/ws_consensus_${dataset_id}.csv \ - --ws_distance_background resources/prior/ws_distance_background_${dataset_id}.csv \ - --evaluation_data_sc resources/evaluation_datasets/${dataset_id}_sc_counts.h5ad \ - --evaluation_data resources/evaluation_datasets/${dataset_id}_perturbation.h5ad \ No newline at end of file + --tf_all resources/grn_benchmark/prior/tf_all.csv \ + --regulators_consensus resources/grn_benchmark/prior/regulators_consensus_${dataset_id}.json \ + --ws_consensus resources/grn_benchmark/prior/ws_consensus_${dataset_id}.csv \ + --ws_distance_background resources/grn_benchmark/prior/ws_distance_background_${dataset_id}.csv \ + --evaluation_data_sc resources/grn_benchmark/evaluation_datasets//${dataset_id}_sc_counts.h5ad \ + --evaluation_data resources/grn_benchmark/evaluation_datasets//${dataset_id}_perturbation.h5ad \ No newline at end of file diff --git a/src/api/file_atac_h5ad.yaml b/src/api/file_atac_h5ad.yaml index d557d960e..e0a4aefd0 100644 --- a/src/api/file_atac_h5ad.yaml +++ b/src/api/file_atac_h5ad.yaml @@ -1,5 +1,5 @@ type: file -example: resources_test/inference_datasets/op_atac.h5ad +example: resources_test/grn_benchmark/inference_datasets//op_atac.h5ad label: chromatin accessibility data summary: "Chromatin accessibility data" info: diff --git a/src/api/file_evaluation_h5ad.yaml b/src/api/file_evaluation_h5ad.yaml index cd18d1ff8..871c6af1d 100644 --- a/src/api/file_evaluation_h5ad.yaml +++ b/src/api/file_evaluation_h5ad.yaml @@ -1,5 +1,5 @@ type: file -example: resources_test/evaluation_datasets/op_perturbation.h5ad +example: resources_test/grn_benchmark/evaluation_datasets/op_bulk.h5ad label: perturbation data summary: "Perturbation dataset for benchmarking." diff --git a/src/api/file_prediction_h5ad.yaml b/src/api/file_prediction_h5ad.yaml index a14ebe0f1..dff343fbe 100644 --- a/src/api/file_prediction_h5ad.yaml +++ b/src/api/file_prediction_h5ad.yaml @@ -15,7 +15,7 @@ info: name: method_id description: "A unique identifier for the inference method" required: true - - type: DataFrame + - type: object name: prediction description: "Inferred GRNs in the format of source, target, weight" required: true diff --git a/src/api/file_rna_h5ad.yaml b/src/api/file_rna_h5ad.yaml index 8d70d6e34..74e981370 100644 --- a/src/api/file_rna_h5ad.yaml +++ b/src/api/file_rna_h5ad.yaml @@ -1,5 +1,5 @@ type: file -example: resources_test/inference_datasets/op_rna.h5ad +example: resources_test/grn_benchmark/inference_datasets/op_rna.h5ad label: gene expression data summary: "RNA expression data." info: diff --git a/src/control_methods/negative_control/script.py b/src/control_methods/negative_control/script.py index d3e6c13c3..e23381654 100644 --- a/src/control_methods/negative_control/script.py +++ b/src/control_methods/negative_control/script.py @@ -8,7 +8,7 @@ par = { "rna": "resources/grn-benchmark/rna.h5ad", "prediction": "resources/grn_models/default/negative_control.csv", - "tf_all": "resources/prior/tf_all.csv", + "tf_all": "resources/grn_benchmark/prior/tf_all.csv", "max_n_links": 50000 } ## VIASH END diff --git a/src/control_methods/pearson_corr/script.py b/src/control_methods/pearson_corr/script.py index 283262da0..ff2026969 100644 --- a/src/control_methods/pearson_corr/script.py +++ b/src/control_methods/pearson_corr/script.py @@ -10,8 +10,8 @@ ## VIASH START par = { - 'rna': 'resources/evaluation_datasets/op_rna.h5ad', - 'tf_all': 'resources/prior/tf_all.csv', + 'rna': 'resources/grn_benchmark/evaluation_datasets//op_rna.h5ad', + 'tf_all': 'resources/grn_benchmark/prior/tf_all.csv', 'cell_type_specific': False, 'max_n_links': 50000, 'prediction': 'output/pearson_net.h5ad', diff --git a/src/control_methods/pearson_corr/test.sh b/src/control_methods/pearson_corr/test.sh index 14cf51c2c..3f76b005d 100644 --- a/src/control_methods/pearson_corr/test.sh +++ b/src/control_methods/pearson_corr/test.sh @@ -1,4 +1,4 @@ viash run src/control_methods/pearson/config.vsh.yaml -- \ --prediction output/baseline_corr.csv \ --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \ - --tf_all resources/prior/tf_all.csv \ No newline at end of file + --tf_all resources/grn_benchmark/prior/tf_all.csv \ No newline at end of file diff --git a/src/control_methods/positive_control/script.py b/src/control_methods/positive_control/script.py index 2d60ef0c4..68269d2e8 100644 --- a/src/control_methods/positive_control/script.py +++ b/src/control_methods/positive_control/script.py @@ -12,7 +12,7 @@ ## VIASH START par = { 'rna': 'resources/grn-benchmark/rna.h5ad', - 'tf_all': 'resources/prior/tf_all.csv', + 'tf_all': 'resources/grn_benchmark/prior/tf_all.csv', 'max_n_links': 50000, 'prediction': 'resources/grn_models/positive_control.csv', "seed": 32, diff --git a/src/control_methods/positive_control/test.sh b/src/control_methods/positive_control/test.sh index 884e70266..1cf9ad889 100644 --- a/src/control_methods/positive_control/test.sh +++ b/src/control_methods/positive_control/test.sh @@ -1,5 +1,5 @@ viash run src/control_methods/baseline_corr/config.vsh.yaml -- \ --prediction output/baseline_corr.csv \ --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \ - --tf_all resources/prior/tf_all.csv \ + --tf_all resources/grn_benchmark/prior/tf_all.csv \ --causal true diff --git a/src/helper.py b/src/helper.py index 8beaf81de..844bbff76 100644 --- a/src/helper.py +++ b/src/helper.py @@ -40,12 +40,12 @@ def analyse_meta_cells(task_grn_inference_dir): par = { - 'rna': f'{task_grn_inference_dir}/resources/inference_datasets/{dataset}_rna.h5ad', - "evaluation_data": f"{task_grn_inference_dir}/resources/evaluation_datasets/{dataset}_perturbation.h5ad", + 'rna': f'{task_grn_inference_dir}/resources/grn_benchmark/inference_datasets/{dataset}_rna.h5ad', + "evaluation_data": f"{task_grn_inference_dir}/resources/grn_benchmark/evaluation_datasets//{dataset}_perturbation.h5ad", 'layer': 'X_norm', - 'consensus': f'{task_grn_inference_dir}/resources/prior/{dataset}_consensus-num-regulators.json', - 'tf_all': f'{task_grn_inference_dir}/resources/prior/tf_all.csv', + 'consensus': f'{task_grn_inference_dir}/resources/grn_benchmark/prior/{dataset}_consensus-num-regulators.json', + 'tf_all': f'{task_grn_inference_dir}/resources/grn_benchmark/prior/tf_all.csv', 'apply_tf': True, 'apply_skeleton': False, 'verbose': 2, @@ -123,12 +123,12 @@ def analyse_imputation(task_grn_inference_dir): par = { - 'rna': f'{task_grn_inference_dir}/resources/inference_datasets/{dataset}_rna.h5ad', - "evaluation_data": f"{task_grn_inference_dir}/resources/evaluation_datasets/{dataset}_perturbation.h5ad", + 'rna': f'{task_grn_inference_dir}/resources/grn_benchmark/inference_datasets/{dataset}_rna.h5ad', + "evaluation_data": f"{task_grn_inference_dir}/resources/grn_benchmark/evaluation_datasets//{dataset}_perturbation.h5ad", 'layer': 'X_norm', - 'consensus': f'{task_grn_inference_dir}/resources/prior/{dataset}_consensus-num-regulators.json', - 'tf_all': f'{task_grn_inference_dir}/resources/prior/tf_all.csv', + 'consensus': f'{task_grn_inference_dir}/resources/grn_benchmark/prior/{dataset}_consensus-num-regulators.json', + 'tf_all': f'{task_grn_inference_dir}/resources/grn_benchmark/prior/tf_all.csv', 'apply_tf': True, 'apply_skeleton': False, 'verbose': 2, @@ -204,12 +204,12 @@ def analyse_imputation(task_grn_inference_dir): def analyse_corr_vs_tfmasked_corr(task_grn_inference_dir): for i_run, dataset in enumerate(['op', 'replogle2', 'nakatake', 'norman', 'adamson']): par = { - 'rna': f'{task_grn_inference_dir}/resources/inference_datasets/{dataset}_rna.h5ad', - "evaluation_data": f"{task_grn_inference_dir}/resources/evaluation_datasets/{dataset}_perturbation.h5ad", + 'rna': f'{task_grn_inference_dir}/resources/grn_benchmark/inference_datasets/{dataset}_rna.h5ad', + "evaluation_data": f"{task_grn_inference_dir}/resources/grn_benchmark/evaluation_datasets//{dataset}_perturbation.h5ad", 'layer': 'X_norm', - 'consensus': f'{task_grn_inference_dir}/resources/prior/{dataset}_consensus-num-regulators.json', - 'tf_all': f'{task_grn_inference_dir}/resources/prior/tf_all.csv', + 'consensus': f'{task_grn_inference_dir}/resources/grn_benchmark/prior/{dataset}_consensus-num-regulators.json', + 'tf_all': f'{task_grn_inference_dir}/resources/grn_benchmark/prior/tf_all.csv', 'apply_skeleton': False, 'verbose': 2, 'max_n_links': 50_000, @@ -342,7 +342,7 @@ def create_skeleton(): skeleton = skeleton.drop_duplicates().reset_index(drop=True) skeleton.nunique() all_links = skeleton['source'].astype(str) + '_' + skeleton['target'].astype(str) - np.savetxt('resources/prior/skeleton.csv', all_links.values, fmt='%s') + np.savetxt('resources/grn_benchmark/prior/skeleton.csv', all_links.values, fmt='%s') def merge_tf_motifs(): cols = ['chrom', 'chromStart', 'chromEnd', 'name'] diff --git a/src/methods/multi_omics/celloracle/script.py b/src/methods/multi_omics/celloracle/script.py index 6940de7d7..0ca2676b8 100644 --- a/src/methods/multi_omics/celloracle/script.py +++ b/src/methods/multi_omics/celloracle/script.py @@ -6,8 +6,8 @@ ## VIASH START par = { - "rna": "resources/inference_datasets/op_rna.h5ad", - "atac": "resources/inference_datasets/op_atac.h5ad", + "rna": "resources/grn_benchmark/inference_datasets/op_rna.h5ad", + "atac": "resources/grn_benchmark/inference_datasets/op_atac.h5ad", "base_grn": 'output/celloracle/base_grn.csv', "temp_dir": 'output/celloracle/', "num_workers": 10, diff --git a/src/methods/multi_omics/figr/script.R b/src/methods/multi_omics/figr/script.R index 98422b67d..bc96a5cda 100644 --- a/src/methods/multi_omics/figr/script.R +++ b/src/methods/multi_omics/figr/script.R @@ -11,7 +11,7 @@ par <- list( multiomics_rna_r = "resources/grn-benchmark/multiomics_rna.rds", multiomics_atac_r = "resources/grn-benchmark/multiomics_atac.rds", temp_dir = "output/figr/", - cell_topic = "resources/prior/cell_topic.csv", + cell_topic = "resources/grn_benchmark/prior/cell_topic.csv", num_workers = 20, n_topics = 48, peak_gene = "output/figr/peak_gene.csv", diff --git a/src/methods/multi_omics/scenicplus/test.sh b/src/methods/multi_omics/scenicplus/test.sh index 4139f543c..b1edb9119 100644 --- a/src/methods/multi_omics/scenicplus/test.sh +++ b/src/methods/multi_omics/scenicplus/test.sh @@ -1,2 +1,2 @@ -viash run src/methods/multi_omics/scenicplus/config.vsh.yaml -- --multiomics_atac ./resources_test/grn-benchmark/multiomics_atac.h5ad --multiomics_rna ./resources_test/grn-benchmark/multiomics_rna.h5ad --tf_all ./resources/prior/tf_all.csv --prediction ./output/scenicplus/prediction.csv --cell_topic ./output/scenicplus/cell_topic.csv --scplus_mdata ./output/scenicplus/scplus_mdata.h5mu --temp_dir ./output/scenicplus/ +viash run src/methods/multi_omics/scenicplus/config.vsh.yaml -- --multiomics_atac ./resources_test/grn-benchmark/multiomics_atac.h5ad --multiomics_rna ./resources_test/grn-benchmark/multiomics_rna.h5ad --tf_all ./resources/grn_benchmark/prior/tf_all.csv --prediction ./output/scenicplus/prediction.csv --cell_topic ./output/scenicplus/cell_topic.csv --scplus_mdata ./output/scenicplus/scplus_mdata.h5mu --temp_dir ./output/scenicplus/ #viash run src/methods/multi_omics/scenicplus/config.vsh.yaml -p docker -- ---setup build \ No newline at end of file diff --git a/src/methods/single_omics/ennet/script.R b/src/methods/single_omics/ennet/script.R index 64e5c076a..87453d799 100644 --- a/src/methods/single_omics/ennet/script.R +++ b/src/methods/single_omics/ennet/script.R @@ -5,7 +5,7 @@ library(dplyr) ## VIASH START par <- list( "multiomics_rna" = 'resources/resources_test/grn-benchmark/multiomics_rna.h5ad', - "tf_all" = 'resources/prior/tf_all.csv', + "tf_all" = 'resources/grn_benchmark/prior/tf_all.csv', "prediction" = 'output/ennet/prediction.csv', "temp_dir": 'output/ennet', "max_n_links": 50000, diff --git a/src/methods/single_omics/ennet/test.sh b/src/methods/single_omics/ennet/test.sh index e9778e232..a55ad42f7 100644 --- a/src/methods/single_omics/ennet/test.sh +++ b/src/methods/single_omics/ennet/test.sh @@ -1 +1 @@ -viash run src/methods/single_omics/ennet/config.vsh.yaml -- --multiomics_rna resources_test/grn-benchmark/multiomics_rna.h5ad --tf_all resources/prior/tf_all.csv --prediction output/ennet/prediction.csv \ No newline at end of file +viash run src/methods/single_omics/ennet/config.vsh.yaml -- --multiomics_rna resources_test/grn-benchmark/multiomics_rna.h5ad --tf_all resources/grn_benchmark/prior/tf_all.csv --prediction output/ennet/prediction.csv \ No newline at end of file diff --git a/src/methods/single_omics/genie3/script.py b/src/methods/single_omics/genie3/script.py index 2f2ceca64..19fc30f81 100644 --- a/src/methods/single_omics/genie3/script.py +++ b/src/methods/single_omics/genie3/script.py @@ -12,7 +12,7 @@ ## VIASH START par = { 'rna': 'resources/grn-benchmark/rna_d0_hvg.h5ad', - "tf_all": 'resources/prior/tf_all.csv', + "tf_all": 'resources/grn_benchmark/prior/tf_all.csv', 'prediction': 'output/genie3_donor0_hvg.csv', 'max_n_links': 50000, 'normalize': False diff --git a/src/methods/single_omics/genie3/test.sh b/src/methods/single_omics/genie3/test.sh index 345f3f140..221ec93d3 100644 --- a/src/methods/single_omics/genie3/test.sh +++ b/src/methods/single_omics/genie3/test.sh @@ -1 +1 @@ -viash run src/methods/single_omics/genie3/config.novsh.yaml -- --multiomics_rna resources_test/grn-benchmark/multiomics_rna.h5ad --tfs resources/prior/tf_all.csv --prediction output/genie3/prediction.csv \ No newline at end of file +viash run src/methods/single_omics/genie3/config.novsh.yaml -- --multiomics_rna resources_test/grn-benchmark/multiomics_rna.h5ad --tfs resources/grn_benchmark/prior/tf_all.csv --prediction output/genie3/prediction.csv \ No newline at end of file diff --git a/src/methods/single_omics/grnboost2/script.py b/src/methods/single_omics/grnboost2/script.py index 63047564a..d128fc162 100644 --- a/src/methods/single_omics/grnboost2/script.py +++ b/src/methods/single_omics/grnboost2/script.py @@ -9,7 +9,7 @@ ## VIASH START par = { 'rna': 'resources/grn-benchmark/rna_d0_hvg.h5ad', - "tf_all": 'resources/prior/tf_all.csv', + "tf_all": 'resources/grn_benchmark/prior/tf_all.csv', 'prediction': 'output/grnboost2_donor_0_hvg.csv', 'max_n_links': 50000, 'cell_type_specific': False, diff --git a/src/methods/single_omics/grnboost2/test.sh b/src/methods/single_omics/grnboost2/test.sh index abb5d5b06..f6222b758 100644 --- a/src/methods/single_omics/grnboost2/test.sh +++ b/src/methods/single_omics/grnboost2/test.sh @@ -1,4 +1,4 @@ viash run src/methods/single_omics/grnboost2/config.vsh.yaml -- --multiomics_rna resources_test/grn-benchmark/multiomics_rna.h5ad \ - --tf_all resources/prior/tf_all.csv \ + --tf_all resources/grn_benchmark/prior/tf_all.csv \ --cell_type_specific false \ --prediction output/grnboost2/prediction.csv \ No newline at end of file diff --git a/src/methods/single_omics/portia/script.py b/src/methods/single_omics/portia/script.py index 9c186a5a3..030ac4bd0 100644 --- a/src/methods/single_omics/portia/script.py +++ b/src/methods/single_omics/portia/script.py @@ -12,7 +12,7 @@ ## VIASH START par = { 'rna': 'resources/grn-benchmark/rna.h5ad', - 'tf_all': 'resources/prior/tf_all.csv', + 'tf_all': 'resources/grn_benchmark/prior/tf_all.csv', 'prediction': 'output/portia.csv', 'max_n_links': 50000, 'donor_specific': False, diff --git a/src/methods/single_omics/portia/test.sh b/src/methods/single_omics/portia/test.sh index f46994b44..7e8d6c82e 100644 --- a/src/methods/single_omics/portia/test.sh +++ b/src/methods/single_omics/portia/test.sh @@ -1,4 +1,4 @@ viash run src/methods/single_omics/portia/config.vsh.yaml -- --multiomics_rna resources_test/grn-benchmark/multiomics_rna.h5ad \ - --tf_all resources/prior/tf_all.csv \ + --tf_all resources/grn_benchmark/prior/tf_all.csv \ --prediction output/portia/prediction.csv \ --cell_type_specific false \ No newline at end of file diff --git a/src/methods/single_omics/ppcor/script.R b/src/methods/single_omics/ppcor/script.R index c7eeed743..dffdc1b92 100644 --- a/src/methods/single_omics/ppcor/script.R +++ b/src/methods/single_omics/ppcor/script.R @@ -6,7 +6,7 @@ library(dplyr) par <- list( rna = 'resources/grn-benchmark/rna.h5ad', prediction = 'resources/grn_models/ppcor.csv', - tf_all = 'resources/prior/tf_all.csv', + tf_all = 'resources/grn_benchmark/prior/tf_all.csv', max_n_links = 50000 ) ## VIASH END diff --git a/src/methods/single_omics/scenic/script.py b/src/methods/single_omics/scenic/script.py index 4433ca8dd..e5fbccb4f 100644 --- a/src/methods/single_omics/scenic/script.py +++ b/src/methods/single_omics/scenic/script.py @@ -19,7 +19,7 @@ ## VIASH START par = { 'rna': 'resources/grn-benchmark/rna_0.h5ad', - "tf_all": 'resources/prior/tf_all.csv', + "tf_all": 'resources/grn_benchmark/prior/tf_all.csv', 'prediction': 'output/scenic_0_hvgs.csv', 'temp_dir': 'output/scenic', 'num_workers': 20, diff --git a/src/methods/single_omics/scenic/test.sh b/src/methods/single_omics/scenic/test.sh index f7af173e6..dd1069e0d 100644 --- a/src/methods/single_omics/scenic/test.sh +++ b/src/methods/single_omics/scenic/test.sh @@ -1 +1 @@ -viash run src/methods/single_omics/scenic/config.vsh.yaml -- --multiomics_rna resources_test/grn-benchmark/multiomics_rna.h5ad --tf_all resources/prior/tf_all.csv --prediction output/scenic_prediction.csv --temp_dir output/scenic \ No newline at end of file +viash run src/methods/single_omics/scenic/config.vsh.yaml -- --multiomics_rna resources_test/grn-benchmark/multiomics_rna.h5ad --tf_all resources/grn_benchmark/prior/tf_all.csv --prediction output/scenic_prediction.csv --temp_dir output/scenic \ No newline at end of file diff --git a/src/methods/single_omics/scgpt/run.sh b/src/methods/single_omics/scgpt/run.sh index 1021776d6..7b409ad1f 100644 --- a/src/methods/single_omics/scgpt/run.sh +++ b/src/methods/single_omics/scgpt/run.sh @@ -1,4 +1,4 @@ viash run src/methods/single_omics/scgpt/config.vsh.yaml -- \ - --rna resources_test/inference_datasets/op_rna.h5ad \ - --tf_all resources/prior/tf_all.csv \ + --rna resources_test/grn_benchmark/inference_datasets//op_rna.h5ad \ + --tf_all resources/grn_benchmark/prior/tf_all.csv \ --prediction output/prediction.h5ad \ No newline at end of file diff --git a/src/methods/single_omics/scgpt/test.sh b/src/methods/single_omics/scgpt/test.sh index 1adaf9cf5..607f3f27e 100644 --- a/src/methods/single_omics/scgpt/test.sh +++ b/src/methods/single_omics/scgpt/test.sh @@ -1 +1 @@ -viash run src/methods/single_omics/portia/config.novsh.yaml -- --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad --tfs resources/prior/tf_all.csv --prediction output/portia/prediction.csv \ No newline at end of file +viash run src/methods/single_omics/portia/config.novsh.yaml -- --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad --tfs resources/grn_benchmark/prior/tf_all.csv --prediction output/portia/prediction.csv \ No newline at end of file diff --git a/src/methods/single_omics/scprint/run.sh b/src/methods/single_omics/scprint/run.sh index de532b91b..e8988169d 100644 --- a/src/methods/single_omics/scprint/run.sh +++ b/src/methods/single_omics/scprint/run.sh @@ -1,10 +1,31 @@ +#!/bin/bash +#SBATCH --job-name=scprint +#SBATCH --output=logs/%j.out +#SBATCH --error=logs/%j.err +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=10 +#SBATCH --time=10:00:00 +#SBATCH --mem=100GB +#SBATCH --partition=cpu +#SBATCH --mail-type=END,FAIL +#SBATCH --mail-user=jalil.nourisa@gmail.com + + + # viash run src/methods/single_omics/scprint/config.vsh.yaml -- \ -# --rna resources_test/inference_datasets/op_rna.h5ad \ -# --tf_all resources/prior/tf_all.csv \ +# --rna resources_test/grn_benchmark/inference_datasets//op_rna.h5ad \ +# --tf_all resources/grn_benchmark/prior/tf_all.csv \ +# --prediction output/prediction.h5ad + + +# python src/methods/single_omics/scprint/script.py \ +# --rna resources/grn_benchmark/inference_datasets/op_rna.h5ad \ +# --tf_all resources/grn_benchmark/prior/tf_all.csv \ # --prediction output/prediction.h5ad +# Exit immediately if a command exits with a non-zero status +set -e +source ~/miniconda3/bin/activate scprint -python src/methods/single_omics/scprint/script.py \ - --rna resources/inference_datasets/op_rna.h5ad \ - --tf_all resources/prior/tf_all.csv \ - --prediction output/prediction.h5ad \ No newline at end of file +python src/methods/script_all.py +python src/metrics/script_all_experiment.py \ No newline at end of file diff --git a/src/methods/single_omics/scprint/script.py b/src/methods/single_omics/scprint/script.py index 5dd3664e0..6ff098cf1 100644 --- a/src/methods/single_omics/scprint/script.py +++ b/src/methods/single_omics/scprint/script.py @@ -28,8 +28,8 @@ ## VIASH START par = { - 'rna': 'resources/inference_datasets/op_rna.h5ad', - 'tf_all': 'resources/prior/tf_all.csv', + 'rna': 'resources/grn_benchmark/inference_datasets/op_rna.h5ad', + 'tf_all': 'resources/grn_benchmark/prior/tf_all.csv', 'prediction': 'output/grn.h5ad', 'filtration': 'top-k', 'max_n_links': 50_000, @@ -148,12 +148,12 @@ def main(par): if __name__ == '__main__': - if True: #TODO:remove this + if False: #TODO:remove this populate_my_ontology( organisms= ["NCBITaxon:9606"] ) par['checkpoint'] = par['temp_dir'] + '/scprint.ckpt' - if True: + if False: os.makedirs(par['temp_dir'], exist_ok=True) print(f"Downloading checkpoint") diff --git a/src/methods/single_omics/tigress/script.R b/src/methods/single_omics/tigress/script.R index b706da454..abe4b181c 100644 --- a/src/methods/single_omics/tigress/script.R +++ b/src/methods/single_omics/tigress/script.R @@ -5,7 +5,7 @@ library(dplyr) ## VIASH START par <- list( multiomics_rna = 'resources/grn-benchmark/multiomics_rna_d0_hvg.h5ad', - tf_all = 'resources/prior/tf_all.csv', + tf_all = 'resources/grn_benchmark/prior/tf_all.csv', prediction = 'output/tigress_d0_hvg.csv', temp_dir = 'output/tigress', max_n_links = 50000, diff --git a/src/methods/single_omics/tigress/test.sh b/src/methods/single_omics/tigress/test.sh index 637824103..02e52d780 100644 --- a/src/methods/single_omics/tigress/test.sh +++ b/src/methods/single_omics/tigress/test.sh @@ -1,2 +1,2 @@ -#viash run src/methods/single_omics/tigress/config.vsh.yaml -- --multiomics_rna resources_test/grn-benchmark/multiomics_rna.h5ad --tf_all resources/prior/tf_all.csv --prediction output/tigress/prediction.csv -viash run src/methods/single_omics/tigress/config.vsh.yaml -- --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad --tf_all resources/prior/tf_all.csv --prediction output/tigress/prediction.csv \ No newline at end of file +#viash run src/methods/single_omics/tigress/config.vsh.yaml -- --multiomics_rna resources_test/grn-benchmark/multiomics_rna.h5ad --tf_all resources/grn_benchmark/prior/tf_all.csv --prediction output/tigress/prediction.csv +viash run src/methods/single_omics/tigress/config.vsh.yaml -- --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad --tf_all resources/grn_benchmark/prior/tf_all.csv --prediction output/tigress/prediction.csv \ No newline at end of file diff --git a/src/metrics/all_metrics/helper.py b/src/metrics/all_metrics/helper.py index 65e975dd1..15ef4533b 100644 --- a/src/metrics/all_metrics/helper.py +++ b/src/metrics/all_metrics/helper.py @@ -37,12 +37,12 @@ def run_consensus(datasets): for dataset in datasets: par = { 'models': models, - 'evaluation_data': f'resources/evaluation_datasets/{dataset}_perturbation.h5ad', + 'evaluation_data': f'resources/grn_benchmark/evaluation_datasets//{dataset}_perturbation.h5ad', 'evaluation_data_sc': f'resources/datasets_raw/{dataset}_sc_counts.h5ad', 'models_dir': f'resources/grn_models/{dataset}/', - 'regulators_consensus': f'resources/prior/regulators_consensus_{dataset}.json', - 'ws_consensus': f'resources/prior/ws_consensus_{dataset}.csv', - 'tf_all': 'resources/prior/tf_all.csv', + 'regulators_consensus': f'resources/grn_benchmark/prior/regulators_consensus_{dataset}.json', + 'ws_consensus': f'resources/grn_benchmark/prior/ws_consensus_{dataset}.csv', + 'tf_all': 'resources/grn_benchmark/prior/tf_all.csv', } # - reg2 consensus @@ -57,8 +57,8 @@ def run_ws_distance_background(datasets): for dataset in datasets: par = { 'evaluation_data_sc': f'resources/datasets_raw/{dataset}_sc_counts.h5ad', - 'background_distance': f'resources/prior/ws_distance_background_{dataset}.csv', - 'tf_all': 'resources/prior/tf_all.csv', + 'background_distance': f'resources/grn_benchmark/prior/ws_distance_background_{dataset}.csv', + 'tf_all': 'resources/grn_benchmark/prior/tf_all.csv', 'layer': 'X_norm' } print(f'--run ws distance background --{dataset}') diff --git a/src/metrics/all_metrics/script.py b/src/metrics/all_metrics/script.py index 986ce6961..ede22eb6c 100644 --- a/src/metrics/all_metrics/script.py +++ b/src/metrics/all_metrics/script.py @@ -3,16 +3,27 @@ import sys import numpy as np import os +import argparse +import scanpy as sc +argparser = argparse.ArgumentParser() +argparser.add_argument('--run_local', action='store_true', help='Run locally') +argparser.add_argument('--prediction', help='Path to the GRN prediction file') +argparser.add_argument('--evaluation_dataset') +argparser.add_argument('--evaluation_dataset_sc') +argparser.add_argument('--regulators_consensus') +argparser.add_argument('--ws_consensus') +argparser.add_argument('--ws_distance_background') +argparser.add_argument('--method_id', help='Method ID') +argparser.add_argument('--dataset_id', help='Dataset ID') +argparser.add_argument('--score', help='Where to store scores') + +par_local = vars(argparser.parse_args()) ## VIASH START par = { - 'prediction': f'resources/grn_models/norman/grnboost2.csv', - 'method_id': 'grnboost2', - - "tf_all": f"resources/prior/tf_all.csv", - 'skeleton': f'resources/prior/skeleton.csv', - 'dataset_id': 'norman', + "tf_all": f"resources/grn_benchmark/prior/tf_all.csv", + 'skeleton': f'resources/grn_benchmark/prior/skeleton.csv', 'layer': 'X_norm', "apply_tf": True, 'subsample': -1, @@ -22,13 +33,24 @@ 'max_n_links': 50000, 'apply_skeleton': False, 'reg_type':'ridge', - 'score': 'output/score.h5ad' } ## VIASH END -sys.path.append(meta["resources_dir"]) -from reg1_main import main as main_reg1 -from reg2_main import main as main_reg2 -from ws_main import main as main_ws +if par_local['run_local']: + for key in par_local: + par[key] = par_local[key] + + meta = { + "resources_dir": 'src/metrics/', + } + sys.path.append(meta["resources_dir"]) + from regression_1.script import main as main_reg1 + from regression_2.main import main as main_reg2 + from wasserstein.script import main as main_ws +else: + sys.path.append(meta["resources_dir"]) + from reg1_main import main as main_reg1 + from reg2_main import main as main_reg2 + from ws_main import main as main_ws # try: # sys.path.append(meta["resources_dir"]) @@ -56,11 +78,11 @@ def main(par): assert par['dataset_id'] dataset = par['dataset_id'] - # par['evaluation_data'] = f'resources/evaluation_datasets/{dataset}_perturbation.h5ad' + # par['evaluation_data'] = f'resources/grn_benchmark/evaluation_datasets/{dataset}_perturbation.h5ad' # par['evaluation_data_sc'] = f'resources/datasets_raw/{dataset}_sc_counts.h5ad' - # par['regulators_consensus'] = f'resources/prior/regulators_consensus_{dataset}.json' - # par['ws_consensus'] = f'resources/prior/ws_consensus_{dataset}.csv' - # par['ws_distance_background'] = f'resources/prior/ws_distance_background_{dataset}.csv' + # par['regulators_consensus'] = f'resources/grn_benchmark/prior/regulators_consensus_{dataset}.json' + # par['ws_consensus'] = f'resources/grn_benchmark/prior/ws_consensus_{dataset}.csv' + # par['ws_distance_background'] = f'resources/grn_benchmark/prior/ws_distance_background_{dataset}.csv' scores_all = [] diff --git a/src/metrics/all_metrics/script_all.py b/src/metrics/all_metrics/script_all.py index cb44dacdb..fd54ef756 100644 --- a/src/metrics/all_metrics/script_all.py +++ b/src/metrics/all_metrics/script_all.py @@ -15,8 +15,8 @@ par = { 'layer': 'X_norm', - "tf_all": "resources/prior/tf_all.csv", - 'skeleton': 'resources/prior/skeleton.csv', + "tf_all": "resources/grn_benchmark/prior/tf_all.csv", + 'skeleton': 'resources/grn_benchmark/prior/skeleton.csv', "apply_tf": True, 'subsample': -1, 'verbose': 4, @@ -146,13 +146,13 @@ def run_evaluation(dataset, models, models_dir, save_file_name): # def define_par(dataset): # par = { -# "evaluation_data": f"resources/evaluation_datasets/{dataset}_perturbation.h5ad", -# 'consensus': f'resources/prior/{dataset}_consensus-num-regulators.json', +# "evaluation_data": f"resources/grn_benchmark/evaluation_datasets//{dataset}_perturbation.h5ad", +# 'consensus': f'resources/grn_benchmark/prior/{dataset}_consensus-num-regulators.json', # 'layer': 'X_norm', -# "tf_all": "resources/prior/tf_all.csv", -# 'skeleton': 'resources/prior/skeleton.csv', +# "tf_all": "resources/grn_benchmark/prior/tf_all.csv", +# 'skeleton': 'resources/grn_benchmark/prior/skeleton.csv', # "apply_tf": True, # 'subsample': -1, # 'verbose': 4, diff --git a/src/metrics/regression_1/config.vsh.yaml b/src/metrics/regression_1/config.vsh.yaml index 81c51164a..6e5120eb5 100644 --- a/src/metrics/regression_1/config.vsh.yaml +++ b/src/metrics/regression_1/config.vsh.yaml @@ -16,7 +16,7 @@ arguments: resources: - type: python_script path: script.py - - path: main.py + - path: helper.py - path: /src/utils/util.py dest: util.py diff --git a/src/metrics/regression_1/main.py b/src/metrics/regression_1/helper.py similarity index 51% rename from src/metrics/regression_1/main.py rename to src/metrics/regression_1/helper.py index 488e8b257..13cee2572 100644 --- a/src/metrics/regression_1/main.py +++ b/src/metrics/regression_1/helper.py @@ -1,3 +1,6 @@ +import pandas as pd +import anndata as ad +import sys import numpy as np from sklearn.metrics import r2_score from sklearn.pipeline import Pipeline @@ -6,16 +9,15 @@ import lightgbm import random from concurrent.futures import ThreadPoolExecutor -import pandas as pd -import anndata as ad from tqdm import tqdm -from util import verbose_print, process_links, verbose_tqdm from sklearn.multioutput import MultiOutputRegressor import os import warnings -warnings.filterwarnings("ignore", category=FutureWarning) - +def set_global_seed(seed): + np.random.seed(seed) + random.seed(seed) + lightgbm.LGBMRegressor().set_params(random_state=seed) class lightgbm_wrapper: @@ -91,7 +93,6 @@ def cross_validation(net, prturb_adata, par:dict): # fill the actual regulatory links X_df.loc[mask_gene_net, :] = net.values X = X_df.values.copy() - # run cv groups = cv_5(len(gene_names)) @@ -105,9 +106,6 @@ def cross_validation(net, prturb_adata, par:dict): Y = Y_df.values y_true = Y.copy() - # print(r2_score(y_true, y_pred, multioutput='variance_weighted')) - - unique_groups = np.unique(groups) # determine regressor reg_type = par['reg_type'] @@ -127,7 +125,9 @@ def cross_validation(net, prturb_adata, par:dict): raise ValueError(f"{reg_type} is not defined.") reg_models = [] + # initialize baselines n_baselines = 100 + unique_groups = np.unique(groups) y_pred_baselines = [y_pred.copy() for i in range(n_baselines)] for group in verbose_tqdm(unique_groups, "Processing groups", 2, par['verbose']): mask_va = groups == group @@ -156,14 +156,14 @@ def cross_validation(net, prturb_adata, par:dict): r2score_baselines = [] for y_pred_baseline in y_pred_baselines: r2score_baselines.append(r2_score(y_true, y_pred_baseline, multioutput='variance_weighted')) - r2score_n = r2score - np.mean(r2score_baselines) + r2score_n_all = r2score - np.mean(r2score_baselines) # - normalized r2 score - S2 r2score = r2_score(y_true[mask_gene_net, :], y_pred[mask_gene_net, :], multioutput='variance_weighted') r2score_baselines = [] for y_pred_baseline in y_pred_baselines: r2score_baselines.append(r2_score(y_true[mask_gene_net, :], y_pred_baseline[mask_gene_net, :], multioutput='variance_weighted')) - r2score_n_s2 = r2score - np.mean(r2score_baselines) + r2score_n_grn = r2score - np.mean(r2score_baselines) # - normalized r2 scores per sample r2score_samples = [] @@ -174,171 +174,12 @@ def cross_validation(net, prturb_adata, par:dict): r2score_baselines.append(r2_score(y_true[:, i_sample], y_pred_baseline[:, i_sample])) r2score_samples.append(score_sample - np.mean(r2score_baselines)) - results = { - 'r2score-aver': r2score_n, - 'r2score-aver-s2': r2score_n_s2, + 'r2score-aver-all': r2score_n_all, + 'r2score-aver-grn': r2score_n_grn, 'r2scores': r2score_samples, 'reg_models' : reg_models } - return results -def regression_1( - net: pd.DataFrame, - prturb_adata: ad.AnnData, - par:dict, - verbose: int = 0, - max_workers: int = 4) -> None: - """ - net: a df with index as genes and columns as tfs - prturb_adata: a adata - """ - - gene_names = prturb_adata.var_names - if 'donor_id' in prturb_adata.obs.columns: - donor_ids = prturb_adata.obs.donor_id.unique() - else: - donor_ids = ['default'] - score_list = [] - score_s2_list = [] - for donor_id in donor_ids: - verbose_print(par['verbose'], f'----cross validate for {donor_id}----', 2) - # check if net is donor specific - if 'donor_id' in net.columns: - if donor_id not in net.donor_id.unique(): - raise ValueError(f'{donor_id} is not present in grn.') - net_sub = net[net.donor_id==donor_id] - else: - net_sub = net - if (len(net_sub)>par['max_n_links']) and (par['max_n_links']!=-1): - net_sub = process_links(net_sub, par) #only top links are considered - verbose_print(par['verbose'], f"Number of links reduced to {par['max_n_links']}", 2) - if par['binarize']: - net['weight'] = net['weight'].apply(binarize_weight) - if 'donor_id' in prturb_adata.obs.columns: - prturb_adata_sub = prturb_adata[prturb_adata.obs.donor_id==donor_id,:] - else: - prturb_adata_sub = prturb_adata - results = cross_validation(net_sub, prturb_adata_sub, par) - - - - score_list.append(results['r2score-aver']) - score_s2_list.append(results['r2score-aver-s2']) - s1 = [np.mean(score_list)] - s2 = [np.mean(score_s2_list)] - - output = dict(S1=s1, S2=s2) - return output - -def set_global_seed(seed): - np.random.seed(seed) - random.seed(seed) - lightgbm.LGBMRegressor().set_params(random_state=seed) - - -def pivot_grn(net): - ''' make net to have gene*tf format''' - net = net.drop_duplicates(subset=['target', 'source']) - df_tmp = net.pivot(index='target', columns='source', values='weight') - return df_tmp.fillna(0) - - -def process_net(net, gene_names): - # Remove self-regulations - net = net.drop_duplicates() - net = net[net['source'] != net['target']] - # pivot - net = pivot_grn(net) - # subset - net = net[net.index.isin(gene_names)] - return net - -def binarize_weight(weight): - if weight > 0: - return 1 - elif weight < 0: - return -1 - else: - return 0 - -def main(par): - random_state = 42 - set_global_seed(random_state) - par['theta'] = 1 # no subsetting based on theta - manipulate = None - ## read and process input data - verbose_print(par['verbose'], 'Reading input files', 3) - net = ad.read_h5ad(par['prediction']) - net = pd.DataFrame(net.uns['prediction']) - - evaluation_data = ad.read_h5ad(par['evaluation_data']) - - if 'is_test' in evaluation_data.obs.columns: - evaluation_data = evaluation_data[evaluation_data.obs['is_test']] - print(evaluation_data.shape) - - tf_all = np.loadtxt(par['tf_all'], dtype=str) - gene_names = evaluation_data.var.index.to_numpy() - - - if par['apply_skeleton']: #apply skeleton - print('Before filtering with skeleton:', net.shape) - skeleton = pd.read_csv(par['skeleton']) - skeleton['link'] = skeleton['source'].astype(str) + '_' + skeleton['target'].astype(str) - skeleton = skeleton['link'].values.flatten() - - net['link'] = net['source'].astype(str) + '_' + net['target'].astype(str) - net = net[net['link'].isin(skeleton)] - print('After filtering with skeleton:', net.shape) - - - # net['weight'] = net.weight.abs() - # subset to keep only those links with source as tf - - if par['apply_tf']: - net = net[net.source.isin(tf_all)] - - if 'cell_type' in net.columns: - # print('Taking mean of cell type specific grns') - # net.drop(columns=['cell_type'], inplace=True) - # net = net.groupby(['source', 'target']).mean().reset_index() - raise ValueError('define this') - - subsample = par['subsample'] - max_workers = par['num_workers'] - layer = par["layer"] - if subsample == -1: - pass - elif subsample == -2: # one combination of cell_type, perturbation - sampled_obs = evaluation_data.obs.groupby(['perturbation', 'cell_type'], observed=False).apply(lambda x: x.sample(1)).reset_index(drop=True) - obs = evaluation_data.obs - mask = [] - for _, row in obs.iterrows(): - mask.append((sampled_obs==row).all(axis=1).any()) - evaluation_data = evaluation_data[mask,:] - elif subsample == -3: #negative control - mask = evaluation_data.obs.perturbation == 'Dimethyl Sulfoxide' - evaluation_data = evaluation_data[mask,:] - elif subsample == -4: #positive control - mask = evaluation_data.obs.perturbation.isin(['Dabrafenib', 'Belinostat']) - evaluation_data = evaluation_data[mask,:] - else: - evaluation_data = evaluation_data[np.random.choice(evaluation_data.n_obs, subsample, replace=False), :] - verbose_print(par['verbose'], evaluation_data.shape,4) - - if layer=='X': - pass - else: - evaluation_data.X = evaluation_data.layers[layer] - - verbose_print(par['verbose'], f'Compute metrics for layer: {layer}', 3) - - - results = regression_1(net, evaluation_data, par=par, max_workers=max_workers) - - - df_results = pd.DataFrame(results) - return df_results \ No newline at end of file diff --git a/src/metrics/regression_1/run.sh b/src/metrics/regression_1/run.sh index 45c242a1e..689b2ee5f 100644 --- a/src/metrics/regression_1/run.sh +++ b/src/metrics/regression_1/run.sh @@ -1,5 +1,5 @@ viash run src/metrics/regression_1/config.vsh.yaml -- \ --prediction resources/grn_models/norman/grnboost2.h5ad \ - --dataset_id norman --evaluation_data resources/evaluation_datasets/norman_perturbation.h5ad \ + --dataset_id norman --evaluation_data resources/grn_benchmark/evaluation_datasets//norman_perturbation.h5ad \ --score output/score.h5ad \ - --tf_all resources/prior/tf_all.csv \ No newline at end of file + --tf_all resources/grn_benchmark/prior/tf_all.csv \ No newline at end of file diff --git a/src/metrics/regression_1/script.py b/src/metrics/regression_1/script.py index 18f0cd2d4..e8ded33b3 100644 --- a/src/metrics/regression_1/script.py +++ b/src/metrics/regression_1/script.py @@ -2,16 +2,26 @@ import anndata as ad import sys import numpy as np +from sklearn.metrics import r2_score +from sklearn.pipeline import Pipeline +from sklearn.linear_model import Ridge +from sklearn.preprocessing import StandardScaler +import lightgbm +import random +from concurrent.futures import ThreadPoolExecutor +from tqdm import tqdm +from sklearn.multioutput import MultiOutputRegressor +import os +import warnings +warnings.filterwarnings("ignore", category=FutureWarning) -file_name = 'op' #nakatake ## VIASH START par = { - "evaluation_data": f"resources/evaluation_datasets/{file_name}_perturbation.h5ad", - "tf_all": "resources/prior/tf_all.csv", - # "prediction": "output/pearson_net.csv", - "prediction": f"resources/grn_models/{file_name}/grnboost2.csv", + "evaluation_data": f"resources/grn_benchmark/evaluation_datasets//op.h5ad", + "tf_all": "resources/grn_benchmark/prior/tf_all.csv", + "prediction": f"resources/grn_models/op/grnboost2.csv", "method_id": "scenic", "max_n_links": 50000, "apply_tf": True, @@ -20,7 +30,7 @@ 'layer': 'pearson', 'subsample': -1, 'num_workers': 4, - 'skeleton': 'resources/prior/skeleton.csv', + 'skeleton': 'resources/grn_benchmark/prior/skeleton.csv', 'apply_skeleton': False, 'verbose': 4, 'binarize': True @@ -29,6 +39,7 @@ import argparse parser = argparse.ArgumentParser() +parser.add_argument('--run_local', action='store_true', help='Run locally') parser.add_argument('--evaluation_data', type=str, help='Path to the evaluation_data file') parser.add_argument('--prediction', type=str, help='Path to the prediction file') parser.add_argument('--tf_all', type=str, help='Path to the tf_all') @@ -39,49 +50,90 @@ args = parser.parse_args() -if args.evaluation_data: - par['evaluation_data'] = args.evaluation_data -if args.layer: - par['layer'] = args.layer -if args.causal: - par['causal'] = True -else: - par['causal'] = False +var_local = vars(args) -if args.prediction: - par['prediction'] = args.prediction -if args.tf_all: - par['tf_all'] = args.tf_all -if args.num_workers: - par['num_workers'] = args.num_workers +if args.run_local: + for key in var_local: + par[key] = var_local[key] + meta = { + "resources_dir":'src/metrics/regression_1/', + "util_dir":'src/utils' + } + sys.path.append(meta["resources_dir"]) + sys.path.append(meta["util_dir"]) -try: - sys.path.append(meta["resources_dir"]) -except: - meta = { - "resources_dir":'src/metrics/regression_1/', - "util_dir":'src/utils' - } +else: sys.path.append(meta["resources_dir"]) - sys.path.append(meta["util_dir"]) - -from main import main -print(par) -output = main(par) -print(output) - -metric_ids = output.columns.to_numpy() -metric_values = output.values[0] - -print(metric_ids.shape, metric_values.shape) -output = ad.AnnData( - X=np.empty((0, 0)), - uns={ - "dataset_id": par["dataset_id"], - "method_id": f"reg1-{par['method_id']}", - "metric_ids": metric_ids, - "metric_values": metric_values - } -) -output.write_h5ad(par["score"], compression="gzip") \ No newline at end of file +from util import verbose_print, process_links, verbose_tqdm, process_net, binarize_weight +from helper import set_global_seed + +def read_net(par): + net = ad.read_h5ad(par['prediction']) + net = pd.DataFrame(net.uns['prediction']) + if par['binarize']: + net['weight'] = net['weight'].apply(binarize_weight) + if par['apply_skeleton']: #apply skeleton + print('Before filtering with skeleton:', net.shape) + skeleton = pd.read_csv(par['skeleton']) + skeleton['link'] = skeleton['source'].astype(str) + '_' + skeleton['target'].astype(str) + skeleton = skeleton['link'].values.flatten() + + net['link'] = net['source'].astype(str) + '_' + net['target'].astype(str) + net = net[net['link'].isin(skeleton)] + print('After filtering with skeleton:', net.shape) + + if par['apply_tf']: + net = net[net.source.isin(tf_all)] + if 'cell_type' in net.columns: + print('Taking mean of cell type specific grns') + net.drop(columns=['cell_type'], inplace=True) + net = net.groupby(['source', 'target']).mean().reset_index() + + if (len(net)>par['max_n_links']) and (par['max_n_links']!=-1): + net = process_links(net, par) #only top links are considered + verbose_print(par['verbose'], f"Number of links reduced to {par['max_n_links']}", 2) + + + return net + +def main(par): + random_state = 42 + set_global_seed(random_state) + + # -- read input data + net = read_net(par) + evaluation_data = ad.read_h5ad(par['evaluation_data']) + evaluation_data.X = evaluation_data.layers[par["layer"]] + gene_names = evaluation_data.var.index.to_numpy() + tf_all = np.loadtxt(par['tf_all'], dtype=str) + + # -- calclate scores + results = cross_validation(net, evaluation_data, par) + + return results + +if __name__ == '__main__': + print(par) + results = main(par) + + # - formatize results + results = dict(all=[results['r2score-aver-all']], grn=[results['r2score-aver-grn']]) + results = pd.DataFrame(results) + print(results) + + metric_ids = results.columns.to_numpy() + metric_values = results.values[0] + + print(metric_ids.shape, metric_values.shape) + results = ad.AnnData( + X=np.empty((0, 0)), + uns={ + "dataset_id": par["dataset_id"], + "method_id": f"reg1-{par['method_id']}", + "metric_ids": metric_ids, + "metric_values": metric_values + } + ) + + results.write_h5ad(par["score"], compression="gzip") \ No newline at end of file diff --git a/src/metrics/regression_1/test.sh b/src/metrics/regression_1/test.sh index c7266bbc4..f789966f1 100644 --- a/src/metrics/regression_1/test.sh +++ b/src/metrics/regression_1/test.sh @@ -1,5 +1,5 @@ viash run src/metrics/regression_1/config.vsh.yaml -- \ --perturbation_data resources/grn-benchmark/perturbation_data.h5ad \ - --tf_all resources/prior/tf_all.csv \ + --tf_all resources/grn_benchmark/prior/tf_all.csv \ --prediction output/portia/prediction.csv \ --score output/score.h5ad \ No newline at end of file diff --git a/src/metrics/regression_2/consensus/script.py b/src/metrics/regression_2/consensus/script.py index 465b4f2d1..8259ace16 100644 --- a/src/metrics/regression_2/consensus/script.py +++ b/src/metrics/regression_2/consensus/script.py @@ -10,10 +10,10 @@ ## VIASH START par = { - 'evaluation_data': 'resources/evaluation_datasets/op_perturbation.h5ad', + 'evaluation_data': 'resources/grn_benchmark/evaluation_datasets//op_perturbation.h5ad', 'models_dir': 'resources/grn_models/op/', 'models': ['pearson_corr', 'pearson_causal', 'portia', 'ppcor', 'genie3', 'grnboost2', 'scenic', 'scglue', 'celloracle'], - 'regulators_consensus': 'resources/prior/regulators_consensus_op.json' + 'regulators_consensus': 'resources/grn_benchmark/prior/regulators_consensus_op.json' } ## VIASH END def main(par): diff --git a/src/metrics/regression_2/main.py b/src/metrics/regression_2/main.py index 583a95429..9e89c498e 100644 --- a/src/metrics/regression_2/main.py +++ b/src/metrics/regression_2/main.py @@ -17,7 +17,6 @@ SEED = 0xCAFE N_POINTS_TO_ESTIMATE_BACKGROUND = 20 - def net_to_matrix(net, gene_names: np.ndarray, par: Dict[str, Any]) -> np.ndarray: gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)} if 'cell_type' in net.columns: @@ -316,6 +315,7 @@ def main(par: Dict[str, Any]) -> pd.DataFrame: # Load consensus numbers of putative regulators with open(par['regulators_consensus'], 'r') as f: data = json.load(f) + gene_names_ = np.asarray(list(data.keys()), dtype=object) n_features_theta_min = np.asarray([data[gene_name]['0'] for gene_name in gene_names], dtype=int) diff --git a/src/metrics/regression_2/run.sh b/src/metrics/regression_2/run.sh index 9d0caa28f..5a6e5b34b 100644 --- a/src/metrics/regression_2/run.sh +++ b/src/metrics/regression_2/run.sh @@ -1,6 +1,6 @@ viash run src/metrics/regression_2/config.vsh.yaml -- \ --prediction resources/grn_models/norman/grnboost2.h5ad \ - --dataset_id norman --evaluation_data resources/evaluation_datasets/norman_perturbation.h5ad \ + --dataset_id norman --evaluation_data resources/grn_benchmark/evaluation_datasets//norman_perturbation.h5ad \ --score output/score.h5ad \ - --tf_all resources/prior/tf_all.csv \ - --regulators_consensus resources/prior/regulators_consensus_norman.json \ No newline at end of file + --tf_all resources/grn_benchmark/prior/tf_all.csv \ + --regulators_consensus resources/grn_benchmark/prior/regulators_consensus_norman.json \ No newline at end of file diff --git a/src/metrics/regression_2/script.py b/src/metrics/regression_2/script.py index 2d906769b..bad17dff1 100644 --- a/src/metrics/regression_2/script.py +++ b/src/metrics/regression_2/script.py @@ -6,10 +6,10 @@ ## VIASH START par = { - 'evaluation_data': 'resources/evaluation_datasets/op_perturbation.h5ad', + 'evaluation_data': 'resources/grn_benchmark/evaluation_datasets//op_perturbation.h5ad', 'layer': 'X_norm', "prediction": "output/models/collectri.csv", - 'tf_all': 'resources/prior/tf_all.csv', + 'tf_all': 'resources/grn_benchmark/prior/tf_all.csv', "max_n_links": 50000, 'consensus': 'output/models/op_consensus-num-regulators.json', 'score': 'output/score_regression2.h5ad', @@ -21,7 +21,7 @@ 'clip_scores': True, 'method_id': 'grnboost', 'apply_skeleton': False, - 'skeleton': 'resources/prior/skeleton.csv', + 'skeleton': 'resources/grn_benchmark/prior/skeleton.csv', 'verbose': 2 } diff --git a/src/metrics/regression_2/test.sh b/src/metrics/regression_2/test.sh index 97426f567..93534b3dc 100644 --- a/src/metrics/regression_2/test.sh +++ b/src/metrics/regression_2/test.sh @@ -1 +1 @@ -viash build src/metrics/regression_2/config.vsh.yaml -p docker -o bin/regression_2 && bin/regression_2/regression_2 --perturbation_data resources/grn-benchmark/perturbation_data.h5ad --tfs resources/prior/tf_all.csv --consensus resources/prior/consensus-num-regulators.json --layer scgen_pearson --reg_type ridge --prediction resources/grn-benchmark/grn_models/scenicplus.csv \ No newline at end of file +viash build src/metrics/regression_2/config.vsh.yaml -p docker -o bin/regression_2 && bin/regression_2/regression_2 --perturbation_data resources/grn-benchmark/perturbation_data.h5ad --tfs resources/grn_benchmark/prior/tf_all.csv --consensus resources/grn_benchmark/prior/consensus-num-regulators.json --layer scgen_pearson --reg_type ridge --prediction resources/grn-benchmark/grn_models/scenicplus.csv \ No newline at end of file diff --git a/src/metrics/regression_2/theta_experiment.py b/src/metrics/regression_2/theta_experiment.py index 2d9f904aa..109e6169a 100644 --- a/src/metrics/regression_2/theta_experiment.py +++ b/src/metrics/regression_2/theta_experiment.py @@ -15,14 +15,14 @@ 'layers': ['scgen_pearson'], "perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad", - "tf_all": "resources/prior/tf_all.csv", + "tf_all": "resources/grn_benchmark/prior/tf_all.csv", "max_n_links": 50000, "apply_tf": "true", 'subsample': -2, 'verbose': 1, 'binarize': True, 'num_workers': 20, - 'consensus': 'resources/prior/consensus-num-regulators.json', + 'consensus': 'resources/grn_benchmark/prior/consensus-num-regulators.json', 'static_only': True, 'clip_scores': True } diff --git a/src/metrics/script_all_experiment.py b/src/metrics/script_all_experiment.py deleted file mode 100644 index 0e3851920..000000000 --- a/src/metrics/script_all_experiment.py +++ /dev/null @@ -1,185 +0,0 @@ -import pandas as pd -import anndata as ad -import sys -import numpy as np -import os - -meta = { - "resources_dir": 'src/metrics/', - "util": 'src/utils' -} -sys.path.append(meta["resources_dir"]) -sys.path.append(meta["util"]) - -from regression_1.main import main as main_reg1 -from regression_1.main import binarize_weight -from regression_2.main import main as main_reg2 -from util import process_links -# - run consensus -from consensus.script import main as main_consensus - -global_models = [ - 'collectri', - 'Ananse:Lung', - 'Ananse:Stomach', - 'Ananse:Heart', - 'Ananse:Bone marrow', - 'Gtex:Whole blood', - 'Gtex:Brain amygdala', - 'Gtex:Breast mammary tissue', - 'Gtex:Lung', - 'Gtex:Stomach', - 'Cellnet:Bcell', - 'Cellnet:Tcell', - 'Cellnet:Skin', - 'Cellnet:Neuron', - 'Cellnet:Heart' - ], - - - -def define_par(dataset): - - par = { - "evaluation_data": f"resources/evaluation_datasets/{dataset}_perturbation.h5ad", - 'consensus': f'resources/prior/{dataset}_consensus-num-regulators.json', - - 'layer': 'X_norm', - - "tf_all": "resources/prior/tf_all.csv", - 'skeleton': 'resources/prior/skeleton.csv', - "apply_tf": True, - 'subsample': -1, - 'verbose': 4, - 'num_workers': 20 - } - - return par - -def run_consensus(dataset): - par = define_par(dataset) - main_consensus(par) - -def run_evaluation(dataset, models, models_dir, scores_dir, save_file_name, binarize=False, max_n_links=50000, apply_skeleton=False, run_global_models=False, reg_type='ridge'): - print('------ ', dataset, '------') - par = define_par(dataset) - os.makedirs(scores_dir, exist_ok=True) - - par['binarize'] = binarize - par['max_n_links'] = max_n_links - par['apply_skeleton'] = apply_skeleton - par['reg_type'] = reg_type - - # - determines models to run - grn_files_dict = {} - # - add models - for model in models: - print(model) - grn_file = f"{models_dir}/{model}.csv" - if not os.path.exists(grn_file): - print(f"{grn_file} doesnt exist. Skipped.") - continue - grn_files_dict[model] = grn_file - - # - actual runs - i = 0 - for model, grn_file in grn_files_dict.items(): - par['prediction'] = grn_file - reg1 = main_reg1(par) - reg2 = main_reg2(par) - score = pd.concat([reg1, reg2], axis=1) - score.index = [model] - if i==0: - df_all = score - else: - df_all = pd.concat([df_all, score]) - df_all.to_csv(save_file_name) - print(df_all) - i+=1 - - - -if __name__ == '__main__': - # - define settings - # models = ['negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'] - models = ['scprint'] - - if False: # subsample - # for dataset in ['op', 'replogle2', 'nakatake', 'norman', 'adamson']: #'op', 'replogle2', 'nakatake', 'norman', 'adamson' - for dataset in ['op']: - if dataset == 'op': - models_subsampled = [f'{model}_{subsample}' for subsample in [1, 2] for model in models] - else: - models_subsampled = [f'{model}_{subsample}' for subsample in [0.2, 0.5] for model in models] - models_dir = f"resources/grn_models/{dataset}" - scores_dir = f"resources/scores/{dataset}" - - save_file_name = f"{scores_dir}/subsampled.csv" - - run_evaluation(dataset, models_subsampled, models_dir, scores_dir, save_file_name) - - - if True: # default run - # models = ['negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'] - - for dataset in ['op', 'replogle2', 'nakatake', 'norman', 'adamson']: - models_dir = f"resources/grn_models/{dataset}" - scores_dir = f"resources/scores/{dataset}" - run_consensus(dataset) - # save_file_name = f"{scores_dir}/X_norm-50000-skeleton_False-binarize_False-ridge-global-False.csv" - save_file_name = f"{scores_dir}/scprint.csv" - - run_evaluation(dataset, models, models_dir, scores_dir, run_global_models, save_file_name) - - if False: # run global models - models = ['pearson_corr'] - dataset = 'op' - - models_dir = "resources/grn_models/global/" - scores_dir = f"resources/scores/{dataset}" - # run_consensus(dataset) - save_file_name = f"{scores_dir}/X_norm-50000-skeleton_False-binarize_False-ridge-global-True.csv" - - run_evaluation(dataset, models, models_dir, scores_dir, run_global_models, save_file_name) - - if False: # run skeleton - models = ['negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'] - - dataset = 'op' - - models_dir = f"resources/grn_models/{dataset}" - scores_dir = f"resources/scores/{dataset}" - save_file_name = f"{scores_dir}/X_norm-50000-skeleton_True-binarize_False-ridge-global-False.csv" - - # run_consensus(dataset) - run_evaluation(dataset, models, models_dir, scores_dir, save_file_name, apply_skeleton=True) - - if False: # run GB - models = ['negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'] - - dataset = 'op' - - models_dir = f"resources/grn_models/{dataset}" - scores_dir = f"resources/scores/{dataset}" - save_file_name = f"{scores_dir}/X_norm-50000-skeleton_True-binarize_False-GB-global-False.csv" - - # run_consensus(dataset) - run_evaluation(dataset, models, models_dir, scores_dir, save_file_name, apply_skeleton=True, reg_type='GB') - - - - -# - repo -# models_dir = f"../ciim/output/grns/" - # scores_dir = f"../ciim/output/scores/" - # models = [ - # 'pearson_corr', - # 'grnboost2', 'celloracle', 'scenicplus', - # 'net_all_celltypes_young_all_batches', - # 'net_all_celltypes_young_batch_1', - # 'net_all_celltypes_old_all_batches', - # 'net_all_celltypes_old_batch_1', - # 'net_B cells_all_ages_all_batches', - # 'net_T cells_all_ages_all_batches', - # 'net_Myeloid cells_all_ages_all_batches', - # 'net_NK cells_all_ages_all_batches'], \ No newline at end of file diff --git a/src/metrics/wasserstein/background_distance/run.sh b/src/metrics/wasserstein/background_distance/run.sh new file mode 100644 index 000000000..577b0b6fc --- /dev/null +++ b/src/metrics/wasserstein/background_distance/run.sh @@ -0,0 +1,15 @@ +#!/bin/bash +#SBATCH --job-name=ws +#SBATCH --output=logs/%j.out +#SBATCH --error=logs/%j.err +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=10 +#SBATCH --time=10:00:00 +#SBATCH --mem=100GB +#SBATCH --partition=cpu +#SBATCH --mail-type=END,FAIL +#SBATCH --mail-user=jalil.nourisa@gmail.com + +set -e + +python src/metrics/wasserstein/background_distance/script.py \ No newline at end of file diff --git a/src/metrics/wasserstein/background_distance/script.py b/src/metrics/wasserstein/background_distance/script.py index 674808f4a..6c8023ee0 100644 --- a/src/metrics/wasserstein/background_distance/script.py +++ b/src/metrics/wasserstein/background_distance/script.py @@ -9,9 +9,9 @@ # - create background dist. par = { - 'evaluation_data_sc': f'resources/datasets_raw/norman_sc_counts.h5ad', - 'background_distance': 'resources/prior/ws_distance_background_norman.csv', - 'tf_all': 'resources/prior/tf_all.csv', + 'evaluation_data_sc': f'resources/grn_benchmark/evaluation_datasets//replogle_sc.h5ad', + 'background_distance': 'resources/grn_benchmark/prior/ws_distance_background_replogle.csv', + 'tf_all': 'resources/grn_benchmark/prior/tf_all.csv', 'layer': 'X_norm' } @@ -27,6 +27,9 @@ def calculate_ws_distance(net, adata) -> pd.DataFrame: for parent, child in zip(net['source'],net['target']): # - get observational X mask_gene = gene_names==child + if 'is_control' not in adata.obs.columns: + adata.obs['is_control'] = adata.obs['perturbation']=='non-targeting' + mask_sample = adata.obs['is_control'] X_observ = adata[mask_sample, mask_gene].X.todense().A # - get interventional diff --git a/src/metrics/wasserstein/consensus/script.py b/src/metrics/wasserstein/consensus/script.py index c21392f4a..7a4daa4d7 100644 --- a/src/metrics/wasserstein/consensus/script.py +++ b/src/metrics/wasserstein/consensus/script.py @@ -7,8 +7,8 @@ # - determine consensus par = { 'evaluation_data_sc': f'resources/datasets_raw/norman_sc_counts.h5ad', - 'ws_consensus': 'resources/prior/consensus_ws_distance_norman.csv', - 'tf_all': 'resources/prior/tf_all.csv', + 'ws_consensus': 'resources/grn_benchmark/prior/consensus_ws_distance_norman.csv', + 'tf_all': 'resources/grn_benchmark/prior/tf_all.csv', 'models_dir': 'resources/grn_models/norman', 'models': ['pearson_corr', 'grnboost2','portia', 'ppcor','scenic'] } diff --git a/src/metrics/wasserstein/run.sh b/src/metrics/wasserstein/run.sh index ee1f7514e..cab5c0931 100644 --- a/src/metrics/wasserstein/run.sh +++ b/src/metrics/wasserstein/run.sh @@ -1,7 +1,7 @@ viash run src/metrics/wasserstein/config.vsh.yaml -- \ --prediction resources/grn_models/norman/grnboost2.h5ad \ --dataset_id norman \ - --ws_consensus resources/prior/ws_consensus_norman.csv \ - --ws_distance_background resources/prior/ws_distance_background_norman.csv \ + --ws_consensus resources/grn_benchmark/prior/ws_consensus_norman.csv \ + --ws_distance_background resources/grn_benchmark/prior/ws_distance_background_norman.csv \ --evaluation_data_sc resources/datasets_raw/norman_sc_counts.h5ad \ --score output/score.h5ad \ No newline at end of file diff --git a/src/metrics/wasserstein/script.py b/src/metrics/wasserstein/script.py index 7efccbb9d..45e1303f0 100644 --- a/src/metrics/wasserstein/script.py +++ b/src/metrics/wasserstein/script.py @@ -9,8 +9,8 @@ 'prediction': 'resources/grn_models/adamson/pearson_corr.csv', 'evaluation_data_sc': 'resources/datasets_raw/adamson_sc_counts.h5ad', 'score': 'output/score.h5ad', - 'ws_consensus': 'resources/prior/consensus_ws_distance_adamson.csv', - 'ws_distance_background':'resources/prior/ws_distance_background_adamson.csv', + 'ws_consensus': 'resources/grn_benchmark/prior/consensus_ws_distance_adamson.csv', + 'ws_distance_background':'resources/grn_benchmark/prior/ws_distance_background_adamson.csv', 'layer': 'X_norm', 'dataset_id': 'dataset_id', 'method_id': 'pearson_corr' diff --git a/src/metrics/wasserstein/script_all.py b/src/metrics/wasserstein/script_all.py index 86f4bca83..599a1f729 100644 --- a/src/metrics/wasserstein/script_all.py +++ b/src/metrics/wasserstein/script_all.py @@ -17,9 +17,9 @@ 'evaluation_data_sc': 'resources/datasets_raw/adamson_sc_counts.h5ad', 'mean_scores_all': 'resources/scores/ws_distance_mean.csv', 'scores_all': 'resources/scores/ws_distance.csv', - 'consensus': 'resources/prior/consensus_ws_distance_adamson.csv', - 'tf_all': 'resources/prior/tf_all.csv', - 'background_distance':'resources/prior/ws_distance_background_adamson.csv', + 'consensus': 'resources/grn_benchmark/prior/consensus_ws_distance_adamson.csv', + 'tf_all': 'resources/grn_benchmark/prior/tf_all.csv', + 'background_distance':'resources/grn_benchmark/prior/ws_distance_background_adamson.csv', 'layer': 'X_norm', } @@ -30,8 +30,8 @@ def main(par): par['grns_dir'] = f'resources/grn_models/{dataset}' par['evaluation_data_sc'] = f'resources/datasets_raw/{dataset}_sc_counts.h5ad' - par['consensus'] = f'resources/prior/consensus_ws_distance_{dataset}.csv' - par['background_distance'] = f'resources/prior/ws_distance_background_{dataset}.csv' + par['consensus'] = f'resources/grn_benchmark/prior/consensus_ws_distance_{dataset}.csv' + par['background_distance'] = f'resources/grn_benchmark/prior/ws_distance_background_{dataset}.csv' if True: main_consensus(par) diff --git a/src/process_data/explanatory_analysis/gene_annotation/config.novsh.yaml b/src/process_data/explanatory_analysis/gene_annotation/config.novsh.yaml index 21a991cec..639f21cb5 100644 --- a/src/process_data/explanatory_analysis/gene_annotation/config.novsh.yaml +++ b/src/process_data/explanatory_analysis/gene_annotation/config.novsh.yaml @@ -11,7 +11,7 @@ functionality: type: file required: false direction: output - example: resources/prior/gene_annotations.txt + example: resources/grn_benchmark/prior/gene_annotations.txt resources: - type: python_script diff --git a/src/process_data/explanatory_analysis/gene_annotation/script.py b/src/process_data/explanatory_analysis/gene_annotation/script.py index 6b3f94ea8..b0141645f 100644 --- a/src/process_data/explanatory_analysis/gene_annotation/script.py +++ b/src/process_data/explanatory_analysis/gene_annotation/script.py @@ -7,7 +7,7 @@ ## VIASH START par = { - 'gene_annotations': 'resources/prior/gene_annotations.txt' + 'gene_annotations': 'resources/grn_benchmark/prior/gene_annotations.txt' } ## VIASH END diff --git a/src/process_data/explanatory_analysis/peak_annotation/script.R b/src/process_data/explanatory_analysis/peak_annotation/script.R index 9410ab3a2..e3cda8e0f 100644 --- a/src/process_data/explanatory_analysis/peak_annotation/script.R +++ b/src/process_data/explanatory_analysis/peak_annotation/script.R @@ -11,8 +11,8 @@ library(tibble) ## VIASH START par <- list( - multiomics_atac = "resources/inference_datasets/op_atac.h5ad", - annot_peak_database = "resources/prior/peak_annotation.csv" + multiomics_atac = "resources/grn_benchmark/inference_datasets/op_atac.h5ad", + annot_peak_database = "resources/grn_benchmark/prior/peak_annotation.csv" ) ## VIASH END diff --git a/src/process_data/multiomics/format_data/config.novsh.yaml b/src/process_data/op_multiomics/format_data/config.novsh.yaml similarity index 100% rename from src/process_data/multiomics/format_data/config.novsh.yaml rename to src/process_data/op_multiomics/format_data/config.novsh.yaml diff --git a/src/process_data/multiomics/format_data/script.py b/src/process_data/op_multiomics/format_data/script.py similarity index 93% rename from src/process_data/multiomics/format_data/script.py rename to src/process_data/op_multiomics/format_data/script.py index 6f7339161..7031a712d 100644 --- a/src/process_data/multiomics/format_data/script.py +++ b/src/process_data/op_multiomics/format_data/script.py @@ -4,8 +4,8 @@ ## VIASH START par = { 'multiome_counts': 'resources/datasets_raw/op_multiome_sc_counts.h5ad', - 'multiomics_rna': 'resources/inference_datasets/op_rna.h5ad', - 'multiomics_atac': 'resources/inference_datasets/op_atac.h5ad' + 'multiomics_rna': 'resources/grn_benchmark/inference_datasets/op_rna.h5ad', + 'multiomics_atac': 'resources/grn_benchmark/inference_datasets/op_atac.h5ad' } ## VIASH END diff --git a/src/process_data/multiomics/format_resources_r/config.vsh.yaml b/src/process_data/op_multiomics/format_resources_r/config.vsh.yaml similarity index 100% rename from src/process_data/multiomics/format_resources_r/config.vsh.yaml rename to src/process_data/op_multiomics/format_resources_r/config.vsh.yaml diff --git a/src/process_data/multiomics/format_resources_r/script.R b/src/process_data/op_multiomics/format_resources_r/script.R similarity index 100% rename from src/process_data/multiomics/format_resources_r/script.R rename to src/process_data/op_multiomics/format_resources_r/script.R diff --git a/src/process_data/multiomics/multiome_matrix/config.vsh.yaml b/src/process_data/op_multiomics/multiome_matrix/config.vsh.yaml similarity index 100% rename from src/process_data/multiomics/multiome_matrix/config.vsh.yaml rename to src/process_data/op_multiomics/multiome_matrix/config.vsh.yaml diff --git a/src/process_data/multiomics/multiome_matrix/script.py b/src/process_data/op_multiomics/multiome_matrix/script.py similarity index 100% rename from src/process_data/multiomics/multiome_matrix/script.py rename to src/process_data/op_multiomics/multiome_matrix/script.py diff --git a/src/process_data/perturbation/opsca/config.novsh.yaml b/src/process_data/opsca_perturbation/config.novsh.yaml similarity index 100% rename from src/process_data/perturbation/opsca/config.novsh.yaml rename to src/process_data/opsca_perturbation/config.novsh.yaml diff --git a/src/process_data/perturbation/opsca/script.py b/src/process_data/opsca_perturbation/script.py similarity index 82% rename from src/process_data/perturbation/opsca/script.py rename to src/process_data/opsca_perturbation/script.py index 1cbf755f9..49f13051e 100644 --- a/src/process_data/perturbation/opsca/script.py +++ b/src/process_data/opsca_perturbation/script.py @@ -8,14 +8,22 @@ from scipy import sparse import scanpy as sc from scipy.sparse import csr_matrix +import sys ## VIASH START par = { 'perturbation_counts': 'resources/datasets_raw/op_perturbation_sc_counts.h5ad', - 'pseudobulked_data': 'resources/datasets_raw/op_bulked.h5ad', - 'evaluation_data': 'resources/evaluation_datasets/op_perturbation.h5ad', + 'perturbation_bulk': 'resources/extended_data/op_perturbation_bulk.h5ad', + 'evaluation_data': 'resources/grn_benchmark/evaluation_datasets/op_bulk.h5ad', } ## VIASH END +meta = { + 'resources': 'src/utils/' +} + +sys.path.append(meta['resources']) + +from util import sum_by def preprocess_sc(par): # clean up @@ -77,49 +85,7 @@ def preprocess_sc(par): del sc_counts.obsm del sc_counts.uns return sc_counts -def sum_by(adata: ad.AnnData, col: str) -> ad.AnnData: - """ - Adapted from this forum post: - https://discourse.scverse.org/t/group-sum-rows-based-on-jobs-feature/371/4 - """ - - assert pd.api.types.is_categorical_dtype(adata.obs[col]) - - # sum `.X` entries for each unique value in `col` - cat = adata.obs[col].values - - indicator = sparse.coo_matrix( - ( - np.broadcast_to(True, adata.n_obs), - (cat.codes, np.arange(adata.n_obs)) - ), - shape=(len(cat.categories), adata.n_obs), - ) - - sum_adata = ad.AnnData( - indicator @ adata.X, - var=adata.var, - obs=pd.DataFrame(index=cat.categories), - ) - - # copy over `.obs` values that have a one-to-one-mapping with `.obs[col]` - obs_cols = adata.obs.columns - obs_cols = list(set(adata.obs.columns) - set([col])) - - one_to_one_mapped_obs_cols = [] - nunique_in_col = adata.obs[col].nunique() - for other_col in obs_cols: - if len(adata.obs[[col, other_col]].drop_duplicates()) == nunique_in_col: - one_to_one_mapped_obs_cols.append(other_col) - - joining_df = adata.obs[[col] + one_to_one_mapped_obs_cols].drop_duplicates().set_index(col) - assert (sum_adata.obs.index == sum_adata.obs.join(joining_df).index).all() - sum_adata.obs = sum_adata.obs.join(joining_df) - sum_adata.obs.index.name = col - sum_adata.obs = sum_adata.obs.reset_index() - sum_adata.obs.index = sum_adata.obs.index.astype('str') - - return sum_adata + def pseudobulk_sum_func(sc_counts): # pseudobulk #group cell types per well @@ -225,5 +191,5 @@ def normalize_func(adata): bulk_adata.obs['is_control'] = bulk_adata.obs['perturbation'].isin(['Dimethyl Sulfoxide']) bulk_adata.obs['is_positive_control'] = bulk_adata.obs['perturbation'].isin(['Dabrafenib', 'Belinostat']) - bulk_adata.write(par['pseudobulked_data']) + bulk_adata.write(par['perturbation_bulk']) bulk_adata.write(par['evaluation_data']) \ No newline at end of file diff --git a/src/process_data/pereggrn/acquisition.py b/src/process_data/pereggrn/acquisition.py new file mode 100644 index 000000000..8833934ad --- /dev/null +++ b/src/process_data/pereggrn/acquisition.py @@ -0,0 +1,83 @@ +import pereggrn_perturbations +import pereggrn_networks + +import os +import anndata as ad +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import sys +import scanpy as sc + +pereggrn_dir = '/home/jnourisa/projs/ongoing/pereggrn' + +pereggrn_perturbations.set_data_path(f"{pereggrn_dir}/perturbation_data/perturbations") + +pereggrn_perturbations.load_perturbation_metadata() + +meta = { + "resources_dir": 'src/utils/', +} +sys.path.append(meta["resources_dir"]) +from util import process_links + +def get_dataset(par): + for file_name in par['datasets']: + + adata = pereggrn_perturbations.load_perturbation(file_name) + # pereggrn_perturbations.check_perturbation_dataset(ad = adata) + adata.write(f"{par['raw_datasets_dir']}/{file_name}.h5ad") + +def get_networks(par): + os.makedirs(par['nets_dir'], exist_ok=True) + + names = [] + for model in par['nets']: + net = pd.read_parquet(f"{pereggrn_dir}/network_collection/networks/{model}") + net.columns = ['source','target','weight'] + method = model.split('/')[0].split('_')[0].capitalize() + tissue = model.split('/')[-1].split('.')[0].replace('_', ' ').capitalize() + name = method+':'+tissue + + net = process_links(net, par) + + net.to_csv(f"{par['nets_dir']}/{name}.csv") + + + +def main(par): + print('Getting data ...') + get_dataset(par) + print('Getting networks ...') + get_networks(par) + +if __name__ == '__main__': + par = { + 'datasets': ['norman', 'adamson', 'nakatake'], + 'nets': [ + 'ANANSE_tissue/networks/lung.parquet', + 'ANANSE_tissue/networks/stomach.parquet', + 'ANANSE_tissue/networks/heart.parquet', + 'ANANSE_tissue/networks/bone_marrow.parquet', + + 'gtex_rna/networks/Whole_Blood.parquet', + 'gtex_rna/networks/Brain_Amygdala.parquet', + 'gtex_rna/networks/Breast_Mammary_Tissue.parquet', + 'gtex_rna/networks/Lung.parquet', + 'gtex_rna/networks/Stomach.parquet', + + 'cellnet_human_Hg1332/networks/bcell.parquet', + 'cellnet_human_Hg1332/networks/tcell.parquet', + 'cellnet_human_Hg1332/networks/skin.parquet', + 'cellnet_human_Hg1332/networks/neuron.parquet', + 'cellnet_human_Hg1332/networks/heart.parquet', + ], + 'raw_datasets_dir': 'resources/datasets_raw/', + 'nets_dir': f'resources/grn_models/global/', + 'max_n_links': 50_000, + } + + + + + main(par) \ No newline at end of file diff --git a/src/process_data/pereggrn/run.sh b/src/process_data/pereggrn/run.sh new file mode 100644 index 000000000..3f0be7b41 --- /dev/null +++ b/src/process_data/pereggrn/run.sh @@ -0,0 +1,13 @@ +#!/bin/bash +#SBATCH --job-name=pereggrn +#SBATCH --output=logs/%j.out +#SBATCH --error=logs/%j.err +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=10 +#SBATCH --time=48:00:00 +#SBATCH --mem=500GB +#SBATCH --partition=cpu +#SBATCH --mail-type=END,FAIL +#SBATCH --mail-user=jalil.nourisa@gmail.com + +python src/process_data/pereggrn/script.py \ No newline at end of file diff --git a/src/process_data/pereggrn/script.py b/src/process_data/pereggrn/script.py new file mode 100644 index 000000000..0505100a0 --- /dev/null +++ b/src/process_data/pereggrn/script.py @@ -0,0 +1,123 @@ +import os +import anndata as ad +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import sys +import scanpy as sc +from sklearn.model_selection import train_test_split + +from scipy.sparse import csr_matrix + +meta = { + 'resources_dir': 'src/utils/' +} + +sys.path.append(meta["resources_dir"]) + +from util import sum_by + +def psedudobulk_fun(adata: ad.AnnData) -> ad.AnnData: + metadata = ( + adata.obs.groupby('perturbation') + .agg(lambda x: x.mode()[0] if x.nunique() == 1 else x.iloc[0]) + ) + + pseudobulk_data = adata.to_df().groupby(adata.obs['perturbation']).sum() + # Ensure the metadata index matches pseudobulk counts + metadata = metadata.loc[pseudobulk_data.index] + + # Create a new AnnData object for pseudobulked data + adata_bulked = sc.AnnData( + X=pseudobulk_data.values, + obs=metadata.reset_index(), + var=adata.var.copy() + ) + + return adata_bulked + +def process_dataset(file_name): + # - get the data + adata = ad.read_h5ad(f'resources/datasets_raw/{file_name}.h5ad') + adata.var_names_make_unique() + adata.var.index = adata.var.index.astype(str) + # - clearn up + del adata.obsp + del adata.varm + del adata.uns + del adata.obsm + if 'gene_name' in adata.var.columns: + adata.var = adata.var[['gene_name']] + adata.var = adata.var.set_index('gene_name') + else: + adata.var = adata.var[[]] + adata.obs = adata.obs[['perturbation', 'is_control', 'perturbation_type']] + + # - data split + if file_name == 'replogle2': + ctr_samples = adata.obs.is_control + samples = adata.obs.index[~ctr_samples] + _, test_samples = train_test_split(samples, test_size=.2, random_state=32) + adata.obs['is_test'] = adata.obs.index.isin(test_samples) + elif file_name == 'norman': + ctr_samples = adata.obs.is_control + samples = adata[adata.obs.index[~ctr_samples]].obs.perturbation.unique() + _, test_samples = train_test_split(samples, test_size=.5, random_state=32) + adata.obs['is_test'] = adata.obs.perturbation.isin(test_samples) + elif file_name == 'nakatake': + samples = adata.obs.perturbation.unique() + _, test_samples = train_test_split(samples, test_size=.5, random_state=32) + adata.obs['is_test'] = adata.obs.perturbation.isin(test_samples) + elif file_name == 'adamson': + ctr_samples = adata.obs.is_control + samples = adata[adata.obs.index[~ctr_samples]].obs.perturbation.unique() + _, test_samples = train_test_split(samples, test_size=.8, random_state=32) + adata.obs['is_test'] = adata.obs.perturbation.isin(test_samples) + + adata_train = adata[~adata.obs['is_test']] # we use single cells for train (if not already bulked) + + if file_name in ['norman', 'adamson']: # these two are single cells. for norman, we have .counts but not for adamson -> different preprocessing + adata_bulked = psedudobulk_fun(adata) # also normalize + else: + adata_bulked = adata + # adata_bulked.layers['X_norm'] = adata_bulked.X.copy() + + adata_test = adata_bulked[adata_bulked.obs['is_test']] # we use bulked data for test + + + # - duplicated gene names + duplicates = adata_train.var_names[adata_train.var_names.duplicated()].unique() + adata_train = adata_train[:, ~adata_train.var_names.isin(duplicates)] + + duplicates = adata_test.var_names[adata_test.var_names.duplicated()].unique() + adata_test = adata_test[:, ~adata_test.var_names.isin(duplicates)] + + + adata_test = adata_test.copy() # Ensure it's a full AnnData object + adata_train = adata_train.copy() # Ensure it's a full AnnData object + adata = adata.copy() # Ensure it's a full AnnData object + + adata_train.layers['X_norm'] = adata_train.X.copy() + adata_test.layers['X_norm'] = adata_test.X.copy() + adata.layers['X_norm'] = adata.X.copy() + + if file_name in ['norman', 'adamson']: + adata.write(f'resources/grn_benchmark/evaluation_datasets/{file_name}_sc.h5ad') + + adata_bulked.write(f'resources/extended_data/{file_name}_bulk.h5ad') + adata_train.write(f'resources/grn_benchmark/inference_datasets/{file_name}_rna.h5ad') + adata_test.write(f'resources/grn_benchmark/evaluation_datasets/{file_name}_bulk.h5ad') + + +def main(par): + for file_name in par['datasets']: + process_dataset(file_name) + +if __name__ == '__main__': + + par = { + 'datasets': ['norman', 'adamson', 'nakatake'] + } + + + main(par) \ No newline at end of file diff --git a/src/process_data/replogle_k562_gwps/run.sh b/src/process_data/replogle_k562_gwps/run.sh new file mode 100644 index 000000000..bfc360f83 --- /dev/null +++ b/src/process_data/replogle_k562_gwps/run.sh @@ -0,0 +1,22 @@ +#!/bin/bash +#SBATCH --job-name=run_replogle +#SBATCH --output=logs/%j.out +#SBATCH --error=logs/%j.err +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=10 +#SBATCH --time=48:00:00 +#SBATCH --mem=500GB +#SBATCH --partition=cpu +#SBATCH --mail-type=END,FAIL +#SBATCH --mail-user=jalil.nourisa@gmail.com + +# python src/process_data/replogle_k562_gwps/sparsify_raw_counts.py #one time at the begining to covert the downloaded data to the sparsified raw counts + +python src/process_data/replogle_k562_gwps/script.py \ + --input resources/datasets_raw/replogle_K562_gwps_raw_singlecell.h5ad \ + --tf_all resources/grn_benchmark/prior/tf_all.csv \ + --adata_bulk resources/extended_data/replogle_bulk.h5ad \ + --adata_test_sc resources/grn_benchmark/evaluation_datasets/replogle_sc.h5ad \ + --adata_test_bulk resources/grn_benchmark/evaluation_datasets/replogle_bulk.h5ad \ + --adata_train_bulk resources/grn_benchmark/inference_datasets/replogle_rna.h5ad \ + --adata_train_sc resources/extended_data/replogle_train_sc.h5ad \ \ No newline at end of file diff --git a/src/process_data/replogle_k562_gwps/script.py b/src/process_data/replogle_k562_gwps/script.py new file mode 100644 index 000000000..a5779276d --- /dev/null +++ b/src/process_data/replogle_k562_gwps/script.py @@ -0,0 +1,153 @@ +import sys +import anndata as ad + +import argparse +import numpy as np +from sklearn.model_selection import train_test_split +import scanpy as sc +import gc + +## VIASH START +argparser = argparse.ArgumentParser() +argparser.add_argument('--input', type=str, required=True) +argparser.add_argument('--tf_all', type=str, required=True) +argparser.add_argument('--adata_bulk', type=str, required=True) +argparser.add_argument('--adata_test_sc', type=str, required=True) +argparser.add_argument('--adata_test_bulk', type=str, required=True) +argparser.add_argument('--adata_train_sc', type=str, required=True) +argparser.add_argument('--adata_train_bulk', type=str, required=True) + +args = argparser.parse_args() +par = vars(args) +## VIASH END + +meta = { + 'resources_dir': 'src/utils/' +} +sys.path.append(meta["resources_dir"]) + +from util import sum_by + + +def format_raw_data(adata: ad.AnnData) -> ad.AnnData: + ''' + Format the raw data + ''' + tf_all = np.loadtxt(par['tf_all'], dtype=str) + adata.var['gene_name'] = adata.var['gene_name'].astype(str) + adata.var = adata.var.reset_index().set_index('gene_name')[['gene_id','chr', 'start', 'end']] + adata.obs = adata.obs.rename(columns={'gene': 'perturbation'})[['perturbation']] + adata.obs['perturbation_type'] = 'knockdown' + adata.obs['is_tf'] = adata.obs['perturbation'].isin(tf_all) + adata.var_names_make_unique() + adata.var['gene_id'] = adata.var['gene_id'].astype(str) + adata.obs['is_control'] = adata.obs['perturbation']=='non-targeting' + return adata + +def split_data(adata: ad.AnnData): + ''' + Split the data into train and test + + ''' + unique_perts = adata.obs['perturbation'].unique() + + tf_all = adata.obs[adata.obs['is_tf']]['perturbation'].unique() + non_tfs = np.setdiff1d(unique_perts, tf_all) + + n_tfs_in_test = int(len(tf_all)/2) + + test_tfs = np.random.choice(tf_all, size=n_tfs_in_test, replace=False) + ctrs = ['non-targeting'] + test_tfs = np.concatenate([test_tfs, ctrs]) + + print(f"Test TFs: {n_tfs_in_test}, Train TFs: {len(tf_all) - n_tfs_in_test}") + + adata_test = adata[adata.obs['perturbation'].isin(test_tfs)] + adata_train = adata[~adata.obs['perturbation'].isin(test_tfs)] # all perturbs + + print(f"Train : {adata_train.shape}, Test: {adata_test.shape}") + + return adata_train, adata_test + +def psedudobulk_fun(adata: ad.AnnData) -> ad.AnnData: + metadata = ( + adata.obs.groupby('perturbation') + .agg(lambda x: x.mode()[0] if x.nunique() == 1 else x.iloc[0]) + ) + + pseudobulk_data = adata.to_df().groupby(adata.obs['perturbation']).sum() + # Ensure the metadata index matches pseudobulk counts + metadata = metadata.loc[pseudobulk_data.index] + + # Create a new AnnData object for pseudobulked data + adata_bulked = sc.AnnData( + X=pseudobulk_data.values, + obs=metadata.reset_index(), + var=adata.var.copy() + ) + + return adata_bulked + + + +def normalize(adata: ad.AnnData) -> ad.AnnData: + X_norm = sc.pp.normalize_total(adata, inplace=False)['X'] + X_norm = sc.pp.log1p(X_norm, copy=True) + + adata.layers['X_norm'] = X_norm + return adata + +def main(par): + adata = ad.read_h5ad(par['input'], backed='r') + if False: # to test + tf_all = np.loadtxt(par['tf_all'], dtype=str) + filter = adata.obs['gene'][adata.obs['gene'].isin(tf_all)].unique()[0:10] + adata = adata[adata.obs['gene'].isin(filter)].to_memory() + + # Format the raw data + print('Formatting raw data...') + adata = format_raw_data(adata) + + # - process adata + print('Processing data...') + adata_sc = adata.to_memory() + adata_bulk = psedudobulk_fun(adata_sc) + adata_bulk = normalize(adata_bulk) + adata_bulk.write(par['adata_bulk']) + del adata_bulk, adata_sc + gc.collect() + # - data split + print('Splitting data...') + adata_train_sc, adata_test_sc = split_data(adata) + del adata + gc.collect() + + # - process inference data + print('Process inference data...') + adata_train_sc = adata_train_sc.to_memory() + adata_train_bulk = psedudobulk_fun(adata_train_sc) + adata_train_sc = normalize(adata_train_sc) + adata_train_bulk = normalize(adata_train_bulk) + adata_train_bulk.write(par['adata_train_bulk']) + adata_train_sc.write(par['adata_train_sc']) + + del adata_train_sc, adata_train_bulk + gc.collect() + + # - process test data + print('Process test data...') + adata_test_sc = adata_test_sc.to_memory() + adata_test_bulk = psedudobulk_fun(adata_test_sc) + adata_test_sc = normalize(adata_test_sc) + adata_test_bulk = normalize(adata_test_bulk) + adata_test_bulk.write(par['adata_test_bulk']) + adata_test_sc.write(par['adata_test_sc']) + + del adata_test_sc, adata_test_bulk + gc.collect() + + + +if __name__ == '__main__': + main(par) + diff --git a/src/process_data/replogle_k562_gwps/sparsify_raw_counts.py b/src/process_data/replogle_k562_gwps/sparsify_raw_counts.py new file mode 100644 index 000000000..65ca0cc8c --- /dev/null +++ b/src/process_data/replogle_k562_gwps/sparsify_raw_counts.py @@ -0,0 +1,8 @@ + + +import anndata as ad +from scipy import sparse +adata = ad.read_h5ad('resources/datasets_raw/replogle_K562_gwps_raw_singlecell.h5ad') +adata.X = sparse.csr_matrix(adata.X) +adata.write_h5ad('resources/datasets_raw/replogle_K562_gwps_raw_singlecell.h5ad') + diff --git a/src/process_data/perturbation/repo/batch_correction_evaluation/config.novsh.yaml b/src/process_data/repo/batch_correction_evaluation/config.novsh.yaml similarity index 100% rename from src/process_data/perturbation/repo/batch_correction_evaluation/config.novsh.yaml rename to src/process_data/repo/batch_correction_evaluation/config.novsh.yaml diff --git a/src/process_data/perturbation/repo/batch_correction_evaluation/helper.py b/src/process_data/repo/batch_correction_evaluation/helper.py similarity index 100% rename from src/process_data/perturbation/repo/batch_correction_evaluation/helper.py rename to src/process_data/repo/batch_correction_evaluation/helper.py diff --git a/src/process_data/perturbation/repo/batch_correction_evaluation/script.py b/src/process_data/repo/batch_correction_evaluation/script.py similarity index 100% rename from src/process_data/perturbation/repo/batch_correction_evaluation/script.py rename to src/process_data/repo/batch_correction_evaluation/script.py diff --git a/src/process_data/perturbation/repo/batch_correction_scgen/config.novsh.yaml b/src/process_data/repo/batch_correction_scgen/config.novsh.yaml similarity index 96% rename from src/process_data/perturbation/repo/batch_correction_scgen/config.novsh.yaml rename to src/process_data/repo/batch_correction_scgen/config.novsh.yaml index b62352112..a8da67f4a 100644 --- a/src/process_data/perturbation/repo/batch_correction_scgen/config.novsh.yaml +++ b/src/process_data/repo/batch_correction_scgen/config.novsh.yaml @@ -30,7 +30,7 @@ functionality: required: true required: true direction: input - example: resources_test/evaluation_datasets/op_perturbation.h5ad + example: resources_test/grn_benchmark/evaluation_datasets//op_perturbation.h5ad - name: --perturbation_data_bc type: file info: diff --git a/src/process_data/perturbation/repo/batch_correction_scgen/script.py b/src/process_data/repo/batch_correction_scgen/script.py similarity index 100% rename from src/process_data/perturbation/repo/batch_correction_scgen/script.py rename to src/process_data/repo/batch_correction_scgen/script.py diff --git a/src/process_data/perturbation/repo/batch_correction_seurat/config.novsh.yaml b/src/process_data/repo/batch_correction_seurat/config.novsh.yaml similarity index 93% rename from src/process_data/perturbation/repo/batch_correction_seurat/config.novsh.yaml rename to src/process_data/repo/batch_correction_seurat/config.novsh.yaml index 304b903aa..b8379619b 100644 --- a/src/process_data/perturbation/repo/batch_correction_seurat/config.novsh.yaml +++ b/src/process_data/repo/batch_correction_seurat/config.novsh.yaml @@ -30,7 +30,7 @@ functionality: required: true required: true direction: input - example: resources_test/evaluation_datasets/op_perturbation.h5ad + example: resources_test/grn_benchmark/evaluation_datasets//op_perturbation.h5ad - name: --perturbation_data_bc __merge__: ../../../api/file_evaluation_h5ad.yaml required: false diff --git a/src/process_data/perturbation/repo/batch_correction_seurat/script.R b/src/process_data/repo/batch_correction_seurat/script.R similarity index 100% rename from src/process_data/perturbation/repo/batch_correction_seurat/script.R rename to src/process_data/repo/batch_correction_seurat/script.R diff --git a/src/process_data/test_data/run.sh b/src/process_data/test_data/run.sh index 510d25df9..a97b2c5f8 100644 --- a/src/process_data/test_data/run.sh +++ b/src/process_data/test_data/run.sh @@ -1,7 +1,7 @@ viash run src/process_data/test_data/config.novsh.yaml -- \ - --rna resources/inference_datasets/op_rna.h5ad --rna_test resources_test/inference_datasets/op_rna.h5ad \ - --atac resources/inference_datasets/op_atac.h5ad --atac_test resources_test/inference_datasets/op_atac.h5ad \ - --perturbation_data resources/evaluation_datasets/op_perturbation.h5ad --perturbation_data_test resources_test/evaluation_datasets/op_perturbation.h5ad \ - --multiomics_counts resources/datasets_raw/op_multiome_sc_counts.h5ad --multiomics_counts_test resources_test/datasets_raw/op_multiome_sc_counts.h5ad \ + --rna resources/grn_benchmark/inference_datasets/op_rna.h5ad --rna_test resources_test/grn_benchmark/inference_datasets//op_rna.h5ad \ + --atac resources/grn_benchmark/inference_datasets/op_atac.h5ad --atac_test resources_test/grn_benchmark/inference_datasets//op_atac.h5ad \ + --perturbation_data resources/grn_benchmark/evaluation_datasets/op_perturbation.h5ad --perturbation_data_test resources_test/grn_benchmark/evaluation_datasets/op_perturbation.h5ad \ + --multiomics_counts resources/grn_benchmark/datasets_raw/op_multiome_sc_counts.h5ad --multiomics_counts_test resources_test/grn_benchmark/datasets_raw/op_multiome_sc_counts.h5ad \ # --perturbation_counts resources/datasets_raw/op_perturbation_sc_counts.h5ad --perturbation_counts_test resources_test/datasets_raw/op_perturbation_sc_counts.h5ad \ - --perturbation_counts 'resources/datasets_raw/adamson_sc_counts.h5ad' --perturbation_counts_test resources_test/datasets_raw/adamson_sc_counts.h5ad + # --perturbation_counts 'resources/datasets_raw/adamson_sc_counts.h5ad' --perturbation_counts_test resources_test/datasets_raw/adamson_sc_counts.h5ad diff --git a/src/process_data/test_data/script.py b/src/process_data/test_data/script.py index 0af3a31e3..b893d00fa 100644 --- a/src/process_data/test_data/script.py +++ b/src/process_data/test_data/script.py @@ -12,14 +12,14 @@ ## VIASH START par = { - 'rna': 'resources/inference_datasets/op_rna.h5ad', - 'rna_test': 'resources_test/inference_datasets/op_rna.h5ad', + 'rna': 'resources/grn_benchmark/inference_datasets/op_rna.h5ad', + 'rna_test': 'resources_test/grn_benchmark/inference_datasets//op_rna.h5ad', - 'atac': 'resources/inference_datasets/op_atac.h5ad', - 'atac_test': 'resources_test/inference_datasets/op_atac.h5ad', + 'atac': 'resources/grn_benchmark/inference_datasets/op_atac.h5ad', + 'atac_test': 'resources_test/grn_benchmark/inference_datasets//op_atac.h5ad', - 'perturbation_data': 'resources/evaluation_datasets/op_perturbation.h5ad', - 'perturbation_data_test': 'resources_test/evaluation_datasets/op_perturbation.h5ad', + 'perturbation_data': 'resources/grn_benchmark/evaluation_datasets//op.h5ad', + 'perturbation_data_test': 'resources_test/grn_benchmark/evaluation_datasets//op.h5ad', 'multiomics_counts': 'resources/datasets_raw/op_multiome_sc_counts.h5ad', 'multiomics_counts_test': 'resources_test/datasets_raw/op_multiome_sc_counts.h5ad', diff --git a/src/robustness_analysis/script_all.py b/src/robustness_analysis/script_all.py index 410344ffc..61bb53a6e 100644 --- a/src/robustness_analysis/script_all.py +++ b/src/robustness_analysis/script_all.py @@ -17,19 +17,19 @@ 'methods': ['negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'], # 'methods': ['pearson_corr'], - "evaluation_data": "resources/evaluation_datasets/op_perturbation.h5ad", - "tf_all": "resources/prior/tf_all.csv", + "evaluation_data": "resources/grn_benchmark/evaluation_datasets//op_perturbation.h5ad", + "tf_all": "resources/grn_benchmark/prior/tf_all.csv", "max_n_links": 50000, "apply_tf": False, 'binarize': False, 'subsample': -1, 'verbose': 0, 'num_workers': 20, - 'consensus': 'resources/prior/op_consensus-num-regulators.json', + 'consensus': 'resources/grn_benchmark/prior/op_consensus-num-regulators.json', 'static_only': True, 'layer': 'X_norm', 'apply_skeleton': False, - 'skeleton': 'resources/prior/skeleton.csv' + 'skeleton': 'resources/grn_benchmark/prior/skeleton.csv' } meta = { diff --git a/src/utils/util.py b/src/utils/util.py index f21567195..1ee1d4a7a 100644 --- a/src/utils/util.py +++ b/src/utils/util.py @@ -14,7 +14,13 @@ '#D55E00', # Vermillion '#CC79A7'] # Reddish Purple - +def binarize_weight(weight): + if weight > 0: + return 1 + elif weight < 0: + return -1 + else: + return 0 def verbose_print(verbose_level, message, level): if level <= verbose_level: print(message) @@ -116,3 +122,48 @@ def read_gmt(file_path:str) -> dict[str, list[str]]: 'genes': genes } return gene_sets +def sum_by(adata: ad.AnnData, col: str) -> ad.AnnData: + """ + Adapted from this forum post: + https://discourse.scverse.org/t/group-sum-rows-based-on-jobs-feature/371/4 + """ + + assert pd.api.types.is_categorical_dtype(adata.obs[col]) + from scipy import sparse + + + # sum `.X` entries for each unique value in `col` + cat = adata.obs[col].values + + indicator = sparse.coo_matrix( + ( + np.broadcast_to(True, adata.n_obs), + (cat.codes, np.arange(adata.n_obs)) + ), + shape=(len(cat.categories), adata.n_obs), + ) + + sum_adata = ad.AnnData( + indicator @ adata.X, + var=adata.var, + obs=pd.DataFrame(index=cat.categories), + ) + + # copy over `.obs` values that have a one-to-one-mapping with `.obs[col]` + obs_cols = adata.obs.columns + obs_cols = list(set(adata.obs.columns) - set([col])) + + one_to_one_mapped_obs_cols = [] + nunique_in_col = adata.obs[col].nunique() + for other_col in obs_cols: + if len(adata.obs[[col, other_col]].drop_duplicates()) == nunique_in_col: + one_to_one_mapped_obs_cols.append(other_col) + + joining_df = adata.obs[[col] + one_to_one_mapped_obs_cols].drop_duplicates().set_index(col) + assert (sum_adata.obs.index == sum_adata.obs.join(joining_df).index).all() + sum_adata.obs = sum_adata.obs.join(joining_df) + sum_adata.obs.index.name = col + sum_adata.obs = sum_adata.obs.reset_index() + sum_adata.obs.index = sum_adata.obs.index.astype('str') + + return sum_adata \ No newline at end of file diff --git a/src/workflows/run_benchmark/config.novsh.yaml b/src/workflows/run_benchmark/config.novsh.yaml index f3ff99b97..9f0d341bb 100644 --- a/src/workflows/run_benchmark/config.novsh.yaml +++ b/src/workflows/run_benchmark/config.novsh.yaml @@ -38,12 +38,12 @@ functionality: type: file required: false direction: input - default: resources/prior/tf_all.csv + default: resources/grn_benchmark/prior/tf_all.csv - name: --consensus type: file required: false direction: input - default: resources/prior/consensus.json + default: resources/grn_benchmark/prior/consensus.json - name: --layer type: string required: false diff --git a/src/workflows/run_robustness_analysis/config.novsh.yaml b/src/workflows/run_robustness_analysis/config.novsh.yaml index a69cc1ba1..3abb9111e 100644 --- a/src/workflows/run_robustness_analysis/config.novsh.yaml +++ b/src/workflows/run_robustness_analysis/config.novsh.yaml @@ -38,7 +38,7 @@ functionality: type: file required: false direction: input - default: resources/prior/consensus.json + default: resources/grn_benchmark/prior/consensus.json - name: --degree type: integer required: false diff --git a/src/methods/script_all.py b/src/workflows_local/benchmark/methods/script.py similarity index 79% rename from src/methods/script_all.py rename to src/workflows_local/benchmark/methods/script.py index a2350c88b..f6dbf6fc0 100644 --- a/src/methods/script_all.py +++ b/src/workflows_local/benchmark/methods/script.py @@ -4,19 +4,31 @@ import subprocess import anndata as ad import numpy as np +import argparse +argparser = argparse.ArgumentParser() +argparser.add_argument('--force', action='store_true', help='Force overwrite exisiting GRN predictions') +argparser.add_argument('--sbatch', action='store_true', help='Submit jobs to SLURM') +argparser.add_argument('--partition', type=str, default='cpu', help='Partition to submit jobs to') +argparser.add_argument('--methods', type=str, nargs='+', required=True,help='Methods to run') +argparser.add_argument('--datasets', type=str, nargs='+', required=True, help='Datasets to run inference on') +args = argparser.parse_args() +par = vars(args) -def run_grn_inference(dataset='op', subsample=None): - par = { + + +def run_grn_inference(par, dataset='op', subsample=None): + par_local = { 'models_dir': f'resources/grn_models/{dataset}/', - 'rna': f'resources/inference_datasets/{dataset}_rna.h5ad', - 'atac': f'resources/inference_datasets/{dataset}_atac.h5ad', + 'rna': f'resources/grn_benchmark/inference_datasets/{dataset}_rna.h5ad', + 'atac': f'resources/grn_benchmark/inference_datasets/{dataset}_atac.h5ad', 'rna_positive_control': f'resources/datasets_raw/{dataset}.h5ad', 'num_workers': 10, 'tmp_dir': 'output/grn_inference' # 'max_n_links': 10000, } - + par.update(par_local) + os.makedirs(par['models_dir'], exist_ok=True) os.makedirs(par['tmp_dir'], exist_ok=True) @@ -54,14 +66,14 @@ def run_grn_inference(dataset='op', subsample=None): par['rna'] = new_rna - for method in methods: + for method in par['methods']: print(method) if subsample is None: par['prediction'] = f"{par['models_dir']}/{method}.h5ad" else: par['prediction'] = f"{par['models_dir']}/{method}_{subsample}.h5ad" - if (force == False) & (os.path.exists(par['prediction'])): + if (par['force'] == False) & (os.path.exists(par['prediction'])): continue # Method arguments @@ -126,7 +138,7 @@ def run_grn_inference(dataset='op', subsample=None): # Prepare sbatch command tag = f"--job-name={method}" # No spaces around '=' resources = (f"--cpus-per-task={par['num_workers']} " - f"--mem={mem} --time={time} --partition={partition}") + f"--mem={mem} --time={time} --partition={par['partition']}") # Combine tags and resources full_tag = [tag] + resources.split() @@ -136,7 +148,7 @@ def run_grn_inference(dataset='op', subsample=None): full_tag += ["--partition=gpu", "--gres=gpu:1"] try: - if sbatch: + if par['sbatch']: result = subprocess.run(['sbatch'] + full_tag + ['scripts/sbatch/grn_inference.sh', command], check=True, capture_output=True, text=True) else: result = subprocess.run(['bash'] + ['scripts/sbatch/grn_inference.sh', command], check=True, capture_output=True, text=True) @@ -152,27 +164,21 @@ def run_grn_inference(dataset='op', subsample=None): print(f"Command output: {e.output}") print(f"Command error output: {e.stderr}") -if __name__ == '__main__': - force = True - sbatch = False - # methods = ["positive_control", "negative_control", "pearson_corr", "portia", "grnboost2", "ppcor", "scenic"], - # methods = ["portia", "grnboost2"] - methods = ["scprint"] - datasets = ['norman'] # 'replogle2', 'norman', 'adamson', 'nakatake', 'op' - - partition='cpu' - +def main(par): + for dataset in par['datasets']: + run_grn_inference(par, dataset, subsample=None) - if True: # normal run - for dataset in datasets: - run_grn_inference(dataset, subsample=None) - if False: # subsample - # for dataset in ['replogle2', 'norman', 'adamson', 'nakatake']: # 'replogle2' 'op' norman - for dataset in ['op']: - if dataset == 'op': - for subsample in [1, 2]: #number of donors - run_grn_inference(dataset, subsample=subsample) - else: - for subsample in [0.2, 0.5, 1.0]: - run_grn_inference(dataset, subsample=subsample) \ No newline at end of file +if __name__ == '__main__': + print(par) + main(par) + + # if False: # subsample + # # for dataset in ['replogle2', 'norman', 'adamson', 'nakatake']: # 'replogle2' 'op' norman + # for dataset in ['op']: + # if dataset == 'op': + # for subsample in [1, 2]: #number of donors + # run_grn_inference(dataset, subsample=subsample) + # else: + # for subsample in [0.2, 0.5, 1.0]: + # run_grn_inference(dataset, subsample=subsample) \ No newline at end of file diff --git a/src/workflows_local/benchmark/metrics/script.py b/src/workflows_local/benchmark/metrics/script.py new file mode 100644 index 000000000..7ec92150f --- /dev/null +++ b/src/workflows_local/benchmark/metrics/script.py @@ -0,0 +1,241 @@ +import pandas as pd +import anndata as ad +import sys +import numpy as np +import os +import subprocess + +import argparse +argparser = argparse.ArgumentParser() + +argparser.add_argument('--datasets', nargs='+', help='List of datasets to include', required=True) +argparser.add_argument('--methods', nargs='+', help='List of methods to include', required=True) +argparser.add_argument('--run_consensus_flag', action='store_true', help='Run consensus') +argparser.add_argument('--save_scores_file', help='Save file name', required=True) +args = argparser.parse_args() +par = vars(args) + +par['temp_dir'] = 'output/temp' +os.makedirs(par['temp_dir'], exist_ok=True) + +meta = { + "resources_dir": './', + "util": 'src/utils' +} +sys.path.append(meta["resources_dir"]) +sys.path.append(meta["util"]) + +dependencies={ + 'all_metrics': 'src/metrics/all_metrics/script.py' +} + +# - run consensus +from src.metrics.regression_2.consensus.script import main as main_consensus + + +global_models = [ + 'collectri', + 'Ananse:Lung', + 'Ananse:Stomach', + 'Ananse:Heart', + 'Ananse:Bone marrow', + 'Gtex:Whole blood', + 'Gtex:Brain amygdala', + 'Gtex:Breast mammary tissue', + 'Gtex:Lung', + 'Gtex:Stomach', + 'Cellnet:Bcell', + 'Cellnet:Tcell', + 'Cellnet:Skin', + 'Cellnet:Neuron', + 'Cellnet:Heart' + ], + + +def define_par(dataset): + + par_local = { + "evaluation_dataset": f"resources/grn_benchmark/evaluation_datasetsets/{dataset}_bulk.h5ad", + "evaluation_dataset_sc": f"resources/grn_benchmark/evaluation_datasetsets/{dataset}_sc.h5ad", + 'regulators_consensus': f'resources/grn_benchmark/prior/regulators_consensus_{dataset}.json', + 'ws_consensus': f'resources/grn_benchmark/prior/ws_consensus_{dataset}.json', + 'ws_distance_background': f'resources/grn_benchmark/prior/ws_distance_background_{dataset}.json', + + 'layer': 'X_norm', + "tf_all": "resources/grn_benchmark/prior/tf_all.csv", + 'skeleton': 'resources/grn_benchmark/prior/skeleton.csv', + "apply_tf": True, + 'subsample': -1, + 'verbose': 4, + 'num_workers': 20 + } + + return par_local + +def run_consensus(dataset): + par_local = define_par(dataset) + main_consensus(par_local) + +def run_metrics(par): + par['score'] = f"output/score_{par['dataset_id']}_{par['method_id']}.h5ad" + args = f"--run_local --prediction {par['prediction']} \ + --dataset_id {par['dataset_id']} \ + --method_id {par['method_id']} \ + --evaluation_dataset {par['evaluation_dataset']} \ + --evaluation_dataset_sc {par['evaluation_dataset_sc']} \ + --regulators_consensus {par['regulators_consensus']} \ + --ws_consensus {par['ws_consensus']} \ + --ws_distance_background {par['ws_distance_background']} \ + --score {par['score']} " + + + command = f"python {dependencies['all_metrics']} {args}" + + print(command) + + result = subprocess.run(command, shell=True, capture_output=True, text=True) + + if result.returncode != 0: + print("STDOUT:", result.stdout) + print("STDERR:", result.stderr) + raise RuntimeError(f"Error: command proprocess failed with exit code {result.returncode}") + print('preprocess completed') + + score = ad.read_h5ad(par['score']).uns['score'] + + return score + + +def run_evaluation(dataset, binarize=False, max_n_links=50000, apply_skeleton=False, reg_type='ridge'): + print('------ ', dataset, '------') + par_local = define_par(dataset) + par.update(par_local) + + par['binarize'] = binarize + par['max_n_links'] = max_n_links + par['apply_skeleton'] = apply_skeleton + par['reg_type'] = reg_type + + # - determines models to run + grn_files_dict = {} + # - add models + for model in par['methods']: + print(model) + grn_file = f"{models_dir}/{model}.csv" + if not os.path.exists(grn_file): + print(f"{grn_file} doesnt exist. Skipped.") + continue + grn_files_dict[model] = grn_file + + # - actual runs + i = 0 + for model, grn_file in grn_files_dict.items(): + par['prediction'] = grn_file + par['method_id'] = model + par['dataset_id'] = dataset + + + score = run_metrics(par) + + + score.index = [model] + if i==0: + df_all = score + else: + df_all = pd.concat([df_all, score]) + print(df_all) + + i+=1 + return df_all + + +if __name__ == '__main__': + # - define settings + # models = ['negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'] + + for i, dataset in enumerate(par['datasets']): + models_dir = f"resources/grn_models/{dataset}" + if par['run_consensus_flag']: + run_consensus(dataset) + + df_dataset = run_evaluation(dataset) + df_dataset['dataset'] = dataset + if i==0: + df_all = df_dataset + else: + df_all = pd.concat([df_all, df_dataset]) + df_all.to_csv(par['save_scores_file']) + + + # if False: # subsample + # # for dataset in ['op', 'replogle2', 'nakatake', 'norman', 'adamson']: #'op', 'replogle2', 'nakatake', 'norman', 'adamson' + # save_scores_file = f"{scores_dir}/subsampled.csv" + # for i, dataset in enumerate(['op']): + # if dataset == 'op': + # models_subsampled = [f'{model}_{subsample}' for subsample in [1, 2] for model in models] + # else: + # models_subsampled = [f'{model}_{subsample}' for subsample in [0.2, 0.5] for model in models] + # models_dir = f"resources/grn_models/{dataset}" + + + # df_dataset = run_evaluation(dataset, models, models_di) + # df_dataset['dataset'] = dataset + # if i==0: + # df_all = df_dataset + # else: + # df_all = pd.concat([df_all, df_dataset]) + # df_all.to_csv(save_scores_file) + + + # if False: # run global models + # models = ['pearson_corr'] + # dataset = 'op' + + # models_dir = "resources/grn_models/global/" + # scores_dir = f"resources/scores/{dataset}" + # # run_consensus(dataset) + # save_scores_file = f"{scores_dir}/X_norm-50000-skeleton_False-binarize_False-ridge-global-True.csv" + + # run_evaluation(dataset, models, models_dir, scores_dir, save_scores_file) + + # if False: # run skeleton + # models = ['negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'] + + # dataset = 'op' + + # models_dir = f"resources/grn_models/{dataset}" + # scores_dir = f"resources/scores/{dataset}" + # save_scores_file = f"{scores_dir}/X_norm-50000-skeleton_True-binarize_False-ridge-global-False.csv" + + # # run_consensus(dataset) + # run_evaluation(dataset, models, models_dir, scores_dir, save_scores_file, apply_skeleton=True) + + # if False: # run GB + # models = ['negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'] + + # dataset = 'op' + + # models_dir = f"resources/grn_models/{dataset}" + # scores_dir = f"resources/scores/{dataset}" + # save_scores_file = f"{scores_dir}/X_norm-50000-skeleton_True-binarize_False-GB-global-False.csv" + + # # run_consensus(dataset) + # run_evaluation(dataset, models, models_dir, scores_dir, save_scores_file, apply_skeleton=True, reg_type='GB') + + + + +# - repo +# models_dir = f"../ciim/output/grns/" + # scores_dir = f"../ciim/output/scores/" + # models = [ + # 'pearson_corr', + # 'grnboost2', 'celloracle', 'scenicplus', + # 'net_all_celltypes_young_all_batches', + # 'net_all_celltypes_young_batch_1', + # 'net_all_celltypes_old_all_batches', + # 'net_all_celltypes_old_batch_1', + # 'net_B cells_all_ages_all_batches', + # 'net_T cells_all_ages_all_batches', + # 'net_Myeloid cells_all_ages_all_batches', + # 'net_NK cells_all_ages_all_batches'], \ No newline at end of file diff --git a/src/workflows_local/benchmark/run.sh b/src/workflows_local/benchmark/run.sh new file mode 100644 index 000000000..d6c8ed87c --- /dev/null +++ b/src/workflows_local/benchmark/run.sh @@ -0,0 +1,48 @@ +#!/bin/bash +#SBATCH --job-name=workflow_local +#SBATCH --output=logs/%j.out +#SBATCH --error=logs/%j.err +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=10 +#SBATCH --time=10:00:00 +#SBATCH --mem=100GB +#SBATCH --partition=cpu +#SBATCH --mail-type=END,FAIL +#SBATCH --mail-user=jalil.nourisa@gmail.com + +set -e + + +DATASETS=( + "op" +) +METHODS=( + "scprint" +) + +SAVE_SCORES_FILE="output/scores.h5ad" + +FORCE=true +RUN_CONSENSUS_FLAG=False + + +# ----- run methods ----- +# cmd="python src/workflows_local/benchmark/methods/script.py +# --datasets ${DATASETS[@]} +# --methods ${METHODS[@]}" + +# [ "$FORCE" = true ] && cmd="${cmd} --force" + +# echo "Running: $cmd" +# $cmd + +# ----- run metrics ----- +cmd="python src/workflows_local/benchmark/metrics/script.py + --datasets ${DATASETS[@]} + --methods ${METHODS[@]} + --save_scores_file ${SAVE_SCORES_FILE}" + +[ "$RUN_CONSENSUS_FLAG" = true ] && cmd="${cmd} --run_consensus_flag" + +echo "Running: $cmd" +$cmd \ No newline at end of file diff --git a/src/workflows_local/run_all_local.sh b/src/workflows_local/run_all_local.sh new file mode 100644 index 000000000..f982fc773 --- /dev/null +++ b/src/workflows_local/run_all_local.sh @@ -0,0 +1,28 @@ +#!/bin/bash +#SBATCH --job-name=run_all_local +#SBATCH --output=logs/%j.out +#SBATCH --error=logs/%j.err +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=10 +#SBATCH --time=12:00:00 +#SBATCH --mem=500GB +#SBATCH --partition=cpu +#SBATCH --mail-type=END,FAIL +#SBATCH --mail-user=jalil.nourisa@gmail.com + + +set -e +# - data preprocessing +echo "Running opsc_perturbation ..." +python src/process_data/opsca_perturbation/script.py +echo "Running replogle_k562_gwps ..." +bash src/process_data/replogle_k562_gwps/run.sh +echo "Running pereggrn ..." +bash src/process_data/pereggrn/run.sh +echo "Running test_data ..." +python src/process_data/test_data/script.py + +# - GRN inference and evaluation +echo "Running workflows_local ..." +bash src/workflows_local/benchmark/run.sh + diff --git a/test.ipynb b/test.ipynb new file mode 100644 index 000000000..ba802058c --- /dev/null +++ b/test.ipynb @@ -0,0 +1,79 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import anndata as ad\n", + "adata = ad.read_h5ad('resources/grn_benchmark/evaluation_datasets/replogle_sc.h5ad')\n", + "adata.obs['is_control'] = adata.obs['perturbation']=='non-targeting'\n", + " \n", + "mask_sample = adata.obs['is_control']\n", + "mask_sample.sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/jnourisa/miniconda3/envs/gimme/lib/python3.10/site-packages/anndata/_core/aligned_df.py:68: ImplicitModificationWarning: Transforming to str index.\n", + " warnings.warn(\"Transforming to str index.\", ImplicitModificationWarning)\n" + ] + }, + { + "data": { + "text/plain": [ + "(585, 8244)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata = ad.read_h5ad('resources/grn_benchmark/inference_datasets/replogle2_rna.h5ad')\n", + "adata[adata.obs['is_control']]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}