From cb0cafc792311922e3d17bf11f078864c0fec9b5 Mon Sep 17 00:00:00 2001 From: Jalil Nourisa Date: Thu, 6 Feb 2025 19:50:44 +0100 Subject: [PATCH] local workflow debuged --- README.md | 2 +- _viash.yaml | 4 +- runs.ipynb | 108 +++++++++----- scripts/run_benchmark_all.sh | 2 +- scripts/run_grn_evaluation.sh | 4 +- scripts/single_grn_evaluation.sh | 4 +- src/api/file_evaluation_h5ad.yaml | 2 +- src/control_methods/pearson_corr/script.py | 2 +- src/helper.py | 8 +- src/metrics/all_metrics/helper.py | 2 +- src/metrics/all_metrics/run.sh | 14 ++ src/metrics/all_metrics/script.py | 18 +-- src/metrics/all_metrics/script_all.py | 6 +- src/metrics/regression_1/helper.py | 19 ++- src/metrics/regression_1/run.sh | 19 ++- src/metrics/regression_1/script.py | 37 ++--- src/metrics/regression_2/consensus/script.py | 2 +- src/metrics/regression_2/main.py | 138 ++++++++---------- src/metrics/regression_2/run.sh | 19 ++- src/metrics/regression_2/script.py | 40 +++-- .../wasserstein/background_distance/run.sh | 4 +- .../wasserstein/background_distance/script.py | 58 ++++++-- src/metrics/wasserstein/main.py | 1 + src/metrics/wasserstein/run.sh | 16 +- src/metrics/wasserstein/script.py | 47 ++++-- src/process_data/opsca_perturbation/script.py | 2 +- src/process_data/pereggrn/script.py | 6 +- src/process_data/replogle_k562_gwps/run.sh | 4 +- .../batch_correction_scgen/config.novsh.yaml | 2 +- .../batch_correction_seurat/config.novsh.yaml | 2 +- src/process_data/test_data/run.sh | 2 +- src/process_data/test_data/script.py | 4 +- src/robustness_analysis/script_all.py | 2 +- .../benchmark/methods/script.py | 2 +- .../benchmark/metrics/script.py | 92 +++++++++--- src/workflows_local/benchmark/run.sh | 15 +- test.ipynb | 4 +- 37 files changed, 449 insertions(+), 264 deletions(-) create mode 100644 src/metrics/all_metrics/run.sh diff --git a/README.md b/README.md index e776af892..21d1e372a 100644 --- a/README.md +++ b/README.md @@ -245,7 +245,7 @@ Data structure: Perturbation dataset for benchmarking. -Example file: `resources_test/grn_benchmark/evaluation_datasets//op_perturbation.h5ad` +Example file: `resources_test/grn_benchmark/evaluation_data//op.h5ad` Format: diff --git a/_viash.yaml b/_viash.yaml index bb607c5e0..bab148809 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -32,8 +32,8 @@ info: path: s3://openproblems-data/resources_test/grn/inference_datasets/ dest: resources_test/grn_benchmark/inference_datasets// - type: s3 - path: s3://openproblems-data/resources_test/grn/evaluation_datasets/ - dest: resources_test/grn_benchmark/evaluation_datasets// + path: s3://openproblems-data/resources_test/grn/evaluation_data/ + dest: resources_test/grn_benchmark/evaluation_data// - type: s3 path: s3://openproblems-data/resources_test/grn/prior/ dest: resources_test/grn_benchmark/prior/ diff --git a/runs.ipynb b/runs.ipynb index ce1a93a00..63654579d 100644 --- a/runs.ipynb +++ b/runs.ipynb @@ -14,11 +14,40 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "upload: resources/grn_benchmark/inference_datasets/op_rna.rds to s3://openproblems-data/resources/grn/grn_benchmark/inference_datasets/op_rna.rds\n", + "upload: resources/grn_benchmark/inference_datasets/op_atac.rds to s3://openproblems-data/resources/grn/grn_benchmark/inference_datasets/op_atac.rds\n", + "upload: resources/grn_benchmark/prior/op/annot_peak_database.csv to s3://openproblems-data/resources/grn/grn_benchmark/prior/op/annot_peak_database.csv\n", + "upload: resources/grn_benchmark/prior/op/cell_topic.csv to s3://openproblems-data/resources/grn/grn_benchmark/prior/op/cell_topic.csv\n", + "upload: resources/grn_benchmark/prior/op/peaks.bed to s3://openproblems-data/resources/grn/grn_benchmark/prior/op/peaks.bed\n", + "upload: resources/grn_benchmark/prior/op/peak_annotation.csv to s3://openproblems-data/resources/grn/grn_benchmark/prior/op/peak_annotation.csv\n", + "upload: resources/grn_benchmark/prior/op/peaks.txt to s3://openproblems-data/resources/grn/grn_benchmark/prior/op/peaks.txt\n", + "upload: resources/grn_benchmark/prior/regulators_consensus_adamson.json to s3://openproblems-data/resources/grn/grn_benchmark/prior/regulators_consensus_adamson.json\n", + "upload: resources/grn_benchmark/prior/regulators_consensus_nakatake.json to s3://openproblems-data/resources/grn/grn_benchmark/prior/regulators_consensus_nakatake.json\n", + "upload: resources/grn_benchmark/prior/regulators_consensus_norman.json to s3://openproblems-data/resources/grn/grn_benchmark/prior/regulators_consensus_norman.json\n", + "upload: resources/grn_benchmark/prior/regulators_consensus_op.json to s3://openproblems-data/resources/grn/grn_benchmark/prior/regulators_consensus_op.json\n", + "upload: resources/grn_benchmark/prior/regulators_consensus_replogle2.json to s3://openproblems-data/resources/grn/grn_benchmark/prior/regulators_consensus_replogle2.json\n", + "upload: resources/grn_benchmark/prior/tf_all.csv to s3://openproblems-data/resources/grn/grn_benchmark/prior/tf_all.csv\n", + "upload: resources/grn_benchmark/inference_datasets/replogle_rna.h5ad to s3://openproblems-data/resources/grn/grn_benchmark/inference_datasets/replogle_rna.h5ad\n", + "upload: resources/grn_benchmark/prior/ws_consensus_adamson.csv to s3://openproblems-data/resources/grn/grn_benchmark/prior/ws_consensus_adamson.csv\n", + "upload: resources/grn_benchmark/prior/ws_consensus_norman.csv to s3://openproblems-data/resources/grn/grn_benchmark/prior/ws_consensus_norman.csv\n", + "upload: resources/grn_benchmark/prior/ws_distance_background_adamson.csv to s3://openproblems-data/resources/grn/grn_benchmark/prior/ws_distance_background_adamson.csv\n", + "upload: resources/grn_benchmark/prior/ws_distance_background_norman.csv to s3://openproblems-data/resources/grn/grn_benchmark/prior/ws_distance_background_norman.csv\n", + "upload: resources/grn_benchmark/prior/skeleton.csv to s3://openproblems-data/resources/grn/grn_benchmark/prior/skeleton.csv\n", + "upload: resources/grn_benchmark/inference_datasets/op_rna.h5ad to s3://openproblems-data/resources/grn/grn_benchmark/inference_datasets/op_rna.h5ad\n", + "upload: resources/grn_benchmark/inference_datasets/op_atac.h5ad to s3://openproblems-data/resources/grn/grn_benchmark/inference_datasets/op_atac.h5ad\n", + "upload: resources/grn_benchmark/evaluation_datasets/replogle_sc.h5ad to s3://openproblems-data/resources/grn/grn_benchmark/evaluation_datasets/replogle_sc.h5ad\n" + ] + } + ], "source": [ - "!aws s3 sync s3://openproblems-data/resources_test/grn/ resources_test/ --delete" + "!aws s3 sync resources/grn_benchmark s3://openproblems-data/resources/grn/grn_benchmark --delete" ] }, { @@ -32,7 +61,7 @@ "# !aws s3 sync resources/grn_benchmark/prior/ s3://openproblems-data/resources/grn/prior --delete\n", "# !aws s3 sync resources/results/ s3://openproblems-data/resources/grn/results --delete\n", "# !aws s3 sync resources/scores/ s3://openproblems-data/resources/grn/scores --delete\n", - "# !aws s3 sync resources/grn_benchmark/evaluation_datasets// s3://openproblems-data/resources/grn/evaluation_datasets/ --delete\n", + "# !aws s3 sync resources/grn_benchmark/evaluation_data// s3://openproblems-data/resources/grn/evaluation_data/ --delete\n", "# !aws s3 sync resources/grn_benchmark/inference_datasets/ s3://openproblems-data/resources/grn/inference_datasets/ --delete" ] }, @@ -171,7 +200,7 @@ "# aws s3 sync s3://openproblems-data/resources/grn/results resources/results/ --delete\n", "# aws s3 sync s3://openproblems-data/resources/grn/grn_models resources/grn_models/\n", "# aws s3 sync s3://openproblems-data/resources/grn/inference_datasets/ resources/grn_benchmark/inference_datasets/\n", - "# aws s3 sync s3://openproblems-data/resources/grn/evaluation_datasets/ resources/grn_benchmark/evaluation_datasets//" + "# aws s3 sync s3://openproblems-data/resources/grn/evaluation_data/ resources/grn_benchmark/evaluation_data//" ] }, { @@ -193,7 +222,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -228,26 +257,14 @@ " 'scores_dir': 'resources/scores'\n", "}\n", "\n", - "datasets = ['op', 'replogle2', 'nakatake', 'norman', 'adamson']" + "datasets = ['op', 'replogle', 'nakatake', 'norman', 'adamson']" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'datasets' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m dataset \u001b[38;5;129;01min\u001b[39;00m \u001b[43mdatasets\u001b[49m:\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m method \u001b[38;5;129;01min\u001b[39;00m par[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmethods\u001b[39m\u001b[38;5;124m'\u001b[39m]:\n\u001b[1;32m 3\u001b[0m file_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mresources/grn_models/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdataset\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmethod\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.csv\u001b[39m\u001b[38;5;124m'\u001b[39m\n", - "\u001b[0;31mNameError\u001b[0m: name 'datasets' is not defined" - ] - } - ], + "outputs": [], "source": [ "for dataset in datasets:\n", " for method in par['methods']:\n", @@ -261,33 +278,46 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/jnourisa/miniconda3/envs/py10/lib/python3.10/site-packages/anndata/_core/aligned_df.py:68: ImplicitModificationWarning: Transforming to str index.\n", + " warnings.warn(\"Transforming to str index.\", ImplicitModificationWarning)\n", + "/home/jnourisa/miniconda3/envs/py10/lib/python3.10/site-packages/anndata/_core/aligned_df.py:68: ImplicitModificationWarning: Transforming to str index.\n", + " warnings.warn(\"Transforming to str index.\", ImplicitModificationWarning)\n" + ] + } + ], "source": [ "import anndata as ad\n", - "adata = ad.read_h5ad('K562_gwps_raw_singlecell.h5ad', backed='r')" + "adata = ad.read_h5ad('output/score_op_scprint.h5ad')" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, "metadata": {}, "outputs": [ { - "ename": "NameError", - "evalue": "name 'adata' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43madata\u001b[49m\u001b[38;5;241m.\u001b[39mobs\n", - "\u001b[0;31mNameError\u001b[0m: name 'adata' is not defined" - ] + "data": { + "text/plain": [ + "{'dataset_id': 'op',\n", + " 'method_id': 'reg2-scprint',\n", + " 'metric_ids': array(['reg2-theta-0.0', 'reg2-theta-0.5', 'reg2-theta-1.0'], dtype=object),\n", + " 'metric_values': array([0.23660427, 0.28158203, 0.31389769])}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "adata.obs" + "adata.uns" ] }, { @@ -1039,7 +1069,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### replogle2" + "### replogle" ] }, { @@ -1257,7 +1287,7 @@ } ], "source": [ - "df_scores = pd.read_csv(f\"resources/scores/replogle2/50000-skeleton_False-binarize_True-ridge.csv\", index_col=0)\n", + "df_scores = pd.read_csv(f\"resources/scores/replogle/50000-skeleton_False-binarize_True-ridge.csv\", index_col=0)\n", "# df_scores[df_scores<0] = 0\n", "\n", "df_scores_f = df_scores[['static-theta-0.0', 'static-theta-0.5', 'static-theta-1.0']]\n", @@ -3801,7 +3831,7 @@ ], "metadata": { "kernelspec": { - "display_name": "py10", + "display_name": "base", "language": "python", "name": "python3" }, @@ -3815,7 +3845,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.10.15" } }, "nbformat": 4, diff --git a/scripts/run_benchmark_all.sh b/scripts/run_benchmark_all.sh index 025a37f73..01b96b479 100644 --- a/scripts/run_benchmark_all.sh +++ b/scripts/run_benchmark_all.sh @@ -21,7 +21,7 @@ param_list: - id: ${reg_type} metric_ids: $metric_ids method_ids: $method_ids - evaluation_data: ${resources_dir}/evaluation_datasets/${dataset}_perturbation.h5ad + evaluation_data: ${resources_dir}/evaluation_data/${dataset}.h5ad rna: ${resources_dir}/inference_datasets/${dataset}_rna.h5ad atac: ${resources_dir}/inference_datasets/${dataset}_atac.h5ad reg_type: $reg_type diff --git a/scripts/run_grn_evaluation.sh b/scripts/run_grn_evaluation.sh index cd07e0ba0..4e4a606cb 100644 --- a/scripts/run_grn_evaluation.sh +++ b/scripts/run_grn_evaluation.sh @@ -57,8 +57,8 @@ append_entry() { cat >> $param_file << HERE - id: ${reg_type}_${1} metric_ids: ${metric_ids} - evaluation_data: ${resources_dir}/evaluation_datasets/${dataset}_perturbation.h5ad - evaluation_data_sc: ${resources_dir}/evaluation_datasets/${dataset}_sc_counts.h5ad + evaluation_data: ${resources_dir}/evaluation_data/${dataset}.h5ad + evaluation_data_sc: ${resources_dir}/evaluation_data/${dataset}_sc_counts.h5ad reg_type: $reg_type method_id: $1 num_workers: $num_workers diff --git a/scripts/single_grn_evaluation.sh b/scripts/single_grn_evaluation.sh index 65dc14902..28c37187d 100644 --- a/scripts/single_grn_evaluation.sh +++ b/scripts/single_grn_evaluation.sh @@ -11,5 +11,5 @@ viash run src/metrics/all_metrics/config.novsh.yaml -- \ --regulators_consensus resources/grn_benchmark/prior/regulators_consensus_${dataset_id}.json \ --ws_consensus resources/grn_benchmark/prior/ws_consensus_${dataset_id}.csv \ --ws_distance_background resources/grn_benchmark/prior/ws_distance_background_${dataset_id}.csv \ - --evaluation_data_sc resources/grn_benchmark/evaluation_datasets//${dataset_id}_sc_counts.h5ad \ - --evaluation_data resources/grn_benchmark/evaluation_datasets//${dataset_id}_perturbation.h5ad \ No newline at end of file + --evaluation_data_sc resources/grn_benchmark/evaluation_data//${dataset_id}_sc_counts.h5ad \ + --evaluation_data resources/grn_benchmark/evaluation_data//${dataset_id}.h5ad \ No newline at end of file diff --git a/src/api/file_evaluation_h5ad.yaml b/src/api/file_evaluation_h5ad.yaml index 871c6af1d..1e3386131 100644 --- a/src/api/file_evaluation_h5ad.yaml +++ b/src/api/file_evaluation_h5ad.yaml @@ -1,5 +1,5 @@ type: file -example: resources_test/grn_benchmark/evaluation_datasets/op_bulk.h5ad +example: resources_test/grn_benchmark/evaluation_data/op_bulk.h5ad label: perturbation data summary: "Perturbation dataset for benchmarking." diff --git a/src/control_methods/pearson_corr/script.py b/src/control_methods/pearson_corr/script.py index ff2026969..2eeb696da 100644 --- a/src/control_methods/pearson_corr/script.py +++ b/src/control_methods/pearson_corr/script.py @@ -10,7 +10,7 @@ ## VIASH START par = { - 'rna': 'resources/grn_benchmark/evaluation_datasets//op_rna.h5ad', + 'rna': 'resources/grn_benchmark/evaluation_data//op_rna.h5ad', 'tf_all': 'resources/grn_benchmark/prior/tf_all.csv', 'cell_type_specific': False, 'max_n_links': 50000, diff --git a/src/helper.py b/src/helper.py index 844bbff76..6dc454835 100644 --- a/src/helper.py +++ b/src/helper.py @@ -41,7 +41,7 @@ def analyse_meta_cells(task_grn_inference_dir): par = { 'rna': f'{task_grn_inference_dir}/resources/grn_benchmark/inference_datasets/{dataset}_rna.h5ad', - "evaluation_data": f"{task_grn_inference_dir}/resources/grn_benchmark/evaluation_datasets//{dataset}_perturbation.h5ad", + "evaluation_data": f"{task_grn_inference_dir}/resources/grn_benchmark/evaluation_data//{dataset}.h5ad", 'layer': 'X_norm', 'consensus': f'{task_grn_inference_dir}/resources/grn_benchmark/prior/{dataset}_consensus-num-regulators.json', @@ -124,7 +124,7 @@ def analyse_imputation(task_grn_inference_dir): par = { 'rna': f'{task_grn_inference_dir}/resources/grn_benchmark/inference_datasets/{dataset}_rna.h5ad', - "evaluation_data": f"{task_grn_inference_dir}/resources/grn_benchmark/evaluation_datasets//{dataset}_perturbation.h5ad", + "evaluation_data": f"{task_grn_inference_dir}/resources/grn_benchmark/evaluation_data//{dataset}.h5ad", 'layer': 'X_norm', 'consensus': f'{task_grn_inference_dir}/resources/grn_benchmark/prior/{dataset}_consensus-num-regulators.json', @@ -202,10 +202,10 @@ def analyse_imputation(task_grn_inference_dir): scores_all.to_csv(f"{par['temp_dir']}/scores_all.csv") def analyse_corr_vs_tfmasked_corr(task_grn_inference_dir): - for i_run, dataset in enumerate(['op', 'replogle2', 'nakatake', 'norman', 'adamson']): + for i_run, dataset in enumerate(['op', 'replogle', 'nakatake', 'norman', 'adamson']): par = { 'rna': f'{task_grn_inference_dir}/resources/grn_benchmark/inference_datasets/{dataset}_rna.h5ad', - "evaluation_data": f"{task_grn_inference_dir}/resources/grn_benchmark/evaluation_datasets//{dataset}_perturbation.h5ad", + "evaluation_data": f"{task_grn_inference_dir}/resources/grn_benchmark/evaluation_data//{dataset}.h5ad", 'layer': 'X_norm', 'consensus': f'{task_grn_inference_dir}/resources/grn_benchmark/prior/{dataset}_consensus-num-regulators.json', diff --git a/src/metrics/all_metrics/helper.py b/src/metrics/all_metrics/helper.py index 15ef4533b..88eba7316 100644 --- a/src/metrics/all_metrics/helper.py +++ b/src/metrics/all_metrics/helper.py @@ -37,7 +37,7 @@ def run_consensus(datasets): for dataset in datasets: par = { 'models': models, - 'evaluation_data': f'resources/grn_benchmark/evaluation_datasets//{dataset}_perturbation.h5ad', + 'evaluation_data': f'resources/grn_benchmark/evaluation_data//{dataset}.h5ad', 'evaluation_data_sc': f'resources/datasets_raw/{dataset}_sc_counts.h5ad', 'models_dir': f'resources/grn_models/{dataset}/', 'regulators_consensus': f'resources/grn_benchmark/prior/regulators_consensus_{dataset}.json', diff --git a/src/metrics/all_metrics/run.sh b/src/metrics/all_metrics/run.sh new file mode 100644 index 000000000..1acab4db6 --- /dev/null +++ b/src/metrics/all_metrics/run.sh @@ -0,0 +1,14 @@ + + +python src/metrics/all_metrics/script.py \ + --run_local \ + --prediction resources/grn_models/op/scprint.csv \ + # --dataset_id op \ + # --method_id scprint \ + # --evaluation_data resources/grn_benchmark/evaluation_data/op_bulk.h5ad \ + # --evaluation_data_sc resources/grn_benchmark/evaluation_data/op_sc.h5ad \ + # --regulators_consensus resources/grn_benchmark/prior/regulators_consensus_op.json \ + # --ws_consensus resources/grn_benchmark/prior/ws_consensus_op.json \ + # --ws_distance_background resources/grn_benchmark/prior/ws_distance_background_op.json \ + # --score output/score_op_scprint.h5ad \ + \ No newline at end of file diff --git a/src/metrics/all_metrics/script.py b/src/metrics/all_metrics/script.py index ede22eb6c..97e7315cf 100644 --- a/src/metrics/all_metrics/script.py +++ b/src/metrics/all_metrics/script.py @@ -9,14 +9,14 @@ argparser = argparse.ArgumentParser() argparser.add_argument('--run_local', action='store_true', help='Run locally') argparser.add_argument('--prediction', help='Path to the GRN prediction file') -argparser.add_argument('--evaluation_dataset') -argparser.add_argument('--evaluation_dataset_sc') -argparser.add_argument('--regulators_consensus') -argparser.add_argument('--ws_consensus') -argparser.add_argument('--ws_distance_background') -argparser.add_argument('--method_id', help='Method ID') -argparser.add_argument('--dataset_id', help='Dataset ID') -argparser.add_argument('--score', help='Where to store scores') +# argparser.add_argument('--evaluation_data') +# argparser.add_argument('--evaluation_data_sc') +# argparser.add_argument('--regulators_consensus') +# argparser.add_argument('--ws_consensus') +# argparser.add_argument('--ws_distance_background') +# argparser.add_argument('--method_id', help='Method ID') +# argparser.add_argument('--dataset_id', help='Dataset ID') +# argparser.add_argument('--score', help='Where to store scores') par_local = vars(argparser.parse_args()) @@ -78,7 +78,7 @@ def main(par): assert par['dataset_id'] dataset = par['dataset_id'] - # par['evaluation_data'] = f'resources/grn_benchmark/evaluation_datasets/{dataset}_perturbation.h5ad' + # par['evaluation_data'] = f'resources/grn_benchmark/evaluation_data/{dataset}.h5ad' # par['evaluation_data_sc'] = f'resources/datasets_raw/{dataset}_sc_counts.h5ad' # par['regulators_consensus'] = f'resources/grn_benchmark/prior/regulators_consensus_{dataset}.json' # par['ws_consensus'] = f'resources/grn_benchmark/prior/ws_consensus_{dataset}.csv' diff --git a/src/metrics/all_metrics/script_all.py b/src/metrics/all_metrics/script_all.py index fd54ef756..c6dfd1b60 100644 --- a/src/metrics/all_metrics/script_all.py +++ b/src/metrics/all_metrics/script_all.py @@ -62,7 +62,7 @@ def run_evaluation(dataset, models, models_dir, save_file_name): run_scores_flag = True run_consensus_flag = False run_ws_distance_background_flag = False - datasets = ['op', 'replogle2', 'nakatake', 'norman', 'adamson'] + datasets = ['op', 'replogle', 'nakatake', 'norman', 'adamson'] if run_consensus_flag: # run consensus run_consensus(datasets) @@ -88,7 +88,7 @@ def run_evaluation(dataset, models, models_dir, save_file_name): run_evaluation(dataset, models, models_dir, scores_dir, save_file_name) if True: # subsample - # for dataset in ['op', 'replogle2', 'nakatake', 'norman', 'adamson']: #'op', 'replogle2', 'nakatake', 'norman', 'adamson' + # for dataset in ['op', 'replogle', 'nakatake', 'norman', 'adamson']: #'op', 'replogle', 'nakatake', 'norman', 'adamson' for dataset in ['op']: if dataset == 'op': models_subsampled = [f'{model}_{subsample}' for subsample in [1, 2] for model in models] @@ -146,7 +146,7 @@ def run_evaluation(dataset, models, models_dir, save_file_name): # def define_par(dataset): # par = { -# "evaluation_data": f"resources/grn_benchmark/evaluation_datasets//{dataset}_perturbation.h5ad", +# "evaluation_data": f"resources/grn_benchmark/evaluation_data//{dataset}.h5ad", # 'consensus': f'resources/grn_benchmark/prior/{dataset}_consensus-num-regulators.json', # 'layer': 'X_norm', diff --git a/src/metrics/regression_1/helper.py b/src/metrics/regression_1/helper.py index 13cee2572..c92beea2a 100644 --- a/src/metrics/regression_1/helper.py +++ b/src/metrics/regression_1/helper.py @@ -68,7 +68,22 @@ def cv_5(genes_n): groups = np.concatenate((groups, np.arange(genes_n % num_groups))) np.random.shuffle(groups) return groups - +def pivot_grn(net): + ''' make net to have gene*tf format''' + net = net.drop_duplicates(subset=['target', 'source']) + df_tmp = net.pivot(index='target', columns='source', values='weight') + return df_tmp.fillna(0) + + +def process_net(net, gene_names): + # Remove self-regulations + net = net.drop_duplicates() + net = net[net['source'] != net['target']] + # pivot + net = pivot_grn(net) + # subset + net = net[net.index.isin(gene_names)] + return net def cross_validation(net, prturb_adata, par:dict): np.random.seed(32) @@ -129,7 +144,7 @@ def cross_validation(net, prturb_adata, par:dict): n_baselines = 100 unique_groups = np.unique(groups) y_pred_baselines = [y_pred.copy() for i in range(n_baselines)] - for group in verbose_tqdm(unique_groups, "Processing groups", 2, par['verbose']): + for group in tqdm(unique_groups, "Cross validation"): mask_va = groups == group mask_tr = ~mask_va X_tr = X[mask_tr & mask_gene_net, :] diff --git a/src/metrics/regression_1/run.sh b/src/metrics/regression_1/run.sh index 689b2ee5f..76b9a710d 100644 --- a/src/metrics/regression_1/run.sh +++ b/src/metrics/regression_1/run.sh @@ -1,5 +1,14 @@ -viash run src/metrics/regression_1/config.vsh.yaml -- \ - --prediction resources/grn_models/norman/grnboost2.h5ad \ - --dataset_id norman --evaluation_data resources/grn_benchmark/evaluation_datasets//norman_perturbation.h5ad \ - --score output/score.h5ad \ - --tf_all resources/grn_benchmark/prior/tf_all.csv \ No newline at end of file +# viash run src/metrics/regression_1/config.vsh.yaml -- \ +# --prediction resources/grn_models/norman/grnboost2.h5ad \ +# --dataset_id norman --evaluation_data resources/grn_benchmark/evaluation_data//norman_bulk.h5ad \ +# --score output/score.h5ad \ +# --tf_all resources/grn_benchmark/prior/tf_all.csv + + +python src/metrics/regression_1/script.py \ + --run_local \ + --prediction resources/grn_models/op/grnboost2.h5ad \ + --dataset_id op \ + --evaluation_data resources/grn_benchmark/evaluation_data/op_bulk.h5ad \ + --method_id grnboost2 \ + --score output/score.h5ad \ No newline at end of file diff --git a/src/metrics/regression_1/script.py b/src/metrics/regression_1/script.py index e8ded33b3..77a14c54c 100644 --- a/src/metrics/regression_1/script.py +++ b/src/metrics/regression_1/script.py @@ -19,15 +19,15 @@ ## VIASH START par = { - "evaluation_data": f"resources/grn_benchmark/evaluation_datasets//op.h5ad", + "evaluation_data": f"resources/grn_benchmark/evaluation_data//op.h5ad", "tf_all": "resources/grn_benchmark/prior/tf_all.csv", - "prediction": f"resources/grn_models/op/grnboost2.csv", + "prediction": f"resources/grn_models/op/grnboost2.h5ad", "method_id": "scenic", "max_n_links": 50000, "apply_tf": True, 'score': 'output/score.h5ad', 'reg_type': 'ridge', - 'layer': 'pearson', + 'layer': 'X_norm', 'subsample': -1, 'num_workers': 4, 'skeleton': 'resources/grn_benchmark/prior/skeleton.csv', @@ -42,19 +42,18 @@ parser.add_argument('--run_local', action='store_true', help='Run locally') parser.add_argument('--evaluation_data', type=str, help='Path to the evaluation_data file') parser.add_argument('--prediction', type=str, help='Path to the prediction file') -parser.add_argument('--tf_all', type=str, help='Path to the tf_all') -parser.add_argument('--num_workers', type=str, help='Number of cores') -parser.add_argument('--layer', type=str, help='Layer to use') -parser.add_argument('--causal', action='store_true', help='Enable causal mode') -parser.add_argument('--normalize', action='store_true') +parser.add_argument('--method_id', type=str, help='Method id') +parser.add_argument('--dataset_id', type=str, help='Dataset id') +parser.add_argument('--score', type=str, help='score file') args = parser.parse_args() -var_local = vars(args) +var_local = vars(args) if args.run_local: for key in var_local: - par[key] = var_local[key] + if var_local[key] is not None: + par[key] = var_local[key] meta = { "resources_dir":'src/metrics/regression_1/', "util_dir":'src/utils' @@ -65,12 +64,14 @@ else: sys.path.append(meta["resources_dir"]) -from util import verbose_print, process_links, verbose_tqdm, process_net, binarize_weight -from helper import set_global_seed +from util import verbose_print, process_links, verbose_tqdm, binarize_weight +from helper import set_global_seed, process_net, cross_validation def read_net(par): + print(par['prediction'], flush=True) net = ad.read_h5ad(par['prediction']) net = pd.DataFrame(net.uns['prediction']) + net['weight'] = net['weight'].astype(float) if par['binarize']: net['weight'] = net['weight'].apply(binarize_weight) if par['apply_skeleton']: #apply skeleton @@ -83,8 +84,7 @@ def read_net(par): net = net[net['link'].isin(skeleton)] print('After filtering with skeleton:', net.shape) - if par['apply_tf']: - net = net[net.source.isin(tf_all)] + if 'cell_type' in net.columns: print('Taking mean of cell type specific grns') net.drop(columns=['cell_type'], inplace=True) @@ -102,11 +102,14 @@ def main(par): set_global_seed(random_state) # -- read input data + tf_all = np.loadtxt(par['tf_all'], dtype=str) net = read_net(par) + if par['apply_tf']: + net = net[net.source.isin(tf_all)] evaluation_data = ad.read_h5ad(par['evaluation_data']) evaluation_data.X = evaluation_data.layers[par["layer"]] gene_names = evaluation_data.var.index.to_numpy() - tf_all = np.loadtxt(par['tf_all'], dtype=str) + # -- calclate scores results = cross_validation(net, evaluation_data, par) @@ -114,11 +117,11 @@ def main(par): return results if __name__ == '__main__': - print(par) +# print(par) results = main(par) # - formatize results - results = dict(all=[results['r2score-aver-all']], grn=[results['r2score-aver-grn']]) + results = dict(r1_all=[results['r2score-aver-all']], r1_grn=[results['r2score-aver-grn']]) results = pd.DataFrame(results) print(results) diff --git a/src/metrics/regression_2/consensus/script.py b/src/metrics/regression_2/consensus/script.py index 8259ace16..22bcaa006 100644 --- a/src/metrics/regression_2/consensus/script.py +++ b/src/metrics/regression_2/consensus/script.py @@ -10,7 +10,7 @@ ## VIASH START par = { - 'evaluation_data': 'resources/grn_benchmark/evaluation_datasets//op_perturbation.h5ad', + 'evaluation_data': 'resources/grn_benchmark/evaluation_data//op.h5ad', 'models_dir': 'resources/grn_models/op/', 'models': ['pearson_corr', 'pearson_causal', 'portia', 'ppcor', 'genie3', 'grnboost2', 'scenic', 'scglue', 'celloracle'], 'regulators_consensus': 'resources/grn_benchmark/prior/regulators_consensus_op.json' diff --git a/src/metrics/regression_2/main.py b/src/metrics/regression_2/main.py index 9e89c498e..4452cad1b 100644 --- a/src/metrics/regression_2/main.py +++ b/src/metrics/regression_2/main.py @@ -278,86 +278,70 @@ def main(par: Dict[str, Any]) -> pd.DataFrame: net = ad.read_h5ad(par['prediction']) net = pd.DataFrame(net.uns['prediction']) - if 'donor_id' not in prturb_adata.obs.columns: - prturb_adata.obs['donor_id']= 'onebatch' - donor_ids = prturb_adata.obs.donor_id.unique() - df_results_store = [] - for donor_id in donor_ids: - prturb_adata_sub = prturb_adata[prturb_adata.obs.donor_id==donor_id,:] - if 'donor_id' in net.columns: - if donor_id not in net.donor_id.unique(): - raise ValueError(f'{donor_id} is not present in grn.') - net_sub = net[net.donor_id==donor_id] - else: - net_sub = net - net_matrix = net_to_matrix(net, gene_names, par) - - # groups = LabelEncoder().fit_transform(evaluation_data.obs.plate_name) - # groups = LabelEncoder().fit_transforcross_validatem(prturb_adata_sub.obs.cell_type) - n_cells = prturb_adata_sub.shape[0] - random_groups = np.random.choice(range(1, 5+1), size=n_cells, replace=True) # random sampling - groups = LabelEncoder().fit_transform(random_groups) - - # Load and standardize perturbation data - layer = par['layer'] - if layer=='X': - X = prturb_adata_sub.X - else: - X = prturb_adata_sub.layers[layer] - - try: - X = X.todense().A - except: - pass + net['weight'] = net['weight'].astype(float) + + + net_matrix = net_to_matrix(net, gene_names, par) - X = RobustScaler().fit_transform(X) + n_cells = prturb_adata.shape[0] + random_groups = np.random.choice(range(1, 5+1), size=n_cells, replace=True) # random sampling + groups = LabelEncoder().fit_transform(random_groups) - # Load consensus numbers of putative regulators - with open(par['regulators_consensus'], 'r') as f: - data = json.load(f) + # Load and standardize perturbation data + layer = par['layer'] + if layer=='X': + X = prturb_adata.X + else: + X = prturb_adata.layers[layer] + + try: + X = X.todense().A + except: + pass - gene_names_ = np.asarray(list(data.keys()), dtype=object) + X = RobustScaler().fit_transform(X) - n_features_theta_min = np.asarray([data[gene_name]['0'] for gene_name in gene_names], dtype=int) - n_features_theta_median = np.asarray([data[gene_name]['0.5'] for gene_name in gene_names], dtype=int) - n_features_theta_max = np.asarray([data[gene_name]['1'] for gene_name in gene_names], dtype=int) + # Load consensus numbers of putative regulators + with open(par['regulators_consensus'], 'r') as f: + data = json.load(f) - # Load list of putative TFs - tf_names = np.loadtxt(par['tf_all'], dtype=str) - if par['apply_tf']==False: - tf_names = gene_names + gene_names_ = np.asarray(list(data.keys()), dtype=object) - # Evaluate GRN - verbose_print(par['verbose'], f'Compute metrics for layer: {layer}', 3) - verbose_print(par['verbose'], f'Static approach (theta=0):', 3) - if (n_features_theta_min!=0).any()==False: - score_static_min = np.nan - else: - score_static_min = static_approach(net_matrix, n_features_theta_min, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['num_workers']) - verbose_print(par['verbose'], f'Static approach (theta=0.5):', 3) - score_static_median = static_approach(net_matrix, n_features_theta_median, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['num_workers']) - print(f'Static approach (theta=1):', flush=True) - score_static_max = static_approach(net_matrix, n_features_theta_max, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['num_workers']) - - results = { - 'reg2-theta-0.0': [float(score_static_min)], - 'reg2-theta-0.5': [float(score_static_median)], - 'reg2-theta-1.0': [float(score_static_max)], - } - - # # Add dynamic score - # if not par['static_only']: - # score_dynamic = dynamic_approach(grn, X, groups, gene_names, tf_names, par['reg_type']) - # score_overall = score_dynamic + score_static_min + score_static_median + score_static_max - # results['dynamic'] = [float(score_dynamic)] - # results['Overall'] = [float(score_overall)] - - # Convert results to DataFrame - df_results = pd.DataFrame(results) - df_results.index=[donor_id] - - df_results_store.append(df_results) - - df_results_concat = pd.concat(df_results_store, axis=0) - df_results_mean = df_results_concat.mean(axis=0).to_frame().T - return df_results_mean + n_features_theta_min = np.asarray([data[gene_name]['0'] for gene_name in gene_names], dtype=int) + n_features_theta_median = np.asarray([data[gene_name]['0.5'] for gene_name in gene_names], dtype=int) + n_features_theta_max = np.asarray([data[gene_name]['1'] for gene_name in gene_names], dtype=int) + + # Load list of putative TFs + tf_names = np.loadtxt(par['tf_all'], dtype=str) + if par['apply_tf']==False: + tf_names = gene_names + + # Evaluate GRN + verbose_print(par['verbose'], f'Compute metrics for layer: {layer}', 3) + verbose_print(par['verbose'], f'Static approach (theta=0):', 3) + if (n_features_theta_min!=0).any()==False: + score_static_min = np.nan + else: + score_static_min = static_approach(net_matrix, n_features_theta_min, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['num_workers']) + verbose_print(par['verbose'], f'Static approach (theta=0.5):', 3) + score_static_median = static_approach(net_matrix, n_features_theta_median, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['num_workers']) + print(f'Static approach (theta=1):', flush=True) + score_static_max = static_approach(net_matrix, n_features_theta_max, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['num_workers']) + + results = { + 'reg2-theta-0.0': [float(score_static_min)], + 'reg2-theta-0.5': [float(score_static_median)], + 'reg2-theta-1.0': [float(score_static_max)], + } + + # # Add dynamic score + # if not par['static_only']: + # score_dynamic = dynamic_approach(grn, X, groups, gene_names, tf_names, par['reg_type']) + # score_overall = score_dynamic + score_static_min + score_static_median + score_static_max + # results['dynamic'] = [float(score_dynamic)] + # results['Overall'] = [float(score_overall)] + + # Convert results to DataFrame + df_results = pd.DataFrame(results) + + return df_results diff --git a/src/metrics/regression_2/run.sh b/src/metrics/regression_2/run.sh index 5a6e5b34b..17a70c35b 100644 --- a/src/metrics/regression_2/run.sh +++ b/src/metrics/regression_2/run.sh @@ -1,6 +1,15 @@ -viash run src/metrics/regression_2/config.vsh.yaml -- \ - --prediction resources/grn_models/norman/grnboost2.h5ad \ - --dataset_id norman --evaluation_data resources/grn_benchmark/evaluation_datasets//norman_perturbation.h5ad \ +# viash run src/metrics/regression_2/config.vsh.yaml -- \ +# --prediction resources/grn_models/norman/grnboost2.h5ad \ +# --dataset_id norman --evaluation_data resources/grn_benchmark/evaluation_data//norman_bulk.h5ad \ +# --score output/score.h5ad \ +# --tf_all resources/grn_benchmark/prior/tf_all.csv \ +# --regulators_consensus resources/grn_benchmark/prior/regulators_consensus_norman.json + +python src/metrics/regression_2/script.py \ + --run_local \ + --prediction resources/grn_models/op/grnboost2.h5ad \ + --dataset_id op \ + --method_id op \ + --evaluation_data resources/grn_benchmark/evaluation_data/op_bulk.h5ad \ --score output/score.h5ad \ - --tf_all resources/grn_benchmark/prior/tf_all.csv \ - --regulators_consensus resources/grn_benchmark/prior/regulators_consensus_norman.json \ No newline at end of file + --regulators_consensus resources/grn_benchmark/prior/regulators_consensus_op.json \ No newline at end of file diff --git a/src/metrics/regression_2/script.py b/src/metrics/regression_2/script.py index bad17dff1..b22355bb1 100644 --- a/src/metrics/regression_2/script.py +++ b/src/metrics/regression_2/script.py @@ -2,17 +2,12 @@ import anndata as ad import sys import numpy as np - +import argparse ## VIASH START par = { - 'evaluation_data': 'resources/grn_benchmark/evaluation_datasets//op_perturbation.h5ad', 'layer': 'X_norm', - "prediction": "output/models/collectri.csv", - 'tf_all': 'resources/grn_benchmark/prior/tf_all.csv', "max_n_links": 50000, - 'consensus': 'output/models/op_consensus-num-regulators.json', - 'score': 'output/score_regression2.h5ad', 'reg_type': 'ridge', 'static_only': True, 'subsample': -1, @@ -22,21 +17,40 @@ 'method_id': 'grnboost', 'apply_skeleton': False, 'skeleton': 'resources/grn_benchmark/prior/skeleton.csv', + 'tf_all': 'resources/grn_benchmark/prior/tf_all.csv', 'verbose': 2 } ## VIASH END -print(par) -try: - sys.path.append(meta['resources_dir']) -except: +## LOCAL START +parser = argparse.ArgumentParser() +parser.add_argument('--run_local', action='store_true', help='Run locally') +parser.add_argument('--evaluation_data', type=str) +parser.add_argument('--regulators_consensus', type=str) +parser.add_argument('--prediction', type=str, help='Path to the prediction file') +parser.add_argument('--method_id', type=str, help='Method id') +parser.add_argument('--dataset_id', type=str, help='Dataset id') +parser.add_argument('--score', type=str, help='score file') + +args = parser.parse_args() +var_local = vars(args) + +## LOCAL END + +if args.run_local: + for key in var_local: + if var_local[key] is not None: + par[key] = var_local[key] meta = { - "resources_dir": 'src/metrics/', - "util": 'src/utils' + "resources_dir":'src/metrics/regression_2/', + "util_dir":'src/utils' } sys.path.append(meta["resources_dir"]) - sys.path.append(meta["util"]) + sys.path.append(meta["util_dir"]) + +else: + sys.path.append(meta["resources_dir"]) from main import main diff --git a/src/metrics/wasserstein/background_distance/run.sh b/src/metrics/wasserstein/background_distance/run.sh index 577b0b6fc..f40390917 100644 --- a/src/metrics/wasserstein/background_distance/run.sh +++ b/src/metrics/wasserstein/background_distance/run.sh @@ -3,8 +3,8 @@ #SBATCH --output=logs/%j.out #SBATCH --error=logs/%j.err #SBATCH --ntasks=1 -#SBATCH --cpus-per-task=10 -#SBATCH --time=10:00:00 +#SBATCH --cpus-per-task=100 +#SBATCH --time=20:00:00 #SBATCH --mem=100GB #SBATCH --partition=cpu #SBATCH --mail-type=END,FAIL diff --git a/src/metrics/wasserstein/background_distance/script.py b/src/metrics/wasserstein/background_distance/script.py index 6c8023ee0..0f730c527 100644 --- a/src/metrics/wasserstein/background_distance/script.py +++ b/src/metrics/wasserstein/background_distance/script.py @@ -5,14 +5,17 @@ import pandas as pd import numpy as np +from multiprocessing import Pool + # - create background dist. par = { - 'evaluation_data_sc': f'resources/grn_benchmark/evaluation_datasets//replogle_sc.h5ad', + 'evaluation_data_sc': f'resources/grn_benchmark/evaluation_data//replogle_sc.h5ad', 'background_distance': 'resources/grn_benchmark/prior/ws_distance_background_replogle.csv', 'tf_all': 'resources/grn_benchmark/prior/tf_all.csv', - 'layer': 'X_norm' + 'layer': 'X_norm', + 'max_workers': 100 } @@ -50,23 +53,50 @@ def calculate_ws_distance(net, adata) -> pd.DataFrame: return net -def main(par): +# def main(par): +# adata = ad.read_h5ad(par['evaluation_data_sc']) +# adata.X = adata.layers[par['layer']] + +# tf_all = np.loadtxt(par['tf_all'], dtype='str') +# available_tfs = np.intersect1d(adata.obs['perturbation'].unique(), tf_all) + +# # - for each tf, calculate the distances for all possible genes in the network +# net_all_store = [] +# for tf in tqdm(available_tfs): +# net_all = pd.DataFrame([{'source':tf, 'target': gene} for gene in adata.var_names]) +# net_all = calculate_ws_distance(net_all, adata) +# net_all_store.append(net_all) +# net_all_ws_distance = pd.concat(net_all_store) +# net_all_ws_distance.to_csv(par['background_distance']) + +def load_adata(par): adata = ad.read_h5ad(par['evaluation_data_sc']) adata.X = adata.layers[par['layer']] + return adata - tf_all = np.loadtxt(par['tf_all'], dtype='str') +def process_tf(tf_adata): + """Calculate ws distance for a given TF""" + adata = load_adata(par) + tf = tf_adata # Unpack tuple + net_all = pd.DataFrame([{'source': tf, 'target': gene} for gene in adata.var_names]) + net_all = calculate_ws_distance(net_all, adata) + return net_all + +def main(par): + adata = ad.read_h5ad(par['evaluation_data_sc'], backed='r') + + tf_all = np.loadtxt(par['tf_all'], dtype=str) available_tfs = np.intersect1d(adata.obs['perturbation'].unique(), tf_all) - # - for each tf, select a random background net and calculate the distances + # Set up multiprocessing pool + num_workers = min(par['max_workers'], len(available_tfs)) # Use available cores + with Pool(num_workers) as pool: + # tqdm now tracks progress inside imap_unordered() + results = list(tqdm(pool.imap(process_tf, [(tf) for tf in available_tfs]), + total=len(available_tfs), desc="Processing TFs")) - random_grn_store = [] - for tf in tqdm(available_tfs): - # random_genes = np.random.choice(adata.var_names, par['n_selection'], replace=False) - random_grn = pd.DataFrame([{'source':tf, 'target': gene} for gene in adata.var_names]) - - random_grn = calculate_ws_distance(random_grn, adata) - random_grn_store.append(random_grn) - random_grn_ws_distance = pd.concat(random_grn_store) - random_grn_ws_distance.to_csv(par['background_distance']) + # Combine results + net_all_ws_distance = pd.concat(results) + net_all_ws_distance.to_csv(par['background_distance']) if __name__ == '__main__': main(par) \ No newline at end of file diff --git a/src/metrics/wasserstein/main.py b/src/metrics/wasserstein/main.py index 778004771..36a23b123 100644 --- a/src/metrics/wasserstein/main.py +++ b/src/metrics/wasserstein/main.py @@ -6,6 +6,7 @@ def main(par): prediction = ad.read_h5ad(par['prediction']) prediction = pd.DataFrame(prediction.uns['prediction']) + prediction['weight'] = prediction['weight'].astype(float) consensus = pd.read_csv(par['ws_consensus'], index_col=0) background_distance = pd.read_csv(par['ws_distance_background'], index_col=0) evaluation_data = ad.read_h5ad(par['evaluation_data_sc']) diff --git a/src/metrics/wasserstein/run.sh b/src/metrics/wasserstein/run.sh index cab5c0931..5b47401ae 100644 --- a/src/metrics/wasserstein/run.sh +++ b/src/metrics/wasserstein/run.sh @@ -1,7 +1,17 @@ -viash run src/metrics/wasserstein/config.vsh.yaml -- \ +# viash run src/metrics/wasserstein/config.vsh.yaml -- \ +# --prediction resources/grn_models/norman/grnboost2.h5ad \ +# --dataset_id norman \ +# --ws_consensus resources/grn_benchmark/prior/ws_consensus_norman.csv \ +# --ws_distance_background resources/grn_benchmark/prior/ws_distance_background_norman.csv \ +# --evaluation_data_sc resources/datasets_raw/norman_sc_counts.h5ad \ +# --score output/score.h5ad + +python src/metrics/wasserstein/script.py \ + --run_local \ --prediction resources/grn_models/norman/grnboost2.h5ad \ --dataset_id norman \ + --method_id grnboost2 \ --ws_consensus resources/grn_benchmark/prior/ws_consensus_norman.csv \ --ws_distance_background resources/grn_benchmark/prior/ws_distance_background_norman.csv \ - --evaluation_data_sc resources/datasets_raw/norman_sc_counts.h5ad \ - --score output/score.h5ad \ No newline at end of file + --evaluation_data_sc resources/grn_benchmark/evaluation_data/norman_sc.h5ad \ + --score output/score.h5ad diff --git a/src/metrics/wasserstein/script.py b/src/metrics/wasserstein/script.py index 45e1303f0..9d00062e7 100644 --- a/src/metrics/wasserstein/script.py +++ b/src/metrics/wasserstein/script.py @@ -2,32 +2,47 @@ import anndata as ad import sys import numpy as np +import argparse ## VIASH START par = { - 'prediction': 'resources/grn_models/adamson/pearson_corr.csv', - 'evaluation_data_sc': 'resources/datasets_raw/adamson_sc_counts.h5ad', - 'score': 'output/score.h5ad', - 'ws_consensus': 'resources/grn_benchmark/prior/consensus_ws_distance_adamson.csv', - 'ws_distance_background':'resources/grn_benchmark/prior/ws_distance_background_adamson.csv', - 'layer': 'X_norm', - 'dataset_id': 'dataset_id', - 'method_id': 'pearson_corr' + 'layer': 'X_norm' } ## VIASH END -try: - sys.path.append(meta["resources_dir"]) - from main import main -except: +## LOCAL START +parser = argparse.ArgumentParser() +parser.add_argument('--run_local', action='store_true', help='Run locally') +parser.add_argument('--evaluation_data_sc', type=str) +parser.add_argument('--ws_consensus', type=str) +parser.add_argument('--ws_distance_background', type=str) +parser.add_argument('--prediction', type=str, help='Path to the prediction file') +parser.add_argument('--method_id', type=str, help='Method id') +parser.add_argument('--dataset_id', type=str, help='Dataset id') +parser.add_argument('--score', type=str, help='score file') + +args = parser.parse_args() +var_local = vars(args) + +## LOCAL END + +if args.run_local: + for key in var_local: + if var_local[key] is not None: + par[key] = var_local[key] meta = { - "resources_dir": 'src/metrics/wasserstein', - "util": 'src/utils' + "resources_dir":'src/metrics/wasserstein/', + "util_dir":'src/utils' } sys.path.append(meta["resources_dir"]) - sys.path.append(meta["util"]) - from main import main + sys.path.append(meta["util_dir"]) + +else: + sys.path.append(meta["resources_dir"]) + + +from main import main if __name__ == '__main__': _, mean_scores = main(par) diff --git a/src/process_data/opsca_perturbation/script.py b/src/process_data/opsca_perturbation/script.py index 49f13051e..124313467 100644 --- a/src/process_data/opsca_perturbation/script.py +++ b/src/process_data/opsca_perturbation/script.py @@ -14,7 +14,7 @@ par = { 'perturbation_counts': 'resources/datasets_raw/op_perturbation_sc_counts.h5ad', 'perturbation_bulk': 'resources/extended_data/op_perturbation_bulk.h5ad', - 'evaluation_data': 'resources/grn_benchmark/evaluation_datasets/op_bulk.h5ad', + 'evaluation_data': 'resources/grn_benchmark/evaluation_data/op_bulk.h5ad', } ## VIASH END meta = { diff --git a/src/process_data/pereggrn/script.py b/src/process_data/pereggrn/script.py index 0505100a0..bb2c109ce 100644 --- a/src/process_data/pereggrn/script.py +++ b/src/process_data/pereggrn/script.py @@ -54,7 +54,7 @@ def process_dataset(file_name): adata.obs = adata.obs[['perturbation', 'is_control', 'perturbation_type']] # - data split - if file_name == 'replogle2': + if file_name == 'replogle': ctr_samples = adata.obs.is_control samples = adata.obs.index[~ctr_samples] _, test_samples = train_test_split(samples, test_size=.2, random_state=32) @@ -102,11 +102,11 @@ def process_dataset(file_name): adata.layers['X_norm'] = adata.X.copy() if file_name in ['norman', 'adamson']: - adata.write(f'resources/grn_benchmark/evaluation_datasets/{file_name}_sc.h5ad') + adata.write(f'resources/grn_benchmark/evaluation_data/{file_name}_sc.h5ad') adata_bulked.write(f'resources/extended_data/{file_name}_bulk.h5ad') adata_train.write(f'resources/grn_benchmark/inference_datasets/{file_name}_rna.h5ad') - adata_test.write(f'resources/grn_benchmark/evaluation_datasets/{file_name}_bulk.h5ad') + adata_test.write(f'resources/grn_benchmark/evaluation_data/{file_name}_bulk.h5ad') def main(par): diff --git a/src/process_data/replogle_k562_gwps/run.sh b/src/process_data/replogle_k562_gwps/run.sh index bfc360f83..4364003a3 100644 --- a/src/process_data/replogle_k562_gwps/run.sh +++ b/src/process_data/replogle_k562_gwps/run.sh @@ -16,7 +16,7 @@ python src/process_data/replogle_k562_gwps/script.py \ --input resources/datasets_raw/replogle_K562_gwps_raw_singlecell.h5ad \ --tf_all resources/grn_benchmark/prior/tf_all.csv \ --adata_bulk resources/extended_data/replogle_bulk.h5ad \ - --adata_test_sc resources/grn_benchmark/evaluation_datasets/replogle_sc.h5ad \ - --adata_test_bulk resources/grn_benchmark/evaluation_datasets/replogle_bulk.h5ad \ + --adata_test_sc resources/grn_benchmark/evaluation_data/replogle_sc.h5ad \ + --adata_test_bulk resources/grn_benchmark/evaluation_data/replogle_bulk.h5ad \ --adata_train_bulk resources/grn_benchmark/inference_datasets/replogle_rna.h5ad \ --adata_train_sc resources/extended_data/replogle_train_sc.h5ad \ \ No newline at end of file diff --git a/src/process_data/repo/batch_correction_scgen/config.novsh.yaml b/src/process_data/repo/batch_correction_scgen/config.novsh.yaml index a8da67f4a..05a8aa2c1 100644 --- a/src/process_data/repo/batch_correction_scgen/config.novsh.yaml +++ b/src/process_data/repo/batch_correction_scgen/config.novsh.yaml @@ -30,7 +30,7 @@ functionality: required: true required: true direction: input - example: resources_test/grn_benchmark/evaluation_datasets//op_perturbation.h5ad + example: resources_test/grn_benchmark/evaluation_data//op.h5ad - name: --perturbation_data_bc type: file info: diff --git a/src/process_data/repo/batch_correction_seurat/config.novsh.yaml b/src/process_data/repo/batch_correction_seurat/config.novsh.yaml index b8379619b..fac6f1485 100644 --- a/src/process_data/repo/batch_correction_seurat/config.novsh.yaml +++ b/src/process_data/repo/batch_correction_seurat/config.novsh.yaml @@ -30,7 +30,7 @@ functionality: required: true required: true direction: input - example: resources_test/grn_benchmark/evaluation_datasets//op_perturbation.h5ad + example: resources_test/grn_benchmark/evaluation_data//op.h5ad - name: --perturbation_data_bc __merge__: ../../../api/file_evaluation_h5ad.yaml required: false diff --git a/src/process_data/test_data/run.sh b/src/process_data/test_data/run.sh index a97b2c5f8..0e0af2bb9 100644 --- a/src/process_data/test_data/run.sh +++ b/src/process_data/test_data/run.sh @@ -1,7 +1,7 @@ viash run src/process_data/test_data/config.novsh.yaml -- \ --rna resources/grn_benchmark/inference_datasets/op_rna.h5ad --rna_test resources_test/grn_benchmark/inference_datasets//op_rna.h5ad \ --atac resources/grn_benchmark/inference_datasets/op_atac.h5ad --atac_test resources_test/grn_benchmark/inference_datasets//op_atac.h5ad \ - --perturbation_data resources/grn_benchmark/evaluation_datasets/op_perturbation.h5ad --perturbation_data_test resources_test/grn_benchmark/evaluation_datasets/op_perturbation.h5ad \ + --perturbation_data resources/grn_benchmark/evaluation_data/op.h5ad --perturbation_data_test resources_test/grn_benchmark/evaluation_data/op.h5ad \ --multiomics_counts resources/grn_benchmark/datasets_raw/op_multiome_sc_counts.h5ad --multiomics_counts_test resources_test/grn_benchmark/datasets_raw/op_multiome_sc_counts.h5ad \ # --perturbation_counts resources/datasets_raw/op_perturbation_sc_counts.h5ad --perturbation_counts_test resources_test/datasets_raw/op_perturbation_sc_counts.h5ad \ # --perturbation_counts 'resources/datasets_raw/adamson_sc_counts.h5ad' --perturbation_counts_test resources_test/datasets_raw/adamson_sc_counts.h5ad diff --git a/src/process_data/test_data/script.py b/src/process_data/test_data/script.py index b893d00fa..6588565bd 100644 --- a/src/process_data/test_data/script.py +++ b/src/process_data/test_data/script.py @@ -18,8 +18,8 @@ 'atac': 'resources/grn_benchmark/inference_datasets/op_atac.h5ad', 'atac_test': 'resources_test/grn_benchmark/inference_datasets//op_atac.h5ad', - 'perturbation_data': 'resources/grn_benchmark/evaluation_datasets//op.h5ad', - 'perturbation_data_test': 'resources_test/grn_benchmark/evaluation_datasets//op.h5ad', + 'perturbation_data': 'resources/grn_benchmark/evaluation_data//op.h5ad', + 'perturbation_data_test': 'resources_test/grn_benchmark/evaluation_data//op.h5ad', 'multiomics_counts': 'resources/datasets_raw/op_multiome_sc_counts.h5ad', 'multiomics_counts_test': 'resources_test/datasets_raw/op_multiome_sc_counts.h5ad', diff --git a/src/robustness_analysis/script_all.py b/src/robustness_analysis/script_all.py index 61bb53a6e..8b0f37415 100644 --- a/src/robustness_analysis/script_all.py +++ b/src/robustness_analysis/script_all.py @@ -17,7 +17,7 @@ 'methods': ['negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'], # 'methods': ['pearson_corr'], - "evaluation_data": "resources/grn_benchmark/evaluation_datasets//op_perturbation.h5ad", + "evaluation_data": "resources/grn_benchmark/evaluation_data//op.h5ad", "tf_all": "resources/grn_benchmark/prior/tf_all.csv", "max_n_links": 50000, "apply_tf": False, diff --git a/src/workflows_local/benchmark/methods/script.py b/src/workflows_local/benchmark/methods/script.py index f6dbf6fc0..9c32eb2ff 100644 --- a/src/workflows_local/benchmark/methods/script.py +++ b/src/workflows_local/benchmark/methods/script.py @@ -174,7 +174,7 @@ def main(par): main(par) # if False: # subsample - # # for dataset in ['replogle2', 'norman', 'adamson', 'nakatake']: # 'replogle2' 'op' norman + # # for dataset in ['replogle', 'norman', 'adamson', 'nakatake']: # 'replogle' 'op' norman # for dataset in ['op']: # if dataset == 'op': # for subsample in [1, 2]: #number of donors diff --git a/src/workflows_local/benchmark/metrics/script.py b/src/workflows_local/benchmark/metrics/script.py index 7ec92150f..c031de01e 100644 --- a/src/workflows_local/benchmark/metrics/script.py +++ b/src/workflows_local/benchmark/metrics/script.py @@ -26,7 +26,9 @@ sys.path.append(meta["util"]) dependencies={ - 'all_metrics': 'src/metrics/all_metrics/script.py' + 'regression_1': 'src/metrics/regression_1/script.py', + 'regression_2': 'src/metrics/regression_2/script.py', + 'ws_distance': 'src/metrics/wasserstein/script.py', } # - run consensus @@ -55,11 +57,11 @@ def define_par(dataset): par_local = { - "evaluation_dataset": f"resources/grn_benchmark/evaluation_datasetsets/{dataset}_bulk.h5ad", - "evaluation_dataset_sc": f"resources/grn_benchmark/evaluation_datasetsets/{dataset}_sc.h5ad", + "evaluation_data": f"resources/grn_benchmark/evaluation_data/{dataset}_bulk.h5ad", + "evaluation_data_sc": f"resources/grn_benchmark/evaluation_data/{dataset}_sc.h5ad", 'regulators_consensus': f'resources/grn_benchmark/prior/regulators_consensus_{dataset}.json', - 'ws_consensus': f'resources/grn_benchmark/prior/ws_consensus_{dataset}.json', - 'ws_distance_background': f'resources/grn_benchmark/prior/ws_distance_background_{dataset}.json', + 'ws_consensus': f'resources/grn_benchmark/prior/ws_consensus_{dataset}.csv', + 'ws_distance_background': f'resources/grn_benchmark/prior/ws_distance_background_{dataset}.csv', 'layer': 'X_norm', "tf_all": "resources/grn_benchmark/prior/tf_all.csv", @@ -77,22 +79,66 @@ def run_consensus(dataset): main_consensus(par_local) def run_metrics(par): - par['score'] = f"output/score_{par['dataset_id']}_{par['method_id']}.h5ad" - args = f"--run_local --prediction {par['prediction']} \ + scores_store = [] + par['score'] = f"output/score_{par['dataset_id']}_{par['method_id']}.h5ad" + # -- reg1 + print('Running regression 1 ...') + args = f"--run_local --prediction {par['prediction']} \ + --dataset_id {par['dataset_id']} \ + --method_id {par['method_id']} \ + --evaluation_data {par['evaluation_data']} \ + --score {par['score']} " + + command = f"python {dependencies['regression_1']} {args}" + + result = subprocess.run(command, shell=True, capture_output=True, text=True) + + if result.returncode != 0: + print("STDOUT:", result.stdout) + print("STDERR:", result.stderr) + raise RuntimeError(f"Error: command proprocess failed with exit code {result.returncode}") + print('preprocess completed') + + score_reg1 = ad.read_h5ad(par['score']) + score_reg1 = pd.DataFrame([score_reg1.uns["metric_values"]], columns=score_reg1.uns["metric_ids"]) + + scores_store.append(score_reg1) + # -- reg2 + print('Running regression 2 ...') + args = f"--run_local \ + --prediction {par['prediction']} \ + --dataset_id {par['dataset_id']} \ + --method_id {par['method_id']} \ + --evaluation_data {par['evaluation_data']} \ + --regulators_consensus {par['regulators_consensus']} \ + --score {par['score']} " + + command = f"python {dependencies['regression_2']} {args}" + result = subprocess.run(command, shell=True, capture_output=True, text=True) + + if result.returncode != 0: + print("STDOUT:", result.stdout) + print("STDERR:", result.stderr) + raise RuntimeError(f"Error: command proprocess failed with exit code {result.returncode}") + print('preprocess completed') + + score_reg2 = ad.read_h5ad(par['score']) + score_reg2 = pd.DataFrame([score_reg2.uns["metric_values"]], columns=score_reg2.uns["metric_ids"]) + + scores_store.append(score_reg2) + # -- ws distance + if par['dataset_id'] in ['replogle', 'norman', 'adamson']: + print('Running WS disance ...') + args = f"--run_local \ + --prediction {par['prediction']} \ --dataset_id {par['dataset_id']} \ --method_id {par['method_id']} \ - --evaluation_dataset {par['evaluation_dataset']} \ - --evaluation_dataset_sc {par['evaluation_dataset_sc']} \ - --regulators_consensus {par['regulators_consensus']} \ + --evaluation_data_sc {par['evaluation_data_sc']} \ --ws_consensus {par['ws_consensus']} \ --ws_distance_background {par['ws_distance_background']} \ --score {par['score']} " - - command = f"python {dependencies['all_metrics']} {args}" - - print(command) - + command = f"python {dependencies['ws_distance']} {args}" result = subprocess.run(command, shell=True, capture_output=True, text=True) if result.returncode != 0: @@ -101,9 +147,15 @@ def run_metrics(par): raise RuntimeError(f"Error: command proprocess failed with exit code {result.returncode}") print('preprocess completed') - score = ad.read_h5ad(par['score']).uns['score'] - - return score + score_ws = ad.read_h5ad(par['score']) + score_ws = pd.DataFrame([score_ws.uns["metric_values"]], columns=score_ws.uns["metric_ids"]) + + scores_store.append(score_ws) + # - merge + score = pd.concat(scores_store, axis=1) + print(score) + + return score def run_evaluation(dataset, binarize=False, max_n_links=50000, apply_skeleton=False, reg_type='ridge'): @@ -121,7 +173,7 @@ def run_evaluation(dataset, binarize=False, max_n_links=50000, apply_skeleton=Fa # - add models for model in par['methods']: print(model) - grn_file = f"{models_dir}/{model}.csv" + grn_file = f"{models_dir}/{model}.h5ad" if not os.path.exists(grn_file): print(f"{grn_file} doesnt exist. Skipped.") continue @@ -168,7 +220,7 @@ def run_evaluation(dataset, binarize=False, max_n_links=50000, apply_skeleton=Fa # if False: # subsample - # # for dataset in ['op', 'replogle2', 'nakatake', 'norman', 'adamson']: #'op', 'replogle2', 'nakatake', 'norman', 'adamson' + # # for dataset in ['op', 'replogle', 'nakatake', 'norman', 'adamson']: #'op', 'replogle', 'nakatake', 'norman', 'adamson' # save_scores_file = f"{scores_dir}/subsampled.csv" # for i, dataset in enumerate(['op']): # if dataset == 'op': diff --git a/src/workflows_local/benchmark/run.sh b/src/workflows_local/benchmark/run.sh index d6c8ed87c..d856395ba 100644 --- a/src/workflows_local/benchmark/run.sh +++ b/src/workflows_local/benchmark/run.sh @@ -20,21 +20,20 @@ METHODS=( "scprint" ) -SAVE_SCORES_FILE="output/scores.h5ad" +SAVE_SCORES_FILE="output/scores.csv" FORCE=true RUN_CONSENSUS_FLAG=False - # ----- run methods ----- -# cmd="python src/workflows_local/benchmark/methods/script.py -# --datasets ${DATASETS[@]} -# --methods ${METHODS[@]}" +cmd="python src/workflows_local/benchmark/methods/script.py + --datasets ${DATASETS[@]} + --methods ${METHODS[@]}" -# [ "$FORCE" = true ] && cmd="${cmd} --force" +[ "$FORCE" = true ] && cmd="${cmd} --force" -# echo "Running: $cmd" -# $cmd +echo "Running: $cmd" +$cmd # ----- run metrics ----- cmd="python src/workflows_local/benchmark/metrics/script.py diff --git a/test.ipynb b/test.ipynb index ba802058c..63c2aa963 100644 --- a/test.ipynb +++ b/test.ipynb @@ -18,7 +18,7 @@ ], "source": [ "import anndata as ad\n", - "adata = ad.read_h5ad('resources/grn_benchmark/evaluation_datasets/replogle_sc.h5ad')\n", + "adata = ad.read_h5ad('resources/grn_benchmark/evaluation_data/replogle_sc.h5ad')\n", "adata.obs['is_control'] = adata.obs['perturbation']=='non-targeting'\n", " \n", "mask_sample = adata.obs['is_control']\n", @@ -50,7 +50,7 @@ } ], "source": [ - "adata = ad.read_h5ad('resources/grn_benchmark/inference_datasets/replogle2_rna.h5ad')\n", + "adata = ad.read_h5ad('resources/grn_benchmark/inference_datasets/replogle_rna.h5ad')\n", "adata[adata.obs['is_control']]" ] }