Skip to content

Commit c365ec3

Browse files
committed
tf binding metric improved
1 parent 6a6b641 commit c365ec3

File tree

9 files changed

+78
-264
lines changed

9 files changed

+78
-264
lines changed

src/api/comp_metric.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,11 @@ arguments:
3131
type: integer
3232
default: 2
3333
direction: input
34-
34+
- name: --tf_all
35+
type: file
36+
direction: input
37+
required: true
38+
example: resources_test/grn_benchmark/prior/tf_all.csv
3539
- name: --num_workers
3640
type: integer
3741
direction: input

src/api/comp_metric_regression.yaml

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,6 @@ info:
88
summary: A regression metric to evaluate the performance of the inferred GRN
99

1010
arguments:
11-
12-
- name: --tf_all
13-
type: file
14-
direction: input
15-
required: true
16-
example: resources_test/grn_benchmark/prior/tf_all.csv
1711
- name: --reg_type
1812
type: string
1913
direction: input

src/metrics/tf_binding/get_chipseq.py renamed to src/metrics/tf_binding/acquire/get_chipseq.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -121,13 +121,13 @@ def build_celltype_grn(cell_type, genome='hg38', local_path='./data/chip_atlas/'
121121

122122
return grn
123123
if __name__ == '__main__':
124-
# Example usage
125-
cell_type = 'PBMC'
126124
genome = 'hg38'
127125
local_path = 'resources/chip_atlas/'
128-
output_csv_path = f'resources/grn_benchmark/ground_truth/{cell_type}.csv'
129-
os.makedirs(local_path, exist_ok=True)
130-
window_bp = 1000
131-
qval = "50"
132-
grn = build_celltype_grn(cell_type, genome, local_path, window_bp, qval)
133-
grn.to_csv(output_csv_path, index=False)
126+
for cell_type in ['K-562']: #'PBMC'
127+
128+
output_csv_path = f'resources/grn_benchmark/ground_truth/{cell_type.replace("-", "")}.csv'
129+
os.makedirs(local_path, exist_ok=True)
130+
window_bp = 1000
131+
qval = "50"
132+
grn = build_celltype_grn(cell_type, genome, local_path, window_bp, qval)
133+
grn.to_csv(output_csv_path, index=False)
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=tf_binding_data
3+
#SBATCH --output=logs/%j.out
4+
#SBATCH --error=logs/%j.err
5+
#SBATCH --ntasks=1
6+
#SBATCH --cpus-per-task=2
7+
#SBATCH --time=10:00:00
8+
#SBATCH --mem=64GB
9+
#SBATCH --partition=cpu
10+
#SBATCH --mail-type=END,FAIL
11+
12+
13+
python src/metrics/tf_binding/acquire/get_chipseq.py

src/metrics/tf_binding/config.vsh.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__merge__: ../../api/comp_metric_ws.yaml
1+
__merge__: ../../api/comp_metric.yaml
22

33
name: tf_binding
44
namespace: "metrics"

src/metrics/tf_binding/helper.py

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,13 @@
1313

1414
def main(par):
1515
prediction = read_prediction(par)
16+
test_data = ad.read_h5ad(par['evaluation_data'], backed='r')
17+
evaluation_genes = test_data.var_names.tolist()
18+
n_targets_total = len(evaluation_genes)
19+
1620
tf_all = np.loadtxt(par['tf_all'], dtype=str, delimiter=',', skiprows=1)
1721
true_graph = pd.read_csv(par['ground_truth'])
18-
true_graph = true_graph[true_graph['source'].isin(tf_all)]
22+
true_graph = true_graph[(true_graph['source'].isin(tf_all)) & (true_graph['target'].isin(evaluation_genes))]
1923
assert prediction.shape[0] > 0, 'No links found in the network'
2024
assert true_graph.shape[0] > 0, 'No links found in the ground truth'
2125

@@ -26,22 +30,40 @@ def main(par):
2630
pred_edges = prediction[prediction['source'] == tf]
2731
true_labels = true_edges['target'].isin(pred_edges['target']).astype(int)
2832
pred_scores = pred_edges.set_index('target').reindex(true_edges['target'])['weight'].fillna(0)
29-
ap = average_precision_score(true_labels, pred_scores)
33+
if true_labels.sum() == 0: # no positives
34+
ap = 0.0
35+
else:
36+
ap = average_precision_score(true_labels, pred_scores)
3037
else:
3138
ap = float('nan')
32-
33-
scores_model.append({'source': tf, 'ap': ap})
39+
n_targets = len(true_edges)
40+
41+
# ----- Analytic random baseline -----
42+
# Extend true edges to all evaluation genes
43+
true_labels_random = np.zeros(n_targets_total)
44+
idx = [evaluation_genes.index(t) for t in true_edges['target']]
45+
true_labels_random[idx] = 1
46+
ap_random = true_labels_random.sum() / len(true_labels_random)
47+
48+
scores_model.append({'source': tf, 'ap': ap, 'n_targets': n_targets, 'ap_random': ap_random})
3449

3550
scores_df = pd.DataFrame(scores_model)
36-
print(scores_df)
51+
print('Number of TFs in GRN:', len(scores_df[scores_df['ap'].notna()]))
3752

38-
# Precision: mean over available TFs (ignoring NaNs)
39-
precision = scores_df['ap'].mean(skipna=True)
53+
# Compute weighted mean (ignoring NaNs)
54+
valid = scores_df.dropna(subset=['ap'])
55+
weighted_precision = np.average(valid['ap'], weights=valid['n_targets'])
4056

41-
# Recall: mean over all TFs, punishing NaNs as 0
57+
# Compute unweighted means (for reference)
58+
precision = scores_df['ap'].mean(skipna=True)
59+
precision_random = scores_df['ap_random'].mean(skipna=True)
4260
recall = scores_df['ap'].fillna(0).mean()
4361

44-
# One-row summary DataFrame
45-
summary_df = pd.DataFrame([{'precision': precision, 'recall': recall}])
62+
summary_df = pd.DataFrame([{
63+
'precision': precision,
64+
'precision_random': precision_random,
65+
'recall': recall,
66+
'weighted_precision': weighted_precision
67+
}])
4668

4769
return summary_df

src/metrics/tf_binding/run_local.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ save_dir="output/tf_binding"
1616
mkdir -p "$save_dir"
1717

1818
# datasets to process
19-
datasets=('op' "300BCG" 'parsebioscience' ) #"300BCG" "ibd" 'parsebioscience'
19+
datasets=('replogle' 'norman' 'adamson' ) #"300BCG" "ibd" 'parsebioscience''op' "300BCG" 'parsebioscience'
2020
# methods to process
2121
methods=("negative_control" "pearson_corr" "positive_control" "ppcor" "portia" "scenic" "grnboost" "scprint" "scenicplus" "celloracle" "scglue" "figr" "granie")
2222

@@ -42,7 +42,7 @@ for dataset in "${datasets[@]}"; do
4242
python src/metrics/tf_binding/script.py \
4343
--prediction "$prediction" \
4444
--evaluation_data "$evaluation_data" \
45-
--ground_truth "resources/grn_benchmark/ground_truth/PBMC.csv" \
45+
--ground_truth "resources/grn_benchmark/ground_truth/K562.csv" \
4646
--score "$score"
4747

4848
done

src/metrics/tf_recovery/helper.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import pandas as pd
22
import numpy as np
33
import decoupler as dc
4-
import mudata as mu
54
import sys
65
import os
76
import re

0 commit comments

Comments
 (0)