tf binding metric improved

janursa · janursa · commit c365ec31a058 · 2025-10-08T09:55:49.000+02:00
diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml
@@ -31,7 +31,11 @@ arguments:
     type: integer
     default: 2
     direction: input
-
+  - name: --tf_all
+    type: file
+    direction: input
+    required: true
+    example: resources_test/grn_benchmark/prior/tf_all.csv
   - name: --num_workers
     type: integer
     direction: input
diff --git a/src/api/comp_metric_regression.yaml b/src/api/comp_metric_regression.yaml
@@ -8,12 +8,6 @@ info:
     summary: A regression metric to evaluate the performance of the inferred GRN
 
 arguments: 
-
-  - name: --tf_all
-    type: file
-    direction: input
-    required: true
-    example: resources_test/grn_benchmark/prior/tf_all.csv
   - name: --reg_type
     type: string
     direction: input
diff --git a/src/metrics/tf_binding/acquire/get_chipseq.py b/src/metrics/tf_binding/acquire/get_chipseq.py
@@ -121,13 +121,13 @@ def build_celltype_grn(cell_type, genome='hg38', local_path='./data/chip_atlas/'
     
     return grn
 if __name__ == '__main__':
-    # Example usage
-    cell_type = 'PBMC'
     genome = 'hg38'
     local_path = 'resources/chip_atlas/'
-    output_csv_path = f'resources/grn_benchmark/ground_truth/{cell_type}.csv'
-    os.makedirs(local_path, exist_ok=True)
-    window_bp = 1000
-    qval = "50"
-    grn = build_celltype_grn(cell_type, genome, local_path, window_bp, qval)
-    grn.to_csv(output_csv_path, index=False)
+    for cell_type in ['K-562']: #'PBMC'
+        
+        output_csv_path = f'resources/grn_benchmark/ground_truth/{cell_type.replace("-", "")}.csv'
+        os.makedirs(local_path, exist_ok=True)
+        window_bp = 1000
+        qval = "50"
+        grn = build_celltype_grn(cell_type, genome, local_path, window_bp, qval)
+        grn.to_csv(output_csv_path, index=False)
diff --git a/src/metrics/tf_binding/acquire/run_local.sh b/src/metrics/tf_binding/acquire/run_local.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+#SBATCH --job-name=tf_binding_data
+#SBATCH --output=logs/%j.out
+#SBATCH --error=logs/%j.err
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=2
+#SBATCH --time=10:00:00
+#SBATCH --mem=64GB
+#SBATCH --partition=cpu
+#SBATCH --mail-type=END,FAIL      
+#SBATCH --mail-user=jalil.nourisa@gmail.com   
+
+python src/metrics/tf_binding/acquire/get_chipseq.py 
diff --git a/src/metrics/tf_binding/config.vsh.yaml b/src/metrics/tf_binding/config.vsh.yaml
@@ -1,4 +1,4 @@
-__merge__: ../../api/comp_metric_ws.yaml
+__merge__: ../../api/comp_metric.yaml
 
 name: tf_binding
 namespace: "metrics"
diff --git a/src/metrics/tf_binding/helper.py b/src/metrics/tf_binding/helper.py
@@ -13,9 +13,13 @@
 
 def main(par):
     prediction = read_prediction(par)
+    test_data = ad.read_h5ad(par['evaluation_data'], backed='r')
+    evaluation_genes = test_data.var_names.tolist()
+    n_targets_total = len(evaluation_genes)
+
     tf_all = np.loadtxt(par['tf_all'], dtype=str, delimiter=',', skiprows=1)
     true_graph = pd.read_csv(par['ground_truth'])
-    true_graph = true_graph[true_graph['source'].isin(tf_all)]
+    true_graph = true_graph[(true_graph['source'].isin(tf_all)) & (true_graph['target'].isin(evaluation_genes))]
     assert prediction.shape[0] > 0, 'No links found in the network'
     assert true_graph.shape[0] > 0, 'No links found in the ground truth'
     
@@ -26,22 +30,40 @@ def main(par):
             pred_edges = prediction[prediction['source'] == tf]
             true_labels = true_edges['target'].isin(pred_edges['target']).astype(int)
             pred_scores = pred_edges.set_index('target').reindex(true_edges['target'])['weight'].fillna(0)
-            ap = average_precision_score(true_labels, pred_scores)
+            if true_labels.sum() == 0:  # no positives
+                ap = 0.0
+            else:
+                ap = average_precision_score(true_labels, pred_scores)
         else:
             ap = float('nan')
-        
-        scores_model.append({'source': tf, 'ap': ap})
+        n_targets = len(true_edges)
+
+        # ----- Analytic random baseline -----
+        # Extend true edges to all evaluation genes
+        true_labels_random = np.zeros(n_targets_total)
+        idx = [evaluation_genes.index(t) for t in true_edges['target']]
+        true_labels_random[idx] = 1
+        ap_random = true_labels_random.sum() / len(true_labels_random)
+
+        scores_model.append({'source': tf, 'ap': ap, 'n_targets': n_targets, 'ap_random': ap_random})
     
     scores_df = pd.DataFrame(scores_model)
-    print(scores_df)
+    print('Number of TFs in GRN:', len(scores_df[scores_df['ap'].notna()]))
 
-    # Precision: mean over available TFs (ignoring NaNs)
-    precision = scores_df['ap'].mean(skipna=True)
+    # Compute weighted mean (ignoring NaNs)
+    valid = scores_df.dropna(subset=['ap'])
+    weighted_precision = np.average(valid['ap'], weights=valid['n_targets'])
 
-    # Recall: mean over all TFs, punishing NaNs as 0
+    # Compute unweighted means (for reference)
+    precision = scores_df['ap'].mean(skipna=True)
+    precision_random = scores_df['ap_random'].mean(skipna=True)
     recall = scores_df['ap'].fillna(0).mean()
 
-    # One-row summary DataFrame
-    summary_df = pd.DataFrame([{'precision': precision, 'recall': recall}])
+    summary_df = pd.DataFrame([{
+        'precision': precision,
+        'precision_random': precision_random,
+        'recall': recall,
+        'weighted_precision': weighted_precision
+    }])
 
     return summary_df
diff --git a/src/metrics/tf_binding/run_local.sh b/src/metrics/tf_binding/run_local.sh
@@ -16,7 +16,7 @@ save_dir="output/tf_binding"
 mkdir -p "$save_dir"
 
 # datasets to process
-datasets=('op' "300BCG" 'parsebioscience' ) #"300BCG" "ibd" 'parsebioscience'
+datasets=('replogle' 'norman' 'adamson' ) #"300BCG" "ibd" 'parsebioscience''op' "300BCG" 'parsebioscience' 
 # methods to process
 methods=("negative_control" "pearson_corr" "positive_control" "ppcor" "portia" "scenic" "grnboost" "scprint" "scenicplus" "celloracle" "scglue" "figr" "granie")
 
@@ -42,7 +42,7 @@ for dataset in "${datasets[@]}"; do
         python src/metrics/tf_binding/script.py \
             --prediction "$prediction" \
             --evaluation_data "$evaluation_data" \
-            --ground_truth "resources/grn_benchmark/ground_truth/PBMC.csv" \
+            --ground_truth "resources/grn_benchmark/ground_truth/K562.csv" \
             --score "$score"
 
     done
diff --git a/src/metrics/tf_recovery/helper.py b/src/metrics/tf_recovery/helper.py
@@ -1,7 +1,6 @@
 import pandas as pd
 import numpy as np
 import decoupler as dc
-import mudata as mu
 import sys
 import os
 import re
diff --git a/test.ipynb b/test.ipynb

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__merge__: ../../api/comp_metric_ws.yaml`
	`1`	`+__merge__: ../../api/comp_metric.yaml`
`2`	`2`
`3`	`3`	`name: tf_binding`
`4`	`4`	`namespace: "metrics"`