robustness analysis causal added

openproblems-bio · Sep 5, 2024 · 656771a · 656771a
1 parent 63b40d5
commit 656771a
Show file tree

Hide file tree

Showing 11 changed files with 372 additions and 83 deletions.
diff --git a/scripts/run_benchmark_single_omics.sh b/scripts/run_benchmark_single_omics.sh
@@ -1,9 +1,9 @@
 #!/bin/bash
 
 # RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
-RUN_ID="single_omics_all_test"
-resources_dir="s3://openproblems-data/resources_test/grn"
-publish_dir="s3://openproblems-data/resources_test/grn/results/${RUN_ID}"
+RUN_ID="single_omics_all"
+resources_dir="s3://openproblems-data/resources/grn"
+publish_dir="s3://openproblems-data/resources/grn/results/${RUN_ID}"
 
 # resources_dir="./resources_test/"
 # publish_dir="output/${RUN_ID}"
@@ -31,21 +31,21 @@ output_state: "state.yaml"
 publish_dir: "$publish_dir"
 HERE
 
-nextflow run . \
-  -main-script  target/nextflow/workflows/run_benchmark_single_omics/main.nf \
-  -profile docker \
-  -with-trace \
-  -c src/common/nextflow_helpers/labels_ci.config \
-  -params-file ${param_file}
-
-./tw-windows-x86_64.exe launch `
-    https://github.com/openproblems-bio/task_grn_benchmark.git `
-    --revision build/main `
-    --pull-latest `
-    --main-script target/nextflow/workflows/run_benchmark_single_omics/main.nf `
-    --workspace 53907369739130 `
-    --compute-env 6TeIFgV5OY4pJCk8I0bfOh `
-    --params-file ./params/single_omics_all_test.yaml `
-    --config src/common/nextflow_helpers/labels_tw.config
+# nextflow run . \
+#   -main-script  target/nextflow/workflows/run_benchmark_single_omics/main.nf \
+#   -profile docker \
+#   -with-trace \
+#   -c src/common/nextflow_helpers/labels_ci.config \
+#   -params-file ${param_file}
+
+# ./tw-windows-x86_64.exe launch `
+#     https://github.com/openproblems-bio/task_grn_benchmark.git `
+#     --revision build/main `
+#     --pull-latest `
+#     --main-script target/nextflow/workflows/run_benchmark_single_omics/main.nf `
+#     --workspace 53907369739130 `
+#     --compute-env 6TeIFgV5OY4pJCk8I0bfOh `
+#     --params-file ./params/single_omics_all_test.yaml `
+#     --config src/common/nextflow_helpers/labels_tw.config
 
 
diff --git a/scripts/run_robust_analys_causal.sh b/scripts/run_robust_analys_causal.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+viash ns build --parallel
+RUN_ID="robust_analy_causal" 
+resources_dir="resources"
+# resources_dir="s3://openproblems-data/resources/grn"
+
+publish_dir="${resources_dir}/results/${RUN_ID}"
+
+reg_type=ridge
+subsample=-2
+max_workers=10
+
+param_file="./params/${RUN_ID}.yaml"
+# Start writing to the YAML file
+cat > $param_file << HERE
+param_list:
+HERE
+
+append_entry() {
+  cat >> $param_file << HERE
+  - id: corr-${1}
+    multiomics_rna: ${resources_dir}/grn-benchmark/multiomics_rna.h5ad
+    perturbation_data: ${resources_dir}/grn-benchmark/perturbation_data.h5ad
+    reg_type: $reg_type
+    method_id: corr-${1}
+    layer: ${2}
+    subsample: $subsample
+    max_workers: $max_workers
+    consensus: ${resources_dir}/prior/consensus-num-regulators.json
+    tf_all: ${resources_dir}/prior/tf_all.csv
+HERE
+}
+# Loop through grn_names and layers
+layers=("pearson")  # Array containing the layer(s)
+
+for layer in "${layers[@]}"; do  # Iterate over each layer in the array
+    for iter in {1..100}; do  # Loop from 1 to 100 iterations
+        append_entry "$iter" "$layer"  # Execute the append_entry command
+    done
+done
+
+
+# Append the remaining output_state and publish_dir to the YAML file
+cat >> $param_file << HERE
+output_state: "state.yaml"
+publish_dir: "$publish_dir"
+HERE
+
+nextflow run . \
+  -main-script  target/nextflow/workflows/run_robustness_analysis_causal/main.nf \
+  -profile docker \
+  -with-trace \
+  -c src/common/nextflow_helpers/labels_ci.config \
+  -params-file ${param_file}
+
diff --git a/src/methods/multi_omics/scenicplus/config.vsh.yaml b/src/methods/multi_omics/scenicplus/config.vsh.yaml
@@ -27,6 +27,12 @@ functionality:
       required: false
       direction: output
       description: "Cell-topics prob scores"
+    - name: --grn_extended
+      type: file 
+      default: output/grn_extended.csv
+      required: false
+      direction: output
+      description: "Source-target-peak triplets"
   resources:
     - type: python_script
       path: script.py

diff --git a/src/methods/multi_omics/scenicplus/script.py b/src/methods/multi_omics/scenicplus/script.py
@@ -366,7 +366,7 @@
         models = run_cgs_models(
             cistopic_obj,
             n_topics=n_topics,
-            n_cpu=12,
+            n_cpu=par['num_workers'],
             n_iter=500,
             random_state=555,
             alpha=50,
@@ -545,7 +545,7 @@
     contrasts=None,
     adjpval_thr=0.05,
     log2fc_thr=np.log2(1.5),
-    n_cpu=5,
+    n_cpu=par['num_workers'],
     split_pattern='-'
 )
 
@@ -724,11 +724,11 @@ def download(url: str, filepath: str) -> None:
 settings['input_data']['path_to_motif_annotations'] = motif_annotation
 settings['params_general']['temp_dir'] = os.path.join(out_dir, 'scplus_pipeline', 'temp')
 settings['params_general']['n_cpu'] = par['num_workers']
-settings['params_inference']['quantile_thresholds_region_to_gene'] = '0.85 0.90 0.95'
-settings['params_inference']['top_n_regionTogenes_per_gene'] = '5 10 15'
+settings['params_inference']['quantile_thresholds_region_to_gene'] = '0.7 0.8 0.9'
+settings['params_inference']['top_n_regionTogenes_per_gene'] = '10 15 25'
 settings['params_inference']['region_to_gene_importance_method'] = 'GBM'
 settings['params_inference']['tf_to_gene_importance_method'] = 'GBM'
-settings['params_inference']['rho_threshold'] = 0.03
+settings['params_inference']['rho_threshold'] = 0.0
 settings['params_inference']['region_to_gene_correlation_method'] = 'SR'
 settings['params_inference']['min_target_genes'] = 10
 settings['params_motif_enrichment']['species'] = 'homo_sapiens'
@@ -764,8 +764,17 @@ def download(url: str, filepath: str) -> None:
 scplus_mdata = mudata.read(par['scplus_mdata'])
 
 # Save results
-prediction = scplus_mdata.uns['direct_e_regulon_metadata']
-prediction.insert(0, 'source', prediction['TF'])
-prediction.insert(1, 'target', prediction['Gene'])
-prediction.insert(2, 'weight', prediction['importance_x_abs_rho'])
+grn_extended = scplus_mdata.uns['direct_e_regulon_metadata']
+
+grn_extended = grn_extended[['TF', 'Gene', 'rho_TF2G', 'Region']].drop_duplicates().reset_index(drop=True)
+grn_extended = grn_extended[grn_extended.rho_TF2G!=0]
+
+grn_extended.columns = ['source', 'target', 'weight', 'peak']
+
+grn_extended = grn_extended[['source','target','weight']].drop_duplicates(ignore_index=True)
+
+prediction = grn_extended.groupby(['source', 'target'], as_index=False)['weight'].sum()
+
+grn_extended.to_csv(par['grn_extended'])
 prediction.to_csv(par['prediction'])
+
diff --git a/src/robustness_analysis/add_noise_grn.py b/src/robustness_analysis/add_noise_grn.py
diff --git a/src/robustness_analysis/causal/config.vsh.yaml b/src/robustness_analysis/causal/config.vsh.yaml
@@ -0,0 +1,34 @@
+functionality:
+  name: causal_grn
+  namespace: "robustness_analysis"
+  info:
+    label: causal_grn
+    summary: Adds noise to the GRNs
+  arguments:
+    - name: --multiomics_rna
+      type: file
+      direction: input
+      example: resources_test/grn-benchmark/multiomics_rna.h5ad
+
+    - name: --tf_all
+      type: file
+      direction: input
+      example: resources_test/prior/tf_all.csv
+
+    - name: --prediction
+      type: file
+      direction: output 
+      example: resources_test/grn_models/collectri.csv
+
+  resources:
+    - type: python_script
+      path: script.py
+platforms:
+  - type: docker
+    image: ghcr.io/openproblems-bio/base_python:1.0.4
+    setup:
+      - type: python
+        packages: []
+  - type: nextflow
+    directives:
+      label: [ midtime, highmem, highcpu ]
diff --git a/src/robustness_analysis/causal/script.py b/src/robustness_analysis/causal/script.py
@@ -0,0 +1,49 @@
+import os
+import pandas as pd
+import numpy as np
+import anndata as ad
+import scanpy as sc
+from tqdm import tqdm
+from sklearn.preprocessing import StandardScaler
+
+
+## VIASH START
+par = {
+
+}
+
+## VIASH END
+
+def create_corr_net(X: np.ndarray, groups: np.ndarray):
+    grns = []
+    for group in tqdm(np.unique(groups), desc="Processing groups"):
+        X_sub = X[groups == group, :]
+        X_sub = StandardScaler().fit_transform(X_sub)
+        grn = np.dot(X_sub.T, X_sub) / X_sub.shape[0]
+        grns.append(grn)
+    return np.mean(grns, axis=0)
+
+
+print('Read data')
+multiomics_rna = ad.read_h5ad(par["multiomics_rna"])
+gene_names = multiomics_rna.var_names.to_numpy()
+tf_all = np.loadtxt(par['tf_all'], dtype=str)
+groups = multiomics_rna.obs.cell_type
+tf_all = np.intersect1d(tf_all, gene_names)
+
+print('Noramlize data')
+sc.pp.normalize_total(multiomics_rna)
+sc.pp.log1p(multiomics_rna)
+sc.pp.scale(multiomics_rna)
+
+print('Create corr net')
+net = create_corr_net(multiomics_rna.X, groups)
+net = pd.DataFrame(net, index=gene_names, columns=gene_names)
+
+net_corr = net.sample(len(tf_all), axis=1)
+net_corr = net_corr.reset_index().melt(id_vars='index', var_name='source', value_name='weight')
+net_corr.rename(columns={'index': 'target'}, inplace=True)
+
+print('Output noised GRN')
+net_corr.to_csv(par['prediction'])
+
diff --git a/src/robustness_analysis/config.vsh.yaml → ...ness_analysis/permute_grn/config.vsh.yaml b/src/robustness_analysis/config.vsh.yaml → ...ness_analysis/permute_grn/config.vsh.yaml
diff --git a/src/robustness_analysis/script.py → ...robustness_analysis/permute_grn/script.py b/src/robustness_analysis/script.py → ...robustness_analysis/permute_grn/script.py
diff --git a/src/workflows/run_robustness_analysis_causal/config.vsh.yaml b/src/workflows/run_robustness_analysis_causal/config.vsh.yaml
@@ -0,0 +1,80 @@
+functionality:
+  name: run_robustness_analysis_causal
+  namespace: "workflows"
+  info:
+    label: run_robustness_analysis_causal
+    summary: "Evaluates GRNs and provides scores using regression analysis."
+  argument_groups:
+    - name: Inputs
+      arguments:
+        - name: --multiomics_rna
+          type: file
+          direction: input
+        - name: --perturbation_data
+          type: file
+          direction: input
+        - name: --layer
+          type: string
+          direction: input
+        - name: --subsample
+          type: integer
+          direction: input
+          default: 200
+        - name: --reg_type
+          type: string
+          direction: input
+          default: ridge
+        - name: --method_id 
+          type: string 
+          direction: input 
+          required: True
+          example: collectri
+        - name: --max_workers
+          type: integer
+          direction: input
+          required: True
+        - name: --consensus
+          type: file
+          required: false
+          direction: input
+          default: resources/prior/consensus.json
+        - name: --tf_all
+          type: file
+          required: false
+          direction: input
+
+
+    - name: Outputs
+      arguments:
+        - name: "--scores"
+          type: file
+          required: true
+          direction: output
+          default: "scores.yaml"
+        - name: "--metric_configs"
+          type: file
+          required: true
+          direction: output
+          default: metric_configs.yaml
+
+  resources:
+    - type: nextflow_script
+      path: main.nf
+      entrypoint: run_wf
+    - type: file
+      path: ../../api/task_info.yaml
+  dependencies:
+    - name: common/extract_metadata
+      repository: openproblemsv2
+    - name: metrics/regression_1
+    - name: metrics/regression_2
+    - name: robustness_analysis/causal_grn
+  repositories:
+    - name: openproblemsv2
+      type: github
+      repo: openproblems-bio/openproblems-v2
+      tag: main_build
+platforms:
+  - type: nextflow
+    directives:
+      label: [ midtime, midmem, lowcpu ]