multiple workflows updated

openproblems-bio · Aug 31, 2024 · 1bd94de · 1bd94de
1 parent dd969e0
commit 1bd94de
Show file tree

Hide file tree

Showing 19 changed files with 307 additions and 194 deletions.
diff --git a/_viash.yaml b/_viash.yaml
@@ -11,5 +11,5 @@ config_mods: |
   .platforms[.type == 'docker'].target_image_source := 'https://github.com/openproblems-bio/task_grn_inference'
   .platforms[.type == "nextflow"].directives.tag := "$id"
   .platforms[.type == "nextflow"].auto.simplifyOutput := false
-  .platforms[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h",  veryveryhightime : "time = 48.h", threedaystime : "time = 72.h", oneweektime : "time = 168.h" }
+  .platforms[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", onedaytime : "time = 24.h", threedaystime : "time = 72.h", oneweektime : "time = 168.h" }
   .platforms[.type == "nextflow"].config.script := "process.errorStrategy = 'ignore'"
diff --git a/runs.ipynb b/runs.ipynb
diff --git a/scripts/repo/run_grn_evaluation.sh b/scripts/repo/run_grn_evaluation.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Default values
+grn=""
+sample="200"  # Default value for sample
+reg_type="ridge"
+score="output/score.csv"
+
+# Parse arguments
+while [[ "$#" -gt 0 ]]; do
+    case $1 in
+        --grn) grn="$2"; shift ;;
+        --sample) sample="$2"; shift ;;
+        --reg_type) reg_type="$2"; shift ;;
+        --score) score="$2"; shift ;;
+        *) echo "Unknown parameter passed: $1"; exit 1 ;;
+    esac
+    shift
+done
+
+# Ensure required arguments are provided
+if [ -z "$grn" ]; then
+    echo "Usage: $0 --grn <grn_file> [--sample <sample_value>]"
+    exit 1
+fi
+
+# Print parsed arguments (for debugging purposes)
+echo "GRN file: $grn"
+echo "Sample value: $sample"
+echo "Regression model: $reg_type"
+
+# Clean bin/ folder
+rm -r bin
+mkdir bin
+
+# Run regression analysis 1
+echo "Running GRN benchmark with $grn and sample size $sample"
+echo "Regression 1"
+mkdir -p bin/regression_1
+viash build src/metrics/regression_1/config.vsh.yaml -p docker -o bin/regression_1
+bin/regression_1/regression_1 --perturbation_data resources/grn-benchmark/perturbation_data.h5ad --reg_type $reg_type --prediction $grn --score $score
+
+# Run regression analysis 2
+echo "Regression 2"
+if [ ! -f resources/grn-benchmark/consensus-num-regulators.json ]; then
+    viash build src/metrics/regression_2/consensus/config.vsh.yaml --platform docker -o bin/regression_2/consensus
+    bin/regression_2/consensus/consensus_for_regression_2 --perturbation_data resources/grn-benchmark/perturbation_data.h5ad --output resources/grn-benchmark/consensus-num-regulators.json --grn_folder resources/grn-benchmark/grn_models/ --grns ananse.csv,celloracle.csv,figr.csv,granie.csv,scenicplus.csv,scglue.csv
+fi
+mkdir -p bin/regression_2
+viash build src/metrics/regression_2/config.vsh.yaml -p docker -o bin/regression_2
+bin/regression_2/regression_2 --perturbation_data resources/grn-benchmark/perturbation_data.h5ad --consensus resources/grn-benchmark/consensus-num-regulators.json --layer scgen_pearson --reg_type $reg_type --prediction $grn --score $score
diff --git a/scripts/run_grn_evaluation_tw.sh → ...pts/repo/run_grn_evaluation_all_layers.sh b/scripts/run_grn_evaluation_tw.sh → ...pts/repo/run_grn_evaluation_all_layers.sh
@@ -1,11 +1,13 @@
 #!/bin/bash
 
 # RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
-RUN_ID="pearson_gb_subsample_RF"
+reg_type=${1} #GB, ridge
+
+RUN_ID="grn_evaluation_${reg_type}"
 resources_dir="s3://openproblems-data/resources/grn"
 publish_dir="s3://openproblems-data/resources/grn/results/${RUN_ID}"
 grn_models_folder="${resources_dir}/grn_models"
-reg_type=RF
+
 subsample=-2
 max_workers=10
 
@@ -21,8 +23,7 @@ grn_names=(
     "scglue"
 )
 
-# layers=("pearson" "lognorm" "scgen_pearson" "scgen_lognorm" "seurat_pearson" "seurat_lognorm")
-layers=( "pearson" )
+layers=("pearson" "lognorm" "scgen_pearson" "scgen_lognorm" "seurat_pearson" "seurat_lognorm")
 
 # Start writing to the YAML file
 cat > $param_file << HERE

diff --git a/scripts/run_benchmark_single_omics.sh b/scripts/run_benchmark_single_omics.sh
@@ -2,11 +2,11 @@
 
 # RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
 RUN_ID="single_omics"
-# resources_dir="s3://openproblems-data/resources_test/grn"
-# publish_dir="s3://openproblems-data/resources_test/grn/results/${RUN_ID}"
+resources_dir="s3://openproblems-data/resources_test/grn"
+publish_dir="s3://openproblems-data/resources_test/grn/results/${RUN_ID}"
 
-resources_dir="./resources_test/"
-publish_dir="output/${RUN_ID}"
+# resources_dir="./resources_test/"
+# publish_dir="output/${RUN_ID}"
 
 reg_type=ridge
 subsample=-2

diff --git a/scripts/run_grn_evaluation.sh b/scripts/run_grn_evaluation.sh
@@ -1,51 +1,95 @@
 #!/bin/bash
 
-# Default values
-grn=""
-sample="200"  # Default value for sample
-reg_type="ridge"
-score="output/score.csv"
-
-# Parse arguments
-while [[ "$#" -gt 0 ]]; do
-    case $1 in
-        --grn) grn="$2"; shift ;;
-        --sample) sample="$2"; shift ;;
-        --reg_type) reg_type="$2"; shift ;;
-        --score) score="$2"; shift ;;
-        *) echo "Unknown parameter passed: $1"; exit 1 ;;
-    esac
-    shift
+# RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
+reg_type=${1} #GB, ridge
+
+RUN_ID="grn_evaluation_${reg_type}"
+# resources_dir="s3://openproblems-data/resources/grn"
+resources_dir="./resources"
+publish_dir="${resources_dir}/results/${RUN_ID}"
+grn_models_folder="${resources_dir}/grn_models"
+
+subsample=-2
+max_workers=10
+
+param_file="./params/${RUN_ID}.yaml"
+
+grn_names=(
+    "collectri"
+    "celloracle"
+    "scenicplus"
+    "figr"
+    "granie"
+    "scglue"
+)
+# Start writing to the YAML file
+cat > $param_file << HERE
+param_list:
+HERE
+
+append_entry() {
+  cat >> $param_file << HERE
+  - id: ${reg_type}_${1}_${3}
+    perturbation_data: ${resources_dir}/grn-benchmark/perturbation_data.h5ad
+    reg_type: $reg_type
+    method_id: $1
+    subsample: $subsample
+    max_workers: $max_workers
+    tf_all: ${resources_dir}/prior/tf_all.csv
+    layer: ${3}
+    consensus: ${resources_dir}/prior/consensus-num-regulators.json
+HERE
+
+  # Conditionally append the prediction line if the second argument is "true"
+  if [[ $2 == "true" ]]; then
+    cat >> $param_file << HERE
+    prediction: ${grn_models_folder}/$1.csv
+HERE
+  fi
+}
+layers=(pearson scgen_pearson)
+# Loop through grn_names and layers
+for layer in "${layers[@]}"; do
+  for grn_name in "${grn_names[@]}"; do
+    append_entry "$grn_name" "true" "$layer"
+  done
+done
+
+# # Append negative control
+grn_name="negative_control"
+for layer in "${layers[@]}"; do
+  append_entry "$grn_name" "false" "$layer"
 done
 
-# Ensure required arguments are provided
-if [ -z "$grn" ]; then
-    echo "Usage: $0 --grn <grn_file> [--sample <sample_value>]"
-    exit 1
-fi
-
-# Print parsed arguments (for debugging purposes)
-echo "GRN file: $grn"
-echo "Sample value: $sample"
-echo "Regression model: $reg_type"
-
-# Clean bin/ folder
-rm -r bin
-mkdir bin
-
-# Run regression analysis 1
-echo "Running GRN benchmark with $grn and sample size $sample"
-echo "Regression 1"
-mkdir -p bin/regression_1
-viash build src/metrics/regression_1/config.vsh.yaml -p docker -o bin/regression_1
-bin/regression_1/regression_1 --perturbation_data resources/grn-benchmark/perturbation_data.h5ad --reg_type $reg_type --prediction $grn --score $score
-
-# Run regression analysis 2
-echo "Regression 2"
-if [ ! -f resources/grn-benchmark/consensus-num-regulators.json ]; then
-    viash build src/metrics/regression_2/consensus/config.vsh.yaml --platform docker -o bin/regression_2/consensus
-    bin/regression_2/consensus/consensus_for_regression_2 --perturbation_data resources/grn-benchmark/perturbation_data.h5ad --output resources/grn-benchmark/consensus-num-regulators.json --grn_folder resources/grn-benchmark/grn_models/ --grns ananse.csv,celloracle.csv,figr.csv,granie.csv,scenicplus.csv,scglue.csv
-fi
-mkdir -p bin/regression_2
-viash build src/metrics/regression_2/config.vsh.yaml -p docker -o bin/regression_2
-bin/regression_2/regression_2 --perturbation_data resources/grn-benchmark/perturbation_data.h5ad --consensus resources/grn-benchmark/consensus-num-regulators.json --layer scgen_pearson --reg_type $reg_type --prediction $grn --score $score
+
+# Append positive controls
+grn_name="positive_control"
+for layer in "${layers[@]}"; do
+  append_entry "$grn_name" "false" "$layer"
+done
+
+
+# Append the remaining output_state and publish_dir to the YAML file
+cat >> $param_file << HERE
+output_state: "state.yaml"
+publish_dir: "$publish_dir"
+HERE
+
+nextflow run . \
+  -main-script  target/nextflow/workflows/run_grn_evaluation/main.nf \
+  -profile docker \
+  -with-trace \
+  -c src/common/nextflow_helpers/labels_ci.config \
+  -params-file ${param_file}
+
+# ./tw-windows-x86_64.exe launch `
+#     https://github.com/openproblems-bio/task_grn_benchmark.git `
+#     --revision build/main `
+#     --pull-latest `
+#     --main-script target/nextflow/workflows/run_grn_evaluation/main.nf `
+#     --workspace 53907369739130 `
+#     --compute-env 6TeIFgV5OY4pJCk8I0bfOh `
+#     --params-file ./params/scgen_pearson_gb_pcs.yaml `
+#     --config src/common/nextflow_helpers/labels_tw.config
+
+
diff --git a/scripts/run_robust_analys.sh b/scripts/run_robust_analys.sh
@@ -3,7 +3,7 @@
 # RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
 
 degrees=(0 10 20 50 100)
-noise_type="$1"
+noise_type="$1" #"net"
 echo $noise_type
 
 RUN_ID="robust_analy_$1"
@@ -19,7 +19,6 @@ grn_models_folder="${resources_dir}/grn_models"
 reg_type=ridge
 subsample=-2
 max_workers=10
-layer=pearson
 
 param_file="./params/${RUN_ID}.yaml"
 
@@ -33,19 +32,18 @@ grn_names=(
 )
 
 
-
 # Start writing to the YAML file
 cat > $param_file << HERE
 param_list:
 HERE
 
 append_entry() {
   cat >> $param_file << HERE
-  - id: ${1}_${2}
+  - id: ${1}_${2}_${3}
     perturbation_data: ${resources_dir}/grn-benchmark/perturbation_data.h5ad
-    layer: ${layer}
     reg_type: $reg_type
     method_id: ${2}-${1}
+    layer: ${3}
     subsample: $subsample
     max_workers: $max_workers
     consensus: ${resources_dir}/prior/consensus-num-regulators.json
@@ -55,12 +53,14 @@ append_entry() {
 HERE
 }
 # Loop through grn_names and layers
-for degree in "${degrees[@]}"; do
-    for grn_name in "${grn_names[@]}"; do
-        append_entry "$grn_name" "$degree" 
+layers=(pearson scgen_pearson)
+for layer in "${layers[@]}"; do
+    for degree in "${degrees[@]}"; do
+        for grn_name in "${grn_names[@]}"; do
+            append_entry "$grn_name" "$degree" "$layer"
+        done
     done
-done
-
+done 
 
 # Append the remaining output_state and publish_dir to the YAML file
 cat >> $param_file << HERE

diff --git a/src/control_methods/positive_control/script.py b/src/control_methods/positive_control/script.py
@@ -14,6 +14,7 @@
   "prediction": "output/positive_control.csv",
 }
 ## VIASH END
+print(par)
 print('Reading input data')
 perturbation_data = ad.read_h5ad(par["perturbation_data"])
 gene_names = perturbation_data.var_names.to_numpy()

diff --git a/src/methods/multi_omics/celloracle/main.py b/src/methods/multi_omics/celloracle/main.py
@@ -141,8 +141,13 @@ def refine_grns(par):
         grn = grn[mask]
         grn = grn[~(grn.coef_abs==0)] # remove those with 0 coeff
         # filter based on z score 
-        z_scores = (grn.coef_abs - grn.coef_abs.mean())/grn.coef_abs.std()
-        mask = z_scores > 2
+        # z_scores = (grn.coef_abs - grn.coef_abs.mean())/grn.coef_abs.std()
+        # mask = z_scores > 2
+        # Sort by absolute coefficient values
+        grn = grn.sort_values(by="coef_abs", ascending=False)
+
+        # Select the top 50,000 links based on absolute weight
+        mask = grn.index[:par['max_n_links']]
         grn = grn.loc[mask, :]
 
         grn = grn[['source', 'target', 'coef_mean']]

diff --git a/src/methods/single_omics/ennet/config.vsh.yaml b/src/methods/single_omics/ennet/config.vsh.yaml
@@ -29,4 +29,4 @@ platforms:
   - type: native
   - type: nextflow
     directives:
-      label: [midtime,midmem,midcpu]
+      label: [onedaytime,midmem,midcpu]
diff --git a/src/methods/single_omics/grnboost2/config.vsh.yaml b/src/methods/single_omics/grnboost2/config.vsh.yaml
@@ -26,4 +26,4 @@ platforms:
   - type: native
   - type: nextflow
     directives:
-      label: [midtime,midmem,midcpu]
+      label: [onedaytime,midmem,midcpu]
diff --git a/src/methods/single_omics/scsgl/config.vsh.yaml b/src/methods/single_omics/scsgl/config.vsh.yaml
@@ -28,4 +28,4 @@ platforms:
   - type: native
   - type: nextflow
     directives:
-      label: [midtime,midmem,midcpu]
+      label: [onedaytime,midmem,midcpu]
diff --git a/src/methods/single_omics/tigress/config.vsh.yaml b/src/methods/single_omics/tigress/config.vsh.yaml
@@ -26,4 +26,4 @@ platforms:
   - type: native
   - type: nextflow
     directives:
-      label: [midtime, midmem, highcpu]
+      label: [onedaytime, midmem, highcpu]
diff --git a/src/metrics/regression_1/script.py b/src/metrics/regression_1/script.py
@@ -9,10 +9,12 @@
   "prediction": "resources/grn-benchmark/grn_models/collectri.csv",
   'score': 'output/score.h5ad',
   'reg_type': 'ridge',
-  'layer': 'lognorm',
+  'layer': 'pearson',
   'subsample': 200,
   'max_workers': 4,
 }
+print(par)
+
 
 ## VIASH END
 sys.path.append(meta["resources_dir"])

diff --git a/src/metrics/regression_2/script.py b/src/metrics/regression_2/script.py
@@ -18,6 +18,7 @@
 }
 ## VIASH END
 
+print(par)
 sys.path.append(meta['resources_dir'])
 from main import main
 

diff --git a/src/workflows/run_benchmark_single_omics/config.vsh.yaml b/src/workflows/run_benchmark_single_omics/config.vsh.yaml
@@ -87,4 +87,4 @@ functionality:
 platforms:
   - type: nextflow
     directives:
-      label: [ hightime, midmem, highcpu ]
+      label: [ onedaytime, midmem, highcpu]
diff --git a/src/workflows/run_benchmark_single_omics/main.nf b/src/workflows/run_benchmark_single_omics/main.nf
@@ -24,11 +24,9 @@ workflow run_wf {
   // ]
 
   methods = [
-    portia,
     ennet,
     grnboost2,
     scsgl,    
-    ppcor,
     tigress
   ]
-Original file line number
+Diff line change
@@ Expand Up / @@ -24,11 +24,9 @@ workflow run_wf { @@
       // ]
       methods = [
-        portia,
         ennet,
         grnboost2,
         scsgl,
-        ppcor,
         tigress
       ]
@@ Expand Down @@