diff --git a/scripts/run_benchmark_single_omics.sh b/scripts/run_benchmark_single_omics.sh index 2a65f7bac..1860e9a35 100644 --- a/scripts/run_benchmark_single_omics.sh +++ b/scripts/run_benchmark_single_omics.sh @@ -1,9 +1,9 @@ #!/bin/bash # RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" -RUN_ID="single_omics_all_test" -resources_dir="s3://openproblems-data/resources_test/grn" -publish_dir="s3://openproblems-data/resources_test/grn/results/${RUN_ID}" +RUN_ID="single_omics_all" +resources_dir="s3://openproblems-data/resources/grn" +publish_dir="s3://openproblems-data/resources/grn/results/${RUN_ID}" # resources_dir="./resources_test/" # publish_dir="output/${RUN_ID}" @@ -31,21 +31,21 @@ output_state: "state.yaml" publish_dir: "$publish_dir" HERE -nextflow run . \ - -main-script target/nextflow/workflows/run_benchmark_single_omics/main.nf \ - -profile docker \ - -with-trace \ - -c src/common/nextflow_helpers/labels_ci.config \ - -params-file ${param_file} - -./tw-windows-x86_64.exe launch ` - https://github.com/openproblems-bio/task_grn_benchmark.git ` - --revision build/main ` - --pull-latest ` - --main-script target/nextflow/workflows/run_benchmark_single_omics/main.nf ` - --workspace 53907369739130 ` - --compute-env 6TeIFgV5OY4pJCk8I0bfOh ` - --params-file ./params/single_omics_all_test.yaml ` - --config src/common/nextflow_helpers/labels_tw.config +# nextflow run . \ +# -main-script target/nextflow/workflows/run_benchmark_single_omics/main.nf \ +# -profile docker \ +# -with-trace \ +# -c src/common/nextflow_helpers/labels_ci.config \ +# -params-file ${param_file} + +# ./tw-windows-x86_64.exe launch ` +# https://github.com/openproblems-bio/task_grn_benchmark.git ` +# --revision build/main ` +# --pull-latest ` +# --main-script target/nextflow/workflows/run_benchmark_single_omics/main.nf ` +# --workspace 53907369739130 ` +# --compute-env 6TeIFgV5OY4pJCk8I0bfOh ` +# --params-file ./params/single_omics_all_test.yaml ` +# --config src/common/nextflow_helpers/labels_tw.config diff --git a/scripts/run_robust_analys_causal.sh b/scripts/run_robust_analys_causal.sh new file mode 100644 index 000000000..cc3fc8494 --- /dev/null +++ b/scripts/run_robust_analys_causal.sh @@ -0,0 +1,55 @@ +#!/bin/bash +viash ns build --parallel +RUN_ID="robust_analy_causal" +resources_dir="resources" +# resources_dir="s3://openproblems-data/resources/grn" + +publish_dir="${resources_dir}/results/${RUN_ID}" + +reg_type=ridge +subsample=-2 +max_workers=10 + +param_file="./params/${RUN_ID}.yaml" +# Start writing to the YAML file +cat > $param_file << HERE +param_list: +HERE + +append_entry() { + cat >> $param_file << HERE + - id: corr-${1} + multiomics_rna: ${resources_dir}/grn-benchmark/multiomics_rna.h5ad + perturbation_data: ${resources_dir}/grn-benchmark/perturbation_data.h5ad + reg_type: $reg_type + method_id: corr-${1} + layer: ${2} + subsample: $subsample + max_workers: $max_workers + consensus: ${resources_dir}/prior/consensus-num-regulators.json + tf_all: ${resources_dir}/prior/tf_all.csv +HERE +} +# Loop through grn_names and layers +layers=("pearson") # Array containing the layer(s) + +for layer in "${layers[@]}"; do # Iterate over each layer in the array + for iter in {1..100}; do # Loop from 1 to 100 iterations + append_entry "$iter" "$layer" # Execute the append_entry command + done +done + + +# Append the remaining output_state and publish_dir to the YAML file +cat >> $param_file << HERE +output_state: "state.yaml" +publish_dir: "$publish_dir" +HERE + +nextflow run . \ + -main-script target/nextflow/workflows/run_robustness_analysis_causal/main.nf \ + -profile docker \ + -with-trace \ + -c src/common/nextflow_helpers/labels_ci.config \ + -params-file ${param_file} + diff --git a/src/methods/multi_omics/scenicplus/config.vsh.yaml b/src/methods/multi_omics/scenicplus/config.vsh.yaml index c5671553a..3f8d92aad 100644 --- a/src/methods/multi_omics/scenicplus/config.vsh.yaml +++ b/src/methods/multi_omics/scenicplus/config.vsh.yaml @@ -27,6 +27,12 @@ functionality: required: false direction: output description: "Cell-topics prob scores" + - name: --grn_extended + type: file + default: output/grn_extended.csv + required: false + direction: output + description: "Source-target-peak triplets" resources: - type: python_script path: script.py diff --git a/src/methods/multi_omics/scenicplus/script.py b/src/methods/multi_omics/scenicplus/script.py index ac6d16f43..88e6e7f8b 100644 --- a/src/methods/multi_omics/scenicplus/script.py +++ b/src/methods/multi_omics/scenicplus/script.py @@ -366,7 +366,7 @@ models = run_cgs_models( cistopic_obj, n_topics=n_topics, - n_cpu=12, + n_cpu=par['num_workers'], n_iter=500, random_state=555, alpha=50, @@ -545,7 +545,7 @@ contrasts=None, adjpval_thr=0.05, log2fc_thr=np.log2(1.5), - n_cpu=5, + n_cpu=par['num_workers'], split_pattern='-' ) @@ -724,11 +724,11 @@ def download(url: str, filepath: str) -> None: settings['input_data']['path_to_motif_annotations'] = motif_annotation settings['params_general']['temp_dir'] = os.path.join(out_dir, 'scplus_pipeline', 'temp') settings['params_general']['n_cpu'] = par['num_workers'] -settings['params_inference']['quantile_thresholds_region_to_gene'] = '0.85 0.90 0.95' -settings['params_inference']['top_n_regionTogenes_per_gene'] = '5 10 15' +settings['params_inference']['quantile_thresholds_region_to_gene'] = '0.7 0.8 0.9' +settings['params_inference']['top_n_regionTogenes_per_gene'] = '10 15 25' settings['params_inference']['region_to_gene_importance_method'] = 'GBM' settings['params_inference']['tf_to_gene_importance_method'] = 'GBM' -settings['params_inference']['rho_threshold'] = 0.03 +settings['params_inference']['rho_threshold'] = 0.0 settings['params_inference']['region_to_gene_correlation_method'] = 'SR' settings['params_inference']['min_target_genes'] = 10 settings['params_motif_enrichment']['species'] = 'homo_sapiens' @@ -764,8 +764,17 @@ def download(url: str, filepath: str) -> None: scplus_mdata = mudata.read(par['scplus_mdata']) # Save results -prediction = scplus_mdata.uns['direct_e_regulon_metadata'] -prediction.insert(0, 'source', prediction['TF']) -prediction.insert(1, 'target', prediction['Gene']) -prediction.insert(2, 'weight', prediction['importance_x_abs_rho']) +grn_extended = scplus_mdata.uns['direct_e_regulon_metadata'] + +grn_extended = grn_extended[['TF', 'Gene', 'rho_TF2G', 'Region']].drop_duplicates().reset_index(drop=True) +grn_extended = grn_extended[grn_extended.rho_TF2G!=0] + +grn_extended.columns = ['source', 'target', 'weight', 'peak'] + +grn_extended = grn_extended[['source','target','weight']].drop_duplicates(ignore_index=True) + +prediction = grn_extended.groupby(['source', 'target'], as_index=False)['weight'].sum() + +grn_extended.to_csv(par['grn_extended']) prediction.to_csv(par['prediction']) + diff --git a/src/robustness_analysis/add_noise_grn.py b/src/robustness_analysis/add_noise_grn.py deleted file mode 100644 index ae6b371b7..000000000 --- a/src/robustness_analysis/add_noise_grn.py +++ /dev/null @@ -1,55 +0,0 @@ -import os -import pandas as pd -import numpy as np - -layer = 'scgen_pearson' -grn_folder = 'resources/grn_models' -grn_folder_noised = 'resources/supplementary/grn_models_noised' -noise_ratio = 0.5 -# permute_ratio = 0.2 - -# Ensure the output folder exists -os.makedirs(grn_folder_noised, exist_ok=True) - -if True: # add noise - # Loop through all files in the grn_folder - for file_name in os.listdir(grn_folder): - if file_name.endswith('.csv'): - # Read the CSV file - file_path = os.path.join(grn_folder, file_name) - df = pd.read_csv(file_path) - - # Add noise to the 'weight' column - if 'weight' in df.columns: - std_dev = df['weight'].std() - noise = np.random.normal(0, noise_ratio * std_dev, size=df['weight'].shape) - df['weight'] += noise - - # Save the noised DataFrame to the new folder - noised_file_path = os.path.join(grn_folder_noised, file_name) - df.to_csv(noised_file_path, index=False) - - print("Noise added to all GRN models and saved successfully.") -# Loop through all files in the grn_folder -else: - for file_name in os.listdir(grn_folder): - if file_name.endswith('.csv'): - # Read the CSV file - file_path = os.path.join(grn_folder, file_name) - df = pd.read_csv(file_path) - - # Permute 20% of the rows in the 'weight' column - if 'weight' in df.columns: - num_rows_to_permute = int(len(df) * permute_ratio) - - # Randomly select 20% of the row indices to permute - permute_indices = np.random.choice(df.index, size=num_rows_to_permute, replace=False) - - # Shuffle the selected rows in 'weight' column - df.loc[permute_indices, 'weight'] = np.random.permutation(df.loc[permute_indices, 'weight'].values) - - # Save the modified DataFrame to the new folder - noised_file_path = os.path.join(grn_folder_noised, file_name) - df.to_csv(noised_file_path, index=False) - - print("20% of the 'weight' column rows have been permuted for all GRN models and saved successfully.") \ No newline at end of file diff --git a/src/robustness_analysis/causal/config.vsh.yaml b/src/robustness_analysis/causal/config.vsh.yaml new file mode 100644 index 000000000..7537b320b --- /dev/null +++ b/src/robustness_analysis/causal/config.vsh.yaml @@ -0,0 +1,34 @@ +functionality: + name: causal_grn + namespace: "robustness_analysis" + info: + label: causal_grn + summary: Adds noise to the GRNs + arguments: + - name: --multiomics_rna + type: file + direction: input + example: resources_test/grn-benchmark/multiomics_rna.h5ad + + - name: --tf_all + type: file + direction: input + example: resources_test/prior/tf_all.csv + + - name: --prediction + type: file + direction: output + example: resources_test/grn_models/collectri.csv + + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + packages: [] + - type: nextflow + directives: + label: [ midtime, highmem, highcpu ] \ No newline at end of file diff --git a/src/robustness_analysis/causal/script.py b/src/robustness_analysis/causal/script.py new file mode 100644 index 000000000..006e9bb31 --- /dev/null +++ b/src/robustness_analysis/causal/script.py @@ -0,0 +1,49 @@ +import os +import pandas as pd +import numpy as np +import anndata as ad +import scanpy as sc +from tqdm import tqdm +from sklearn.preprocessing import StandardScaler + + +## VIASH START +par = { + +} + +## VIASH END + +def create_corr_net(X: np.ndarray, groups: np.ndarray): + grns = [] + for group in tqdm(np.unique(groups), desc="Processing groups"): + X_sub = X[groups == group, :] + X_sub = StandardScaler().fit_transform(X_sub) + grn = np.dot(X_sub.T, X_sub) / X_sub.shape[0] + grns.append(grn) + return np.mean(grns, axis=0) + + +print('Read data') +multiomics_rna = ad.read_h5ad(par["multiomics_rna"]) +gene_names = multiomics_rna.var_names.to_numpy() +tf_all = np.loadtxt(par['tf_all'], dtype=str) +groups = multiomics_rna.obs.cell_type +tf_all = np.intersect1d(tf_all, gene_names) + +print('Noramlize data') +sc.pp.normalize_total(multiomics_rna) +sc.pp.log1p(multiomics_rna) +sc.pp.scale(multiomics_rna) + +print('Create corr net') +net = create_corr_net(multiomics_rna.X, groups) +net = pd.DataFrame(net, index=gene_names, columns=gene_names) + +net_corr = net.sample(len(tf_all), axis=1) +net_corr = net_corr.reset_index().melt(id_vars='index', var_name='source', value_name='weight') +net_corr.rename(columns={'index': 'target'}, inplace=True) + +print('Output noised GRN') +net_corr.to_csv(par['prediction']) + diff --git a/src/robustness_analysis/config.vsh.yaml b/src/robustness_analysis/permute_grn/config.vsh.yaml similarity index 100% rename from src/robustness_analysis/config.vsh.yaml rename to src/robustness_analysis/permute_grn/config.vsh.yaml diff --git a/src/robustness_analysis/script.py b/src/robustness_analysis/permute_grn/script.py similarity index 100% rename from src/robustness_analysis/script.py rename to src/robustness_analysis/permute_grn/script.py diff --git a/src/workflows/run_robustness_analysis_causal/config.vsh.yaml b/src/workflows/run_robustness_analysis_causal/config.vsh.yaml new file mode 100644 index 000000000..d8eed5c66 --- /dev/null +++ b/src/workflows/run_robustness_analysis_causal/config.vsh.yaml @@ -0,0 +1,80 @@ +functionality: + name: run_robustness_analysis_causal + namespace: "workflows" + info: + label: run_robustness_analysis_causal + summary: "Evaluates GRNs and provides scores using regression analysis." + argument_groups: + - name: Inputs + arguments: + - name: --multiomics_rna + type: file + direction: input + - name: --perturbation_data + type: file + direction: input + - name: --layer + type: string + direction: input + - name: --subsample + type: integer + direction: input + default: 200 + - name: --reg_type + type: string + direction: input + default: ridge + - name: --method_id + type: string + direction: input + required: True + example: collectri + - name: --max_workers + type: integer + direction: input + required: True + - name: --consensus + type: file + required: false + direction: input + default: resources/prior/consensus.json + - name: --tf_all + type: file + required: false + direction: input + + + - name: Outputs + arguments: + - name: "--scores" + type: file + required: true + direction: output + default: "scores.yaml" + - name: "--metric_configs" + type: file + required: true + direction: output + default: metric_configs.yaml + + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: ../../api/task_info.yaml + dependencies: + - name: common/extract_metadata + repository: openproblemsv2 + - name: metrics/regression_1 + - name: metrics/regression_2 + - name: robustness_analysis/causal_grn + repositories: + - name: openproblemsv2 + type: github + repo: openproblems-bio/openproblems-v2 + tag: main_build +platforms: + - type: nextflow + directives: + label: [ midtime, midmem, lowcpu ] diff --git a/src/workflows/run_robustness_analysis_causal/main.nf b/src/workflows/run_robustness_analysis_causal/main.nf new file mode 100644 index 000000000..8ed336618 --- /dev/null +++ b/src/workflows/run_robustness_analysis_causal/main.nf @@ -0,0 +1,111 @@ + +workflow auto { + findStatesTemp(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + + // construct list of metrics + metrics = [ + regression_1, + regression_1 + ] + + /*************************** + * RUN METRICS * + ***************************/ + score_ch = input_ch + | map{ id, state -> + [id, state + ["_meta": [join_id: id]]] + } + + | causal_grn.run( + fromState: [ + multiomics_rna: "multiomics_rna", tf_all: "tf_all" + ], + toState: [ + prediction: "prediction" + ] + ) + + // run all metrics + | runEach( + components: metrics, + id: { id, state, comp -> + id + "." + comp.config.functionality.name + }, + // use 'fromState' to fetch the arguments the component requires from the overall state + fromState: [ + perturbation_data: "perturbation_data", + prediction: "prediction", + layer: "layer", + subsample: "subsample", + reg_type: "reg_type", + method_id: "method_id", + max_workers: "max_workers", + consensus: "consensus", + tf_all: "tf_all" + ], + // use 'toState' to publish that component's outputs to the overall state + toState: { id, output, state, comp -> + state + [ + metric_id: comp.config.functionality.name, + metric_output: output.score + ] + } + ) + + output_ch = score_ch + + // extract the scores + | extract_metadata.run( + key: "extract_scores", + fromState: [input: "metric_output"], + toState: { id, output, state -> + state + [ + score_uns: readYaml(output.output).uns + ] + } + ) + + | joinStates { ids, states -> + assert states[0]._meta, "no _meta found in state[0]" + // store the metric configs in a file + def metric_configs = metrics.collect{it.config} + def metric_configs_yaml_blob = toYamlBlob(metric_configs) + def metric_configs_file = tempFile("metric_configs.yaml") + metric_configs_file.write(metric_configs_yaml_blob) + + def task_info_file = meta.resources_dir.resolve("task_info.yaml") + + // store the scores in a file + def score_uns = states.collect{it.score_uns} + def score_uns_yaml_blob = toYamlBlob(score_uns) + def score_uns_file = tempFile("score_uns.yaml") + score_uns_file.write(score_uns_yaml_blob) + + def new_state = [ + metric_configs: metric_configs_file, + scores: score_uns_file, + _meta: states[0]._meta + ] + + ["output", new_state] + } + + // merge all of the output data + | joinStates{ ids, states -> + def mergedStates = states.inject([:]) { acc, m -> acc + m } + [ids[0], mergedStates] + } + + emit: + output_ch +}