diff --git a/runs.ipynb b/runs.ipynb
index 6de68c673..db5f82007 100644
--- a/runs.ipynb
+++ b/runs.ipynb
@@ -69,7 +69,7 @@
"!aws s3 sync resources/grn_models/ s3://openproblems-data/resources/grn/grn_models --delete\n",
"!aws s3 sync resources/prior/ s3://openproblems-data/resources/grn/prior --delete\n",
"!aws s3 sync resources/supplementary/ s3://openproblems-data/resources/grn/supplementary --delete\n",
- "!aws s3 sync resources/results/ s3://openproblems-data/resources/grn/results --delete"
+ "# !aws s3 sync resources/results/ s3://openproblems-data/resources/grn/results --delete"
]
},
{
@@ -81,7 +81,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@@ -92,6 +92,8 @@
"import scanpy as sc \n",
"from src.exp_analysis.helper import plot_cumulative_density\n",
"import sys \n",
+ "import subprocess\n",
+ "\n",
"sys.path.append('../')\n",
"from grn_benchmark.src.commons import surragate_names\n",
"def extract_data(data, reg='reg1', dataset_id='scgen_pearson'):\n",
@@ -138,8 +140,18 @@
" df_all = pd.concat([df_reg1, df_reg2], axis=1)\n",
" return df_all\n",
"\n",
+ "par = {\n",
+ " # 'methods': [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'genie3', 'grnboost2', 'scenic', 'scglue', 'celloracle'],\n",
+ " 'methods': [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'scglue', 'celloracle'],\n",
+ " 'models_dir': 'resources/grn_models/',\n",
+ " 'scores_dir': 'resources/scores',\n",
+ " 'multiomics_rna': 'resources/grn-benchmark/multiomics_rna.h5ad', \n",
+ " 'num_workers': 20,\n",
+ " 'mem': \"100G\",\n",
+ " 'time': \"24:00:00\"\n",
+ "}\n",
"\n",
- "methods = [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'genie3', 'grnboost2', 'scenic', 'scglue', 'celloracle']"
+ "# methods = [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'genie3', 'grnboost2', 'scenic', 'scglue', 'celloracle']"
]
},
{
@@ -149,15 +161,6 @@
"# Marco data "
]
},
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "# !ls resources_local/mccalla_extended/"
- ]
- },
{
"cell_type": "code",
"execution_count": 3,
@@ -182,109 +185,141 @@
"# subprocess.run(command, shell=True, check=True)"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Run grn inference "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Local runs"
+ ]
+ },
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "\n",
+ "\n",
+ "if False:\n",
+ " for method in par['methods']:\n",
+ " par['prediction'] = f\"{par['models_dir']}/{method}.csv\"\n",
+ " # method arguments \n",
+ " method_args=f\"--multiomics_rna {par['multiomics_rna']} --prediction {par['prediction']} --num_workers {par['num_workers']} --resources_dir src/utils\"\n",
+ " # run command\n",
+ " if method == \"pearson_corr\":\n",
+ " command = f\"python src/control_methods/pearson/script.py {method_args}\"\n",
+ " elif method == \"celloracle\":\n",
+ " command = f\"/home/jnourisa/miniconda3/envs/celloracle/bin/python src/methods/multi_omics/celloracle/script.py {method_args}\"\n",
+ " elif method in [\"grnboost2\", \"scenic\", \"genie3\"]:\n",
+ " command = f\"singularity exec ../../images/scenic python src/methods/single_omics/{method}/script.py {method_args}\" \n",
+ " elif method == 'scglue':\n",
+ " command = f\"singularity exec ../../images/scglue python src/methods/multi_omics/{method}/script.py {method_args}\"\n",
+ " elif method == 'ppcor':\n",
+ " command = f\"singularity exec ../../images/ppcor Rscript src/methods/single_omics/{method}/script.R {method_args}\"\n",
+ " else:\n",
+ " command = f\"singularity exec ../../images/{method} python src/methods/single_omics/{method}/script.py {method_args}\"\n",
+ " # sbatch tags\n",
+ " tag = f\"--job-name {method} \"\n",
+ " resources = f\" --cpus-per-task {par['num_workers']} --mem {par['mem']} --time {par['time']}\" #resources\n",
+ " tag+=resources\n",
+ " if method=='scglue':\n",
+ " tag += f\" --partition=gpu --gres=gpu:1\"\n",
+ " \n",
+ " !sbatch {tag} scripts/sbatch/grn_inference.sh \"{command}\" \n",
+ " # !bash scripts/sbatch/grn_inference.sh \"{command}\" "
+ ]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "# Run grn inference "
+ "## Baseline models"
]
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Submitted batch job 7748321\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "par = {\n",
- " # 'methods': ['pearson_corr', 'portia', 'grnboost2', 'ppcor', 'scenic', 'scglue', 'celloracle'],\n",
- " 'methods': ['ppcor'],\n",
- " 'write_dir': 'resources/grn_models/',\n",
- " 'multiomics_rna': 'resources/grn-benchmark/multiomics_rna.h5ad',\n",
- " 'num_workers': 20,\n",
- " 'mem': \"100G\",\n",
- " 'time': \"24:00:00\"\n",
- "}\n",
- "for method in par['methods']:\n",
- " par['prediction'] = f\"{par['write_dir']}/{method}.csv\"\n",
- " # method arguments \n",
- " method_args=f\"--multiomics_rna {par['multiomics_rna']} --prediction {par['prediction']} --num_workers {par['num_workers']}\"\n",
- " # run command\n",
- " if method == \"pearson_corr\":\n",
- " command = f\"python src/control_methods/pearson/script.py {method_args}\"\n",
- " elif method == \"celloracle\":\n",
- " command = f\"/home/jnourisa/miniconda3/envs/celloracle/bin/python src/methods/multi_omics/celloracle/script.py {method_args}\"\n",
- " elif method in [\"grnboost2\", \"scenic\", \"genie3\"]:\n",
- " command = f\"singularity exec ../../images/scenic python src/methods/single_omics/{method}/script.py {method_args}\" \n",
- " elif method == 'scglue':\n",
- " command = f\"singularity exec ../../images/scglue python src/methods/multi_omics/{method}/script.py {method_args}\"\n",
- " elif method == 'ppcor':\n",
- " command = f\"singularity exec ../../images/ppcor Rscript src/methods/single_omics/{method}/script.R {method_args}\"\n",
- " else:\n",
- " command = f\"singularity exec ../../images/{method} python src/methods/single_omics/{method}/script.py {method_args}\"\n",
- " # sbatch tags\n",
- " tag = f\"--job-name {method} \"\n",
- " resources = f\" --cpus-per-task {par['num_workers']} --mem {par['mem']} --time {par['time']}\" #resources\n",
- " tag+=resources\n",
- " if method=='scglue':\n",
- " tag += f\" --partition=gpu --gres=gpu:1\"\n",
- " \n",
- " !sbatch {tag} scripts/sbatch/grn_inference.sh \"{command}\" \n",
- " # !bash scripts/sbatch/grn_inference.sh \"{command}\" "
+ "if False:\n",
+ " command = f\"--multiomics_rna {par['multiomics_rna']} --models_dir {par['models_dir']}\"\n",
+ " !python src/control_methods/script_all.py {command}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "# d0_hvgs\n",
- "While we run baseline models using the below commands, for the rest of the models, they must be ran already and their predictions should be given in the relevant folder."
+ "## Seqera"
]
},
{
- "cell_type": "markdown",
+ "cell_type": "code",
+ "execution_count": 14,
"metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "download: s3://openproblems-data/resources/grn/results/celloracle/state.yaml to resources/results/celloracle/state.yaml\n",
+ "download: s3://openproblems-data/resources/grn/results/celloracle/trace.txt to resources/results/celloracle/trace.txt\n",
+ "download: s3://openproblems-data/resources/grn/results/celloracle/celloracle.grn_inference_celloracle.prediction.csv to resources/results/celloracle/celloracle.grn_inference_celloracle.prediction.csv\n",
+ "download: s3://openproblems-data/resources/grn/results/celloracle/output/celloracle/base_grn.csv to resources/results/celloracle/output/celloracle/base_grn.csv\n"
+ ]
+ }
+ ],
"source": [
- "## Baseline models"
+ "if False: # submit the job\n",
+ " !bash src/methods/multi_omics/celloracle/run.sh\n",
+ "if False: # get the results\n",
+ " !aws s3 sync s3://openproblems-data/resources/grn/results/celloracle resources/results/celloracle"
]
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "{'write_dir': 'resources/grn_models/d0_hvgs', 'multiomics_rna': 'resources/grn-benchmark/multiomics_rna_d0_hvg.h5ad', 'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad', 'tf_all': 'resources/prior/tf_all.csv', 'max_n_links': 50000, 'layer': 'scgen_pearson', 'cell_type_specific': False, 'normalize': False, 'causal': True, 'prediction': 'resources/grn_models/d0_hvgs/pearson_corr.csv'}\n",
- "Read data\n",
- "Causal subsetting\n",
- "Reading input data\n",
- "Inferring GRN\n",
- "{'write_dir': 'resources/grn_models/d0_hvgs', 'multiomics_rna': 'resources/grn-benchmark/perturbation_data.h5ad', 'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad', 'tf_all': 'resources/prior/tf_all.csv', 'max_n_links': 50000, 'layer': 'scgen_pearson', 'cell_type_specific': False, 'normalize': False, 'causal': True, 'prediction': 'resources/grn_models/d0_hvgs/positive_control.csv'}\n",
- "Read data\n",
- "adata.X is already dense.\n",
- "Causal subsetting\n"
+ "alldonors_default donor_0_celltype negative_control.csv positive_control.csv\n",
+ "celloracle.csv\t donor_0_default peak_gene\t\t ppcor.csv\n",
+ "d0_hvg\t\t granie.csv\t pearson_corr.csv\t scenic.csv\n",
+ "default\t\t grnboost2.csv portia.csv\t\t scglue.csv\n"
]
}
],
"source": [
- "!python src/control_methods/script_all.py"
+ "if False: # process celloracle\n",
+ " # - prediction \n",
+ " # !mv resources/results/celloracle/celloracle.grn_inference_celloracle.prediction.csv resources/results/celloracle/celloracle.csv \n",
+ " !cp resources/results/celloracle/celloracle.csv {par['models_dir']}/celloracle.csv\n",
+ " !ls {par['models_dir']}\n",
+ " \n",
+ " # - peak - gene\n",
+ " # !mkdir resources/grn_models/peak_gene/\n",
+ " df = pd.read_csv(\"resources/results/celloracle/output/celloracle/base_grn.csv\")[['peak_id','gene_short_name']]\n",
+ " df.columns = ['peak','target']\n",
+ " df.to_csv('resources/grn_models/peak_gene/celloracle.csv')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Scores\n",
+ "While we run baseline models using the below commands, for the rest of the models, they must be ran already and their predictions should be given in the relevant folder."
]
},
{
@@ -296,34 +331,356 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 23,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "resources/grn-benchmark/perturbation_data.h5ad\n",
+ "['collectri', 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'scglue', 'celloracle']\n",
+ "Sparsity of resources/grn_models/collectri.csv: 0.9999170526430831\n",
+ "Sparsity of resources/grn_models/negative_control.csv: 0.9998980328944889\n",
+ "Sparsity of resources/grn_models/positive_control.csv: 0.9997840137565959\n",
+ "Sparsity of resources/grn_models/pearson_corr.csv: 0.9997883809984375\n",
+ "Sparsity of resources/grn_models/portia.csv: 0.9999967429274494\n",
+ "Sparsity of resources/grn_models/ppcor.csv: 0.9999794553885274\n",
+ "Sparsity of resources/grn_models/grnboost2.csv: 0.9997985237124277\n",
+ "Sparsity of resources/grn_models/scenic.csv: 0.9997322245751523\n",
+ "Sparsity of resources/grn_models/scglue.csv: 0.9999847772895649\n",
+ "Sparsity of resources/grn_models/celloracle.csv: 0.9997894825282788\n"
+ ]
+ }
+ ],
"source": [
"# consensus: run this after updating method list\n",
"if False:\n",
- " !python src/metrics/regression_2/consensus/script.py #inside the method names and dir\n"
+ " model_names = ' '.join(par['methods'])\n",
+ " command = [\n",
+ " \"python\", \n",
+ " \"src/metrics/regression_2/consensus/script.py\", \n",
+ " \"--models_dir\", par['models_dir'], \n",
+ " \"--models\"\n",
+ " ] + model_names.split()\n",
+ "\n",
+ " # Print command to verify\n",
+ " subprocess.run(command)"
]
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Submitted batch job 7747147\n"
+ "Submitted batch job 7750423\n"
]
}
],
"source": [
"# run calculating scores\n",
- "if False:\n",
+ "if True:\n",
+ " # !bash scripts/sbatch/calculate_scores.sh #includes both reg1 and 2. #inside the script, set the reg_type, read and write dirs, and methods\n",
" !sbatch scripts/sbatch/calculate_scores.sh #includes both reg1 and 2. #inside the script, set the reg_type, read and write dirs, and methods"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " S1 | \n",
+ " S2 | \n",
+ " static-theta-0.0 | \n",
+ " static-theta-0.5 | \n",
+ " rank | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " collectri | \n",
+ " -0.100238 | \n",
+ " -0.211182 | \n",
+ " 0.489316 | \n",
+ " 0.514896 | \n",
+ " 11 | \n",
+ "
\n",
+ " \n",
+ " negative_control | \n",
+ " -0.044574 | \n",
+ " -0.045158 | \n",
+ " 0.445727 | \n",
+ " 0.501389 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " positive_control | \n",
+ " 0.197129 | \n",
+ " 0.578822 | \n",
+ " 0.530848 | \n",
+ " 0.584694 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " pearson_corr | \n",
+ " 0.273443 | \n",
+ " 0.516343 | \n",
+ " 0.639871 | \n",
+ " 0.549188 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " portia | \n",
+ " 0.101110 | \n",
+ " 0.365418 | \n",
+ " 0.456203 | \n",
+ " 0.513157 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " ppcor | \n",
+ " 0.017954 | \n",
+ " 0.159754 | \n",
+ " 0.317136 | \n",
+ " 0.506147 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " grnboost2 | \n",
+ " 0.421936 | \n",
+ " 0.489322 | \n",
+ " 0.825025 | \n",
+ " 0.619527 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " scenic | \n",
+ " 0.172085 | \n",
+ " 0.228266 | \n",
+ " 0.521234 | \n",
+ " 0.583202 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " granie | \n",
+ " 0.083298 | \n",
+ " 0.106012 | \n",
+ " 0.323324 | \n",
+ " 0.458506 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " scglue | \n",
+ " 0.099859 | \n",
+ " 0.279999 | \n",
+ " 0.180594 | \n",
+ " 0.533916 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " celloracle | \n",
+ " 0.209151 | \n",
+ " 0.291478 | \n",
+ " 0.680787 | \n",
+ " 0.584687 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_scores = pd.read_csv(f\"{par['scores_dir']}/scgen_pearson-ridge.csv\", index_col=0)\n",
+ "df_all_n = (df_scores-df_scores.min(axis=0))/(df_scores.max(axis=0)-df_scores.min(axis=0))\n",
+ "df_scores['rank'] = df_all_n.mean(axis=1).rank(ascending=False).astype(int)\n",
+ "df_scores.style.background_gradient()"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 12,
diff --git a/scripts/sbatch/calculate_scores.sh b/scripts/sbatch/calculate_scores.sh
index 4fbc56f3d..9533028d9 100644
--- a/scripts/sbatch/calculate_scores.sh
+++ b/scripts/sbatch/calculate_scores.sh
@@ -8,4 +8,4 @@
#SBATCH --mem=64G
#SBATCH --cpus-per-task=20
-python src/metrics/script_all.py
+python src/metrics/script_all.py
diff --git a/src/control_methods/script_all.py b/src/control_methods/script_all.py
index b0ecd1dab..e709d9bc5 100644
--- a/src/control_methods/script_all.py
+++ b/src/control_methods/script_all.py
@@ -6,7 +6,7 @@
import random
par = {
- 'write_dir': "resources/grn_models/d0_hvgs",
+ 'models_dir': "resources/grn_models/d0_hvgs",
"multiomics_rna": "resources/grn-benchmark/multiomics_rna_d0_hvg.h5ad",
"perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad",
@@ -18,6 +18,28 @@
'causal': True
}
+import argparse
+parser = argparse.ArgumentParser(description="Process multiomics RNA data.")
+parser.add_argument('--multiomics_rna', type=str, help='Path to the multiomics RNA file')
+parser.add_argument('--prediction', type=str, help='Path to the prediction file')
+parser.add_argument('--resources_dir', type=str, help='Path to the prediction file')
+parser.add_argument('--tf_all', type=str, help='Path to the tf_all')
+parser.add_argument('--num_workers', type=str, help='Number of cores')
+parser.add_argument('--models_dir', type=str, help='where to write the model')
+args = parser.parse_args()
+
+if args.multiomics_rna:
+ par['multiomics_rna'] = args.multiomics_rna
+if args.tf_all:
+ par['tf_all'] = args.tf_all
+if args.num_workers:
+ par['num_workers'] = args.num_workers
+if args.models_dir:
+ par['models_dir'] = args.models_dir
+
+if args.resources_dir:
+ meta = {'resources_dir': args.resources_dir}
+
meta = {
"resources_dir": 'src/control_methods',
"util": 'src/utils'
@@ -25,23 +47,26 @@
sys.path.append(meta["resources_dir"])
sys.path.append(meta["util"])
-os.makedirs(par['write_dir'], exist_ok=True)
+os.makedirs(par['models_dir'], exist_ok=True)
from util import create_corr_net
-
+print(par)
#---- run for pearson_corr
-par['prediction'] = f"{par['write_dir']}/pearson_corr.csv"
+print("run for pearson_corr")
+par['prediction'] = f"{par['models_dir']}/pearson_corr.csv"
par['causal'] = True
net = create_corr_net(par)
net.to_csv(par['prediction'])
#---- run for negative control
+print("run for negative control")
from negative_control.main import main
-par['prediction'] = f"{par['write_dir']}/negative_control.csv"
+par['prediction'] = f"{par['models_dir']}/negative_control.csv"
net = main(par)
net.to_csv(par['prediction'])
#---- run for positive_control
+print("run for positive control")
par['multiomics_rna'] = par['perturbation_data']
-par['prediction'] = f"{par['write_dir']}/positive_control.csv"
+par['prediction'] = f"{par['models_dir']}/positive_control.csv"
net = create_corr_net(par)
net.to_csv(par['prediction'])
diff --git a/src/exp_analysis/helper.py b/src/exp_analysis/helper.py
index 1c9340fc2..1925e48b1 100644
--- a/src/exp_analysis/helper.py
+++ b/src/exp_analysis/helper.py
@@ -65,6 +65,63 @@ def plot_cumulative_density(data, title='', ax=None, s=1, **kwdgs):
# self.out_deg = degree_centrality(net, source='source', target='target', **kwargs)
# self.in_deg = degree_centrality(net, source='target', target='source', **kwargs)
+class VSA_analysis:
+ '''Vester's sensitivity analysis.
+ - active_sum: active sum. Sum along rows of the influence matrix and it indicates how much does a variable influence all the others.
+ - passive_sum: passive sum. Its is the sum along columns of the influence matrix and it indicates how sensitive a variable is, how does it react to the influence of others
+ '''
+ def __init__(self, sample: pd.DataFrame, control: pd.DataFrame, mode='weight', top_q_net = 0.75, critical_change_q_t: float = 0.75):
+ # - keep only top quantile links
+ control = control[control.weight > control.weight.quantile(top_q_net)]
+ sample = sample[sample.weight > sample.weight.quantile(top_q_net)]
+
+ print('Determine roles')
+ self.vsa_control = self.determine_roles(control)
+ self.vsa_sample = self.determine_roles(sample)
+ print('Determine role change')
+ self.oo = self.diff_roles(self.vsa_control, self.vsa_sample, critical_change_q_t)
+
+ @staticmethod
+ def determine_roles(net, mode='weight', top_q_role=0.9) -> pd.DataFrame:
+ # active sum and passive sum
+ gene_names = np.union1d(net.source.unique(), net.target.unique())
+ if mode=='weight':
+ active_sum = np.asarray([sum(net.query(f"source == '{gene}'")['weight']) for gene in gene_names])
+ passive_sum = np.asarray([sum(net.query(f"target == '{gene}'")['weight']) for gene in gene_names])
+ else:
+ raise ValueError('define first')
+
+ # define the roles ['Buffering', 'Passive', 'Active', 'Critical'] -> [0, 1, 2, 3]
+ active_sum_threhsold = np.quantile(active_sum, top_q_role)
+ passive_sum_threhsold = np.quantile(passive_sum, top_q_role)
+ # active_sum, passive_sum = standardize_array(active_sum), standardize_array(passive_sum)
+ roles = [2*as_flag+ps_flag for as_flag, ps_flag in zip(active_sum>active_sum_threhsold, passive_sum>passive_sum_threhsold)]
+ roles = pd.DataFrame(data={'gene_name': gene_names, 'active_sum': active_sum, 'passive_sum': passive_sum, 'role':roles })
+ roles.set_index('gene_name',inplace=True)
+ return roles
+ @staticmethod
+ def diff_roles(control: pd.DataFrame, sample: pd.DataFrame, critical_change_q_t: float=.75) -> pd.DataFrame:
+ '''
+ Find the distance in the role from control to sample, and flags the top changes.
+ '''
+ # - match the index
+ common_genes = pd.Index(np.union1d(sample.index, control.index))
+ control = control.reindex(common_genes).fillna(0)
+ sample = sample.reindex(common_genes).fillna(0)
+ # - calculate distance
+ as_distance = (sample['active_sum'] - control['active_sum'])
+ ps_distance = (sample['passive_sum'] - control['passive_sum'])
+ overall_distance = as_distance**2 + ps_distance**2
+ df_distance = pd.DataFrame({'as_distance':as_distance, 'ps_distance':ps_distance, 'overall_distance':overall_distance}, index=common_genes)
+
+ # df_distance = df_distance.assign(passive_sum_c=control['passive_sum'], active_sum_c=control['active_sum'])
+ # df_distance = df_distance.assign(passive_sum_s=sample['passive_sum'], active_sum_s=sample['active_sum'])
+
+ # - define x top percentile as the cut-off value to determine critical role change
+ # df_distance['critical_change_as'] = df_distance['as_distance'] > df_distance['as_distance'].quantile(critical_change_q_t)
+ # df_distance['critical_change_ps'] = df_distance['ps_distance'] > df_distance['ps_distance'].quantile(critical_change_q_t)
+ # df_distance['critical_change_overall'] = df_distance['overall_distance'] > df_distance['overall_distance'].quantile(critical_change_q_t)
+ return df_distance
class Exp_analysis:
'''
This class provides functions for explanatory analysis of GRNs
@@ -101,8 +158,6 @@ def plot_centrality(self, df, title='',ax=None, xlabel='Degree', ylabel='Gene',
if colors is not None:
for label, color in zip(ax.get_yticklabels(), colors):
label.set_color(color)
-
-
def top_edges(self, quantile=0.95):
grn = self.net
grn = grn[grn.weight>grn.weight.quantile(quantile)]
@@ -118,7 +173,6 @@ def determine_source_properties(self):
c_weight = determine_centrality(self.net, mode='weight')
self.sources = c_weight.merge(c_degree, left_index=True, right_index=True)
-
# def calculate_centrality_stats(self) -> None:
# '''Calculate network centrality metrics such as in and out degree
# '''
@@ -164,6 +218,9 @@ def annotate_genes(self, gene_annotation_df) -> dict[str, float]:
# gene_annot.values
self.gene_annot = {key:round((value*100/len(self.targets)), 1) for key, value in gene_annot.items()}
return self.gene_annot
+ def analyse_roles(self, ref_net, **kwargs):
+ self.vsa_obj = VSA_analysis(self.net, ref_net,**kwargs)
+ self.vsa_results = self.vsa_obj.oo
def plot_interactions(interaction_df: pd.DataFrame, min_subset_size=None, min_degree=None, color_map=None) -> plt.Figure:
"""Upset plot of interactions
diff --git a/src/methods/single_omics/genie3/config.vsh.yaml b/src/methods/single_omics/genie3/config.vsh.yaml
index 0b396b941..9711f49a8 100644
--- a/src/methods/single_omics/genie3/config.vsh.yaml
+++ b/src/methods/single_omics/genie3/config.vsh.yaml
@@ -15,6 +15,7 @@ functionality:
path: script.py
- path: /src/utils/util.py
dest: util.py
+
platforms:
- type: docker
diff --git a/src/methods/single_omics/scenic/config.vsh.yaml b/src/methods/single_omics/scenic/config.vsh.yaml
index 1227dac8e..383737ba1 100644
--- a/src/methods/single_omics/scenic/config.vsh.yaml
+++ b/src/methods/single_omics/scenic/config.vsh.yaml
@@ -27,6 +27,9 @@ functionality:
resources:
- type: python_script
path: script.py
+ - path: /src/utils/util.py
+ dest: util.py
+ # - path: main.py
platforms:
- type: docker
diff --git a/src/methods/single_omics/scenic/main.py b/src/methods/single_omics/scenic/main.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/methods/single_omics/scenic/script.py b/src/methods/single_omics/scenic/script.py
index 5e8c52f6a..1f30b977a 100644
--- a/src/methods/single_omics/scenic/script.py
+++ b/src/methods/single_omics/scenic/script.py
@@ -6,6 +6,9 @@
import ast
import requests
import scipy.sparse as sp
+import sys
+
+
# wget https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather
@@ -27,13 +30,6 @@
'normalize': False
}
## VIASH END
-
-import sys
-meta= {
- "resources_dir": 'src/utils/'
-}
-
-
import argparse
parser = argparse.ArgumentParser(description="Process multiomics RNA data.")
parser.add_argument('--multiomics_rna', type=str, help='Path to the multiomics RNA file')
@@ -53,39 +49,27 @@
par['num_workers'] = args.num_workers
if args.resources_dir:
- meta['resources_dir'] = args.resources_dir
+ meta = {'resources_dir': args.resources_dir}
sys.path.append(meta["resources_dir"])
from util import process_links
-os.makedirs(par['temp_dir'], exist_ok=True)
-
-databases = f"{par['temp_dir']}/databases/"
-os.makedirs(databases, exist_ok=True)
-
-par['motif_annotation'] = f'{databases}/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl'
-par['genes_vs_motifs_10k'] = f'{databases}/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather'
-par['genes_vs_motifs_500'] = f'{databases}/hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather'
-
-if not (os.path.exists(par['motif_annotation'])):
- print('downloading motif_annotation')
- response = requests.get("https://resources.aertslab.org/cistarget/motif2tf/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl")
- with open(par['motif_annotation'], "wb") as file:
- file.write(response.content)
-if not (os.path.exists(par['genes_vs_motifs_10k'])):
- print('downloading genes_vs_motifs_10k')
- response = requests.get("https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather")
- with open(par['genes_vs_motifs_10k'], "wb") as file:
- file.write(response.content)
-if not (os.path.exists(par['genes_vs_motifs_500'])):
- print('downloading genes_vs_motifs_500')
- response = requests.get("https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather")
- with open(par['genes_vs_motifs_500'], "wb") as file:
- file.write(response.content)
-
-expr_mat_adjacencies = os.path.join(par['temp_dir'], "expr_mat_adjacencies.tsv")
-expression_data = os.path.join(par['temp_dir'], "expression_data.tsv")
-regulons = f"{par['temp_dir']}/regulons.csv"
+def download_prior(par):
+ if not (os.path.exists(par['motif_annotation'])):
+ print('downloading motif_annotation')
+ response = requests.get("https://resources.aertslab.org/cistarget/motif2tf/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl")
+ with open(par['motif_annotation'], "wb") as file:
+ file.write(response.content)
+ if not (os.path.exists(par['genes_vs_motifs_10k'])):
+ print('downloading genes_vs_motifs_10k')
+ response = requests.get("https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather")
+ with open(par['genes_vs_motifs_10k'], "wb") as file:
+ file.write(response.content)
+ if not (os.path.exists(par['genes_vs_motifs_500'])):
+ print('downloading genes_vs_motifs_500')
+ response = requests.get("https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather")
+ with open(par['genes_vs_motifs_500'], "wb") as file:
+ file.write(response.content)
def format_data(par):
print('Read data')
@@ -93,18 +77,17 @@ def format_data(par):
gene_names = adata_rna.var_names
if sp.issparse(adata_rna.X):
adata_rna.X = adata_rna.X.toarray()
- pd.DataFrame(adata_rna.X, columns=gene_names).to_csv(expression_data, sep='\t', index=False)
+ pd.DataFrame(adata_rna.X, columns=gene_names).to_csv(par['expression_data'], sep='\t', index=False)
-
def run_grn(par):
print('Run grn')
command = [
"pyscenic", "grn",
"--num_workers", str(par['num_workers']),
"--seed", str(par['seed']),
- "-o", expr_mat_adjacencies,
+ "-o", str(par['expr_mat_adjacencies']),
"--method", "grnboost2",
- expression_data,
+ str(par['expression_data']),
par['tf_all']
]
subprocess.run(command, check=True)
@@ -114,11 +97,11 @@ def prune_grn(par):
command = [
"pyscenic", "ctx",
- expr_mat_adjacencies, par['genes_vs_motifs_500'], par['genes_vs_motifs_10k'],
+ par['expr_mat_adjacencies'], par['genes_vs_motifs_500'], par['genes_vs_motifs_10k'],
"--annotations_fname", par['motif_annotation'],
- "--expression_mtx_fname", expression_data,
+ "--expression_mtx_fname", par['expression_data'],
"--mode", "custom_multiprocessing",
- "--output", regulons,
+ "--output", str(par['regulons']),
"--num_workers", str(par['num_workers']),
"--rank_threshold", str(par['rank_threshold']),
"--auc_threshold", str(par['auc_threshold']),
@@ -129,7 +112,7 @@ def prune_grn(par):
def format_grn(par):
print('Format regulons')
- regulons_df = pd.read_csv(regulons, index_col=0, skiprows=1)['TargetGenes'].reset_index().iloc[1:,:]
+ regulons_df = pd.read_csv(par['regulons'], index_col=0, skiprows=1)['TargetGenes'].reset_index().iloc[1:,:]
def format_it(df):
values = df['TargetGenes'].values[0]
@@ -142,13 +125,30 @@ def format_it(df):
grn = regulons_df.groupby('index').apply(lambda df: format_it(df)).reset_index().rename(columns={'index':'source'})
network = grn[['source','target','weight']]
return network
+def main(par):
+ databases = f"{par['temp_dir']}/databases/"
+ os.makedirs(databases, exist_ok=True)
+
+ par['motif_annotation'] = f'{databases}/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl'
+ par['genes_vs_motifs_10k'] = f'{databases}/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather'
+ par['genes_vs_motifs_500'] = f'{databases}/hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather'
+
+ par['expr_mat_adjacencies'] = os.path.join(par['temp_dir'], "expr_mat_adjacencies.tsv")
+ par['expression_data'] = os.path.join(par['temp_dir'], "expression_data.tsv")
+ par['regulons'] = f"{par['temp_dir']}/regulons.csv"
+
+ # format_data(par)
+ # run_grn(par)
+ prune_grn(par)
+ network = format_grn(par)
+ return network
+
+if __name__=='__main__':
+ network = main(par)
+ os.makedirs(par['temp_dir'], exist_ok=True)
-format_data(par)
-run_grn(par)
-prune_grn(par)
-network = format_grn(par)
-network.to_csv(par['prediction'], sep=',')
+ network.to_csv(par['prediction'], sep=',')
+ print('Finished.')
-print('Finished.')
diff --git a/src/metrics/regression_2/consensus/script.py b/src/metrics/regression_2/consensus/script.py
index 075f29a13..f2ef679f5 100644
--- a/src/metrics/regression_2/consensus/script.py
+++ b/src/metrics/regression_2/consensus/script.py
@@ -11,11 +11,21 @@
## VIASH START
par = {
'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad',
- 'grn_folder': 'resources/grn-benchmark/grn_models/d0_hvg',
- 'grns': 'pearson_corr, pearson_causal, portia, ppcor, genie3, grnboost2, scenic, scglue, celloracle',
+ # 'models_dir': 'resources/grn-benchmark/grn_models/d0_hvg',
+ # 'models': [pearson_corr, pearson_causal, portia, ppcor, genie3, grnboost2, scenic, scglue, celloracle],
'output': 'resources/grn-benchmark/consensus-num-regulators.json'
}
## VIASH END
+import argparse
+parser = argparse.ArgumentParser(description="Process multiomics RNA data.")
+parser.add_argument('--models_dir', type=str, help='where to read the models')
+parser.add_argument('--models', nargs='+', help="List of models to process", required=True)
+
+args = parser.parse_args()
+if args.models_dir:
+ par['models_dir'] = args.models_dir
+if args.models:
+ par['models'] = args.models
# Load perturbation data
@@ -26,12 +36,12 @@
gene_names = adata_rna.var.index.to_numpy()
print(par['perturbation_data'])
-print(par['grns'])
+print(par['models'])
# Load inferred GRNs
grns = []
-for filename in par['grns'].split(','):
- filepath = os.path.join(par['grn_folder'], f'{filename}.csv')
+for filename in par['models']:
+ filepath = os.path.join(par['models_dir'], f'{filename}.csv')
gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)}
A = np.zeros((len(gene_names), len(gene_names)), dtype=float)
df = pd.read_csv(filepath, sep=',', header='infer', index_col=0)
diff --git a/src/metrics/regression_2/main.py b/src/metrics/regression_2/main.py
index 1dfd65ed3..e2512045f 100644
--- a/src/metrics/regression_2/main.py
+++ b/src/metrics/regression_2/main.py
@@ -20,22 +20,23 @@
def load_grn(filepath: str, gene_names: np.ndarray, par: Dict[str, Any]) -> np.ndarray:
gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)}
- A = np.zeros((len(gene_names), len(gene_names)), dtype=float)
- df = pd.read_csv(filepath, sep=',', header='infer', index_col=0)
+ df = pd.read_csv(filepath, sep=',', header='infer')
if 'cell_type' in df.columns:
verbose_print(par['verbose'], 'Taking mean of cell type specific grns', 3)
df.drop(columns=['cell_type'], inplace=True)
df = df.groupby(['source', 'target']).mean().reset_index()
-
+ # keep only top n links
+ if df.shape[0] > par['max_n_links']:
+ verbose_print(par['verbose'], f"Reducing number of links to {par['max_n_links']}", 3)
+ df = process_links(df, par)
+ # convert to matrix
+ A = np.zeros((len(gene_names), len(gene_names)), dtype=float)
for source, target, weight in zip(df['source'], df['target'], df['weight']):
if (source not in gene_dict) or (target not in gene_dict):
continue
i = gene_dict[source]
j = gene_dict[target]
A[i, j] = float(weight)
- if df.shape[0] > par['max_n_links']:
- verbose_print(par['verbose'], f"Reducing number of links to {par['max_n_links']}", 3)
- df = process_links(df, par)
return A
diff --git a/src/metrics/script_all.py b/src/metrics/script_all.py
index 1480346ef..db3c80f32 100644
--- a/src/metrics/script_all.py
+++ b/src/metrics/script_all.py
@@ -7,10 +7,12 @@
## VIASH START
par = {
'reg_type': 'ridge',
- 'read_dir': "resources/grn_models/d0_hvg",
- 'write_dir': "resources/results/scores/d0_hvg",
- 'methods': [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'genie3', 'grnboost2', 'scenic', 'scglue', 'celloracle'],
- 'layers': ['lognorm', 'pearson', 'seurat_lognorm', 'seurat_pearson', 'scgen_lognorm', 'scgen_pearson'],
+ 'models_dir': "resources/grn_models/",
+ 'scores_dir': "resources/scores/",
+
+ 'methods': [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle'],
+ # 'layers': ['scgen_pearson', 'lognorm', 'pearson', 'seurat_lognorm', 'seurat_pearson', 'scgen_lognorm'],
+ 'layers': ['pearson', 'seurat_lognorm', 'seurat_pearson', 'scgen_lognorm'],
"perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad",
"tf_all": "resources/prior/tf_all.csv",
@@ -25,6 +27,25 @@
'clip_scores': True
}
# VIASH END
+
+import argparse
+parser = argparse.ArgumentParser(description="Process multiomics RNA data.")
+parser.add_argument('--multiomics_rna', type=str, help='Path to the multiomics RNA file')
+parser.add_argument('--num_workers', type=str, help='Number of cores')
+parser.add_argument('--models_dir', type=str, help='where to read the models')
+parser.add_argument('--scores_dir', type=str, help='where to write the model')
+
+args = parser.parse_args()
+
+if args.multiomics_rna:
+ par['multiomics_rna'] = args.multiomics_rna
+if args.num_workers:
+ par['num_workers'] = args.num_workers
+if args.models_dir:
+ par['models_dir'] = args.models_dir
+if args.scores_dir:
+ par['scores_dir'] = args.scores_dir
+
meta = {
"resources_dir": 'src/metrics/',
"util": 'src/utils'
@@ -32,12 +53,13 @@
sys.path.append(meta["resources_dir"])
sys.path.append(meta["util"])
-os.makedirs(par['write_dir'], exist_ok=True)
+os.makedirs(par['scores_dir'], exist_ok=True)
+
for layer in par['layers']:
par['layer'] = layer
for i, method in enumerate(par['methods']):
- par['prediction'] = f"{par['read_dir']}/{method}.csv"
+ par['prediction'] = f"{par['models_dir']}/{method}.csv"
from regression_1.main import main
reg1 = main(par)
from regression_2.main import main
@@ -48,6 +70,6 @@
df_all = score
else:
df_all = pd.concat([df_all, score])
- df_all.to_csv(f"{par['write_dir']}/{layer}-{par['reg_type']}.csv")
+ df_all.to_csv(f"{par['scores_dir']}/{layer}-{par['reg_type']}.csv")
print(df_all)
diff --git a/src/utils/util.py b/src/utils/util.py
index 162cf6245..5bf2880d5 100644
--- a/src/utils/util.py
+++ b/src/utils/util.py
@@ -26,7 +26,7 @@ def verbose_tqdm(iterable, desc, level, verbose_level):
def basic_qc(adata, min_genes_per_cell = 200, max_genes_per_cell = 5000, min_cells_per_gene = 10):
mt = adata.var_names.str.startswith('MT-')
-
+ print('shape before ', adata.shape)
# 1. stats
total_counts = adata.X.sum(axis=1)
n_genes_by_counts = (adata.X > 0).sum(axis=1)
@@ -37,15 +37,16 @@ def basic_qc(adata, min_genes_per_cell = 200, max_genes_per_cell = 5000, min_cel
# mt_filter = mt_frac > max_mt_frac
# 2. Filter cells
- print(f'Number of cells removed: below min gene {low_gene_filter.sum()}, exceed max gene {high_gene_filter.sum()}')
+ # print(f'Number of cells removed: below min gene {low_gene_filter.sum()}, exceed max gene {high_gene_filter.sum()}')
mask_cells= (~low_gene_filter)& \
(~high_gene_filter)
# (~mt_filter)
# 3. Filter genes
n_cells = (adata.X!=0).sum(axis=0)
mask_genes = n_cells>min_cells_per_gene
-
- return adata[mask_cells, mask_genes]
+ adata_f = adata[mask_cells, mask_genes]
+ print('shape after ', adata_f.shape)
+ return adata_f
def process_links(net, par):
net = net[net.source!=net.target]