diff --git a/runs.ipynb b/runs.ipynb index 6de68c673..db5f82007 100644 --- a/runs.ipynb +++ b/runs.ipynb @@ -69,7 +69,7 @@ "!aws s3 sync resources/grn_models/ s3://openproblems-data/resources/grn/grn_models --delete\n", "!aws s3 sync resources/prior/ s3://openproblems-data/resources/grn/prior --delete\n", "!aws s3 sync resources/supplementary/ s3://openproblems-data/resources/grn/supplementary --delete\n", - "!aws s3 sync resources/results/ s3://openproblems-data/resources/grn/results --delete" + "# !aws s3 sync resources/results/ s3://openproblems-data/resources/grn/results --delete" ] }, { @@ -81,7 +81,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -92,6 +92,8 @@ "import scanpy as sc \n", "from src.exp_analysis.helper import plot_cumulative_density\n", "import sys \n", + "import subprocess\n", + "\n", "sys.path.append('../')\n", "from grn_benchmark.src.commons import surragate_names\n", "def extract_data(data, reg='reg1', dataset_id='scgen_pearson'):\n", @@ -138,8 +140,18 @@ " df_all = pd.concat([df_reg1, df_reg2], axis=1)\n", " return df_all\n", "\n", + "par = {\n", + " # 'methods': [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'genie3', 'grnboost2', 'scenic', 'scglue', 'celloracle'],\n", + " 'methods': [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'scglue', 'celloracle'],\n", + " 'models_dir': 'resources/grn_models/',\n", + " 'scores_dir': 'resources/scores',\n", + " 'multiomics_rna': 'resources/grn-benchmark/multiomics_rna.h5ad', \n", + " 'num_workers': 20,\n", + " 'mem': \"100G\",\n", + " 'time': \"24:00:00\"\n", + "}\n", "\n", - "methods = [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'genie3', 'grnboost2', 'scenic', 'scglue', 'celloracle']" + "# methods = [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'genie3', 'grnboost2', 'scenic', 'scglue', 'celloracle']" ] }, { @@ -149,15 +161,6 @@ "# Marco data " ] }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# !ls resources_local/mccalla_extended/" - ] - }, { "cell_type": "code", "execution_count": 3, @@ -182,109 +185,141 @@ "# subprocess.run(command, shell=True, check=True)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run grn inference " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Local runs" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "\n", + "\n", + "if False:\n", + " for method in par['methods']:\n", + " par['prediction'] = f\"{par['models_dir']}/{method}.csv\"\n", + " # method arguments \n", + " method_args=f\"--multiomics_rna {par['multiomics_rna']} --prediction {par['prediction']} --num_workers {par['num_workers']} --resources_dir src/utils\"\n", + " # run command\n", + " if method == \"pearson_corr\":\n", + " command = f\"python src/control_methods/pearson/script.py {method_args}\"\n", + " elif method == \"celloracle\":\n", + " command = f\"/home/jnourisa/miniconda3/envs/celloracle/bin/python src/methods/multi_omics/celloracle/script.py {method_args}\"\n", + " elif method in [\"grnboost2\", \"scenic\", \"genie3\"]:\n", + " command = f\"singularity exec ../../images/scenic python src/methods/single_omics/{method}/script.py {method_args}\" \n", + " elif method == 'scglue':\n", + " command = f\"singularity exec ../../images/scglue python src/methods/multi_omics/{method}/script.py {method_args}\"\n", + " elif method == 'ppcor':\n", + " command = f\"singularity exec ../../images/ppcor Rscript src/methods/single_omics/{method}/script.R {method_args}\"\n", + " else:\n", + " command = f\"singularity exec ../../images/{method} python src/methods/single_omics/{method}/script.py {method_args}\"\n", + " # sbatch tags\n", + " tag = f\"--job-name {method} \"\n", + " resources = f\" --cpus-per-task {par['num_workers']} --mem {par['mem']} --time {par['time']}\" #resources\n", + " tag+=resources\n", + " if method=='scglue':\n", + " tag += f\" --partition=gpu --gres=gpu:1\"\n", + " \n", + " !sbatch {tag} scripts/sbatch/grn_inference.sh \"{command}\" \n", + " # !bash scripts/sbatch/grn_inference.sh \"{command}\" " + ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Run grn inference " + "## Baseline models" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Submitted batch job 7748321\n" - ] - } - ], + "outputs": [], "source": [ - "par = {\n", - " # 'methods': ['pearson_corr', 'portia', 'grnboost2', 'ppcor', 'scenic', 'scglue', 'celloracle'],\n", - " 'methods': ['ppcor'],\n", - " 'write_dir': 'resources/grn_models/',\n", - " 'multiomics_rna': 'resources/grn-benchmark/multiomics_rna.h5ad',\n", - " 'num_workers': 20,\n", - " 'mem': \"100G\",\n", - " 'time': \"24:00:00\"\n", - "}\n", - "for method in par['methods']:\n", - " par['prediction'] = f\"{par['write_dir']}/{method}.csv\"\n", - " # method arguments \n", - " method_args=f\"--multiomics_rna {par['multiomics_rna']} --prediction {par['prediction']} --num_workers {par['num_workers']}\"\n", - " # run command\n", - " if method == \"pearson_corr\":\n", - " command = f\"python src/control_methods/pearson/script.py {method_args}\"\n", - " elif method == \"celloracle\":\n", - " command = f\"/home/jnourisa/miniconda3/envs/celloracle/bin/python src/methods/multi_omics/celloracle/script.py {method_args}\"\n", - " elif method in [\"grnboost2\", \"scenic\", \"genie3\"]:\n", - " command = f\"singularity exec ../../images/scenic python src/methods/single_omics/{method}/script.py {method_args}\" \n", - " elif method == 'scglue':\n", - " command = f\"singularity exec ../../images/scglue python src/methods/multi_omics/{method}/script.py {method_args}\"\n", - " elif method == 'ppcor':\n", - " command = f\"singularity exec ../../images/ppcor Rscript src/methods/single_omics/{method}/script.R {method_args}\"\n", - " else:\n", - " command = f\"singularity exec ../../images/{method} python src/methods/single_omics/{method}/script.py {method_args}\"\n", - " # sbatch tags\n", - " tag = f\"--job-name {method} \"\n", - " resources = f\" --cpus-per-task {par['num_workers']} --mem {par['mem']} --time {par['time']}\" #resources\n", - " tag+=resources\n", - " if method=='scglue':\n", - " tag += f\" --partition=gpu --gres=gpu:1\"\n", - " \n", - " !sbatch {tag} scripts/sbatch/grn_inference.sh \"{command}\" \n", - " # !bash scripts/sbatch/grn_inference.sh \"{command}\" " + "if False:\n", + " command = f\"--multiomics_rna {par['multiomics_rna']} --models_dir {par['models_dir']}\"\n", + " !python src/control_methods/script_all.py {command}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# d0_hvgs\n", - "While we run baseline models using the below commands, for the rest of the models, they must be ran already and their predictions should be given in the relevant folder." + "## Seqera" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 14, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "download: s3://openproblems-data/resources/grn/results/celloracle/state.yaml to resources/results/celloracle/state.yaml\n", + "download: s3://openproblems-data/resources/grn/results/celloracle/trace.txt to resources/results/celloracle/trace.txt\n", + "download: s3://openproblems-data/resources/grn/results/celloracle/celloracle.grn_inference_celloracle.prediction.csv to resources/results/celloracle/celloracle.grn_inference_celloracle.prediction.csv\n", + "download: s3://openproblems-data/resources/grn/results/celloracle/output/celloracle/base_grn.csv to resources/results/celloracle/output/celloracle/base_grn.csv\n" + ] + } + ], "source": [ - "## Baseline models" + "if False: # submit the job\n", + " !bash src/methods/multi_omics/celloracle/run.sh\n", + "if False: # get the results\n", + " !aws s3 sync s3://openproblems-data/resources/grn/results/celloracle resources/results/celloracle" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'write_dir': 'resources/grn_models/d0_hvgs', 'multiomics_rna': 'resources/grn-benchmark/multiomics_rna_d0_hvg.h5ad', 'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad', 'tf_all': 'resources/prior/tf_all.csv', 'max_n_links': 50000, 'layer': 'scgen_pearson', 'cell_type_specific': False, 'normalize': False, 'causal': True, 'prediction': 'resources/grn_models/d0_hvgs/pearson_corr.csv'}\n", - "Read data\n", - "Causal subsetting\n", - "Reading input data\n", - "Inferring GRN\n", - "{'write_dir': 'resources/grn_models/d0_hvgs', 'multiomics_rna': 'resources/grn-benchmark/perturbation_data.h5ad', 'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad', 'tf_all': 'resources/prior/tf_all.csv', 'max_n_links': 50000, 'layer': 'scgen_pearson', 'cell_type_specific': False, 'normalize': False, 'causal': True, 'prediction': 'resources/grn_models/d0_hvgs/positive_control.csv'}\n", - "Read data\n", - "adata.X is already dense.\n", - "Causal subsetting\n" + "alldonors_default donor_0_celltype negative_control.csv positive_control.csv\n", + "celloracle.csv\t donor_0_default peak_gene\t\t ppcor.csv\n", + "d0_hvg\t\t granie.csv\t pearson_corr.csv\t scenic.csv\n", + "default\t\t grnboost2.csv portia.csv\t\t scglue.csv\n" ] } ], "source": [ - "!python src/control_methods/script_all.py" + "if False: # process celloracle\n", + " # - prediction \n", + " # !mv resources/results/celloracle/celloracle.grn_inference_celloracle.prediction.csv resources/results/celloracle/celloracle.csv \n", + " !cp resources/results/celloracle/celloracle.csv {par['models_dir']}/celloracle.csv\n", + " !ls {par['models_dir']}\n", + " \n", + " # - peak - gene\n", + " # !mkdir resources/grn_models/peak_gene/\n", + " df = pd.read_csv(\"resources/results/celloracle/output/celloracle/base_grn.csv\")[['peak_id','gene_short_name']]\n", + " df.columns = ['peak','target']\n", + " df.to_csv('resources/grn_models/peak_gene/celloracle.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Scores\n", + "While we run baseline models using the below commands, for the rest of the models, they must be ran already and their predictions should be given in the relevant folder." ] }, { @@ -296,34 +331,356 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "resources/grn-benchmark/perturbation_data.h5ad\n", + "['collectri', 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'scglue', 'celloracle']\n", + "Sparsity of resources/grn_models/collectri.csv: 0.9999170526430831\n", + "Sparsity of resources/grn_models/negative_control.csv: 0.9998980328944889\n", + "Sparsity of resources/grn_models/positive_control.csv: 0.9997840137565959\n", + "Sparsity of resources/grn_models/pearson_corr.csv: 0.9997883809984375\n", + "Sparsity of resources/grn_models/portia.csv: 0.9999967429274494\n", + "Sparsity of resources/grn_models/ppcor.csv: 0.9999794553885274\n", + "Sparsity of resources/grn_models/grnboost2.csv: 0.9997985237124277\n", + "Sparsity of resources/grn_models/scenic.csv: 0.9997322245751523\n", + "Sparsity of resources/grn_models/scglue.csv: 0.9999847772895649\n", + "Sparsity of resources/grn_models/celloracle.csv: 0.9997894825282788\n" + ] + } + ], "source": [ "# consensus: run this after updating method list\n", "if False:\n", - " !python src/metrics/regression_2/consensus/script.py #inside the method names and dir\n" + " model_names = ' '.join(par['methods'])\n", + " command = [\n", + " \"python\", \n", + " \"src/metrics/regression_2/consensus/script.py\", \n", + " \"--models_dir\", par['models_dir'], \n", + " \"--models\"\n", + " ] + model_names.split()\n", + "\n", + " # Print command to verify\n", + " subprocess.run(command)" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Submitted batch job 7747147\n" + "Submitted batch job 7750423\n" ] } ], "source": [ "# run calculating scores\n", - "if False:\n", + "if True:\n", + " # !bash scripts/sbatch/calculate_scores.sh #includes both reg1 and 2. #inside the script, set the reg_type, read and write dirs, and methods\n", " !sbatch scripts/sbatch/calculate_scores.sh #includes both reg1 and 2. #inside the script, set the reg_type, read and write dirs, and methods" ] }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 S1S2static-theta-0.0static-theta-0.5rank
collectri-0.100238-0.2111820.4893160.51489611
negative_control-0.044574-0.0451580.4457270.5013899
positive_control0.1971290.5788220.5308480.5846943
pearson_corr0.2734430.5163430.6398710.5491882
portia0.1011100.3654180.4562030.5131576
ppcor0.0179540.1597540.3171360.5061478
grnboost20.4219360.4893220.8250250.6195271
scenic0.1720850.2282660.5212340.5832025
granie0.0832980.1060120.3233240.45850610
scglue0.0998590.2799990.1805940.5339167
celloracle0.2091510.2914780.6807870.5846874
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_scores = pd.read_csv(f\"{par['scores_dir']}/scgen_pearson-ridge.csv\", index_col=0)\n", + "df_all_n = (df_scores-df_scores.min(axis=0))/(df_scores.max(axis=0)-df_scores.min(axis=0))\n", + "df_scores['rank'] = df_all_n.mean(axis=1).rank(ascending=False).astype(int)\n", + "df_scores.style.background_gradient()" + ] + }, { "cell_type": "code", "execution_count": 12, diff --git a/scripts/sbatch/calculate_scores.sh b/scripts/sbatch/calculate_scores.sh index 4fbc56f3d..9533028d9 100644 --- a/scripts/sbatch/calculate_scores.sh +++ b/scripts/sbatch/calculate_scores.sh @@ -8,4 +8,4 @@ #SBATCH --mem=64G #SBATCH --cpus-per-task=20 -python src/metrics/script_all.py +python src/metrics/script_all.py diff --git a/src/control_methods/script_all.py b/src/control_methods/script_all.py index b0ecd1dab..e709d9bc5 100644 --- a/src/control_methods/script_all.py +++ b/src/control_methods/script_all.py @@ -6,7 +6,7 @@ import random par = { - 'write_dir': "resources/grn_models/d0_hvgs", + 'models_dir': "resources/grn_models/d0_hvgs", "multiomics_rna": "resources/grn-benchmark/multiomics_rna_d0_hvg.h5ad", "perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad", @@ -18,6 +18,28 @@ 'causal': True } +import argparse +parser = argparse.ArgumentParser(description="Process multiomics RNA data.") +parser.add_argument('--multiomics_rna', type=str, help='Path to the multiomics RNA file') +parser.add_argument('--prediction', type=str, help='Path to the prediction file') +parser.add_argument('--resources_dir', type=str, help='Path to the prediction file') +parser.add_argument('--tf_all', type=str, help='Path to the tf_all') +parser.add_argument('--num_workers', type=str, help='Number of cores') +parser.add_argument('--models_dir', type=str, help='where to write the model') +args = parser.parse_args() + +if args.multiomics_rna: + par['multiomics_rna'] = args.multiomics_rna +if args.tf_all: + par['tf_all'] = args.tf_all +if args.num_workers: + par['num_workers'] = args.num_workers +if args.models_dir: + par['models_dir'] = args.models_dir + +if args.resources_dir: + meta = {'resources_dir': args.resources_dir} + meta = { "resources_dir": 'src/control_methods', "util": 'src/utils' @@ -25,23 +47,26 @@ sys.path.append(meta["resources_dir"]) sys.path.append(meta["util"]) -os.makedirs(par['write_dir'], exist_ok=True) +os.makedirs(par['models_dir'], exist_ok=True) from util import create_corr_net - +print(par) #---- run for pearson_corr -par['prediction'] = f"{par['write_dir']}/pearson_corr.csv" +print("run for pearson_corr") +par['prediction'] = f"{par['models_dir']}/pearson_corr.csv" par['causal'] = True net = create_corr_net(par) net.to_csv(par['prediction']) #---- run for negative control +print("run for negative control") from negative_control.main import main -par['prediction'] = f"{par['write_dir']}/negative_control.csv" +par['prediction'] = f"{par['models_dir']}/negative_control.csv" net = main(par) net.to_csv(par['prediction']) #---- run for positive_control +print("run for positive control") par['multiomics_rna'] = par['perturbation_data'] -par['prediction'] = f"{par['write_dir']}/positive_control.csv" +par['prediction'] = f"{par['models_dir']}/positive_control.csv" net = create_corr_net(par) net.to_csv(par['prediction']) diff --git a/src/exp_analysis/helper.py b/src/exp_analysis/helper.py index 1c9340fc2..1925e48b1 100644 --- a/src/exp_analysis/helper.py +++ b/src/exp_analysis/helper.py @@ -65,6 +65,63 @@ def plot_cumulative_density(data, title='', ax=None, s=1, **kwdgs): # self.out_deg = degree_centrality(net, source='source', target='target', **kwargs) # self.in_deg = degree_centrality(net, source='target', target='source', **kwargs) +class VSA_analysis: + '''Vester's sensitivity analysis. + - active_sum: active sum. Sum along rows of the influence matrix and it indicates how much does a variable influence all the others. + - passive_sum: passive sum. Its is the sum along columns of the influence matrix and it indicates how sensitive a variable is, how does it react to the influence of others + ''' + def __init__(self, sample: pd.DataFrame, control: pd.DataFrame, mode='weight', top_q_net = 0.75, critical_change_q_t: float = 0.75): + # - keep only top quantile links + control = control[control.weight > control.weight.quantile(top_q_net)] + sample = sample[sample.weight > sample.weight.quantile(top_q_net)] + + print('Determine roles') + self.vsa_control = self.determine_roles(control) + self.vsa_sample = self.determine_roles(sample) + print('Determine role change') + self.oo = self.diff_roles(self.vsa_control, self.vsa_sample, critical_change_q_t) + + @staticmethod + def determine_roles(net, mode='weight', top_q_role=0.9) -> pd.DataFrame: + # active sum and passive sum + gene_names = np.union1d(net.source.unique(), net.target.unique()) + if mode=='weight': + active_sum = np.asarray([sum(net.query(f"source == '{gene}'")['weight']) for gene in gene_names]) + passive_sum = np.asarray([sum(net.query(f"target == '{gene}'")['weight']) for gene in gene_names]) + else: + raise ValueError('define first') + + # define the roles ['Buffering', 'Passive', 'Active', 'Critical'] -> [0, 1, 2, 3] + active_sum_threhsold = np.quantile(active_sum, top_q_role) + passive_sum_threhsold = np.quantile(passive_sum, top_q_role) + # active_sum, passive_sum = standardize_array(active_sum), standardize_array(passive_sum) + roles = [2*as_flag+ps_flag for as_flag, ps_flag in zip(active_sum>active_sum_threhsold, passive_sum>passive_sum_threhsold)] + roles = pd.DataFrame(data={'gene_name': gene_names, 'active_sum': active_sum, 'passive_sum': passive_sum, 'role':roles }) + roles.set_index('gene_name',inplace=True) + return roles + @staticmethod + def diff_roles(control: pd.DataFrame, sample: pd.DataFrame, critical_change_q_t: float=.75) -> pd.DataFrame: + ''' + Find the distance in the role from control to sample, and flags the top changes. + ''' + # - match the index + common_genes = pd.Index(np.union1d(sample.index, control.index)) + control = control.reindex(common_genes).fillna(0) + sample = sample.reindex(common_genes).fillna(0) + # - calculate distance + as_distance = (sample['active_sum'] - control['active_sum']) + ps_distance = (sample['passive_sum'] - control['passive_sum']) + overall_distance = as_distance**2 + ps_distance**2 + df_distance = pd.DataFrame({'as_distance':as_distance, 'ps_distance':ps_distance, 'overall_distance':overall_distance}, index=common_genes) + + # df_distance = df_distance.assign(passive_sum_c=control['passive_sum'], active_sum_c=control['active_sum']) + # df_distance = df_distance.assign(passive_sum_s=sample['passive_sum'], active_sum_s=sample['active_sum']) + + # - define x top percentile as the cut-off value to determine critical role change + # df_distance['critical_change_as'] = df_distance['as_distance'] > df_distance['as_distance'].quantile(critical_change_q_t) + # df_distance['critical_change_ps'] = df_distance['ps_distance'] > df_distance['ps_distance'].quantile(critical_change_q_t) + # df_distance['critical_change_overall'] = df_distance['overall_distance'] > df_distance['overall_distance'].quantile(critical_change_q_t) + return df_distance class Exp_analysis: ''' This class provides functions for explanatory analysis of GRNs @@ -101,8 +158,6 @@ def plot_centrality(self, df, title='',ax=None, xlabel='Degree', ylabel='Gene', if colors is not None: for label, color in zip(ax.get_yticklabels(), colors): label.set_color(color) - - def top_edges(self, quantile=0.95): grn = self.net grn = grn[grn.weight>grn.weight.quantile(quantile)] @@ -118,7 +173,6 @@ def determine_source_properties(self): c_weight = determine_centrality(self.net, mode='weight') self.sources = c_weight.merge(c_degree, left_index=True, right_index=True) - # def calculate_centrality_stats(self) -> None: # '''Calculate network centrality metrics such as in and out degree # ''' @@ -164,6 +218,9 @@ def annotate_genes(self, gene_annotation_df) -> dict[str, float]: # gene_annot.values self.gene_annot = {key:round((value*100/len(self.targets)), 1) for key, value in gene_annot.items()} return self.gene_annot + def analyse_roles(self, ref_net, **kwargs): + self.vsa_obj = VSA_analysis(self.net, ref_net,**kwargs) + self.vsa_results = self.vsa_obj.oo def plot_interactions(interaction_df: pd.DataFrame, min_subset_size=None, min_degree=None, color_map=None) -> plt.Figure: """Upset plot of interactions diff --git a/src/methods/single_omics/genie3/config.vsh.yaml b/src/methods/single_omics/genie3/config.vsh.yaml index 0b396b941..9711f49a8 100644 --- a/src/methods/single_omics/genie3/config.vsh.yaml +++ b/src/methods/single_omics/genie3/config.vsh.yaml @@ -15,6 +15,7 @@ functionality: path: script.py - path: /src/utils/util.py dest: util.py + platforms: - type: docker diff --git a/src/methods/single_omics/scenic/config.vsh.yaml b/src/methods/single_omics/scenic/config.vsh.yaml index 1227dac8e..383737ba1 100644 --- a/src/methods/single_omics/scenic/config.vsh.yaml +++ b/src/methods/single_omics/scenic/config.vsh.yaml @@ -27,6 +27,9 @@ functionality: resources: - type: python_script path: script.py + - path: /src/utils/util.py + dest: util.py + # - path: main.py platforms: - type: docker diff --git a/src/methods/single_omics/scenic/main.py b/src/methods/single_omics/scenic/main.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/methods/single_omics/scenic/script.py b/src/methods/single_omics/scenic/script.py index 5e8c52f6a..1f30b977a 100644 --- a/src/methods/single_omics/scenic/script.py +++ b/src/methods/single_omics/scenic/script.py @@ -6,6 +6,9 @@ import ast import requests import scipy.sparse as sp +import sys + + # wget https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather @@ -27,13 +30,6 @@ 'normalize': False } ## VIASH END - -import sys -meta= { - "resources_dir": 'src/utils/' -} - - import argparse parser = argparse.ArgumentParser(description="Process multiomics RNA data.") parser.add_argument('--multiomics_rna', type=str, help='Path to the multiomics RNA file') @@ -53,39 +49,27 @@ par['num_workers'] = args.num_workers if args.resources_dir: - meta['resources_dir'] = args.resources_dir + meta = {'resources_dir': args.resources_dir} sys.path.append(meta["resources_dir"]) from util import process_links -os.makedirs(par['temp_dir'], exist_ok=True) - -databases = f"{par['temp_dir']}/databases/" -os.makedirs(databases, exist_ok=True) - -par['motif_annotation'] = f'{databases}/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl' -par['genes_vs_motifs_10k'] = f'{databases}/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather' -par['genes_vs_motifs_500'] = f'{databases}/hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather' - -if not (os.path.exists(par['motif_annotation'])): - print('downloading motif_annotation') - response = requests.get("https://resources.aertslab.org/cistarget/motif2tf/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl") - with open(par['motif_annotation'], "wb") as file: - file.write(response.content) -if not (os.path.exists(par['genes_vs_motifs_10k'])): - print('downloading genes_vs_motifs_10k') - response = requests.get("https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather") - with open(par['genes_vs_motifs_10k'], "wb") as file: - file.write(response.content) -if not (os.path.exists(par['genes_vs_motifs_500'])): - print('downloading genes_vs_motifs_500') - response = requests.get("https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather") - with open(par['genes_vs_motifs_500'], "wb") as file: - file.write(response.content) - -expr_mat_adjacencies = os.path.join(par['temp_dir'], "expr_mat_adjacencies.tsv") -expression_data = os.path.join(par['temp_dir'], "expression_data.tsv") -regulons = f"{par['temp_dir']}/regulons.csv" +def download_prior(par): + if not (os.path.exists(par['motif_annotation'])): + print('downloading motif_annotation') + response = requests.get("https://resources.aertslab.org/cistarget/motif2tf/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl") + with open(par['motif_annotation'], "wb") as file: + file.write(response.content) + if not (os.path.exists(par['genes_vs_motifs_10k'])): + print('downloading genes_vs_motifs_10k') + response = requests.get("https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather") + with open(par['genes_vs_motifs_10k'], "wb") as file: + file.write(response.content) + if not (os.path.exists(par['genes_vs_motifs_500'])): + print('downloading genes_vs_motifs_500') + response = requests.get("https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather") + with open(par['genes_vs_motifs_500'], "wb") as file: + file.write(response.content) def format_data(par): print('Read data') @@ -93,18 +77,17 @@ def format_data(par): gene_names = adata_rna.var_names if sp.issparse(adata_rna.X): adata_rna.X = adata_rna.X.toarray() - pd.DataFrame(adata_rna.X, columns=gene_names).to_csv(expression_data, sep='\t', index=False) + pd.DataFrame(adata_rna.X, columns=gene_names).to_csv(par['expression_data'], sep='\t', index=False) - def run_grn(par): print('Run grn') command = [ "pyscenic", "grn", "--num_workers", str(par['num_workers']), "--seed", str(par['seed']), - "-o", expr_mat_adjacencies, + "-o", str(par['expr_mat_adjacencies']), "--method", "grnboost2", - expression_data, + str(par['expression_data']), par['tf_all'] ] subprocess.run(command, check=True) @@ -114,11 +97,11 @@ def prune_grn(par): command = [ "pyscenic", "ctx", - expr_mat_adjacencies, par['genes_vs_motifs_500'], par['genes_vs_motifs_10k'], + par['expr_mat_adjacencies'], par['genes_vs_motifs_500'], par['genes_vs_motifs_10k'], "--annotations_fname", par['motif_annotation'], - "--expression_mtx_fname", expression_data, + "--expression_mtx_fname", par['expression_data'], "--mode", "custom_multiprocessing", - "--output", regulons, + "--output", str(par['regulons']), "--num_workers", str(par['num_workers']), "--rank_threshold", str(par['rank_threshold']), "--auc_threshold", str(par['auc_threshold']), @@ -129,7 +112,7 @@ def prune_grn(par): def format_grn(par): print('Format regulons') - regulons_df = pd.read_csv(regulons, index_col=0, skiprows=1)['TargetGenes'].reset_index().iloc[1:,:] + regulons_df = pd.read_csv(par['regulons'], index_col=0, skiprows=1)['TargetGenes'].reset_index().iloc[1:,:] def format_it(df): values = df['TargetGenes'].values[0] @@ -142,13 +125,30 @@ def format_it(df): grn = regulons_df.groupby('index').apply(lambda df: format_it(df)).reset_index().rename(columns={'index':'source'}) network = grn[['source','target','weight']] return network +def main(par): + databases = f"{par['temp_dir']}/databases/" + os.makedirs(databases, exist_ok=True) + + par['motif_annotation'] = f'{databases}/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl' + par['genes_vs_motifs_10k'] = f'{databases}/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather' + par['genes_vs_motifs_500'] = f'{databases}/hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather' + + par['expr_mat_adjacencies'] = os.path.join(par['temp_dir'], "expr_mat_adjacencies.tsv") + par['expression_data'] = os.path.join(par['temp_dir'], "expression_data.tsv") + par['regulons'] = f"{par['temp_dir']}/regulons.csv" + + # format_data(par) + # run_grn(par) + prune_grn(par) + network = format_grn(par) + return network + +if __name__=='__main__': + network = main(par) + os.makedirs(par['temp_dir'], exist_ok=True) -format_data(par) -run_grn(par) -prune_grn(par) -network = format_grn(par) -network.to_csv(par['prediction'], sep=',') + network.to_csv(par['prediction'], sep=',') + print('Finished.') -print('Finished.') diff --git a/src/metrics/regression_2/consensus/script.py b/src/metrics/regression_2/consensus/script.py index 075f29a13..f2ef679f5 100644 --- a/src/metrics/regression_2/consensus/script.py +++ b/src/metrics/regression_2/consensus/script.py @@ -11,11 +11,21 @@ ## VIASH START par = { 'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad', - 'grn_folder': 'resources/grn-benchmark/grn_models/d0_hvg', - 'grns': 'pearson_corr, pearson_causal, portia, ppcor, genie3, grnboost2, scenic, scglue, celloracle', + # 'models_dir': 'resources/grn-benchmark/grn_models/d0_hvg', + # 'models': [pearson_corr, pearson_causal, portia, ppcor, genie3, grnboost2, scenic, scglue, celloracle], 'output': 'resources/grn-benchmark/consensus-num-regulators.json' } ## VIASH END +import argparse +parser = argparse.ArgumentParser(description="Process multiomics RNA data.") +parser.add_argument('--models_dir', type=str, help='where to read the models') +parser.add_argument('--models', nargs='+', help="List of models to process", required=True) + +args = parser.parse_args() +if args.models_dir: + par['models_dir'] = args.models_dir +if args.models: + par['models'] = args.models # Load perturbation data @@ -26,12 +36,12 @@ gene_names = adata_rna.var.index.to_numpy() print(par['perturbation_data']) -print(par['grns']) +print(par['models']) # Load inferred GRNs grns = [] -for filename in par['grns'].split(','): - filepath = os.path.join(par['grn_folder'], f'{filename}.csv') +for filename in par['models']: + filepath = os.path.join(par['models_dir'], f'{filename}.csv') gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)} A = np.zeros((len(gene_names), len(gene_names)), dtype=float) df = pd.read_csv(filepath, sep=',', header='infer', index_col=0) diff --git a/src/metrics/regression_2/main.py b/src/metrics/regression_2/main.py index 1dfd65ed3..e2512045f 100644 --- a/src/metrics/regression_2/main.py +++ b/src/metrics/regression_2/main.py @@ -20,22 +20,23 @@ def load_grn(filepath: str, gene_names: np.ndarray, par: Dict[str, Any]) -> np.ndarray: gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)} - A = np.zeros((len(gene_names), len(gene_names)), dtype=float) - df = pd.read_csv(filepath, sep=',', header='infer', index_col=0) + df = pd.read_csv(filepath, sep=',', header='infer') if 'cell_type' in df.columns: verbose_print(par['verbose'], 'Taking mean of cell type specific grns', 3) df.drop(columns=['cell_type'], inplace=True) df = df.groupby(['source', 'target']).mean().reset_index() - + # keep only top n links + if df.shape[0] > par['max_n_links']: + verbose_print(par['verbose'], f"Reducing number of links to {par['max_n_links']}", 3) + df = process_links(df, par) + # convert to matrix + A = np.zeros((len(gene_names), len(gene_names)), dtype=float) for source, target, weight in zip(df['source'], df['target'], df['weight']): if (source not in gene_dict) or (target not in gene_dict): continue i = gene_dict[source] j = gene_dict[target] A[i, j] = float(weight) - if df.shape[0] > par['max_n_links']: - verbose_print(par['verbose'], f"Reducing number of links to {par['max_n_links']}", 3) - df = process_links(df, par) return A diff --git a/src/metrics/script_all.py b/src/metrics/script_all.py index 1480346ef..db3c80f32 100644 --- a/src/metrics/script_all.py +++ b/src/metrics/script_all.py @@ -7,10 +7,12 @@ ## VIASH START par = { 'reg_type': 'ridge', - 'read_dir': "resources/grn_models/d0_hvg", - 'write_dir': "resources/results/scores/d0_hvg", - 'methods': [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'genie3', 'grnboost2', 'scenic', 'scglue', 'celloracle'], - 'layers': ['lognorm', 'pearson', 'seurat_lognorm', 'seurat_pearson', 'scgen_lognorm', 'scgen_pearson'], + 'models_dir': "resources/grn_models/", + 'scores_dir': "resources/scores/", + + 'methods': [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle'], + # 'layers': ['scgen_pearson', 'lognorm', 'pearson', 'seurat_lognorm', 'seurat_pearson', 'scgen_lognorm'], + 'layers': ['pearson', 'seurat_lognorm', 'seurat_pearson', 'scgen_lognorm'], "perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad", "tf_all": "resources/prior/tf_all.csv", @@ -25,6 +27,25 @@ 'clip_scores': True } # VIASH END + +import argparse +parser = argparse.ArgumentParser(description="Process multiomics RNA data.") +parser.add_argument('--multiomics_rna', type=str, help='Path to the multiomics RNA file') +parser.add_argument('--num_workers', type=str, help='Number of cores') +parser.add_argument('--models_dir', type=str, help='where to read the models') +parser.add_argument('--scores_dir', type=str, help='where to write the model') + +args = parser.parse_args() + +if args.multiomics_rna: + par['multiomics_rna'] = args.multiomics_rna +if args.num_workers: + par['num_workers'] = args.num_workers +if args.models_dir: + par['models_dir'] = args.models_dir +if args.scores_dir: + par['scores_dir'] = args.scores_dir + meta = { "resources_dir": 'src/metrics/', "util": 'src/utils' @@ -32,12 +53,13 @@ sys.path.append(meta["resources_dir"]) sys.path.append(meta["util"]) -os.makedirs(par['write_dir'], exist_ok=True) +os.makedirs(par['scores_dir'], exist_ok=True) + for layer in par['layers']: par['layer'] = layer for i, method in enumerate(par['methods']): - par['prediction'] = f"{par['read_dir']}/{method}.csv" + par['prediction'] = f"{par['models_dir']}/{method}.csv" from regression_1.main import main reg1 = main(par) from regression_2.main import main @@ -48,6 +70,6 @@ df_all = score else: df_all = pd.concat([df_all, score]) - df_all.to_csv(f"{par['write_dir']}/{layer}-{par['reg_type']}.csv") + df_all.to_csv(f"{par['scores_dir']}/{layer}-{par['reg_type']}.csv") print(df_all) diff --git a/src/utils/util.py b/src/utils/util.py index 162cf6245..5bf2880d5 100644 --- a/src/utils/util.py +++ b/src/utils/util.py @@ -26,7 +26,7 @@ def verbose_tqdm(iterable, desc, level, verbose_level): def basic_qc(adata, min_genes_per_cell = 200, max_genes_per_cell = 5000, min_cells_per_gene = 10): mt = adata.var_names.str.startswith('MT-') - + print('shape before ', adata.shape) # 1. stats total_counts = adata.X.sum(axis=1) n_genes_by_counts = (adata.X > 0).sum(axis=1) @@ -37,15 +37,16 @@ def basic_qc(adata, min_genes_per_cell = 200, max_genes_per_cell = 5000, min_cel # mt_filter = mt_frac > max_mt_frac # 2. Filter cells - print(f'Number of cells removed: below min gene {low_gene_filter.sum()}, exceed max gene {high_gene_filter.sum()}') + # print(f'Number of cells removed: below min gene {low_gene_filter.sum()}, exceed max gene {high_gene_filter.sum()}') mask_cells= (~low_gene_filter)& \ (~high_gene_filter) # (~mt_filter) # 3. Filter genes n_cells = (adata.X!=0).sum(axis=0) mask_genes = n_cells>min_cells_per_gene - - return adata[mask_cells, mask_genes] + adata_f = adata[mask_cells, mask_genes] + print('shape after ', adata_f.shape) + return adata_f def process_links(net, par): net = net[net.source!=net.target]