From 4336a112c2ec9a389a263ad4fa5a8cef0e56ee5f Mon Sep 17 00:00:00 2001 From: matin Date: Fri, 19 Jul 2024 22:08:35 +0100 Subject: [PATCH 1/4] before pull --- celloracle.ipynb | 1241 +++++++++++++++++++ dockerfiles/scenicplus/Dockerfile | 0 dockerfiles/scglue/Dockerfile | 29 + scripts/run_grn_inference.sh | 12 +- src/methods/scenicplus/config.vsh.yaml | 4 +- src/methods/scenicplus/script.py | 2 +- src/workflows/run_benchmark/config.vsh.yaml | 25 +- src/workflows/run_benchmark/main.nf | 2 +- 8 files changed, 1283 insertions(+), 32 deletions(-) create mode 100644 celloracle.ipynb create mode 100644 dockerfiles/scenicplus/Dockerfile create mode 100644 dockerfiles/scglue/Dockerfile diff --git a/celloracle.ipynb b/celloracle.ipynb new file mode 100644 index 000000000..d14b4de57 --- /dev/null +++ b/celloracle.ipynb @@ -0,0 +1,1241 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/root/.local/lib/python3.10/site-packages/numba/np/ufunc/parallel.py:371: NumbaWarning: \u001b[1mThe TBB threading layer requires TBB version 2021 update 6 or later i.e., TBB_INTERFACE_VERSION >= 12060. Found TBB_INTERFACE_VERSION = 12050. The TBB threading layer is disabled.\u001b[0m\n", + " warnings.warn(problem)\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import os \n", + "from celloracle import motif_analysis as ma\n", + "import pandas as pd\n", + "import celloracle as co\n", + "import anndata\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import anndata as ad\n", + "from local_utils import plots" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import anndata as ad\n", + "\n", + "## VIASH START\n", + "par = {\n", + " \"multiomics_rna\": \"resources/grn-benchmark/multiomics_rna.h5ad\",\n", + " \"multiomics_atac\": \"resources/grn-benchmark/multiomics_atac.h5ad\",\n", + " \"annotation_file\": \"resources/grn-benchmark/annotation_file\",\n", + " \"motif_file\": \"resources/grn-benchmark/motif_file\",\n", + " \"prediction\": \"output/prediction.csv\",\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading input files\n" + ] + } + ], + "source": [ + "print('Reading input files', flush=True)\n", + "multiomics_rna = ad.read_h5ad(par[\"multiomics_rna\"])\n", + "multiomics_atac = ad.read_h5ad(par[\"multiomics_atac\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "que bed peaks: 135358\n", + "tss peaks in que: 21028\n" + ] + } + ], + "source": [ + "peaks = multiomics_atac.var_names.to_numpy()\n", + "\n", + "peaks = [peak.replace(':','_').replace(\"-\",'_') for peak in peaks]\n", + "tss_annotated = ma.get_tss_info(peak_str_list=peaks, ref_genome=\"hg38\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "tss_annotated['peak_id'] = tss_annotated['chr'].astype(str)+\"_\"+tss_annotated['start'].astype(str)+\"_\"+tss_annotated['end'].astype(str)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "# pd.read_csv('../perturb-multiomics-grn/output/infer/celloracle/peak_gene.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "hg38 installation: True\n", + "No motif data entered. Loading default motifs for your species ...\n", + " Default motif for vertebrate: gimme.vertebrate.v5.0. \n", + " For more information, please see https://gimmemotifs.readthedocs.io/en/master/overview.html \n", + "\n", + "Initiating scanner... \n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG:gimme.scanner:using background: genome hg38 with size 200\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Calculating FPR-based threshold. This step may take substantial time when you load a new ref-genome. It will be done quicker on the second time. \n", + "\n", + "Motif scan started .. It may take long time.\n", + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "392dd15eb4634227843443fd729b77fe", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "scanning: 0%| | 0/17276 [00:00 1 else chromsizes['Chromosome'][x] for x in range(len(chromsizes['Chromosome']))]\n", + "\n", + "chromsizes = chromsizes[['Chromosome', 'End']]\n", + "# save\n", + "chromsizes.to_csv(f'{work_dir}/cicero/chromsizes.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# check if all chr are found\n", + "chrs = df_peaks.locations.apply(lambda x:x.split('_')[0])\n", + "chrs.isin(chromsizes.Chromosome).sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Celloracle" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Base GRN\n", + "This section create base GRN given cicero results.\n", + "### Associate peaks with TSS\n", + "Each peak is associated with promotors of target genes (+- 1kbp). Celloracle should be installed for this step." + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "que bed peaks: 131047\n", + "tss peaks in que: 20898\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "***** WARNING: File /tmp/pybedtools.f78i1zv8.tmp has inconsistent naming convention for record:\n", + "GL000194.1\t55749\t56580\n", + "\n", + "***** WARNING: File /tmp/pybedtools.f78i1zv8.tmp has inconsistent naming convention for record:\n", + "GL000194.1\t55749\t56580\n", + "\n" + ] + } + ], + "source": [ + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Merge TSS peaks with cicero connections" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "integrated.to_csv(f'{work_dir}/infer/celloracle/baseGRN/integrated.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Shortlisten the peak-gene connections" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "peak_gene = integrated[integrated.coaccess>0.8].reset_index(drop=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "peak_gene.to_csv(f'{work_dir}/infer/celloracle/baseGRN/peak_gene.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create base GRN\n", + "Running interactively might take a long time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Base GRN: only proximal \n", + "This is only based on proximal cis elements and doesnt require cicero" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "import anndata as ad \n", + "adata_atac = ad.read_h5ad(f'{work_dir}/scATAC/adata_atac.h5ad')" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "peaks = format_peakname(adata_atac.var.reset_index()).location.values" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "que bed peaks: 135418\n", + "tss peaks in que: 21028\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "***** WARNING: File /tmp/pybedtools.dxm99xoa.tmp has inconsistent naming convention for record:\n", + "chr10\t100001032\t100001800\n", + "\n", + "***** WARNING: File /tmp/pybedtools.dxm99xoa.tmp has inconsistent naming convention for record:\n", + "chr10\t100001032\t100001800\n", + "\n" + ] + } + ], + "source": [ + "tss_annotated = ma.get_tss_info(peak_str_list=peaks, ref_genome=\"hg38\")\n", + "tss_annotated.to_csv(f'{work_dir}/infer/celloracle/baseGRN/tss_annotated.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [], + "source": [ + "##----- integrate_tss_peak_with_cicero\n", + "import numpy as np\n", + "from celloracle.motif_analysis.process_bed_file import df_to_list_peakstr\n", + "# 1. check tss data format and convert if needed\n", + "tss_peak=tss_annotated\n", + "tss = tss_peak.copy()\n", + "if np.all([i in tss.columns for i in [\"chr\", \"start\", \"end\"]]):\n", + " tss = pd.DataFrame({\"peak_id\": df_to_list_peakstr(tss),\n", + " \"gene_short_name\": tss.gene_short_name.values})\n", + "else:\n", + " raise ValueError(\"tss_peak format error\")\n", + "\n", + "peak_gene = tss" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "peak_gene.to_csv(f'{work_dir}/infer/celloracle/baseGRN/peak_gene.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### TF motifs" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd \n", + "from celloracle import motif_analysis as ma\n", + "import genomepy\n", + "genomes_dir='/beegfs/desy/user/nourisaj/op_multiomics_grn/output/celloracle'\n", + "peak_gene = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/peak_gene.csv', index_col=0)\n", + "genomepy.install_genome(name=\"hg38\", provider=\"UCSC\", genomes_dir=genomes_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "# PLEASE make sure reference genome is correct.\n", + "ref_genome = \"hg38\"\n", + "\n", + "genome_installation = ma.is_genome_installed(ref_genome=ref_genome,\n", + " genomes_dir=genomes_dir)\n", + "print(ref_genome, \"installation: \", genome_installation)\n", + "\n", + "# Instantiate TFinfo object\n", + "tfi = ma.TFinfo(peak_data_frame=peak_gene, \n", + " ref_genome=\"hg38\",\n", + " genomes_dir=genomes_dir) \n", + "\n", + "tfi.scan(fpr=0.05, \n", + " motifs=None, # If you enter None, default motifs will be loaded.\n", + " verbose=True)\n", + "# Check motif scan results\n", + "tfi.scanned_df.head()\n", + "# Reset filtering \n", + "tfi.reset_filtering()\n", + "\n", + "# Do filtering\n", + "tfi.filter_motifs_by_score(threshold=10)\n", + "\n", + "# Format post-filtering results.\n", + "tfi.make_TFinfo_dataframe_and_dictionary(verbose=True)\n", + "\n", + "# Format and save \n", + "df = tfi.to_dataframe()\n", + "df.head()\n", + "df.to_csv(f'{work_dir}/infer/celloracle/baseGRN/grn_celloracle_base.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## GRN construction\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Preprocessing scRNA-seq" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import celloracle as co\n", + "import anndata\n", + "import scanpy as sc\n", + "adata = anndata.read_h5ad(f'{work_dir}/scRNA/adata_rna.h5ad')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AnnData object with n_obs × n_vars = 25551 × 22787\n", + " obs: 'cell_type', 'donor_id', 'n_genes'\n", + " var: 'n_cells'\n", + " layers: 'counts', 'x_norm'" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "del adata.varm \n", + "del adata.uns \n", + "del adata.obsp \n", + "del adata.obsm \n", + "del adata.obs['louvain']\n", + "adata" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "adata.X = adata.layers['counts'].copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1., 1., 1., ..., 2., 1., 1.])" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.X.data" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "sc.pp.normalize_per_cell(adata, key_n_counts='n_counts_all')" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [], + "source": [ + "filter_result = sc.pp.filter_genes_dispersion(adata.X,\n", + " flavor='cell_ranger',\n", + " n_top_genes=3000,\n", + " log=False)\n", + "\n", + "# Subset the genes\n", + "adata = adata[:, filter_result.gene_subset]\n", + "\n", + "# Renormalize after filtering\n", + "sc.pp.normalize_per_cell(adata)" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [], + "source": [ + "# Log transformation and scaling\n", + "sc.pp.log1p(adata)\n", + "sc.pp.scale(adata)" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "# PCA\n", + "sc.tl.pca(adata, svd_solver='arpack')\n", + "\n", + "# Diffusion map\n", + "sc.pp.neighbors(adata, n_neighbors=4, n_pcs=20)\n", + "\n", + "sc.tl.diffmap(adata)\n", + "# Calculate neihbors again based on diffusionmap \n", + "sc.pp.neighbors(adata, n_neighbors=10, use_rep='X_diffmap')" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [], + "source": [ + "sc.tl.louvain(adata, resolution=0.8)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AnnData object with n_obs × n_vars = 25551 × 3000\n", + " obs: 'cell_type', 'donor_id', 'n_genes', 'n_counts_all', 'n_counts', 'louvain'\n", + " var: 'n_cells', 'mean', 'std'\n", + " uns: 'log1p', 'pca', 'neighbors', 'diffmap_evals', 'louvain'\n", + " obsm: 'X_pca', 'X_diffmap'\n", + " varm: 'PCs'\n", + " layers: 'counts', 'x_norm'\n", + " obsp: 'distances', 'connectivities'" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [], + "source": [ + "sc.tl.paga(adata, groups='louvain')" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [], + "source": [ + "sc.pl.paga(adata)" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING: Package 'fa2' is not installed, falling back to layout 'fr'.To use the faster and better ForceAtlas2 layout, install package 'fa2' (`pip install fa2`).\n" + ] + } + ], + "source": [ + "sc.tl.draw_graph(adata, init_pos='paga', random_state=123)\n", + "sc.pl.draw_graph(adata, color='louvain', legend_loc='on data')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Metadata columns : ['cell_type', 'donor_id', 'n_genes', 'louvain', 'n_counts_all', 'n_counts']\n", + "Dimensional reduction: ['X_diffmap', 'X_draw_graph_fr', 'X_pca', 'X_umap']\n" + ] + } + ], + "source": [ + "# Check data in anndata\n", + "print(\"Metadata columns :\", list(adata.obs.columns))\n", + "print(\"Dimensional reduction: \", list(adata.obsm.keys()))" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1., 1., 1., ..., 1., 1., 2.])" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## run based on counts as suggesyed by co pipeline\n", + "adata.X = adata.layers[\"counts\"]\n", + "adata.X.data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load base GRN" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [], + "source": [ + "base_GRN = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/grn_celloracle_base.csv', index_col=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Initialize " + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING: adata.X seems to be already log-transformed.\n" + ] + } + ], + "source": [ + "# Instantiate Oracle object\n", + "oracle = co.Oracle()\n", + "# Instantiate Oracle object.\n", + "oracle.import_anndata_as_raw_count(adata=adata,\n", + " cluster_column_name=\"cell_type\",\n", + " embedding_name=\"X_draw_graph_fr\")" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": {}, + "outputs": [], + "source": [ + "# You can load TF info dataframe with the following code.\n", + "oracle.import_TF_data(TF_info_matrix=base_GRN)" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "25\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "oracle.perform_PCA()\n", + "plt.plot(np.cumsum(oracle.pca.explained_variance_ratio_)[:100])\n", + "n_comps = np.where(np.diff(np.diff(np.cumsum(oracle.pca.explained_variance_ratio_))>0.002))[0][0]\n", + "plt.axvline(n_comps, c=\"k\")\n", + "plt.show()\n", + "print(n_comps)\n", + "n_comps = min(n_comps, 50)" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cell number is :25551\n", + "Auto-selected k is :50\n" + ] + } + ], + "source": [ + "n_cell = oracle.adata.shape[0]\n", + "print(f\"cell number is :{n_cell}\")\n", + "k = min([int(0.025*n_cell), 50])\n", + "print(f\"Auto-selected k is :{k}\")\n", + "oracle.knn_imputation(n_pca_dims=n_comps, k=k, balanced=True, b_sight=k*8,\n", + " b_maxl=k*4, n_jobs=4)" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [], + "source": [ + "oracle.to_hdf5(f'{work_dir}/infer/celloracle/grn/presaved_3000.celloracle.oracle')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### GRN calculation\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if True: # run locally\n", + " # Load file.\n", + " oracle = co.load_hdf5(f'{work_dir}/infer/celloracle/grn/presaved_3000.celloracle.oracle')\n", + " # This step may take some time.\n", + " links = oracle.get_links(cluster_name_for_GRN_unit=\"cell_type\", alpha=10,\n", + " verbose_level=10)\n", + " links.to_hdf5(file_path=f\"{work_dir}/infer/celloracle/grn/links_3000.celloracle.links\")\n", + "else:\n", + " !python celloracle/run_grn.py" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Post evaluation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Peak gene connections\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
peaktarget
0chr7_130668147_130669092COPG2
1chr3_12796310_12797168CAND2
2chr1_207052722_207053635PFKFB2
3chr10_96043175_96044011CCNJ
4chr1_161225428_161226349MIR5187
.........
21023chr1_10430097_10431027CENPS-CORT
21024chr5_149549508_149550287CSNK1A1
21025chr5_149551086_149552006CSNK1A1
21026chr20_10673798_10674620JAG1
21027chr9_122228348_122229104LHX6
\n", + "

21028 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " peak target\n", + "0 chr7_130668147_130669092 COPG2\n", + "1 chr3_12796310_12797168 CAND2\n", + "2 chr1_207052722_207053635 PFKFB2\n", + "3 chr10_96043175_96044011 CCNJ\n", + "4 chr1_161225428_161226349 MIR5187\n", + "... ... ...\n", + "21023 chr1_10430097_10431027 CENPS-CORT\n", + "21024 chr5_149549508_149550287 CSNK1A1\n", + "21025 chr5_149551086_149552006 CSNK1A1\n", + "21026 chr20_10673798_10674620 JAG1\n", + "21027 chr9_122228348_122229104 LHX6\n", + "\n", + "[21028 rows x 2 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "integrated" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "if True:\n", + " integrated = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/integrated.csv')\n", + " integrated = integrated[['peak_id','gene_short_name']]\n", + " integrated.columns = ['peak','target']\n", + " \n", + " # integrated.to_csv(f'{work_dir}/infer/celloracle/peak_gene.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "tss_annotated = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/tss_annotated.csv')\n", + "print(len(tss_annotated))\n", + "integrated = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/integrated.csv')\n", + "print(len(integrated))\n", + "peak_gene_shortlist = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/peak_gene.csv')\n", + "print(len(peak_gene_shortlist))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "unique peaks in peak-gene (17295,)\n", + "unique genes in peak-gene (16691,)\n" + ] + } + ], + "source": [ + "print('unique peaks in peak-gene', peak_gene_shortlist.peak_id.unique().shape)\n", + "print('unique genes in peak-gene', peak_gene_shortlist.gene_short_name.unique().shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "number of DORC genes with t of 10 0\n", + "number of DORC genes with t of 5 17\n" + ] + } + ], + "source": [ + "peak_gene_shortlist = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/peak_gene.csv', index_col=0)\n", + "peak_gene_co_n = peak_gene_shortlist.groupby('gene_short_name').apply(lambda df:df['peak_id'].shape[0])\n", + "np.max(peak_gene_co_n.values), np.median(peak_gene_co_n.values)\n", + "\n", + "# print('number of TFs ', scenicplus.TF.unique().shape[0], ' CIS ', scenicplus.Region.unique().shape[0], ' gene ', scenicplus.Gene.unique().shape[0])\n", + "print('number of DORC genes with t of 10 ', (peak_gene_co_n.values > 10).sum())\n", + "print('number of DORC genes with t of 5 ', (peak_gene_co_n.values > 5).sum())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# convert peak to peak_id using celloracle function\n", + "tss_annotated_df = pd.DataFrame({\"peak_id\": ma.process_bed_file.df_to_list_peakstr(tss_annotated),\n", + " \"gene_short_name\": tss_annotated.gene_short_name.values})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# number of tss_annotated pairs in shortlisted peak\n", + "print(f'Percentage of proximal elements in the final peak gene pairs: {100*peak_gene_shortlist.peak_id.isin(tss_annotated_df.peak_id).sum()/len(peak_gene_shortlist)}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dorc_shortlisted = peak_gene_shortlist.groupby('gene_short_name').size()\n", + "print(f\"In the short list: max peaks per gene: {dorc_shortlisted.max()}, median: {dorc_shortlisted.median()}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "threshold = 10\n", + "peak_new = integrated[integrated.coaccess >= threshold]\n", + "print('number of DORC: ', (peak_new.groupby('gene_short_name').size()>10).sum())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Base GRN" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "base_GRN = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/grn_celloracle_base.csv', index_col=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tfs_co = base_GRN.columns[3:]\n", + "keeo_cols = tfs_co.insert(0, ['gene_short_name', 'peak_id'])\n", + "df = base_GRN[keeo_cols]\n", + "# Melting the DataFrame\n", + "melted_df = pd.melt(df, id_vars=['gene_short_name', 'peak_id'], var_name='TF', value_name='Link')\n", + "\n", + "# Filtering out rows where there is no link (optional, if you only want interactions)\n", + "melted_df = melted_df[melted_df['Link'] == 1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f'TFs : {melted_df.TF.unique().shape} , regions : {melted_df.peak_id.unique().shape}, genes : {melted_df.gene_short_name.unique().shape}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Refined GRN " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "tag = '' #'_hvg'\n", + "\n", + "links_o = co.load_hdf5(f\"{work_dir}/infer/celloracle/grn/links{tag}.celloracle.links\") " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "links_dict = links_o.links_dict.copy()\n", + "tt = 0.05\n", + "links_dict_f = {}\n", + "for key, df in links_dict.items():\n", + " mask = df.p /tmp/params.yaml << HERE param_list: @@ -11,11 +11,7 @@ param_list: de_test_h5ad: "$resources_dir/neurips-2023-data/de_test.h5ad" id_map: "$resources_dir/neurips-2023-data/id_map.csv" layer: clipped_sign_log10_pval - # - id: neurips-2023-kaggle - # de_train_h5ad: "$resources_dir/neurips-2023-kaggle/de_train.h5ad" - # de_test_h5ad: "$resources_dir/neurips-2023-kaggle/de_test.h5ad" - # id_map: "$resources_dir/neurips-2023-kaggle/id_map.csv" - # layer: sign_log10_pval + output_state: "state.yaml" publish_dir: "$publish_dir" HERE diff --git a/src/methods/scenicplus/config.vsh.yaml b/src/methods/scenicplus/config.vsh.yaml index 91147dfb2..49cd562b8 100644 --- a/src/methods/scenicplus/config.vsh.yaml +++ b/src/methods/scenicplus/config.vsh.yaml @@ -16,12 +16,12 @@ functionality: platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: janursa/scenicplus:19-08-2024 setup: - type: python packages: [ ] - type: python - git: [ https://github.com/aertslab/scenicplus ] + git: [ ] - type: native diff --git a/src/methods/scenicplus/script.py b/src/methods/scenicplus/script.py index c08659d80..a59f2a4b9 100644 --- a/src/methods/scenicplus/script.py +++ b/src/methods/scenicplus/script.py @@ -9,7 +9,7 @@ "prediction": "output/prediction.csv", } ## VIASH END -sys.path.append(meta["resources_dir"]) +# sys.path.append(meta["resources_dir"]) from main import main prediction = main(par) diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index 4d4aefd75..ba46c1230 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -12,17 +12,7 @@ functionality: __merge__: ../../api/file_multiomics_atac_h5ad.yaml required: false direction: input - - name: --perturbation_data - __merge__: ../../api/file_perturbation_h5ad.yaml - required: true - direction: input - - name: --layer - required: true - type: string - direction: input - default: lognorm - description: Which layer to use. - + - name: Outputs arguments: @@ -37,11 +27,6 @@ functionality: required: true direction: output default: method_configs.yaml - - name: "--metric_configs" - type: file - required: true - direction: output - default: metric_configs.yaml - name: "--dataset_uns" type: file required: true @@ -58,10 +43,7 @@ functionality: type: string multiple: true description: A list of method ids to run. If not specified, all methods will be run. - - name: "--metric_ids" - type: string - multiple: true - description: A list of metric ids to run. If not specified, all metric will be run. + resources: - type: nextflow_script @@ -69,6 +51,9 @@ functionality: entrypoint: run_wf - type: file path: "../../api/task_info.yaml" + dependencies: + - name: common/extract_metadata + repository: openproblemsv2 # dependencies: # # - name: common/extract_metadata # # repository: openproblemsv2 diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index 1cb644811..06a5d36cf 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -1,6 +1,6 @@ // construct list of methods methods = [ - scglue + figr ] // construct list of metrics From ef4be12ec7301d805629238b0892ffe2eba6eaba Mon Sep 17 00:00:00 2001 From: matin Date: Fri, 19 Jul 2024 22:15:37 +0100 Subject: [PATCH 2/4] after merge --- .gitignore | 4 +- celloracle.ipynb | 1241 ---------------------------------------------- 2 files changed, 3 insertions(+), 1242 deletions(-) delete mode 100644 celloracle.ipynb diff --git a/.gitignore b/.gitignore index 256176b9d..c04996a27 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,8 @@ resources/ resources_test/ output/ +target/ +local/ # related to python .ipynb_checkpoints @@ -21,4 +23,4 @@ work # IDE related .idea -.vscode \ No newline at end of file +.vscode diff --git a/celloracle.ipynb b/celloracle.ipynb deleted file mode 100644 index d14b4de57..000000000 --- a/celloracle.ipynb +++ /dev/null @@ -1,1241 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Imports" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/root/.local/lib/python3.10/site-packages/numba/np/ufunc/parallel.py:371: NumbaWarning: \u001b[1mThe TBB threading layer requires TBB version 2021 update 6 or later i.e., TBB_INTERFACE_VERSION >= 12060. Found TBB_INTERFACE_VERSION = 12050. The TBB threading layer is disabled.\u001b[0m\n", - " warnings.warn(problem)\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "import os \n", - "from celloracle import motif_analysis as ma\n", - "import pandas as pd\n", - "import celloracle as co\n", - "import anndata\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import anndata as ad\n", - "from local_utils import plots" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import anndata as ad\n", - "\n", - "## VIASH START\n", - "par = {\n", - " \"multiomics_rna\": \"resources/grn-benchmark/multiomics_rna.h5ad\",\n", - " \"multiomics_atac\": \"resources/grn-benchmark/multiomics_atac.h5ad\",\n", - " \"annotation_file\": \"resources/grn-benchmark/annotation_file\",\n", - " \"motif_file\": \"resources/grn-benchmark/motif_file\",\n", - " \"prediction\": \"output/prediction.csv\",\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Reading input files\n" - ] - } - ], - "source": [ - "print('Reading input files', flush=True)\n", - "multiomics_rna = ad.read_h5ad(par[\"multiomics_rna\"])\n", - "multiomics_atac = ad.read_h5ad(par[\"multiomics_atac\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "que bed peaks: 135358\n", - "tss peaks in que: 21028\n" - ] - } - ], - "source": [ - "peaks = multiomics_atac.var_names.to_numpy()\n", - "\n", - "peaks = [peak.replace(':','_').replace(\"-\",'_') for peak in peaks]\n", - "tss_annotated = ma.get_tss_info(peak_str_list=peaks, ref_genome=\"hg38\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "tss_annotated['peak_id'] = tss_annotated['chr'].astype(str)+\"_\"+tss_annotated['start'].astype(str)+\"_\"+tss_annotated['end'].astype(str)" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "# pd.read_csv('../perturb-multiomics-grn/output/infer/celloracle/peak_gene.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "hg38 installation: True\n", - "No motif data entered. Loading default motifs for your species ...\n", - " Default motif for vertebrate: gimme.vertebrate.v5.0. \n", - " For more information, please see https://gimmemotifs.readthedocs.io/en/master/overview.html \n", - "\n", - "Initiating scanner... \n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "DEBUG:gimme.scanner:using background: genome hg38 with size 200\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Calculating FPR-based threshold. This step may take substantial time when you load a new ref-genome. It will be done quicker on the second time. \n", - "\n", - "Motif scan started .. It may take long time.\n", - "\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "392dd15eb4634227843443fd729b77fe", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "scanning: 0%| | 0/17276 [00:00 1 else chromsizes['Chromosome'][x] for x in range(len(chromsizes['Chromosome']))]\n", - "\n", - "chromsizes = chromsizes[['Chromosome', 'End']]\n", - "# save\n", - "chromsizes.to_csv(f'{work_dir}/cicero/chromsizes.csv', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# check if all chr are found\n", - "chrs = df_peaks.locations.apply(lambda x:x.split('_')[0])\n", - "chrs.isin(chromsizes.Chromosome).sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Celloracle" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Base GRN\n", - "This section create base GRN given cicero results.\n", - "### Associate peaks with TSS\n", - "Each peak is associated with promotors of target genes (+- 1kbp). Celloracle should be installed for this step." - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "que bed peaks: 131047\n", - "tss peaks in que: 20898\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "***** WARNING: File /tmp/pybedtools.f78i1zv8.tmp has inconsistent naming convention for record:\n", - "GL000194.1\t55749\t56580\n", - "\n", - "***** WARNING: File /tmp/pybedtools.f78i1zv8.tmp has inconsistent naming convention for record:\n", - "GL000194.1\t55749\t56580\n", - "\n" - ] - } - ], - "source": [ - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Merge TSS peaks with cicero connections" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "metadata": {}, - "outputs": [], - "source": [ - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 82, - "metadata": {}, - "outputs": [], - "source": [ - "integrated.to_csv(f'{work_dir}/infer/celloracle/baseGRN/integrated.csv')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Shortlisten the peak-gene connections" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "peak_gene = integrated[integrated.coaccess>0.8].reset_index(drop=True)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "peak_gene.to_csv(f'{work_dir}/infer/celloracle/baseGRN/peak_gene.csv')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create base GRN\n", - "Running interactively might take a long time." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Base GRN: only proximal \n", - "This is only based on proximal cis elements and doesnt require cicero" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [], - "source": [ - "import anndata as ad \n", - "adata_atac = ad.read_h5ad(f'{work_dir}/scATAC/adata_atac.h5ad')" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [], - "source": [ - "peaks = format_peakname(adata_atac.var.reset_index()).location.values" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "que bed peaks: 135418\n", - "tss peaks in que: 21028\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "***** WARNING: File /tmp/pybedtools.dxm99xoa.tmp has inconsistent naming convention for record:\n", - "chr10\t100001032\t100001800\n", - "\n", - "***** WARNING: File /tmp/pybedtools.dxm99xoa.tmp has inconsistent naming convention for record:\n", - "chr10\t100001032\t100001800\n", - "\n" - ] - } - ], - "source": [ - "tss_annotated = ma.get_tss_info(peak_str_list=peaks, ref_genome=\"hg38\")\n", - "tss_annotated.to_csv(f'{work_dir}/infer/celloracle/baseGRN/tss_annotated.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "metadata": {}, - "outputs": [], - "source": [ - "##----- integrate_tss_peak_with_cicero\n", - "import numpy as np\n", - "from celloracle.motif_analysis.process_bed_file import df_to_list_peakstr\n", - "# 1. check tss data format and convert if needed\n", - "tss_peak=tss_annotated\n", - "tss = tss_peak.copy()\n", - "if np.all([i in tss.columns for i in [\"chr\", \"start\", \"end\"]]):\n", - " tss = pd.DataFrame({\"peak_id\": df_to_list_peakstr(tss),\n", - " \"gene_short_name\": tss.gene_short_name.values})\n", - "else:\n", - " raise ValueError(\"tss_peak format error\")\n", - "\n", - "peak_gene = tss" - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "metadata": {}, - "outputs": [], - "source": [ - "peak_gene.to_csv(f'{work_dir}/infer/celloracle/baseGRN/peak_gene.csv')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### TF motifs" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd \n", - "from celloracle import motif_analysis as ma\n", - "import genomepy\n", - "genomes_dir='/beegfs/desy/user/nourisaj/op_multiomics_grn/output/celloracle'\n", - "peak_gene = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/peak_gene.csv', index_col=0)\n", - "genomepy.install_genome(name=\"hg38\", provider=\"UCSC\", genomes_dir=genomes_dir)" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [], - "source": [ - "# PLEASE make sure reference genome is correct.\n", - "ref_genome = \"hg38\"\n", - "\n", - "genome_installation = ma.is_genome_installed(ref_genome=ref_genome,\n", - " genomes_dir=genomes_dir)\n", - "print(ref_genome, \"installation: \", genome_installation)\n", - "\n", - "# Instantiate TFinfo object\n", - "tfi = ma.TFinfo(peak_data_frame=peak_gene, \n", - " ref_genome=\"hg38\",\n", - " genomes_dir=genomes_dir) \n", - "\n", - "tfi.scan(fpr=0.05, \n", - " motifs=None, # If you enter None, default motifs will be loaded.\n", - " verbose=True)\n", - "# Check motif scan results\n", - "tfi.scanned_df.head()\n", - "# Reset filtering \n", - "tfi.reset_filtering()\n", - "\n", - "# Do filtering\n", - "tfi.filter_motifs_by_score(threshold=10)\n", - "\n", - "# Format post-filtering results.\n", - "tfi.make_TFinfo_dataframe_and_dictionary(verbose=True)\n", - "\n", - "# Format and save \n", - "df = tfi.to_dataframe()\n", - "df.head()\n", - "df.to_csv(f'{work_dir}/infer/celloracle/baseGRN/grn_celloracle_base.csv')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## GRN construction\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Preprocessing scRNA-seq" - ] - }, - { - "cell_type": "code", - "execution_count": 86, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import celloracle as co\n", - "import anndata\n", - "import scanpy as sc\n", - "adata = anndata.read_h5ad(f'{work_dir}/scRNA/adata_rna.h5ad')\n" - ] - }, - { - "cell_type": "code", - "execution_count": 87, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AnnData object with n_obs × n_vars = 25551 × 22787\n", - " obs: 'cell_type', 'donor_id', 'n_genes'\n", - " var: 'n_cells'\n", - " layers: 'counts', 'x_norm'" - ] - }, - "execution_count": 87, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "del adata.varm \n", - "del adata.uns \n", - "del adata.obsp \n", - "del adata.obsm \n", - "del adata.obs['louvain']\n", - "adata" - ] - }, - { - "cell_type": "code", - "execution_count": 88, - "metadata": {}, - "outputs": [], - "source": [ - "adata.X = adata.layers['counts'].copy()" - ] - }, - { - "cell_type": "code", - "execution_count": 89, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([1., 1., 1., ..., 2., 1., 1.])" - ] - }, - "execution_count": 89, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "adata.X.data" - ] - }, - { - "cell_type": "code", - "execution_count": 90, - "metadata": {}, - "outputs": [], - "source": [ - "sc.pp.normalize_per_cell(adata, key_n_counts='n_counts_all')" - ] - }, - { - "cell_type": "code", - "execution_count": 91, - "metadata": {}, - "outputs": [], - "source": [ - "filter_result = sc.pp.filter_genes_dispersion(adata.X,\n", - " flavor='cell_ranger',\n", - " n_top_genes=3000,\n", - " log=False)\n", - "\n", - "# Subset the genes\n", - "adata = adata[:, filter_result.gene_subset]\n", - "\n", - "# Renormalize after filtering\n", - "sc.pp.normalize_per_cell(adata)" - ] - }, - { - "cell_type": "code", - "execution_count": 95, - "metadata": {}, - "outputs": [], - "source": [ - "# Log transformation and scaling\n", - "sc.pp.log1p(adata)\n", - "sc.pp.scale(adata)" - ] - }, - { - "cell_type": "code", - "execution_count": 100, - "metadata": {}, - "outputs": [], - "source": [ - "# PCA\n", - "sc.tl.pca(adata, svd_solver='arpack')\n", - "\n", - "# Diffusion map\n", - "sc.pp.neighbors(adata, n_neighbors=4, n_pcs=20)\n", - "\n", - "sc.tl.diffmap(adata)\n", - "# Calculate neihbors again based on diffusionmap \n", - "sc.pp.neighbors(adata, n_neighbors=10, use_rep='X_diffmap')" - ] - }, - { - "cell_type": "code", - "execution_count": 101, - "metadata": {}, - "outputs": [], - "source": [ - "sc.tl.louvain(adata, resolution=0.8)" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AnnData object with n_obs × n_vars = 25551 × 3000\n", - " obs: 'cell_type', 'donor_id', 'n_genes', 'n_counts_all', 'n_counts', 'louvain'\n", - " var: 'n_cells', 'mean', 'std'\n", - " uns: 'log1p', 'pca', 'neighbors', 'diffmap_evals', 'louvain'\n", - " obsm: 'X_pca', 'X_diffmap'\n", - " varm: 'PCs'\n", - " layers: 'counts', 'x_norm'\n", - " obsp: 'distances', 'connectivities'" - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "adata" - ] - }, - { - "cell_type": "code", - "execution_count": 102, - "metadata": {}, - "outputs": [], - "source": [ - "sc.tl.paga(adata, groups='louvain')" - ] - }, - { - "cell_type": "code", - "execution_count": 103, - "metadata": {}, - "outputs": [], - "source": [ - "sc.pl.paga(adata)" - ] - }, - { - "cell_type": "code", - "execution_count": 104, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING: Package 'fa2' is not installed, falling back to layout 'fr'.To use the faster and better ForceAtlas2 layout, install package 'fa2' (`pip install fa2`).\n" - ] - } - ], - "source": [ - "sc.tl.draw_graph(adata, init_pos='paga', random_state=123)\n", - "sc.pl.draw_graph(adata, color='louvain', legend_loc='on data')" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Metadata columns : ['cell_type', 'donor_id', 'n_genes', 'louvain', 'n_counts_all', 'n_counts']\n", - "Dimensional reduction: ['X_diffmap', 'X_draw_graph_fr', 'X_pca', 'X_umap']\n" - ] - } - ], - "source": [ - "# Check data in anndata\n", - "print(\"Metadata columns :\", list(adata.obs.columns))\n", - "print(\"Dimensional reduction: \", list(adata.obsm.keys()))" - ] - }, - { - "cell_type": "code", - "execution_count": 108, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([1., 1., 1., ..., 1., 1., 2.])" - ] - }, - "execution_count": 108, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "## run based on counts as suggesyed by co pipeline\n", - "adata.X = adata.layers[\"counts\"]\n", - "adata.X.data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load base GRN" - ] - }, - { - "cell_type": "code", - "execution_count": 106, - "metadata": {}, - "outputs": [], - "source": [ - "base_GRN = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/grn_celloracle_base.csv', index_col=0)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Initialize " - ] - }, - { - "cell_type": "code", - "execution_count": 109, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING: adata.X seems to be already log-transformed.\n" - ] - } - ], - "source": [ - "# Instantiate Oracle object\n", - "oracle = co.Oracle()\n", - "# Instantiate Oracle object.\n", - "oracle.import_anndata_as_raw_count(adata=adata,\n", - " cluster_column_name=\"cell_type\",\n", - " embedding_name=\"X_draw_graph_fr\")" - ] - }, - { - "cell_type": "code", - "execution_count": 113, - "metadata": {}, - "outputs": [], - "source": [ - "# You can load TF info dataframe with the following code.\n", - "oracle.import_TF_data(TF_info_matrix=base_GRN)" - ] - }, - { - "cell_type": "code", - "execution_count": 114, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "25\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "oracle.perform_PCA()\n", - "plt.plot(np.cumsum(oracle.pca.explained_variance_ratio_)[:100])\n", - "n_comps = np.where(np.diff(np.diff(np.cumsum(oracle.pca.explained_variance_ratio_))>0.002))[0][0]\n", - "plt.axvline(n_comps, c=\"k\")\n", - "plt.show()\n", - "print(n_comps)\n", - "n_comps = min(n_comps, 50)" - ] - }, - { - "cell_type": "code", - "execution_count": 115, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "cell number is :25551\n", - "Auto-selected k is :50\n" - ] - } - ], - "source": [ - "n_cell = oracle.adata.shape[0]\n", - "print(f\"cell number is :{n_cell}\")\n", - "k = min([int(0.025*n_cell), 50])\n", - "print(f\"Auto-selected k is :{k}\")\n", - "oracle.knn_imputation(n_pca_dims=n_comps, k=k, balanced=True, b_sight=k*8,\n", - " b_maxl=k*4, n_jobs=4)" - ] - }, - { - "cell_type": "code", - "execution_count": 117, - "metadata": {}, - "outputs": [], - "source": [ - "oracle.to_hdf5(f'{work_dir}/infer/celloracle/grn/presaved_3000.celloracle.oracle')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### GRN calculation\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if True: # run locally\n", - " # Load file.\n", - " oracle = co.load_hdf5(f'{work_dir}/infer/celloracle/grn/presaved_3000.celloracle.oracle')\n", - " # This step may take some time.\n", - " links = oracle.get_links(cluster_name_for_GRN_unit=\"cell_type\", alpha=10,\n", - " verbose_level=10)\n", - " links.to_hdf5(file_path=f\"{work_dir}/infer/celloracle/grn/links_3000.celloracle.links\")\n", - "else:\n", - " !python celloracle/run_grn.py" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Post evaluation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Peak gene connections\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
peaktarget
0chr7_130668147_130669092COPG2
1chr3_12796310_12797168CAND2
2chr1_207052722_207053635PFKFB2
3chr10_96043175_96044011CCNJ
4chr1_161225428_161226349MIR5187
.........
21023chr1_10430097_10431027CENPS-CORT
21024chr5_149549508_149550287CSNK1A1
21025chr5_149551086_149552006CSNK1A1
21026chr20_10673798_10674620JAG1
21027chr9_122228348_122229104LHX6
\n", - "

21028 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " peak target\n", - "0 chr7_130668147_130669092 COPG2\n", - "1 chr3_12796310_12797168 CAND2\n", - "2 chr1_207052722_207053635 PFKFB2\n", - "3 chr10_96043175_96044011 CCNJ\n", - "4 chr1_161225428_161226349 MIR5187\n", - "... ... ...\n", - "21023 chr1_10430097_10431027 CENPS-CORT\n", - "21024 chr5_149549508_149550287 CSNK1A1\n", - "21025 chr5_149551086_149552006 CSNK1A1\n", - "21026 chr20_10673798_10674620 JAG1\n", - "21027 chr9_122228348_122229104 LHX6\n", - "\n", - "[21028 rows x 2 columns]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "integrated" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "if True:\n", - " integrated = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/integrated.csv')\n", - " integrated = integrated[['peak_id','gene_short_name']]\n", - " integrated.columns = ['peak','target']\n", - " \n", - " # integrated.to_csv(f'{work_dir}/infer/celloracle/peak_gene.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "tss_annotated = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/tss_annotated.csv')\n", - "print(len(tss_annotated))\n", - "integrated = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/integrated.csv')\n", - "print(len(integrated))\n", - "peak_gene_shortlist = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/peak_gene.csv')\n", - "print(len(peak_gene_shortlist))" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "unique peaks in peak-gene (17295,)\n", - "unique genes in peak-gene (16691,)\n" - ] - } - ], - "source": [ - "print('unique peaks in peak-gene', peak_gene_shortlist.peak_id.unique().shape)\n", - "print('unique genes in peak-gene', peak_gene_shortlist.gene_short_name.unique().shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "number of DORC genes with t of 10 0\n", - "number of DORC genes with t of 5 17\n" - ] - } - ], - "source": [ - "peak_gene_shortlist = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/peak_gene.csv', index_col=0)\n", - "peak_gene_co_n = peak_gene_shortlist.groupby('gene_short_name').apply(lambda df:df['peak_id'].shape[0])\n", - "np.max(peak_gene_co_n.values), np.median(peak_gene_co_n.values)\n", - "\n", - "# print('number of TFs ', scenicplus.TF.unique().shape[0], ' CIS ', scenicplus.Region.unique().shape[0], ' gene ', scenicplus.Gene.unique().shape[0])\n", - "print('number of DORC genes with t of 10 ', (peak_gene_co_n.values > 10).sum())\n", - "print('number of DORC genes with t of 5 ', (peak_gene_co_n.values > 5).sum())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# convert peak to peak_id using celloracle function\n", - "tss_annotated_df = pd.DataFrame({\"peak_id\": ma.process_bed_file.df_to_list_peakstr(tss_annotated),\n", - " \"gene_short_name\": tss_annotated.gene_short_name.values})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# number of tss_annotated pairs in shortlisted peak\n", - "print(f'Percentage of proximal elements in the final peak gene pairs: {100*peak_gene_shortlist.peak_id.isin(tss_annotated_df.peak_id).sum()/len(peak_gene_shortlist)}')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dorc_shortlisted = peak_gene_shortlist.groupby('gene_short_name').size()\n", - "print(f\"In the short list: max peaks per gene: {dorc_shortlisted.max()}, median: {dorc_shortlisted.median()}\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "threshold = 10\n", - "peak_new = integrated[integrated.coaccess >= threshold]\n", - "print('number of DORC: ', (peak_new.groupby('gene_short_name').size()>10).sum())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Base GRN" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "base_GRN = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/grn_celloracle_base.csv', index_col=0)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tfs_co = base_GRN.columns[3:]\n", - "keeo_cols = tfs_co.insert(0, ['gene_short_name', 'peak_id'])\n", - "df = base_GRN[keeo_cols]\n", - "# Melting the DataFrame\n", - "melted_df = pd.melt(df, id_vars=['gene_short_name', 'peak_id'], var_name='TF', value_name='Link')\n", - "\n", - "# Filtering out rows where there is no link (optional, if you only want interactions)\n", - "melted_df = melted_df[melted_df['Link'] == 1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(f'TFs : {melted_df.TF.unique().shape} , regions : {melted_df.peak_id.unique().shape}, genes : {melted_df.gene_short_name.unique().shape}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Refined GRN " - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "tag = '' #'_hvg'\n", - "\n", - "links_o = co.load_hdf5(f\"{work_dir}/infer/celloracle/grn/links{tag}.celloracle.links\") " - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "links_dict = links_o.links_dict.copy()\n", - "tt = 0.05\n", - "links_dict_f = {}\n", - "for key, df in links_dict.items():\n", - " mask = df.p Date: Fri, 19 Jul 2024 22:46:24 +0100 Subject: [PATCH 3/4] no viash --- src/api/comp_metric.yaml | 1 + .../{config.vsh.yaml => config.novsh.yaml} | 0 src/pre_methods/cistopic/config.novsh.yaml | 23 +++++++++++++++++++ .../{config.vsh.yaml => config.novsh.yaml} | 0 4 files changed, 24 insertions(+) rename src/methods/scenicplus/{config.vsh.yaml => config.novsh.yaml} (100%) create mode 100644 src/pre_methods/cistopic/config.novsh.yaml rename src/pre_methods/format_multiomics_R/{config.vsh.yaml => config.novsh.yaml} (100%) diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml index 68eef5701..f50aad02f 100644 --- a/src/api/comp_metric.yaml +++ b/src/api/comp_metric.yaml @@ -25,6 +25,7 @@ functionality: direction: input default: ridge description: name of regretion to use + multiple: true - name: --subsample type: integer direction: input diff --git a/src/methods/scenicplus/config.vsh.yaml b/src/methods/scenicplus/config.novsh.yaml similarity index 100% rename from src/methods/scenicplus/config.vsh.yaml rename to src/methods/scenicplus/config.novsh.yaml diff --git a/src/pre_methods/cistopic/config.novsh.yaml b/src/pre_methods/cistopic/config.novsh.yaml new file mode 100644 index 000000000..ea092766e --- /dev/null +++ b/src/pre_methods/cistopic/config.novsh.yaml @@ -0,0 +1,23 @@ +functionality: + name: cistopic + info: + label: cistopic + summary: "creates cistopic" + + + resources: + - type: r_script + path: script.R + + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_r:1.0.2 + # setup: + # - type: r + # packages: [dplyr, FNN, chromVAR, doParallel, BuenColors, FigR, BSgenome.Hsapiens.UCSC.hg38] + + - type: native + - type: nextflow + directives: + label: [midtime,midmem,midcpu] diff --git a/src/pre_methods/format_multiomics_R/config.vsh.yaml b/src/pre_methods/format_multiomics_R/config.novsh.yaml similarity index 100% rename from src/pre_methods/format_multiomics_R/config.vsh.yaml rename to src/pre_methods/format_multiomics_R/config.novsh.yaml From 47efd3df23c6d6a19669612b22ab2552facc70a3 Mon Sep 17 00:00:00 2001 From: matin Date: Mon, 22 Jul 2024 20:28:11 +0100 Subject: [PATCH 4/4] test resourcs added --- notebooks/create_resources.ipynb | 167 +- .../cistarget/tf_lists/allTFs_hg38.txt | 1892 +++++++++++++++++ src/api/comp_method.yaml | 6 +- 3 files changed, 2053 insertions(+), 12 deletions(-) create mode 100644 notebooks/resources.aertslab.org/cistarget/tf_lists/allTFs_hg38.txt diff --git a/notebooks/create_resources.ipynb b/notebooks/create_resources.ipynb index b9f8df256..de7ac8f2f 100644 --- a/notebooks/create_resources.ipynb +++ b/notebooks/create_resources.ipynb @@ -18,12 +18,13 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import anndata as ad\n", "import pandas as pd\n", + "\n", "import numpy as np\n", "data_dir = '../../perturb-multiomics-grn/output/'\n", "\n", @@ -161,6 +162,105 @@ "adata_bulk.write(f'{resource_dir}/perturbation_data.h5ad')" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# test rresources" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "test_resource_dir = f'{resource_dir}/../../resources_test/grn-benchmark'\n", + "os.makedirs(test_resource_dir, exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "adata_rna = ad.read_h5ad(f'{resource_dir}/multiomics_rna.h5ad')\n", + "adata_atac = ad.read_h5ad(f'{resource_dir}/multiomics_atac.h5ad')" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "peaks = pd.read_csv(f'{resource_dir}/peak_gene_models/granie.csv').peak.to_numpy()\n", + "hvgs = ad.read_h5ad(f'{resource_dir}/prior_data.h5ad').uns['hvgs']\n", + "genes_multi = ad.read_h5ad(f'{resource_dir}/prior_data.h5ad').uns['gene_names']\n", + "tfs = ad.read_h5ad(f'{resource_dir}/prior_data.h5ad').uns['tf_list']\n", + "genes = set(tfs) & set(genes_multi)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "# shorten rna \n", + "mask = adata_rna.obs.donor_id=='donor_0'\n", + "adata_rna_s = adata_rna[mask]\n", + "random_indices = np.random.choice(adata_rna_s.obs.index, 1000, replace=False)\n", + "adata_rna_s = adata_rna_s[random_indices, adata_rna_s.var_names.isin(genes)]" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "View of AnnData object with n_obs × n_vars = 1000 × 4962\n", + " obs: 'cell_type', 'donor_id'" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# shorten atac\n", + "adata_atac_s = adata_atac[adata_atac.obs.index.isin(adata_rna_s.obs.index), adata_atac.var.index.isin(peaks)]\n", + "adata_atac_s" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "adata_rna_s.write(f'{test_resource_dir}/multiomics_rna.h5ad')\n", + "adata_atac_s.write(f'{test_resource_dir}/multiomics_atac.h5ad')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [], + "source": [ + "# shorten perturbation\n", + "adata_bulk = ad.read_h5ad(f'{resource_dir}/perturbation_data.h5ad')\n", + "adata_bulk[:200, adata_bulk.var_names.isin(genes)].write(f'{test_resource_dir}/perturbation_data.h5ad')" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -168,9 +268,16 @@ "# Prior" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## tf names\n" + ] + }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -179,7 +286,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -188,13 +295,59 @@ "prior_adata.uns['tf_list'] = tf_list\n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## gene names" + ] + }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "bulk_adata = ad.read_h5ad(f'{resource_dir}/perturbation_data.h5ad')\n", + "prior_adata.uns['gene_names_pert'] = bulk_adata.var_names.to_numpy()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ - "prior_adata.write(f'{resource_dir}/prior_data.h5ad')" + "bulk_adata = ad.read_h5ad(f'{resource_dir}/multiomics_rna.h5ad')\n", + "prior_adata.uns['gene_names'] = bulk_adata.var_names.to_numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "bulk_adata = ad.read_h5ad(f'{resource_dir}/multiomics_atac.h5ad')\n", + "prior_adata.uns['peak'] = bulk_adata.var_names.to_numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "prior_adata.uns['hvgs'] = np.loadtxt(f'{resource_dir}/hvgs.txt', dtype=str)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "prior_adata.write(f'{resource_dir}/prior_data.h5ad')\n" ] }, { @@ -209,9 +362,7 @@ "execution_count": 9, "metadata": {}, "outputs": [], - "source": [ - "bulk_adata = ad.read_h5ad(f'{work_dir}/preprocess/bulk_adata_integrated.h5ad')" - ] + "source": [] }, { "cell_type": "code", diff --git a/notebooks/resources.aertslab.org/cistarget/tf_lists/allTFs_hg38.txt b/notebooks/resources.aertslab.org/cistarget/tf_lists/allTFs_hg38.txt new file mode 100644 index 000000000..6769dac51 --- /dev/null +++ b/notebooks/resources.aertslab.org/cistarget/tf_lists/allTFs_hg38.txt @@ -0,0 +1,1892 @@ +ZNF354C +KLF12 +ZNF143 +ZIC2 +ZNF274 +SP2 +ZBTB7A +BCL6B +ZBTB49 +ZIC1 +ZNF232 +ZNF282 +ZNF410 +ZSCAN16 +ZNF524 +ZNF713 +ZNF75A +ZSCAN4 +ZNF200 +SNAI2 +KLF1 +BCL6 +EGR2 +OVOL2 +GFI1 +GFI1B +KLF11 +WT1 +ZNF655 +FOXC1 +ARX +VSX1 +CRX +PBX4 +PHOX2B +VAX2 +VSX2 +MSX2 +ESX1 +HOXD13 +NKX2-8 +VENTX +HESX1 +PITX2 +PROP1 +ISX +NKX2-5 +SIX6 +HOXC4 +HOXB7 +PAX6 +PAX7 +PAX4 +PAX3 +POU4F3 +POU6F2 +POU3F4 +NR1H4 +NR2E3 +POU2F1 +RBPJ +FOXP1 +MAX +PHF1 +MTF2 +BCL11A +BCL11B +FOXN2 +FOXR1 +SOX4 +SOHLH2 +ZSCAN29 +PLAGL2 +VEZF1 +ZBTB44 +CENPBD1 +TIGD1 +CXXC5 +FOXN3 +HDX +DUXA +MSANTD3 +ZZZ3 +LCORL +NFATC4 +CUX2 +CUX1 +DLX3 +LHX9 +POU5F1B +NFATC2 +ZFHX3 +KDM2B +KMT2A +DNMT1 +TET1 +DMRT3 +DMRT1 +DMRTA2 +DMRT2 +E2F2 +FOXK1 +FOXG1 +GCM1 +HOXA2 +NOBOX +LHX2 +LHX6 +TLX2 +EMX1 +ZFHX2 +LBX1 +HOXB13 +ZHX1 +POU6F1 +SHOX +ANHX +MYRF +NR2E1 +NR3C2 +NR2F6 +RARG +NFATC3 +RFX2 +GMEB1 +THAP12 +GLI2 +GLI3 +GLI1 +ETS1 +NFIL3 +MZF1 +RREB1 +SPIB +FOXF2 +FOXD1 +PBX1 +IRF2 +RORA +PPARG +REL +RELA +SOX9 +SRY +TFEB +TCF4 +CEBPE +XBP1 +PRDM1 +EHF +ERG +FOXJ3 +GATA1 +MEIS2 +POU2F2 +HSF2 +MEF2C +RXRG +NFATC1 +RFX3 +RUNX3 +EOMES +TFAP2B +TFAP2C +TFAP2A +ZBED1 +MLXIPL +TFE3 +MNT +TCF3 +TFAP4 +TFEC +MLX +MYF6 +BHLHE41 +BHLHE23 +ARNTL +BHLHE40 +CLOCK +HEY2 +USF1 +HEY1 +MESP1 +NEUROD2 +NHLH1 +OLIG3 +NEUROG2 +MSC +HES7 +BHLHA15 +BHLHE22 +FIGLA +OLIG1 +HES5 +SREBF2 +OLIG2 +MGA +DBP +CREB3 +HLF +NFE2 +BATF3 +ATF4 +NRL +JDP2 +CEBPG +CREB3L1 +TEF +CEBPB +MAFF +MAFG +MAFK +CEBPD +ATF7 +YY1 +CTCF +SP4 +GLIS3 +PRDM4 +EGR1 +GLIS2 +KLF16 +EGR4 +ZNF740 +ZIC3 +ZBTB7B +SP8 +HIC2 +KLF13 +HINFP +SP3 +GLIS1 +ZIC4 +EGR3 +ZBTB18 +ZNF784 +ZBTB7C +SP1 +MTF1 +ZKSCAN3 +SCRT2 +YY2 +SCRT1 +KLF14 +CENPB +ONECUT2 +ONECUT1 +ONECUT3 +E2F1 +E2F3 +E2F8 +E2F7 +E2F4 +EBF1 +ETV1 +SPI1 +ELF4 +ETV2 +ERF +ELK3 +ETV3 +ELF1 +SPDEF +ELK1 +ELF5 +ETV6 +FLI1 +GABPA +ELK4 +ELF3 +FEV +SPIC +ETV4 +ETV5 +FOXP3 +FOXJ2 +FOXO3 +FOXO1 +FOXI1 +FOXB1 +FOXL1 +FOXC2 +FOXO4 +FOXD2 +FOXD3 +FOXO6 +GATA3 +GATA5 +GATA4 +GCM2 +GRHL1 +TFCP2 +MEOX1 +DLX6 +ALX4 +GSC2 +PITX1 +HOXA9 +RHOXF1 +MEIS3 +DLX5 +HOXA1 +HOXA13 +EVX1 +MEOX2 +PITX3 +DLX4 +CDX1 +OTX1 +DLX2 +PRRX1 +TGIF2 +HOXB5 +HOXB3 +HOXC13 +HOXC11 +HOXC12 +HOXD11 +MNX1 +BARX1 +GSC +RAX +HNF1A +LMX1B +PDX1 +BARHL2 +MEIS1 +DLX1 +HMBOX1 +VAX1 +TGIF2LX +ALX3 +ISL2 +PKNOX1 +LMX1A +EN1 +MSX1 +EN2 +UNCX +GBX1 +PHOX2A +PKNOX2 +CDX2 +OTX2 +DRGX +PRRX2 +GBX2 +SHOX2 +GSX1 +HOXD12 +EMX2 +IRX2 +HOXB2 +RAX2 +EVX2 +HOXD8 +IRX5 +TGIF1 +LBX2 +ALX1 +GSX2 +HOXC10 +MIXL1 +HMX3 +HMX2 +BSX +DMBX1 +DPRX +NOTO +HOMEZ +HMX1 +HNF1B +PAX2 +POU1F1 +POU2F3 +POU4F2 +POU4F1 +POU3F2 +POU3F1 +POU3F3 +HSF4 +HSFY2 +HSF1 +IRF3 +IRF5 +IRF4 +IRF8 +IRF7 +IRF9 +MEF2A +SRF +MEF2D +MEF2B +MYBL2 +MYBL1 +RARB +ESR1 +HNF4A +VDR +NR3C1 +ESRRB +THRA +RARA +THRB +NR4A2 +AR +ESRRA +NR2F1 +NR2C2 +RXRA +ESRRG +RXRB +TP63 +PAX1 +PAX5 +PAX9 +PROX1 +NFKB2 +NFAT5 +NFKB1 +RFX4 +RFX5 +RUNX2 +GMEB2 +NFIX +NFIB +NFIA +SMAD3 +SOX8 +SOX10 +SOX21 +SOX15 +LEF1 +TCF7L1 +SOX14 +SOX7 +SOX2 +SOX18 +TBX21 +TBX5 +TBX15 +TBX2 +TBX4 +TBR1 +TBX19 +TBX20 +TBX1 +TEAD3 +TEAD1 +TEAD4 +NRF1 +CPEB1 +PGR +NR1I3 +NR1I2 +NFE2L1 +ATF2 +ATF6 +CREB1 +ATF3 +FOSL1 +JUN +MAFB +ATF6B +CEBPA +TFAP2E +HES2 +SREBF1 +TCFL5 +USF2 +HES1 +TCF21 +MYOG +MYOD1 +MYCN +ASCL1 +TCF12 +HES6 +FERD3L +MSGN1 +NEUROD1 +HAND2 +PTF1A +NPAS2 +ATOH1 +ARNT2 +NHLH2 +ATOH7 +NEUROG1 +ASCL2 +MESP2 +CREM +BACH2 +FOSB +JUND +CREB3L4 +CREB5 +BATF +FOS +JUNB +MAF +MAFA +ZNF263 +DPF1 +ZBTB32 +ZNF76 +KLF6 +ZNF343 +KLF5 +ZNF821 +ZNF174 +KLF3 +ZNF684 +ZBTB45 +SNAI1 +ZNF384 +KLF2 +ZSCAN5A +KLF4 +ZSCAN9 +ZIC5 +ZNF787 +OSR1 +ZNF660 +ZNF385D +ZSCAN1 +KLF10 +ZNF276 +ZNF281 +KLF15 +ZNF12 +ZNF704 +OSR2 +ZNF23 +ZNF444 +ZNF597 +ZBTB43 +ZNF32 +ZNF296 +ZBTB26 +KLF17 +OVOL1 +ZNF449 +HIC1 +ZBTB33 +ZNF454 +ZFP42 +ZNF771 +ZBTB2 +ZFP41 +ZBTB20 +ZFP1 +ZBTB37 +SNAI3 +ZNF501 +ZNF396 +ZSCAN23 +ZNF177 +ZNF250 +ZNF140 +ZNF460 +ZBTB14 +ZBTB12 +ZNF580 +SP9 +ZSCAN31 +ZBTB22 +ZNF345 +MBNL2 +YBX1 +LIN28B +DMRTC2 +DMRTA1 +ETV7 +ELF2 +ETS2 +FOXA2 +FOXA1 +FOXQ1 +FOXA3 +FOXE1 +FOXL2 +FOXR2 +GATA6 +GATA2 +TFCP2L1 +UBP1 +HOXA11 +ISL1 +HOXC8 +BARX2 +LHX5 +SIX4 +HOXA5 +HOXA6 +HOXB6 +NKX3-2 +NANOG +NKX2-3 +HOXB8 +HOXB1 +LHX4 +HOXA7 +BARHL1 +SIX1 +HOXD1 +HOXD3 +HOXD9 +HOXD10 +CDX4 +RHOXF2 +SIX3 +NKX6-2 +LHX8 +TLX3 +NKX6-3 +NKX3-1 +HOXD4 +IRX1 +SIX2 +HOXB9 +TGIF2LY +IRX3 +HOXC9 +HOXB4 +ARGFX +HOXA4 +HOXA10 +LHX1 +POU5F1 +HSFY1 +HSF5 +IRF6 +PPARD +NR5A2 +NR2C1 +NR4A1 +NR1D1 +NR5A1 +RORC +NR6A1 +NR1D2 +RORB +PAX8 +RFX1 +RFX7 +SKOR2 +SMAD5 +NFIC +SOX30 +TCF7 +BBX +SOX3 +SOX12 +TBX18 +TBX3 +TBX6 +TBXT +TEAD2 +XPA +SKOR1 +FOSL2 +ZKSCAN1 +ZFP14 +ZNF415 +ZNF135 +ZFP82 +ZKSCAN7 +ZNF777 +ZNF682 +FOXP2 +SOX6 +SOX5 +SOX17 +PLAG1 +ZKSCAN2 +ZNF582 +ZNF506 +ZNF324 +ZNF671 +ZNF264 +ZNF302 +ZNF184 +ZNF419 +ZNF85 +ZNF430 +ZNF549 +ZNF211 +ZNF205 +ZNF45 +ZNF133 +ZNF484 +ZNF557 +ZNF337 +ZNF317 +ZNF331 +ZNF141 +ZNF304 +ZNF132 +ZNF189 +ZNF287 +ZIM3 +ZNF614 +ZNF300 +RBAK +ZNF157 +ZNF182 +ZNF7 +ZNF214 +ZNF547 +ZNF776 +ZNF18 +ZNF19 +ZNF222 +ZNF235 +ZNF714 +ZNF333 +ZNF382 +ZNF496 +PRDM9 +ZNF202 +ZNF3 +ZNF180 +ZNF641 +ZNF610 +ZNF528 +ZNF701 +ZNF283 +ZNF558 +ZNF30 +ZNF354A +ZNF764 +ZNF778 +ZNF212 +ZNF439 +ZNF440 +ZNF562 +ZNF561 +ZNF584 +ZIK1 +ZNF540 +ZNF570 +ZNF621 +ZNF680 +ZNF483 +ZNF417 +ZNF791 +ZNF266 +ZNF519 +ZNF25 +ZNF77 +ZNF169 +ZNF613 +ZNF620 +ZNF619 +ZNF114 +ZNF543 +ZNF354B +ZNF223 +ZNF552 +ZNF154 +ZNF816 +ZNF571 +ZNF443 +ZNF792 +ZNF707 +ZNF875 +ZNF101 +ZNF716 +ZNF708 +ZNF662 +ZNF320 +ZNF530 +ZNF730 +ZNF93 +ZFP90 +ZNF479 +ZNF445 +ZNF74 +ZNF267 +ZNF566 +ZNF529 +ZNF284 +ZNF749 +ZNF17 +ZNF555 +ZNF75D +ZNF197 +ZFP69B +ZFP69 +ZNF626 +ZNF793 +ZNF383 +ZNF669 +ZNF548 +ZNF567 +ZNF573 +ZNF527 +ZNF33A +ZNF79 +ZNF681 +ZNF766 +ZNF565 +ZNF765 +ZNF124 +ZNF605 +ZNF799 +ZNF782 +ZNF846 +ZNF136 +ZKSCAN5 +ZNF33B +ZNF431 +ZNF418 +ZNF585A +ZNF429 +ZNF100 +ZNF398 +ZNF441 +ZNF257 +ZNF785 +ZNF786 +ZNF675 +ZNF860 +ZNF695 +ZNF615 +ZNF433 +ZNF81 +ZNF780A +ZNF181 +ZNF44 +ZNF790 +ZNF823 +ZNF311 +ZNF273 +ZNF84 +ZNF667 +ZNF649 +ZNF248 +ZNF334 +ZNF485 +ZNF442 +ZNF26 +ZNF69 +ZNF480 +ZNF587 +ZNF808 +ZNF28 +ZNF627 +ZNF789 +ZNF534 +ZNF525 +ZNF805 +ZNF468 +ZNF616 +ZFP57 +ZNF783 +ZNF425 +ZNF611 +ZNF254 +ZNF90 +ZNF891 +ZNF705G +ZNF880 +ZNF492 +ZNF879 +ZNF736 +ZNF737 +ZNF324B +ZNF564 +ZNF674 +ZNF550 +ZNF432 +ZNF10 +ZNF486 +ZNF225 +ZNF285 +ZNF224 +ZIM2 +ZNF2 +ZNF8 +ZNF487 +MXI1 +MYC +ZEB1 +REST +CTCFL +E2F6 +PBX3 +STAT1 +STAT3 +STAT2 +THAP1 +TP73 +HIF1A +TWIST1 +MITF +KLF9 +ZNF24 +NFYA +TFDP1 +FOXK2 +FOXH1 +GRHL2 +PBX2 +DUX4 +IRF1 +MYB +ESR2 +HNF4G +NR2F2 +RELB +SOX13 +TCF7L2 +NFYB +BACH1 +SIX5 +TBP +ZNF416 +ZNF574 +ZNF41 +ZNF653 +ZNF35 +ZNF16 +ZNF692 +ZFP3 +ZNF322 +ZNF467 +ZSCAN22 +ZNF71 +ZFP64 +PRDM6 +ZNF37A +ZNF586 +MYNN +ZNF213 +PATZ1 +MAZ +ZNF175 +KLF7 +GTF3A +ZNF436 +FEZF1 +ZNF341 +ZNF394 +IKZF3 +ZNF513 +ZNF22 +ZNF146 +ZNF280A +ZNF768 +ZNF554 +ZNF596 +ZBTB42 +ZNF594 +ZNF329 +ZBTB6 +ZSCAN30 +ZNF490 +ZNF563 +ZNF34 +ZNF774 +ZNF502 +ZFP28 +ZNF98 +ZNF677 +ZNF121 +ZNF770 +ZSCAN5C +ZBTB48 +ZNF134 +GLI4 +ZNF260 +ZNF350 +ZNF595 +INSM1 +ARID5B +LYL1 +AHR +EPAS1 +ARNT +TAL1 +NFE2L2 +ATF1 +ZFX +MECOM +SALL4 +KLF8 +ZBTB17 +PRDM14 +IKZF1 +ZNF335 +E2F5 +FOXM1 +LHX3 +NKX2-1 +NKX6-1 +MBD2 +MECP2 +NR1H3 +PPARA +TP53 +RUNX1 +AIRE +SMAD4 +STAT5A +STAT4 +STAT6 +STAT5B +THAP11 +NFYC +ZNF711 +ARID3A +HMGA1 +HMGA2 +MYF5 +NFE2L3 +ATF5 +DDIT3 +ZEB2 +HIVEP2 +IKZF2 +ZBTB11 +ZNF423 +ZBTB16 +ZNF541 +GZF1 +ZSCAN10 +PRDM12 +ZNF236 +PRDM15 +PRDM16 +ZNF761 +ZNF148 +ZNF589 +ZNF219 +SALL2 +E4F1 +SP7 +ZNF581 +ZNF217 +ZFP92 +ZSCAN26 +ZNF628 +ZNF521 +SP5 +ZNF316 +ZNF705E +ZNF727 +ZNF735 +ZNF883 +ZNF718 +ZNF658 +SATB1 +CXXC1 +EBF4 +EBF3 +EBF2 +FOXF1 +FOXN1 +FOXJ1 +FOXD4L4 +TRPS1 +GTF2IRD1 +GTF2I +HOXA3 +NKX2-2 +SETDB1 +MTERF1 +CDC5L +SMAD9 +SMAD1 +HBP1 +SOX11 +TBX22 +LTF +DNTTIP1 +POU2AF1 +CEBPZ +GTF2B +CARF +SPZ1 +NR0B1 +BPTF +PURA +TOPORS +NFE4 +ADNP +CHAMP1 +DACH1 +DRAP1 +GATAD1 +GATAD2A +HHEX +HMG20A +HMG20B +HMGXB4 +IKZF5 +INSM2 +KAT7 +KMT2B +MBD1 +MXD3 +MXD4 +NCOA1 +NCOA3 +NFXL1 +PHF20 +PRDM10 +SKI +ZBED5 +ZBTB10 +ZBTB21 +ZBTB25 +ZBTB40 +ZBTB8A +ZFP37 +ZFP91 +ZGPAT +ZKSCAN8 +ZNF239 +ZNF362 +ZNF366 +ZNF407 +ZNF426 +ZNF48 +ZNF507 +ZNF511 +ZNF512 +ZNF518A +ZNF577 +ZNF579 +ZNF585B +ZNF592 +ZNF600 +ZNF629 +ZNF639 +ZNF644 +ZNF652 +ZNF654 +ZNF664 +ZNF697 +ZNF781 +ZNF83 +ZNF843 +ZSCAN21 +ZXDB +AFF4 +ASCC1 +BAD +CBFA2T2 +CBFB +ZNF830 +CNOT6 +NELFB +DDX20 +ENO1 +FEZF2 +FHL2 +FOXP4 +GTF2H3 +GTF3C2 +GTF3C5 +HCFC2 +HCLS1 +HDAC8 +UBE2K +HTATIP2 +ID2 +KDM5A +LARP1 +CERS4 +MAGED4 +MAGEF1 +MYEF2 +NCALD +NME1 +NMRAL1 +NUCB1 +OTUD4 +PAXIP1 +PDCD11 +PDLIM5 +PHTF1 +PIR +PLAGL1 +PQBP1 +PURG +RAB18 +RAN +RBBP5 +RBFOX2 +RFXANK +SCAND2P +SCMH1 +SEMA4A +SF1 +SMAD2 +SNAPC4 +SNAPC5 +SND1 +SSBP3 +SSX2 +SSX3 +TAF1A +TAF9 +TBPL1 +TCEAL2 +TFAM +THAP5 +MED30 +TIMELESS +TRMT1 +TSC22D4 +TSNAX +TULP1 +VPS4B +YEATS4 +ZBTB4 +ZBTB46 +ZHX3 +ZNF131 +ZNF160 +ZNF207 +RNF114 +ZNF326 +ZNF385A +ZNF503 +ZNF510 +ZNF706 +TFAP2D +BRCA1 +CREB3L2 +FUBP1 +HAND1 +HLTF +HOXC6 +ID4 +NR1H2 +NR4A3 +SMARCA1 +SMARCA5 +SOX1 +TAF1 +TLX1 +HIVEP1 +ZNF165 +NF1 +BNC2 +ZBED2 +NKX2-4 +ARID5A +BCL3 +CHD1 +CHD2 +DBX2 +DMC1 +EP300 +EZH2 +GTF2F1 +HCFC1 +HLX +HOXC5 +IRX4 +IRX6 +MTA3 +NKX1-1 +NKX1-2 +NKX2-6 +OTP +PML +RAD21 +RCOR1 +SIN3A +SMARCC1 +SMARCC2 +SMC3 +SP100 +TBL1XR1 +WRNIP1 +ZBTB3 +ZNF691 +TRAF4 +CPSF4 +MYCLP1 +TCF15 +TAF6 +GABPB1 +ILF2 +SIRT6 +ING4 +CHURC1 +MXD1 +TAL2 +RFXAP +GTF2A2 +GTF2A1 +TFDP2 +RB1 +SMAD7 +SMAD6 +DEAF1 +ARNTL2 +TRIM28 +PARP1 +TERF1 +CNOT3 +DBX1 +BRF1 +BDP1 +POLR3A +EWSR1 +CTNNB1 +FOXN4 +BCLAF1 +CCNT2 +HDAC2 +OVOL3 +ZNF536 +ZBTB5 +ZNF688 +TBX10 +FOXD4L6 +FOXE3 +RLF +SP6 +ZNF746 +FOXD4L5 +FOXD4L3 +TBPL2 +ZNF687 +ZNF438 +ZNF516 +ZSCAN18 +PRDM13 +FOXD4L1 +SALL1 +ZBTB41 +ZBTB1 +ZSCAN5B +GTF2A1L +ZBTB8B +ZNF575 +ZNF280B +ZBTB34 +IKZF4 +AEBP2 +ZNF772 +ZSCAN25 +FIZ1 +ZNF215 +SALL3 +ZNF500 +ZFY +ZBTB24 +ZNF853 +ZSCAN20 +ZNF80 +ZNF20 +ZNF630 +ZNF699 +ZNF470 +ZNF57 +ZXDC +ZNF648 +ZNF544 +ZNF546 +ZNF517 +ZFP2 +ZNF572 +ZNF66 +ZNF689 +ZNF837 +ZNF710 +ZNF625 +ZNF491 +ZNF709 +ZNF526 +ZNF676 +ZNF556 +ZNF408 +ZNF700 +ZNF286A +ZNF471 +ZFP30 +ZNF230 +ZNF233 +ZNF275 +ZNF729 +ZSCAN32 +ZNF195 +ZNF814 +ZNF878 +ZNF726 +ZNF208 +ZNF732 +ZNF99 +ZNF253 +ZNF623 +ZNF14 +ZNF705D +ZNF43 +ZNF92 +ZNF117 +ZNF138 +ZNF91 +ZXDA +ZNF155 +ZNF234 +ZNF844 +ZNF763 +ZNF569 +ZNF404 +ZNF678 +ZNF829 +ZNF672 +ZNF568 +ZNF841 +ZNF813 +ZNF836 +ZNF705A +ZNF773 +ZNF551 +ZSCAN2 +ZNF227 +ZNF497 +ZNF493 +ZNF679 +ZNF683 +ZFP62 +ZNF721 +ZNF461 +ZNF397 +ZNF420 +ZNF578 +ZNF775 +ZNF845 +ZNF560 +ZNF606 +ZNF668 +ZKSCAN4 +ZNF514 +ZNF696 +ZNF607 +ZNF599 +ZNF559 +ZNF251 +ZNF583 +ZNF665 +ZNF670 +ZNF358 +ZNF319 +ZNF70 +ZNF226 +ZNF624 +PRDM5 +ZNF112 +ZNF780B +ZBTB47 +ZBTB39 +ZNF646 +ZNF835 +ZNF107 +ZNF391 +ZSCAN12 +ZFPM1 +PEG3 +ZBTB38 +ZNF367 +ZNF256 +HDAC1 +APEX1 +CTBP1 +BANP +CRTC2 +NONO +SFPQ +ABL1 +HELT +DIDO1 +HNRNPUL1 +DPF2 +NCOA2 +ILF3 +RHOXF2B +AHDC1 +HMGXB3 +LCOR +MLLT10 +SATB2 +GPBP1L1 +ZNF280D +ZNF142 +ZNF462 +ZNF576 +ATF7-NPFF +NANOGP8 +MBTPS2 +CIC +SETBP1 +FOXL3 +SEBOX +DMRTB1 +RFX6 +TAF1L +TWIST2 +FREM1 +ARID3B +RBPJL +CREBL2 +FOXB2 +FOXD4 +SP140 +CPHXL +AHCTF1 +DNAJC21 +MYPOP +PRDM11 +PHF21A +CCDC169-SOHLH2 +MLXIP +CREBZF +TERF2 +SP110 +NFX1 +ASH2L +METTL14 +VPS72 +CERS6 +CERS3 +CERS5 +CERS2 +PRDM7 +HIF3A +BNC1 +FANK1 +IL21 +ZNF622 +NPAS4 +ZBED6 +TMEM33 +ACAA1 +ZNF800 +ADNP2 +ZNF414 +ZFP91-CNTF +ZNF587B +ZNF451 +ZNF532 +LDB1 +LMO2 +YOD1 +METTL3 +A1CF +ABCF2 +ACO1 +ADARB1 +AGAP2 +AGGF1 +AGMAT +AHRR +AKR1A1 +ANXA1 +ANXA11 +APEX2 +ARFGAP1 +ARG1 +ARG2 +ARID3C +ASAP3 +ASPSCR1 +ATOH8 +AVEN +BAX +BOLL +BORCS8-MEF2B +BRF2 +C19orf25 +CANX +CAT +CBX7 +CCDC25 +CD59 +CDK2AP1 +CELF4 +CELF5 +CELF6 +CFL2 +CKMT1B +CLK1 +CNOT4 +CPTP +CSNK2B +CSTF2 +CYB5R1 +CYCS +DAB2 +DAZAP1 +DDX4 +DDX43 +DDX53 +DGCR8 +DHX36 +DIABLO +DIS3 +DMAP1 +DNMT3A +DR1 +DTL +DUS3L +DUSP22 +DUSP26 +ECSIT +EDN1 +EEF1D +EIF5A2 +ESRP1 +ESRP2 +ETFB +EXO5 +EXOSC3 +EZR +FAAP24 +FAM127B +FBXL19 +FEZ1 +FGF19 +FIP1L1 +FOXS1 +GADD45A +GAR1 +GIT2 +GLYCTK +GOT1 +GPAM +GPANK1 +GPD1 +GRHL3 +GRHPR +GTPBP1 +GTPBP6 +H1FX +H2AFY +H2AFZ +HADHB +HDAC3 +HES4 +HEYL +HHAT +HIRIP3 +HIST1H2BN +HIST2H2AB +HIST2H2BE +HIVEP3 +HKR1 +HLCS +HMGB1 +HMGB2 +HMGB3 +HMGB4 +HNRNPA0 +HNRNPA1 +HNRNPC +HNRNPH3 +HNRNPLL +HP1BP3 +HSPA1L +HSPA5 +HUNK +ID1 +IL24 +ING3 +IVD +JAZF1 +JRK +JRKL +KCNIP1 +KDM2A +KDM4A +KDM4B +KDM4C +KDM4D +KDM4E +KDM5D +KDM7A +KIAA0907 +KIF22 +KLF18 +KLRG1 +LARP4 +LAS1L +LIN28A +LRRFIP1 +LSM6 +LUZP1 +LUZP2 +MAGEA8 +MAGOH +MAP4K2 +MAPK1 +MCTP2 +MDM2 +MELK +METTL21B +MEX3C +MIEF1 +MIOS +MKX +MORN1 +MRPL1 +MRPL2 +MRPS25 +MSI1 +MSI2 +MSRA +MSRB3 +MTHFD1 +MYCL +MYLK +NAGS +NANOS1 +NAP1L1 +NCBP2 +NCOR1 +NCOR2 +NELFA +NEUROG3 +NMI +NNT +NOC2L +NPDC1 +NUP107 +NUP133 +NXPH3 +ODC1 +P4HB +PCK2 +PDE6H +PDS5A +PGAM2 +PHF12 +PHF2 +PHF8 +PHLDA2 +PICK1 +PIK3C3 +PKM +PLG +POLD2 +POLE3 +POLE4 +POLI +POLR2A +POLR3G +PPARGC1A +PPP1R10 +PPP2R3B +PPP5C +PRDX5 +PRKAA1 +PRKAA2 +PRNP +PROX2 +PSMA6 +PSMC2 +PSMD12 +PTCD1 +PTPMT1 +PUM3 +R3HDM2 +RAB14 +RAB2A +RAB7A +RBBP9 +RBM17 +RBM22 +RBM3 +RBM42 +RBM7 +RBM8A +RBMS1 +RFC2 +RFC3 +RFX8 +RIOK2 +RNASEH2C +RNF138 +RPL35 +RPL6 +RPP25 +RPS10 +RPS4X +RPS6KA5 +RUFY3 +RUVBL1 +SCAND1 +SCX +SF3B1 +SFT2D1 +SIM1 +SIM2 +SLC18A1 +SMAP2 +SMPX +SMUG1 +SNRNP70 +SNRPB2 +SOCS4 +SOD1 +SPAG7 +SPATS2 +SPR +SRBD1 +SRP9 +SRRM3 +SSRP1 +STAU2 +STK40 +STUB1 +SUCLG1 +T +TAF7 +TAGLN2 +TCEAL6 +TCF23 +TCF24 +TFDP3 +TFF3 +THOC2 +TIA1 +TIGD2 +TIGD3 +TIGD4 +TIGD5 +TIGD6 +TIGD7 +TIMM44 +TIMM8A +TMSB4XP8 +TOB2 +TPI1 +TPPP +TRIB1 +TRIB2 +TRIB3 +TRIM21 +TRIM69 +TRIP10 +TRMO +TROVE2 +TSN +U2AF1 +UBB +UBE2V1 +UBTF +UBXN1 +UGP2 +UQCRB +USP39 +UTP18 +VAMP3 +WDR83 +WISP2 +XG +XRCC1 +YWHAE +YWHAZ +ZC3H7A +ZCCHC14 +ZCCHC17 +ZDHHC15 +ZDHHC24 +ZDHHC5 +ZHX2 +ZMAT2 +ZMAT4 +ZNF286B +ZNF355P +ZNF542P +ZNF598 +ZNF658B +ZNF702P +ZNF705CP +ZNF717 +ZNF720 +ZNF788 +ZNF806 +ZNF826P +ZNF827 +ZNF831 +ZRSR2 +ZSWIM1 diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml index 6ff380eae..ad847e1c9 100644 --- a/src/api/comp_method.yaml +++ b/src/api/comp_method.yaml @@ -12,6 +12,8 @@ functionality: __merge__: file_multiomics_rna_h5ad.yaml required: True direction: input + info: + test_default: ../../resources_test/grn-benchmark/multiomics_rna.h5ad - name: --multiomics_atac __merge__: file_multiomics_atac_h5ad.yaml required: false @@ -21,10 +23,6 @@ functionality: required: true direction: output - - - - test_resources: - type: python_script path: /src/common/component_tests/run_and_check_output.py