From 4336a112c2ec9a389a263ad4fa5a8cef0e56ee5f Mon Sep 17 00:00:00 2001
From: matin <matin@M-As-MacBook-Pro.local>
Date: Fri, 19 Jul 2024 22:08:35 +0100
Subject: [PATCH 1/4] before pull

---
 celloracle.ipynb                            | 1241 +++++++++++++++++++
 dockerfiles/scenicplus/Dockerfile           |    0
 dockerfiles/scglue/Dockerfile               |   29 +
 scripts/run_grn_inference.sh                |   12 +-
 src/methods/scenicplus/config.vsh.yaml      |    4 +-
 src/methods/scenicplus/script.py            |    2 +-
 src/workflows/run_benchmark/config.vsh.yaml |   25 +-
 src/workflows/run_benchmark/main.nf         |    2 +-
 8 files changed, 1283 insertions(+), 32 deletions(-)
 create mode 100644 celloracle.ipynb
 create mode 100644 dockerfiles/scenicplus/Dockerfile
 create mode 100644 dockerfiles/scglue/Dockerfile

diff --git a/celloracle.ipynb b/celloracle.ipynb
new file mode 100644
index 000000000..d14b4de57
--- /dev/null
+++ b/celloracle.ipynb
@@ -0,0 +1,1241 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/root/.local/lib/python3.10/site-packages/numba/np/ufunc/parallel.py:371: NumbaWarning: \u001b[1mThe TBB threading layer requires TBB version 2021 update 6 or later i.e., TBB_INTERFACE_VERSION >= 12060. Found TBB_INTERFACE_VERSION = 12050. The TBB threading layer is disabled.\u001b[0m\n",
+      "  warnings.warn(problem)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import os \n",
+    "from celloracle import motif_analysis as ma\n",
+    "import pandas as pd\n",
+    "import celloracle as co\n",
+    "import anndata\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import anndata as ad\n",
+    "from local_utils import plots"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import anndata as ad\n",
+    "\n",
+    "## VIASH START\n",
+    "par = {\n",
+    "  \"multiomics_rna\": \"resources/grn-benchmark/multiomics_rna.h5ad\",\n",
+    "  \"multiomics_atac\": \"resources/grn-benchmark/multiomics_atac.h5ad\",\n",
+    "  \"annotation_file\": \"resources/grn-benchmark/annotation_file\",\n",
+    "  \"motif_file\": \"resources/grn-benchmark/motif_file\",\n",
+    "  \"prediction\": \"output/prediction.csv\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reading input files\n"
+     ]
+    }
+   ],
+   "source": [
+    "print('Reading input files', flush=True)\n",
+    "multiomics_rna = ad.read_h5ad(par[\"multiomics_rna\"])\n",
+    "multiomics_atac = ad.read_h5ad(par[\"multiomics_atac\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "que bed peaks: 135358\n",
+      "tss peaks in que: 21028\n"
+     ]
+    }
+   ],
+   "source": [
+    "peaks = multiomics_atac.var_names.to_numpy()\n",
+    "\n",
+    "peaks = [peak.replace(':','_').replace(\"-\",'_') for peak in peaks]\n",
+    "tss_annotated = ma.get_tss_info(peak_str_list=peaks, ref_genome=\"hg38\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tss_annotated['peak_id'] = tss_annotated['chr'].astype(str)+\"_\"+tss_annotated['start'].astype(str)+\"_\"+tss_annotated['end'].astype(str)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# pd.read_csv('../perturb-multiomics-grn/output/infer/celloracle/peak_gene.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "hg38 installation:  True\n",
+      "No motif data entered. Loading default motifs for your species ...\n",
+      " Default motif for vertebrate: gimme.vertebrate.v5.0. \n",
+      " For more information, please see https://gimmemotifs.readthedocs.io/en/master/overview.html \n",
+      "\n",
+      "Initiating scanner... \n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "DEBUG:gimme.scanner:using background: genome hg38 with size 200\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Calculating FPR-based threshold. This step may take substantial time when you load a new ref-genome. It will be done quicker on the second time. \n",
+      "\n",
+      "Motif scan started .. It may take long time.\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "392dd15eb4634227843443fd729b77fe",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "scanning:   0%|          | 0/17276 [00:00<?, ? sequences/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "DEBUG:gimme.scanner:Scanning\n"
+     ]
+    }
+   ],
+   "source": [
+    "peak_gene = tss_annotated\n",
+    "# PLEASE make sure reference genome is correct.\n",
+    "ref_genome = \"hg38\"\n",
+    "\n",
+    "genome_installation = ma.is_genome_installed(ref_genome=ref_genome,\n",
+    "                                             genomes_dir=None)\n",
+    "print(ref_genome, \"installation: \", genome_installation)\n",
+    "\n",
+    "# Instantiate TFinfo object\n",
+    "tfi = ma.TFinfo(peak_data_frame=peak_gene, \n",
+    "                ref_genome=\"hg38\",\n",
+    "                genomes_dir=None) \n",
+    "\n",
+    "tfi.scan(fpr=0.05, \n",
+    "         motifs=None,  # If you enter None, default motifs will be loaded.\n",
+    "         verbose=True)\n",
+    "# Check motif scan results\n",
+    "tfi.scanned_df.head()\n",
+    "# Reset filtering \n",
+    "tfi.reset_filtering()\n",
+    "\n",
+    "# Do filtering\n",
+    "tfi.filter_motifs_by_score(threshold=10)\n",
+    "\n",
+    "# Format post-filtering results.\n",
+    "tfi.make_TFinfo_dataframe_and_dictionary(verbose=True)\n",
+    "\n",
+    "# Format and save \n",
+    "df = tfi.to_dataframe()\n",
+    "df.head()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Annotations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get chromosome sizes (for hg38 here). \n",
+    "target_url='http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes'\n",
+    "chromsizes=pd.read_csv(target_url, sep='\\t', header=None)\n",
+    "chromsizes.columns=['Chromosome', 'End']\n",
+    "chromsizes['Start']=[0]*chromsizes.shape[0]\n",
+    "chromsizes=chromsizes.loc[:,['Chromosome', 'Start', 'End']]\n",
+    "# Exceptionally in this case, to agree with CellRangerARC annotations\n",
+    "chromsizes['Chromosome'] = [chromsizes['Chromosome'][x].replace('v', '.') for x in range(len(chromsizes['Chromosome']))]\n",
+    "chromsizes['Chromosome'] = [chromsizes['Chromosome'][x].split('_')[1] if len(chromsizes['Chromosome'][x].split('_')) > 1 else chromsizes['Chromosome'][x] for x in range(len(chromsizes['Chromosome']))]\n",
+    "\n",
+    "chromsizes = chromsizes[['Chromosome', 'End']]\n",
+    "# save\n",
+    "chromsizes.to_csv(f'{work_dir}/cicero/chromsizes.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# check if all chr are found\n",
+    "chrs = df_peaks.locations.apply(lambda x:x.split('_')[0])\n",
+    "chrs.isin(chromsizes.Chromosome).sum()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Celloracle"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Base GRN\n",
+    "This section create base GRN given cicero results.\n",
+    "###  Associate peaks with TSS\n",
+    "Each peak is associated with promotors of target genes (+- 1kbp). Celloracle should be installed for this step."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "que bed peaks: 131047\n",
+      "tss peaks in que: 20898\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "***** WARNING: File /tmp/pybedtools.f78i1zv8.tmp has inconsistent naming convention for record:\n",
+      "GL000194.1\t55749\t56580\n",
+      "\n",
+      "***** WARNING: File /tmp/pybedtools.f78i1zv8.tmp has inconsistent naming convention for record:\n",
+      "GL000194.1\t55749\t56580\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "###  Merge TSS peaks with cicero connections"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "integrated.to_csv(f'{work_dir}/infer/celloracle/baseGRN/integrated.csv')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "###  Shortlisten the peak-gene connections"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "peak_gene = integrated[integrated.coaccess>0.8].reset_index(drop=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "peak_gene.to_csv(f'{work_dir}/infer/celloracle/baseGRN/peak_gene.csv')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create base GRN\n",
+    "Running interactively might take a long time."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Base GRN: only proximal \n",
+    "This is only based on proximal cis elements and doesnt require cicero"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import anndata as ad \n",
+    "adata_atac = ad.read_h5ad(f'{work_dir}/scATAC/adata_atac.h5ad')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "peaks = format_peakname(adata_atac.var.reset_index()).location.values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "que bed peaks: 135418\n",
+      "tss peaks in que: 21028\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "***** WARNING: File /tmp/pybedtools.dxm99xoa.tmp has inconsistent naming convention for record:\n",
+      "chr10\t100001032\t100001800\n",
+      "\n",
+      "***** WARNING: File /tmp/pybedtools.dxm99xoa.tmp has inconsistent naming convention for record:\n",
+      "chr10\t100001032\t100001800\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "tss_annotated = ma.get_tss_info(peak_str_list=peaks, ref_genome=\"hg38\")\n",
+    "tss_annotated.to_csv(f'{work_dir}/infer/celloracle/baseGRN/tss_annotated.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "##----- integrate_tss_peak_with_cicero\n",
+    "import numpy as np\n",
+    "from celloracle.motif_analysis.process_bed_file import df_to_list_peakstr\n",
+    "# 1. check tss data format and convert if needed\n",
+    "tss_peak=tss_annotated\n",
+    "tss = tss_peak.copy()\n",
+    "if np.all([i in tss.columns for i in [\"chr\", \"start\", \"end\"]]):\n",
+    "    tss = pd.DataFrame({\"peak_id\": df_to_list_peakstr(tss),\n",
+    "                        \"gene_short_name\": tss.gene_short_name.values})\n",
+    "else:\n",
+    "    raise ValueError(\"tss_peak format error\")\n",
+    "\n",
+    "peak_gene = tss"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "peak_gene.to_csv(f'{work_dir}/infer/celloracle/baseGRN/peak_gene.csv')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### TF motifs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd \n",
+    "from celloracle import motif_analysis as ma\n",
+    "import genomepy\n",
+    "genomes_dir='/beegfs/desy/user/nourisaj/op_multiomics_grn/output/celloracle'\n",
+    "peak_gene = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/peak_gene.csv', index_col=0)\n",
+    "genomepy.install_genome(name=\"hg38\", provider=\"UCSC\", genomes_dir=genomes_dir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# PLEASE make sure reference genome is correct.\n",
+    "ref_genome = \"hg38\"\n",
+    "\n",
+    "genome_installation = ma.is_genome_installed(ref_genome=ref_genome,\n",
+    "                                             genomes_dir=genomes_dir)\n",
+    "print(ref_genome, \"installation: \", genome_installation)\n",
+    "\n",
+    "# Instantiate TFinfo object\n",
+    "tfi = ma.TFinfo(peak_data_frame=peak_gene, \n",
+    "                ref_genome=\"hg38\",\n",
+    "                genomes_dir=genomes_dir) \n",
+    "\n",
+    "tfi.scan(fpr=0.05, \n",
+    "         motifs=None,  # If you enter None, default motifs will be loaded.\n",
+    "         verbose=True)\n",
+    "# Check motif scan results\n",
+    "tfi.scanned_df.head()\n",
+    "# Reset filtering \n",
+    "tfi.reset_filtering()\n",
+    "\n",
+    "# Do filtering\n",
+    "tfi.filter_motifs_by_score(threshold=10)\n",
+    "\n",
+    "# Format post-filtering results.\n",
+    "tfi.make_TFinfo_dataframe_and_dictionary(verbose=True)\n",
+    "\n",
+    "# Format and save \n",
+    "df = tfi.to_dataframe()\n",
+    "df.head()\n",
+    "df.to_csv(f'{work_dir}/infer/celloracle/baseGRN/grn_celloracle_base.csv')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## GRN construction\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Preprocessing scRNA-seq"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import celloracle as co\n",
+    "import anndata\n",
+    "import scanpy as sc\n",
+    "adata = anndata.read_h5ad(f'{work_dir}/scRNA/adata_rna.h5ad')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "AnnData object with n_obs × n_vars = 25551 × 22787\n",
+       "    obs: 'cell_type', 'donor_id', 'n_genes'\n",
+       "    var: 'n_cells'\n",
+       "    layers: 'counts', 'x_norm'"
+      ]
+     },
+     "execution_count": 87,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "del adata.varm \n",
+    "del adata.uns \n",
+    "del adata.obsp \n",
+    "del adata.obsm \n",
+    "del adata.obs['louvain']\n",
+    "adata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "adata.X = adata.layers['counts'].copy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([1., 1., 1., ..., 2., 1., 1.])"
+      ]
+     },
+     "execution_count": 89,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adata.X.data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sc.pp.normalize_per_cell(adata, key_n_counts='n_counts_all')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "filter_result = sc.pp.filter_genes_dispersion(adata.X,\n",
+    "                                              flavor='cell_ranger',\n",
+    "                                              n_top_genes=3000,\n",
+    "                                              log=False)\n",
+    "\n",
+    "# Subset the genes\n",
+    "adata = adata[:, filter_result.gene_subset]\n",
+    "\n",
+    "# Renormalize after filtering\n",
+    "sc.pp.normalize_per_cell(adata)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 95,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Log transformation and scaling\n",
+    "sc.pp.log1p(adata)\n",
+    "sc.pp.scale(adata)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# PCA\n",
+    "sc.tl.pca(adata, svd_solver='arpack')\n",
+    "\n",
+    "# Diffusion map\n",
+    "sc.pp.neighbors(adata, n_neighbors=4, n_pcs=20)\n",
+    "\n",
+    "sc.tl.diffmap(adata)\n",
+    "# Calculate neihbors again based on diffusionmap \n",
+    "sc.pp.neighbors(adata, n_neighbors=10, use_rep='X_diffmap')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 101,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sc.tl.louvain(adata, resolution=0.8)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "AnnData object with n_obs × n_vars = 25551 × 3000\n",
+       "    obs: 'cell_type', 'donor_id', 'n_genes', 'n_counts_all', 'n_counts', 'louvain'\n",
+       "    var: 'n_cells', 'mean', 'std'\n",
+       "    uns: 'log1p', 'pca', 'neighbors', 'diffmap_evals', 'louvain'\n",
+       "    obsm: 'X_pca', 'X_diffmap'\n",
+       "    varm: 'PCs'\n",
+       "    layers: 'counts', 'x_norm'\n",
+       "    obsp: 'distances', 'connectivities'"
+      ]
+     },
+     "execution_count": 59,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 102,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sc.tl.paga(adata, groups='louvain')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 103,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sc.pl.paga(adata)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING: Package 'fa2' is not installed, falling back to layout 'fr'.To use the faster and better ForceAtlas2 layout, install package 'fa2' (`pip install fa2`).\n"
+     ]
+    }
+   ],
+   "source": [
+    "sc.tl.draw_graph(adata, init_pos='paga', random_state=123)\n",
+    "sc.pl.draw_graph(adata, color='louvain', legend_loc='on data')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Metadata columns : ['cell_type', 'donor_id', 'n_genes', 'louvain', 'n_counts_all', 'n_counts']\n",
+      "Dimensional reduction:  ['X_diffmap', 'X_draw_graph_fr', 'X_pca', 'X_umap']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Check data in anndata\n",
+    "print(\"Metadata columns :\", list(adata.obs.columns))\n",
+    "print(\"Dimensional reduction: \", list(adata.obsm.keys()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 108,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([1., 1., 1., ..., 1., 1., 2.])"
+      ]
+     },
+     "execution_count": 108,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "## run based on counts as suggesyed by co pipeline\n",
+    "adata.X = adata.layers[\"counts\"]\n",
+    "adata.X.data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load base GRN"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 106,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "base_GRN = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/grn_celloracle_base.csv', index_col=0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Initialize "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING: adata.X seems to be already log-transformed.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Instantiate Oracle object\n",
+    "oracle = co.Oracle()\n",
+    "# Instantiate Oracle object.\n",
+    "oracle.import_anndata_as_raw_count(adata=adata,\n",
+    "                                   cluster_column_name=\"cell_type\",\n",
+    "                                   embedding_name=\"X_draw_graph_fr\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 113,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# You can load TF info dataframe with the following code.\n",
+    "oracle.import_TF_data(TF_info_matrix=base_GRN)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 114,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "25\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "oracle.perform_PCA()\n",
+    "plt.plot(np.cumsum(oracle.pca.explained_variance_ratio_)[:100])\n",
+    "n_comps = np.where(np.diff(np.diff(np.cumsum(oracle.pca.explained_variance_ratio_))>0.002))[0][0]\n",
+    "plt.axvline(n_comps, c=\"k\")\n",
+    "plt.show()\n",
+    "print(n_comps)\n",
+    "n_comps = min(n_comps, 50)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 115,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "cell number is :25551\n",
+      "Auto-selected k is :50\n"
+     ]
+    }
+   ],
+   "source": [
+    "n_cell = oracle.adata.shape[0]\n",
+    "print(f\"cell number is :{n_cell}\")\n",
+    "k = min([int(0.025*n_cell), 50])\n",
+    "print(f\"Auto-selected k is :{k}\")\n",
+    "oracle.knn_imputation(n_pca_dims=n_comps, k=k, balanced=True, b_sight=k*8,\n",
+    "                      b_maxl=k*4, n_jobs=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 117,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "oracle.to_hdf5(f'{work_dir}/infer/celloracle/grn/presaved_3000.celloracle.oracle')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### GRN calculation\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if True: # run locally\n",
+    "    # Load file.\n",
+    "    oracle = co.load_hdf5(f'{work_dir}/infer/celloracle/grn/presaved_3000.celloracle.oracle')\n",
+    "    # This step may take some time.\n",
+    "    links = oracle.get_links(cluster_name_for_GRN_unit=\"cell_type\", alpha=10,\n",
+    "                            verbose_level=10)\n",
+    "    links.to_hdf5(file_path=f\"{work_dir}/infer/celloracle/grn/links_3000.celloracle.links\")\n",
+    "else:\n",
+    "    !python celloracle/run_grn.py"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Post evaluation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Peak gene connections\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>peak</th>\n",
+       "      <th>target</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>chr7_130668147_130669092</td>\n",
+       "      <td>COPG2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>chr3_12796310_12797168</td>\n",
+       "      <td>CAND2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>chr1_207052722_207053635</td>\n",
+       "      <td>PFKFB2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>chr10_96043175_96044011</td>\n",
+       "      <td>CCNJ</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>chr1_161225428_161226349</td>\n",
+       "      <td>MIR5187</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21023</th>\n",
+       "      <td>chr1_10430097_10431027</td>\n",
+       "      <td>CENPS-CORT</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21024</th>\n",
+       "      <td>chr5_149549508_149550287</td>\n",
+       "      <td>CSNK1A1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21025</th>\n",
+       "      <td>chr5_149551086_149552006</td>\n",
+       "      <td>CSNK1A1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21026</th>\n",
+       "      <td>chr20_10673798_10674620</td>\n",
+       "      <td>JAG1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21027</th>\n",
+       "      <td>chr9_122228348_122229104</td>\n",
+       "      <td>LHX6</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>21028 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                           peak      target\n",
+       "0      chr7_130668147_130669092       COPG2\n",
+       "1        chr3_12796310_12797168       CAND2\n",
+       "2      chr1_207052722_207053635      PFKFB2\n",
+       "3       chr10_96043175_96044011        CCNJ\n",
+       "4      chr1_161225428_161226349     MIR5187\n",
+       "...                         ...         ...\n",
+       "21023    chr1_10430097_10431027  CENPS-CORT\n",
+       "21024  chr5_149549508_149550287     CSNK1A1\n",
+       "21025  chr5_149551086_149552006     CSNK1A1\n",
+       "21026   chr20_10673798_10674620        JAG1\n",
+       "21027  chr9_122228348_122229104        LHX6\n",
+       "\n",
+       "[21028 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "integrated"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if True:\n",
+    "    integrated = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/integrated.csv')\n",
+    "    integrated = integrated[['peak_id','gene_short_name']]\n",
+    "    integrated.columns = ['peak','target']\n",
+    "    \n",
+    "    # integrated.to_csv(f'{work_dir}/infer/celloracle/peak_gene.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tss_annotated = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/tss_annotated.csv')\n",
+    "print(len(tss_annotated))\n",
+    "integrated = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/integrated.csv')\n",
+    "print(len(integrated))\n",
+    "peak_gene_shortlist = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/peak_gene.csv')\n",
+    "print(len(peak_gene_shortlist))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "unique peaks in peak-gene (17295,)\n",
+      "unique genes in peak-gene (16691,)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print('unique peaks in peak-gene', peak_gene_shortlist.peak_id.unique().shape)\n",
+    "print('unique genes in peak-gene', peak_gene_shortlist.gene_short_name.unique().shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "number of DORC genes with t of 10  0\n",
+      "number of DORC genes with t of 5  17\n"
+     ]
+    }
+   ],
+   "source": [
+    "peak_gene_shortlist = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/peak_gene.csv', index_col=0)\n",
+    "peak_gene_co_n = peak_gene_shortlist.groupby('gene_short_name').apply(lambda df:df['peak_id'].shape[0])\n",
+    "np.max(peak_gene_co_n.values), np.median(peak_gene_co_n.values)\n",
+    "\n",
+    "# print('number of TFs ', scenicplus.TF.unique().shape[0], ' CIS ', scenicplus.Region.unique().shape[0], ' gene ', scenicplus.Gene.unique().shape[0])\n",
+    "print('number of DORC genes with t of 10 ', (peak_gene_co_n.values > 10).sum())\n",
+    "print('number of DORC genes with t of 5 ', (peak_gene_co_n.values > 5).sum())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# convert peak to peak_id using celloracle function\n",
+    "tss_annotated_df = pd.DataFrame({\"peak_id\": ma.process_bed_file.df_to_list_peakstr(tss_annotated),\n",
+    "                            \"gene_short_name\": tss_annotated.gene_short_name.values})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# number of tss_annotated pairs in shortlisted peak\n",
+    "print(f'Percentage of proximal elements in the final peak gene pairs: {100*peak_gene_shortlist.peak_id.isin(tss_annotated_df.peak_id).sum()/len(peak_gene_shortlist)}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dorc_shortlisted = peak_gene_shortlist.groupby('gene_short_name').size()\n",
+    "print(f\"In the short list: max peaks per gene: {dorc_shortlisted.max()}, median: {dorc_shortlisted.median()}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "threshold = 10\n",
+    "peak_new = integrated[integrated.coaccess >= threshold]\n",
+    "print('number of DORC: ', (peak_new.groupby('gene_short_name').size()>10).sum())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Base GRN"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "base_GRN = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/grn_celloracle_base.csv', index_col=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tfs_co = base_GRN.columns[3:]\n",
+    "keeo_cols = tfs_co.insert(0, ['gene_short_name', 'peak_id'])\n",
+    "df = base_GRN[keeo_cols]\n",
+    "# Melting the DataFrame\n",
+    "melted_df = pd.melt(df, id_vars=['gene_short_name', 'peak_id'], var_name='TF', value_name='Link')\n",
+    "\n",
+    "# Filtering out rows where there is no link (optional, if you only want interactions)\n",
+    "melted_df = melted_df[melted_df['Link'] == 1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f'TFs : {melted_df.TF.unique().shape} , regions : {melted_df.peak_id.unique().shape},  genes : {melted_df.gene_short_name.unique().shape}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Refined GRN "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tag = '' #'_hvg'\n",
+    "\n",
+    "links_o = co.load_hdf5(f\"{work_dir}/infer/celloracle/grn/links{tag}.celloracle.links\") "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "links_dict =  links_o.links_dict.copy()\n",
+    "tt = 0.05\n",
+    "links_dict_f = {}\n",
+    "for key, df in links_dict.items():\n",
+    "    mask = df.p<tt\n",
+    "    df = df[mask]\n",
+    "    if key=='agg_type':\n",
+    "        key='T cells'\n",
+    "    links_dict_f[key]=df\n",
+    "for cell_type, grn in links_dict_f.items():\n",
+    "    grn.to_csv(f'{work_dir}/infer/celloracle/grn/grn_{cell_type}.csv')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "celloracle",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/dockerfiles/scenicplus/Dockerfile b/dockerfiles/scenicplus/Dockerfile
new file mode 100644
index 000000000..e69de29bb
diff --git a/dockerfiles/scglue/Dockerfile b/dockerfiles/scglue/Dockerfile
new file mode 100644
index 000000000..e9555dd7a
--- /dev/null
+++ b/dockerfiles/scglue/Dockerfile
@@ -0,0 +1,29 @@
+
+FROM nvidia/cuda:11.7.1-base-ubuntu20.04
+# Install required dependencies for the R packages
+RUN apt-get update && apt-get install -y \
+    bedtools \
+    build-essential \
+    curl \
+    bash \
+    python3 \
+    python3-pip \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Python packages
+RUN pip install \
+    scglue==0.3.2 \
+    pyscenic==0.12.1 \
+    numpy==1.23.4 \
+    scanpy \
+    networkx \
+    pyarrow \
+    cytoolz \
+    scikit-misc \
+    cuda-python
+
+# Set the working directory
+WORKDIR /workspace
+
+# Default command
+CMD ["python"]
diff --git a/scripts/run_grn_inference.sh b/scripts/run_grn_inference.sh
index 363384f95..97d7b6c17 100644
--- a/scripts/run_grn_inference.sh
+++ b/scripts/run_grn_inference.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
 
-RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
-resources_dir="s3://openproblems-data/resources/perturbation_prediction/datasets/"
-publish_dir="s3://openproblems-data/resources/perturbation_prediction/results/${RUN_ID}"
+RUN_ID="run_figr_$(date +%Y-%m-%d_%H-%M-%S)"
+resources_dir="s3://openproblems-data/resources/grn"
+publish_dir="s3://openproblems-data/resources/grn/results/${RUN_ID}"
 
 cat > /tmp/params.yaml << HERE
 param_list:
@@ -11,11 +11,7 @@ param_list:
     de_test_h5ad: "$resources_dir/neurips-2023-data/de_test.h5ad"
     id_map: "$resources_dir/neurips-2023-data/id_map.csv"
     layer: clipped_sign_log10_pval
-  # - id: neurips-2023-kaggle
-  #   de_train_h5ad: "$resources_dir/neurips-2023-kaggle/de_train.h5ad"
-  #   de_test_h5ad: "$resources_dir/neurips-2023-kaggle/de_test.h5ad"
-  #   id_map: "$resources_dir/neurips-2023-kaggle/id_map.csv"
-  #   layer: sign_log10_pval
+
 output_state: "state.yaml"
 publish_dir: "$publish_dir"
 HERE
diff --git a/src/methods/scenicplus/config.vsh.yaml b/src/methods/scenicplus/config.vsh.yaml
index 91147dfb2..49cd562b8 100644
--- a/src/methods/scenicplus/config.vsh.yaml
+++ b/src/methods/scenicplus/config.vsh.yaml
@@ -16,12 +16,12 @@ functionality:
 
 platforms:
   - type: docker
-    image: ghcr.io/openproblems-bio/base_python:1.0.4
+    image: janursa/scenicplus:19-08-2024
     setup:
       - type: python
         packages: [  ]
       - type: python
-        git: [ https://github.com/aertslab/scenicplus ]
+        git: [ ]
 
 
   - type: native
diff --git a/src/methods/scenicplus/script.py b/src/methods/scenicplus/script.py
index c08659d80..a59f2a4b9 100644
--- a/src/methods/scenicplus/script.py
+++ b/src/methods/scenicplus/script.py
@@ -9,7 +9,7 @@
   "prediction": "output/prediction.csv",
 }
 ## VIASH END
-sys.path.append(meta["resources_dir"])
+# sys.path.append(meta["resources_dir"])
 from main import main 
 prediction = main(par)
 
diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
index 4d4aefd75..ba46c1230 100644
--- a/src/workflows/run_benchmark/config.vsh.yaml
+++ b/src/workflows/run_benchmark/config.vsh.yaml
@@ -12,17 +12,7 @@ functionality:
           __merge__: ../../api/file_multiomics_atac_h5ad.yaml
           required: false
           direction: input
-        - name: --perturbation_data
-          __merge__: ../../api/file_perturbation_h5ad.yaml
-          required: true
-          direction: input
-        - name: --layer
-          required: true
-          type: string
-          direction: input
-          default: lognorm
-          description: Which layer to use.
-
+        
 
     - name: Outputs
       arguments:
@@ -37,11 +27,6 @@ functionality:
           required: true
           direction: output
           default: method_configs.yaml
-        - name: "--metric_configs"
-          type: file
-          required: true
-          direction: output
-          default: metric_configs.yaml
         - name: "--dataset_uns"
           type: file
           required: true
@@ -58,10 +43,7 @@ functionality:
           type: string
           multiple: true
           description: A list of method ids to run. If not specified, all methods will be run.
-        - name: "--metric_ids"
-          type: string
-          multiple: true
-          description: A list of metric ids to run. If not specified, all metric will be run.
+
 
   resources:
     - type: nextflow_script
@@ -69,6 +51,9 @@ functionality:
       entrypoint: run_wf
     - type: file
       path: "../../api/task_info.yaml"
+  dependencies:
+  - name: common/extract_metadata
+    repository: openproblemsv2
   # dependencies:
   #   # - name: common/extract_metadata
   #   #   repository: openproblemsv2
diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
index 1cb644811..06a5d36cf 100644
--- a/src/workflows/run_benchmark/main.nf
+++ b/src/workflows/run_benchmark/main.nf
@@ -1,6 +1,6 @@
 // construct list of methods
 methods = [
-  scglue
+  figr
 ]
 
 // construct list of metrics

From ef4be12ec7301d805629238b0892ffe2eba6eaba Mon Sep 17 00:00:00 2001
From: matin <matin@M-As-MacBook-Pro.local>
Date: Fri, 19 Jul 2024 22:15:37 +0100
Subject: [PATCH 2/4] after merge

---
 .gitignore       |    4 +-
 celloracle.ipynb | 1241 ----------------------------------------------
 2 files changed, 3 insertions(+), 1242 deletions(-)
 delete mode 100644 celloracle.ipynb

diff --git a/.gitignore b/.gitignore
index 256176b9d..c04996a27 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,8 @@
 resources/
 resources_test/
 output/
+target/
+local/
 
 # related to python
 .ipynb_checkpoints
@@ -21,4 +23,4 @@ work
 
 # IDE related
 .idea
-.vscode
\ No newline at end of file
+.vscode
diff --git a/celloracle.ipynb b/celloracle.ipynb
deleted file mode 100644
index d14b4de57..000000000
--- a/celloracle.ipynb
+++ /dev/null
@@ -1,1241 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Imports"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/root/.local/lib/python3.10/site-packages/numba/np/ufunc/parallel.py:371: NumbaWarning: \u001b[1mThe TBB threading layer requires TBB version 2021 update 6 or later i.e., TBB_INTERFACE_VERSION >= 12060. Found TBB_INTERFACE_VERSION = 12050. The TBB threading layer is disabled.\u001b[0m\n",
-      "  warnings.warn(problem)\n"
-     ]
-    }
-   ],
-   "source": [
-    "import pandas as pd\n",
-    "import os \n",
-    "from celloracle import motif_analysis as ma\n",
-    "import pandas as pd\n",
-    "import celloracle as co\n",
-    "import anndata\n",
-    "import matplotlib.pyplot as plt\n",
-    "import numpy as np\n",
-    "import anndata as ad\n",
-    "from local_utils import plots"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import anndata as ad\n",
-    "\n",
-    "## VIASH START\n",
-    "par = {\n",
-    "  \"multiomics_rna\": \"resources/grn-benchmark/multiomics_rna.h5ad\",\n",
-    "  \"multiomics_atac\": \"resources/grn-benchmark/multiomics_atac.h5ad\",\n",
-    "  \"annotation_file\": \"resources/grn-benchmark/annotation_file\",\n",
-    "  \"motif_file\": \"resources/grn-benchmark/motif_file\",\n",
-    "  \"prediction\": \"output/prediction.csv\",\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Reading input files\n"
-     ]
-    }
-   ],
-   "source": [
-    "print('Reading input files', flush=True)\n",
-    "multiomics_rna = ad.read_h5ad(par[\"multiomics_rna\"])\n",
-    "multiomics_atac = ad.read_h5ad(par[\"multiomics_atac\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "que bed peaks: 135358\n",
-      "tss peaks in que: 21028\n"
-     ]
-    }
-   ],
-   "source": [
-    "peaks = multiomics_atac.var_names.to_numpy()\n",
-    "\n",
-    "peaks = [peak.replace(':','_').replace(\"-\",'_') for peak in peaks]\n",
-    "tss_annotated = ma.get_tss_info(peak_str_list=peaks, ref_genome=\"hg38\")\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tss_annotated['peak_id'] = tss_annotated['chr'].astype(str)+\"_\"+tss_annotated['start'].astype(str)+\"_\"+tss_annotated['end'].astype(str)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# pd.read_csv('../perturb-multiomics-grn/output/infer/celloracle/peak_gene.csv')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "hg38 installation:  True\n",
-      "No motif data entered. Loading default motifs for your species ...\n",
-      " Default motif for vertebrate: gimme.vertebrate.v5.0. \n",
-      " For more information, please see https://gimmemotifs.readthedocs.io/en/master/overview.html \n",
-      "\n",
-      "Initiating scanner... \n",
-      "\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "DEBUG:gimme.scanner:using background: genome hg38 with size 200\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Calculating FPR-based threshold. This step may take substantial time when you load a new ref-genome. It will be done quicker on the second time. \n",
-      "\n",
-      "Motif scan started .. It may take long time.\n",
-      "\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "392dd15eb4634227843443fd729b77fe",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "scanning:   0%|          | 0/17276 [00:00<?, ? sequences/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "DEBUG:gimme.scanner:Scanning\n"
-     ]
-    }
-   ],
-   "source": [
-    "peak_gene = tss_annotated\n",
-    "# PLEASE make sure reference genome is correct.\n",
-    "ref_genome = \"hg38\"\n",
-    "\n",
-    "genome_installation = ma.is_genome_installed(ref_genome=ref_genome,\n",
-    "                                             genomes_dir=None)\n",
-    "print(ref_genome, \"installation: \", genome_installation)\n",
-    "\n",
-    "# Instantiate TFinfo object\n",
-    "tfi = ma.TFinfo(peak_data_frame=peak_gene, \n",
-    "                ref_genome=\"hg38\",\n",
-    "                genomes_dir=None) \n",
-    "\n",
-    "tfi.scan(fpr=0.05, \n",
-    "         motifs=None,  # If you enter None, default motifs will be loaded.\n",
-    "         verbose=True)\n",
-    "# Check motif scan results\n",
-    "tfi.scanned_df.head()\n",
-    "# Reset filtering \n",
-    "tfi.reset_filtering()\n",
-    "\n",
-    "# Do filtering\n",
-    "tfi.filter_motifs_by_score(threshold=10)\n",
-    "\n",
-    "# Format post-filtering results.\n",
-    "tfi.make_TFinfo_dataframe_and_dictionary(verbose=True)\n",
-    "\n",
-    "# Format and save \n",
-    "df = tfi.to_dataframe()\n",
-    "df.head()\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Annotations"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Get chromosome sizes (for hg38 here). \n",
-    "target_url='http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes'\n",
-    "chromsizes=pd.read_csv(target_url, sep='\\t', header=None)\n",
-    "chromsizes.columns=['Chromosome', 'End']\n",
-    "chromsizes['Start']=[0]*chromsizes.shape[0]\n",
-    "chromsizes=chromsizes.loc[:,['Chromosome', 'Start', 'End']]\n",
-    "# Exceptionally in this case, to agree with CellRangerARC annotations\n",
-    "chromsizes['Chromosome'] = [chromsizes['Chromosome'][x].replace('v', '.') for x in range(len(chromsizes['Chromosome']))]\n",
-    "chromsizes['Chromosome'] = [chromsizes['Chromosome'][x].split('_')[1] if len(chromsizes['Chromosome'][x].split('_')) > 1 else chromsizes['Chromosome'][x] for x in range(len(chromsizes['Chromosome']))]\n",
-    "\n",
-    "chromsizes = chromsizes[['Chromosome', 'End']]\n",
-    "# save\n",
-    "chromsizes.to_csv(f'{work_dir}/cicero/chromsizes.csv', index=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# check if all chr are found\n",
-    "chrs = df_peaks.locations.apply(lambda x:x.split('_')[0])\n",
-    "chrs.isin(chromsizes.Chromosome).sum()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Celloracle"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Base GRN\n",
-    "This section create base GRN given cicero results.\n",
-    "###  Associate peaks with TSS\n",
-    "Each peak is associated with promotors of target genes (+- 1kbp). Celloracle should be installed for this step."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 51,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "que bed peaks: 131047\n",
-      "tss peaks in que: 20898\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "***** WARNING: File /tmp/pybedtools.f78i1zv8.tmp has inconsistent naming convention for record:\n",
-      "GL000194.1\t55749\t56580\n",
-      "\n",
-      "***** WARNING: File /tmp/pybedtools.f78i1zv8.tmp has inconsistent naming convention for record:\n",
-      "GL000194.1\t55749\t56580\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "###  Merge TSS peaks with cicero connections"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 81,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 82,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "integrated.to_csv(f'{work_dir}/infer/celloracle/baseGRN/integrated.csv')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "###  Shortlisten the peak-gene connections"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "peak_gene = integrated[integrated.coaccess>0.8].reset_index(drop=True)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "peak_gene.to_csv(f'{work_dir}/infer/celloracle/baseGRN/peak_gene.csv')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Create base GRN\n",
-    "Running interactively might take a long time."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Base GRN: only proximal \n",
-    "This is only based on proximal cis elements and doesnt require cicero"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 54,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import anndata as ad \n",
-    "adata_atac = ad.read_h5ad(f'{work_dir}/scATAC/adata_atac.h5ad')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 55,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "peaks = format_peakname(adata_atac.var.reset_index()).location.values"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 56,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "que bed peaks: 135418\n",
-      "tss peaks in que: 21028\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "***** WARNING: File /tmp/pybedtools.dxm99xoa.tmp has inconsistent naming convention for record:\n",
-      "chr10\t100001032\t100001800\n",
-      "\n",
-      "***** WARNING: File /tmp/pybedtools.dxm99xoa.tmp has inconsistent naming convention for record:\n",
-      "chr10\t100001032\t100001800\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "tss_annotated = ma.get_tss_info(peak_str_list=peaks, ref_genome=\"hg38\")\n",
-    "tss_annotated.to_csv(f'{work_dir}/infer/celloracle/baseGRN/tss_annotated.csv')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 77,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "##----- integrate_tss_peak_with_cicero\n",
-    "import numpy as np\n",
-    "from celloracle.motif_analysis.process_bed_file import df_to_list_peakstr\n",
-    "# 1. check tss data format and convert if needed\n",
-    "tss_peak=tss_annotated\n",
-    "tss = tss_peak.copy()\n",
-    "if np.all([i in tss.columns for i in [\"chr\", \"start\", \"end\"]]):\n",
-    "    tss = pd.DataFrame({\"peak_id\": df_to_list_peakstr(tss),\n",
-    "                        \"gene_short_name\": tss.gene_short_name.values})\n",
-    "else:\n",
-    "    raise ValueError(\"tss_peak format error\")\n",
-    "\n",
-    "peak_gene = tss"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 78,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "peak_gene.to_csv(f'{work_dir}/infer/celloracle/baseGRN/peak_gene.csv')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### TF motifs"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd \n",
-    "from celloracle import motif_analysis as ma\n",
-    "import genomepy\n",
-    "genomes_dir='/beegfs/desy/user/nourisaj/op_multiomics_grn/output/celloracle'\n",
-    "peak_gene = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/peak_gene.csv', index_col=0)\n",
-    "genomepy.install_genome(name=\"hg38\", provider=\"UCSC\", genomes_dir=genomes_dir)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 48,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# PLEASE make sure reference genome is correct.\n",
-    "ref_genome = \"hg38\"\n",
-    "\n",
-    "genome_installation = ma.is_genome_installed(ref_genome=ref_genome,\n",
-    "                                             genomes_dir=genomes_dir)\n",
-    "print(ref_genome, \"installation: \", genome_installation)\n",
-    "\n",
-    "# Instantiate TFinfo object\n",
-    "tfi = ma.TFinfo(peak_data_frame=peak_gene, \n",
-    "                ref_genome=\"hg38\",\n",
-    "                genomes_dir=genomes_dir) \n",
-    "\n",
-    "tfi.scan(fpr=0.05, \n",
-    "         motifs=None,  # If you enter None, default motifs will be loaded.\n",
-    "         verbose=True)\n",
-    "# Check motif scan results\n",
-    "tfi.scanned_df.head()\n",
-    "# Reset filtering \n",
-    "tfi.reset_filtering()\n",
-    "\n",
-    "# Do filtering\n",
-    "tfi.filter_motifs_by_score(threshold=10)\n",
-    "\n",
-    "# Format post-filtering results.\n",
-    "tfi.make_TFinfo_dataframe_and_dictionary(verbose=True)\n",
-    "\n",
-    "# Format and save \n",
-    "df = tfi.to_dataframe()\n",
-    "df.head()\n",
-    "df.to_csv(f'{work_dir}/infer/celloracle/baseGRN/grn_celloracle_base.csv')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## GRN construction\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Preprocessing scRNA-seq"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 86,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import celloracle as co\n",
-    "import anndata\n",
-    "import scanpy as sc\n",
-    "adata = anndata.read_h5ad(f'{work_dir}/scRNA/adata_rna.h5ad')\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 87,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "AnnData object with n_obs × n_vars = 25551 × 22787\n",
-       "    obs: 'cell_type', 'donor_id', 'n_genes'\n",
-       "    var: 'n_cells'\n",
-       "    layers: 'counts', 'x_norm'"
-      ]
-     },
-     "execution_count": 87,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "del adata.varm \n",
-    "del adata.uns \n",
-    "del adata.obsp \n",
-    "del adata.obsm \n",
-    "del adata.obs['louvain']\n",
-    "adata"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 88,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "adata.X = adata.layers['counts'].copy()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 89,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([1., 1., 1., ..., 2., 1., 1.])"
-      ]
-     },
-     "execution_count": 89,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "adata.X.data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 90,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sc.pp.normalize_per_cell(adata, key_n_counts='n_counts_all')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 91,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "filter_result = sc.pp.filter_genes_dispersion(adata.X,\n",
-    "                                              flavor='cell_ranger',\n",
-    "                                              n_top_genes=3000,\n",
-    "                                              log=False)\n",
-    "\n",
-    "# Subset the genes\n",
-    "adata = adata[:, filter_result.gene_subset]\n",
-    "\n",
-    "# Renormalize after filtering\n",
-    "sc.pp.normalize_per_cell(adata)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 95,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Log transformation and scaling\n",
-    "sc.pp.log1p(adata)\n",
-    "sc.pp.scale(adata)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 100,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# PCA\n",
-    "sc.tl.pca(adata, svd_solver='arpack')\n",
-    "\n",
-    "# Diffusion map\n",
-    "sc.pp.neighbors(adata, n_neighbors=4, n_pcs=20)\n",
-    "\n",
-    "sc.tl.diffmap(adata)\n",
-    "# Calculate neihbors again based on diffusionmap \n",
-    "sc.pp.neighbors(adata, n_neighbors=10, use_rep='X_diffmap')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 101,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sc.tl.louvain(adata, resolution=0.8)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 59,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "AnnData object with n_obs × n_vars = 25551 × 3000\n",
-       "    obs: 'cell_type', 'donor_id', 'n_genes', 'n_counts_all', 'n_counts', 'louvain'\n",
-       "    var: 'n_cells', 'mean', 'std'\n",
-       "    uns: 'log1p', 'pca', 'neighbors', 'diffmap_evals', 'louvain'\n",
-       "    obsm: 'X_pca', 'X_diffmap'\n",
-       "    varm: 'PCs'\n",
-       "    layers: 'counts', 'x_norm'\n",
-       "    obsp: 'distances', 'connectivities'"
-      ]
-     },
-     "execution_count": 59,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "adata"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 102,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sc.tl.paga(adata, groups='louvain')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 103,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sc.pl.paga(adata)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 104,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING: Package 'fa2' is not installed, falling back to layout 'fr'.To use the faster and better ForceAtlas2 layout, install package 'fa2' (`pip install fa2`).\n"
-     ]
-    }
-   ],
-   "source": [
-    "sc.tl.draw_graph(adata, init_pos='paga', random_state=123)\n",
-    "sc.pl.draw_graph(adata, color='louvain', legend_loc='on data')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Metadata columns : ['cell_type', 'donor_id', 'n_genes', 'louvain', 'n_counts_all', 'n_counts']\n",
-      "Dimensional reduction:  ['X_diffmap', 'X_draw_graph_fr', 'X_pca', 'X_umap']\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Check data in anndata\n",
-    "print(\"Metadata columns :\", list(adata.obs.columns))\n",
-    "print(\"Dimensional reduction: \", list(adata.obsm.keys()))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 108,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([1., 1., 1., ..., 1., 1., 2.])"
-      ]
-     },
-     "execution_count": 108,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "## run based on counts as suggesyed by co pipeline\n",
-    "adata.X = adata.layers[\"counts\"]\n",
-    "adata.X.data"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Load base GRN"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 106,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "base_GRN = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/grn_celloracle_base.csv', index_col=0)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Initialize "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 109,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING: adata.X seems to be already log-transformed.\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Instantiate Oracle object\n",
-    "oracle = co.Oracle()\n",
-    "# Instantiate Oracle object.\n",
-    "oracle.import_anndata_as_raw_count(adata=adata,\n",
-    "                                   cluster_column_name=\"cell_type\",\n",
-    "                                   embedding_name=\"X_draw_graph_fr\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 113,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# You can load TF info dataframe with the following code.\n",
-    "oracle.import_TF_data(TF_info_matrix=base_GRN)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 114,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "25\n"
-     ]
-    }
-   ],
-   "source": [
-    "import numpy as np\n",
-    "oracle.perform_PCA()\n",
-    "plt.plot(np.cumsum(oracle.pca.explained_variance_ratio_)[:100])\n",
-    "n_comps = np.where(np.diff(np.diff(np.cumsum(oracle.pca.explained_variance_ratio_))>0.002))[0][0]\n",
-    "plt.axvline(n_comps, c=\"k\")\n",
-    "plt.show()\n",
-    "print(n_comps)\n",
-    "n_comps = min(n_comps, 50)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 115,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "cell number is :25551\n",
-      "Auto-selected k is :50\n"
-     ]
-    }
-   ],
-   "source": [
-    "n_cell = oracle.adata.shape[0]\n",
-    "print(f\"cell number is :{n_cell}\")\n",
-    "k = min([int(0.025*n_cell), 50])\n",
-    "print(f\"Auto-selected k is :{k}\")\n",
-    "oracle.knn_imputation(n_pca_dims=n_comps, k=k, balanced=True, b_sight=k*8,\n",
-    "                      b_maxl=k*4, n_jobs=4)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 117,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "oracle.to_hdf5(f'{work_dir}/infer/celloracle/grn/presaved_3000.celloracle.oracle')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### GRN calculation\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "if True: # run locally\n",
-    "    # Load file.\n",
-    "    oracle = co.load_hdf5(f'{work_dir}/infer/celloracle/grn/presaved_3000.celloracle.oracle')\n",
-    "    # This step may take some time.\n",
-    "    links = oracle.get_links(cluster_name_for_GRN_unit=\"cell_type\", alpha=10,\n",
-    "                            verbose_level=10)\n",
-    "    links.to_hdf5(file_path=f\"{work_dir}/infer/celloracle/grn/links_3000.celloracle.links\")\n",
-    "else:\n",
-    "    !python celloracle/run_grn.py"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Post evaluation"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Peak gene connections\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>peak</th>\n",
-       "      <th>target</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>chr7_130668147_130669092</td>\n",
-       "      <td>COPG2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>chr3_12796310_12797168</td>\n",
-       "      <td>CAND2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>chr1_207052722_207053635</td>\n",
-       "      <td>PFKFB2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>chr10_96043175_96044011</td>\n",
-       "      <td>CCNJ</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>chr1_161225428_161226349</td>\n",
-       "      <td>MIR5187</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>21023</th>\n",
-       "      <td>chr1_10430097_10431027</td>\n",
-       "      <td>CENPS-CORT</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>21024</th>\n",
-       "      <td>chr5_149549508_149550287</td>\n",
-       "      <td>CSNK1A1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>21025</th>\n",
-       "      <td>chr5_149551086_149552006</td>\n",
-       "      <td>CSNK1A1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>21026</th>\n",
-       "      <td>chr20_10673798_10674620</td>\n",
-       "      <td>JAG1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>21027</th>\n",
-       "      <td>chr9_122228348_122229104</td>\n",
-       "      <td>LHX6</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>21028 rows × 2 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                           peak      target\n",
-       "0      chr7_130668147_130669092       COPG2\n",
-       "1        chr3_12796310_12797168       CAND2\n",
-       "2      chr1_207052722_207053635      PFKFB2\n",
-       "3       chr10_96043175_96044011        CCNJ\n",
-       "4      chr1_161225428_161226349     MIR5187\n",
-       "...                         ...         ...\n",
-       "21023    chr1_10430097_10431027  CENPS-CORT\n",
-       "21024  chr5_149549508_149550287     CSNK1A1\n",
-       "21025  chr5_149551086_149552006     CSNK1A1\n",
-       "21026   chr20_10673798_10674620        JAG1\n",
-       "21027  chr9_122228348_122229104        LHX6\n",
-       "\n",
-       "[21028 rows x 2 columns]"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "integrated"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "if True:\n",
-    "    integrated = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/integrated.csv')\n",
-    "    integrated = integrated[['peak_id','gene_short_name']]\n",
-    "    integrated.columns = ['peak','target']\n",
-    "    \n",
-    "    # integrated.to_csv(f'{work_dir}/infer/celloracle/peak_gene.csv')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tss_annotated = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/tss_annotated.csv')\n",
-    "print(len(tss_annotated))\n",
-    "integrated = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/integrated.csv')\n",
-    "print(len(integrated))\n",
-    "peak_gene_shortlist = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/peak_gene.csv')\n",
-    "print(len(peak_gene_shortlist))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "unique peaks in peak-gene (17295,)\n",
-      "unique genes in peak-gene (16691,)\n"
-     ]
-    }
-   ],
-   "source": [
-    "print('unique peaks in peak-gene', peak_gene_shortlist.peak_id.unique().shape)\n",
-    "print('unique genes in peak-gene', peak_gene_shortlist.gene_short_name.unique().shape)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "number of DORC genes with t of 10  0\n",
-      "number of DORC genes with t of 5  17\n"
-     ]
-    }
-   ],
-   "source": [
-    "peak_gene_shortlist = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/peak_gene.csv', index_col=0)\n",
-    "peak_gene_co_n = peak_gene_shortlist.groupby('gene_short_name').apply(lambda df:df['peak_id'].shape[0])\n",
-    "np.max(peak_gene_co_n.values), np.median(peak_gene_co_n.values)\n",
-    "\n",
-    "# print('number of TFs ', scenicplus.TF.unique().shape[0], ' CIS ', scenicplus.Region.unique().shape[0], ' gene ', scenicplus.Gene.unique().shape[0])\n",
-    "print('number of DORC genes with t of 10 ', (peak_gene_co_n.values > 10).sum())\n",
-    "print('number of DORC genes with t of 5 ', (peak_gene_co_n.values > 5).sum())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# convert peak to peak_id using celloracle function\n",
-    "tss_annotated_df = pd.DataFrame({\"peak_id\": ma.process_bed_file.df_to_list_peakstr(tss_annotated),\n",
-    "                            \"gene_short_name\": tss_annotated.gene_short_name.values})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# number of tss_annotated pairs in shortlisted peak\n",
-    "print(f'Percentage of proximal elements in the final peak gene pairs: {100*peak_gene_shortlist.peak_id.isin(tss_annotated_df.peak_id).sum()/len(peak_gene_shortlist)}')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dorc_shortlisted = peak_gene_shortlist.groupby('gene_short_name').size()\n",
-    "print(f\"In the short list: max peaks per gene: {dorc_shortlisted.max()}, median: {dorc_shortlisted.median()}\")\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "threshold = 10\n",
-    "peak_new = integrated[integrated.coaccess >= threshold]\n",
-    "print('number of DORC: ', (peak_new.groupby('gene_short_name').size()>10).sum())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Base GRN"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "base_GRN = pd.read_csv(f'{work_dir}/infer/celloracle/baseGRN/grn_celloracle_base.csv', index_col=0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tfs_co = base_GRN.columns[3:]\n",
-    "keeo_cols = tfs_co.insert(0, ['gene_short_name', 'peak_id'])\n",
-    "df = base_GRN[keeo_cols]\n",
-    "# Melting the DataFrame\n",
-    "melted_df = pd.melt(df, id_vars=['gene_short_name', 'peak_id'], var_name='TF', value_name='Link')\n",
-    "\n",
-    "# Filtering out rows where there is no link (optional, if you only want interactions)\n",
-    "melted_df = melted_df[melted_df['Link'] == 1]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(f'TFs : {melted_df.TF.unique().shape} , regions : {melted_df.peak_id.unique().shape},  genes : {melted_df.gene_short_name.unique().shape}')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Refined GRN "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tag = '' #'_hvg'\n",
-    "\n",
-    "links_o = co.load_hdf5(f\"{work_dir}/infer/celloracle/grn/links{tag}.celloracle.links\") "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "links_dict =  links_o.links_dict.copy()\n",
-    "tt = 0.05\n",
-    "links_dict_f = {}\n",
-    "for key, df in links_dict.items():\n",
-    "    mask = df.p<tt\n",
-    "    df = df[mask]\n",
-    "    if key=='agg_type':\n",
-    "        key='T cells'\n",
-    "    links_dict_f[key]=df\n",
-    "for cell_type, grn in links_dict_f.items():\n",
-    "    grn.to_csv(f'{work_dir}/infer/celloracle/grn/grn_{cell_type}.csv')"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "celloracle",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

From 1d083dd8f7f75ecc60644dc8b46633bb0fcee8f1 Mon Sep 17 00:00:00 2001
From: matin <matin@M-As-MacBook-Pro.local>
Date: Fri, 19 Jul 2024 22:46:24 +0100
Subject: [PATCH 3/4] no viash

---
 src/api/comp_metric.yaml                      |  1 +
 .../{config.vsh.yaml => config.novsh.yaml}    |  0
 src/pre_methods/cistopic/config.novsh.yaml    | 23 +++++++++++++++++++
 .../{config.vsh.yaml => config.novsh.yaml}    |  0
 4 files changed, 24 insertions(+)
 rename src/methods/scenicplus/{config.vsh.yaml => config.novsh.yaml} (100%)
 create mode 100644 src/pre_methods/cistopic/config.novsh.yaml
 rename src/pre_methods/format_multiomics_R/{config.vsh.yaml => config.novsh.yaml} (100%)

diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml
index 68eef5701..f50aad02f 100644
--- a/src/api/comp_metric.yaml
+++ b/src/api/comp_metric.yaml
@@ -25,6 +25,7 @@ functionality:
       direction: input
       default: ridge
       description: name of regretion to use
+      multiple: true
     - name: --subsample
       type: integer
       direction: input
diff --git a/src/methods/scenicplus/config.vsh.yaml b/src/methods/scenicplus/config.novsh.yaml
similarity index 100%
rename from src/methods/scenicplus/config.vsh.yaml
rename to src/methods/scenicplus/config.novsh.yaml
diff --git a/src/pre_methods/cistopic/config.novsh.yaml b/src/pre_methods/cistopic/config.novsh.yaml
new file mode 100644
index 000000000..ea092766e
--- /dev/null
+++ b/src/pre_methods/cistopic/config.novsh.yaml
@@ -0,0 +1,23 @@
+functionality:
+  name: cistopic
+  info:
+    label: cistopic
+    summary: "creates cistopic"
+
+
+  resources:
+    - type: r_script
+      path: script.R
+
+
+platforms:
+  - type: docker
+    image: ghcr.io/openproblems-bio/base_r:1.0.2
+    # setup:
+    #   - type: r
+    #     packages: [dplyr, FNN, chromVAR, doParallel, BuenColors, FigR, BSgenome.Hsapiens.UCSC.hg38]
+
+  - type: native
+  - type: nextflow
+    directives:
+      label: [midtime,midmem,midcpu]
diff --git a/src/pre_methods/format_multiomics_R/config.vsh.yaml b/src/pre_methods/format_multiomics_R/config.novsh.yaml
similarity index 100%
rename from src/pre_methods/format_multiomics_R/config.vsh.yaml
rename to src/pre_methods/format_multiomics_R/config.novsh.yaml

From 47efd3df23c6d6a19669612b22ab2552facc70a3 Mon Sep 17 00:00:00 2001
From: matin <matin@M-As-MacBook-Pro.local>
Date: Mon, 22 Jul 2024 20:28:11 +0100
Subject: [PATCH 4/4] test resourcs added

---
 notebooks/create_resources.ipynb              |  167 +-
 .../cistarget/tf_lists/allTFs_hg38.txt        | 1892 +++++++++++++++++
 src/api/comp_method.yaml                      |    6 +-
 3 files changed, 2053 insertions(+), 12 deletions(-)
 create mode 100644 notebooks/resources.aertslab.org/cistarget/tf_lists/allTFs_hg38.txt

diff --git a/notebooks/create_resources.ipynb b/notebooks/create_resources.ipynb
index b9f8df256..de7ac8f2f 100644
--- a/notebooks/create_resources.ipynb
+++ b/notebooks/create_resources.ipynb
@@ -18,12 +18,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
     "import anndata as ad\n",
     "import pandas as pd\n",
+    "\n",
     "import numpy as np\n",
     "data_dir = '../../perturb-multiomics-grn/output/'\n",
     "\n",
@@ -161,6 +162,105 @@
     "adata_bulk.write(f'{resource_dir}/perturbation_data.h5ad')"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# test rresources"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "test_resource_dir = f'{resource_dir}/../../resources_test/grn-benchmark'\n",
+    "os.makedirs(test_resource_dir, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "adata_rna = ad.read_h5ad(f'{resource_dir}/multiomics_rna.h5ad')\n",
+    "adata_atac = ad.read_h5ad(f'{resource_dir}/multiomics_atac.h5ad')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "peaks = pd.read_csv(f'{resource_dir}/peak_gene_models/granie.csv').peak.to_numpy()\n",
+    "hvgs = ad.read_h5ad(f'{resource_dir}/prior_data.h5ad').uns['hvgs']\n",
+    "genes_multi = ad.read_h5ad(f'{resource_dir}/prior_data.h5ad').uns['gene_names']\n",
+    "tfs = ad.read_h5ad(f'{resource_dir}/prior_data.h5ad').uns['tf_list']\n",
+    "genes = set(tfs) & set(genes_multi)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# shorten rna \n",
+    "mask = adata_rna.obs.donor_id=='donor_0'\n",
+    "adata_rna_s = adata_rna[mask]\n",
+    "random_indices = np.random.choice(adata_rna_s.obs.index, 1000, replace=False)\n",
+    "adata_rna_s = adata_rna_s[random_indices, adata_rna_s.var_names.isin(genes)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "View of AnnData object with n_obs × n_vars = 1000 × 4962\n",
+       "    obs: 'cell_type', 'donor_id'"
+      ]
+     },
+     "execution_count": 77,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# shorten atac\n",
+    "adata_atac_s = adata_atac[adata_atac.obs.index.isin(adata_rna_s.obs.index), adata_atac.var.index.isin(peaks)]\n",
+    "adata_atac_s"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "adata_rna_s.write(f'{test_resource_dir}/multiomics_rna.h5ad')\n",
+    "adata_atac_s.write(f'{test_resource_dir}/multiomics_atac.h5ad')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# shorten perturbation\n",
+    "adata_bulk = ad.read_h5ad(f'{resource_dir}/perturbation_data.h5ad')\n",
+    "adata_bulk[:200, adata_bulk.var_names.isin(genes)].write(f'{test_resource_dir}/perturbation_data.h5ad')"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -168,9 +268,16 @@
     "# Prior"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## tf names\n"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -179,7 +286,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -188,13 +295,59 @@
     "prior_adata.uns['tf_list'] = tf_list\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## gene names"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bulk_adata = ad.read_h5ad(f'{resource_dir}/perturbation_data.h5ad')\n",
+    "prior_adata.uns['gene_names_pert'] = bulk_adata.var_names.to_numpy()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
-    "prior_adata.write(f'{resource_dir}/prior_data.h5ad')"
+    "bulk_adata = ad.read_h5ad(f'{resource_dir}/multiomics_rna.h5ad')\n",
+    "prior_adata.uns['gene_names'] = bulk_adata.var_names.to_numpy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bulk_adata = ad.read_h5ad(f'{resource_dir}/multiomics_atac.h5ad')\n",
+    "prior_adata.uns['peak'] = bulk_adata.var_names.to_numpy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prior_adata.uns['hvgs'] = np.loadtxt(f'{resource_dir}/hvgs.txt', dtype=str)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prior_adata.write(f'{resource_dir}/prior_data.h5ad')\n"
    ]
   },
   {
@@ -209,9 +362,7 @@
    "execution_count": 9,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "bulk_adata = ad.read_h5ad(f'{work_dir}/preprocess/bulk_adata_integrated.h5ad')"
-   ]
+   "source": []
   },
   {
    "cell_type": "code",
diff --git a/notebooks/resources.aertslab.org/cistarget/tf_lists/allTFs_hg38.txt b/notebooks/resources.aertslab.org/cistarget/tf_lists/allTFs_hg38.txt
new file mode 100644
index 000000000..6769dac51
--- /dev/null
+++ b/notebooks/resources.aertslab.org/cistarget/tf_lists/allTFs_hg38.txt
@@ -0,0 +1,1892 @@
+ZNF354C
+KLF12
+ZNF143
+ZIC2
+ZNF274
+SP2
+ZBTB7A
+BCL6B
+ZBTB49
+ZIC1
+ZNF232
+ZNF282
+ZNF410
+ZSCAN16
+ZNF524
+ZNF713
+ZNF75A
+ZSCAN4
+ZNF200
+SNAI2
+KLF1
+BCL6
+EGR2
+OVOL2
+GFI1
+GFI1B
+KLF11
+WT1
+ZNF655
+FOXC1
+ARX
+VSX1
+CRX
+PBX4
+PHOX2B
+VAX2
+VSX2
+MSX2
+ESX1
+HOXD13
+NKX2-8
+VENTX
+HESX1
+PITX2
+PROP1
+ISX
+NKX2-5
+SIX6
+HOXC4
+HOXB7
+PAX6
+PAX7
+PAX4
+PAX3
+POU4F3
+POU6F2
+POU3F4
+NR1H4
+NR2E3
+POU2F1
+RBPJ
+FOXP1
+MAX
+PHF1
+MTF2
+BCL11A
+BCL11B
+FOXN2
+FOXR1
+SOX4
+SOHLH2
+ZSCAN29
+PLAGL2
+VEZF1
+ZBTB44
+CENPBD1
+TIGD1
+CXXC5
+FOXN3
+HDX
+DUXA
+MSANTD3
+ZZZ3
+LCORL
+NFATC4
+CUX2
+CUX1
+DLX3
+LHX9
+POU5F1B
+NFATC2
+ZFHX3
+KDM2B
+KMT2A
+DNMT1
+TET1
+DMRT3
+DMRT1
+DMRTA2
+DMRT2
+E2F2
+FOXK1
+FOXG1
+GCM1
+HOXA2
+NOBOX
+LHX2
+LHX6
+TLX2
+EMX1
+ZFHX2
+LBX1
+HOXB13
+ZHX1
+POU6F1
+SHOX
+ANHX
+MYRF
+NR2E1
+NR3C2
+NR2F6
+RARG
+NFATC3
+RFX2
+GMEB1
+THAP12
+GLI2
+GLI3
+GLI1
+ETS1
+NFIL3
+MZF1
+RREB1
+SPIB
+FOXF2
+FOXD1
+PBX1
+IRF2
+RORA
+PPARG
+REL
+RELA
+SOX9
+SRY
+TFEB
+TCF4
+CEBPE
+XBP1
+PRDM1
+EHF
+ERG
+FOXJ3
+GATA1
+MEIS2
+POU2F2
+HSF2
+MEF2C
+RXRG
+NFATC1
+RFX3
+RUNX3
+EOMES
+TFAP2B
+TFAP2C
+TFAP2A
+ZBED1
+MLXIPL
+TFE3
+MNT
+TCF3
+TFAP4
+TFEC
+MLX
+MYF6
+BHLHE41
+BHLHE23
+ARNTL
+BHLHE40
+CLOCK
+HEY2
+USF1
+HEY1
+MESP1
+NEUROD2
+NHLH1
+OLIG3
+NEUROG2
+MSC
+HES7
+BHLHA15
+BHLHE22
+FIGLA
+OLIG1
+HES5
+SREBF2
+OLIG2
+MGA
+DBP
+CREB3
+HLF
+NFE2
+BATF3
+ATF4
+NRL
+JDP2
+CEBPG
+CREB3L1
+TEF
+CEBPB
+MAFF
+MAFG
+MAFK
+CEBPD
+ATF7
+YY1
+CTCF
+SP4
+GLIS3
+PRDM4
+EGR1
+GLIS2
+KLF16
+EGR4
+ZNF740
+ZIC3
+ZBTB7B
+SP8
+HIC2
+KLF13
+HINFP
+SP3
+GLIS1
+ZIC4
+EGR3
+ZBTB18
+ZNF784
+ZBTB7C
+SP1
+MTF1
+ZKSCAN3
+SCRT2
+YY2
+SCRT1
+KLF14
+CENPB
+ONECUT2
+ONECUT1
+ONECUT3
+E2F1
+E2F3
+E2F8
+E2F7
+E2F4
+EBF1
+ETV1
+SPI1
+ELF4
+ETV2
+ERF
+ELK3
+ETV3
+ELF1
+SPDEF
+ELK1
+ELF5
+ETV6
+FLI1
+GABPA
+ELK4
+ELF3
+FEV
+SPIC
+ETV4
+ETV5
+FOXP3
+FOXJ2
+FOXO3
+FOXO1
+FOXI1
+FOXB1
+FOXL1
+FOXC2
+FOXO4
+FOXD2
+FOXD3
+FOXO6
+GATA3
+GATA5
+GATA4
+GCM2
+GRHL1
+TFCP2
+MEOX1
+DLX6
+ALX4
+GSC2
+PITX1
+HOXA9
+RHOXF1
+MEIS3
+DLX5
+HOXA1
+HOXA13
+EVX1
+MEOX2
+PITX3
+DLX4
+CDX1
+OTX1
+DLX2
+PRRX1
+TGIF2
+HOXB5
+HOXB3
+HOXC13
+HOXC11
+HOXC12
+HOXD11
+MNX1
+BARX1
+GSC
+RAX
+HNF1A
+LMX1B
+PDX1
+BARHL2
+MEIS1
+DLX1
+HMBOX1
+VAX1
+TGIF2LX
+ALX3
+ISL2
+PKNOX1
+LMX1A
+EN1
+MSX1
+EN2
+UNCX
+GBX1
+PHOX2A
+PKNOX2
+CDX2
+OTX2
+DRGX
+PRRX2
+GBX2
+SHOX2
+GSX1
+HOXD12
+EMX2
+IRX2
+HOXB2
+RAX2
+EVX2
+HOXD8
+IRX5
+TGIF1
+LBX2
+ALX1
+GSX2
+HOXC10
+MIXL1
+HMX3
+HMX2
+BSX
+DMBX1
+DPRX
+NOTO
+HOMEZ
+HMX1
+HNF1B
+PAX2
+POU1F1
+POU2F3
+POU4F2
+POU4F1
+POU3F2
+POU3F1
+POU3F3
+HSF4
+HSFY2
+HSF1
+IRF3
+IRF5
+IRF4
+IRF8
+IRF7
+IRF9
+MEF2A
+SRF
+MEF2D
+MEF2B
+MYBL2
+MYBL1
+RARB
+ESR1
+HNF4A
+VDR
+NR3C1
+ESRRB
+THRA
+RARA
+THRB
+NR4A2
+AR
+ESRRA
+NR2F1
+NR2C2
+RXRA
+ESRRG
+RXRB
+TP63
+PAX1
+PAX5
+PAX9
+PROX1
+NFKB2
+NFAT5
+NFKB1
+RFX4
+RFX5
+RUNX2
+GMEB2
+NFIX
+NFIB
+NFIA
+SMAD3
+SOX8
+SOX10
+SOX21
+SOX15
+LEF1
+TCF7L1
+SOX14
+SOX7
+SOX2
+SOX18
+TBX21
+TBX5
+TBX15
+TBX2
+TBX4
+TBR1
+TBX19
+TBX20
+TBX1
+TEAD3
+TEAD1
+TEAD4
+NRF1
+CPEB1
+PGR
+NR1I3
+NR1I2
+NFE2L1
+ATF2
+ATF6
+CREB1
+ATF3
+FOSL1
+JUN
+MAFB
+ATF6B
+CEBPA
+TFAP2E
+HES2
+SREBF1
+TCFL5
+USF2
+HES1
+TCF21
+MYOG
+MYOD1
+MYCN
+ASCL1
+TCF12
+HES6
+FERD3L
+MSGN1
+NEUROD1
+HAND2
+PTF1A
+NPAS2
+ATOH1
+ARNT2
+NHLH2
+ATOH7
+NEUROG1
+ASCL2
+MESP2
+CREM
+BACH2
+FOSB
+JUND
+CREB3L4
+CREB5
+BATF
+FOS
+JUNB
+MAF
+MAFA
+ZNF263
+DPF1
+ZBTB32
+ZNF76
+KLF6
+ZNF343
+KLF5
+ZNF821
+ZNF174
+KLF3
+ZNF684
+ZBTB45
+SNAI1
+ZNF384
+KLF2
+ZSCAN5A
+KLF4
+ZSCAN9
+ZIC5
+ZNF787
+OSR1
+ZNF660
+ZNF385D
+ZSCAN1
+KLF10
+ZNF276
+ZNF281
+KLF15
+ZNF12
+ZNF704
+OSR2
+ZNF23
+ZNF444
+ZNF597
+ZBTB43
+ZNF32
+ZNF296
+ZBTB26
+KLF17
+OVOL1
+ZNF449
+HIC1
+ZBTB33
+ZNF454
+ZFP42
+ZNF771
+ZBTB2
+ZFP41
+ZBTB20
+ZFP1
+ZBTB37
+SNAI3
+ZNF501
+ZNF396
+ZSCAN23
+ZNF177
+ZNF250
+ZNF140
+ZNF460
+ZBTB14
+ZBTB12
+ZNF580
+SP9
+ZSCAN31
+ZBTB22
+ZNF345
+MBNL2
+YBX1
+LIN28B
+DMRTC2
+DMRTA1
+ETV7
+ELF2
+ETS2
+FOXA2
+FOXA1
+FOXQ1
+FOXA3
+FOXE1
+FOXL2
+FOXR2
+GATA6
+GATA2
+TFCP2L1
+UBP1
+HOXA11
+ISL1
+HOXC8
+BARX2
+LHX5
+SIX4
+HOXA5
+HOXA6
+HOXB6
+NKX3-2
+NANOG
+NKX2-3
+HOXB8
+HOXB1
+LHX4
+HOXA7
+BARHL1
+SIX1
+HOXD1
+HOXD3
+HOXD9
+HOXD10
+CDX4
+RHOXF2
+SIX3
+NKX6-2
+LHX8
+TLX3
+NKX6-3
+NKX3-1
+HOXD4
+IRX1
+SIX2
+HOXB9
+TGIF2LY
+IRX3
+HOXC9
+HOXB4
+ARGFX
+HOXA4
+HOXA10
+LHX1
+POU5F1
+HSFY1
+HSF5
+IRF6
+PPARD
+NR5A2
+NR2C1
+NR4A1
+NR1D1
+NR5A1
+RORC
+NR6A1
+NR1D2
+RORB
+PAX8
+RFX1
+RFX7
+SKOR2
+SMAD5
+NFIC
+SOX30
+TCF7
+BBX
+SOX3
+SOX12
+TBX18
+TBX3
+TBX6
+TBXT
+TEAD2
+XPA
+SKOR1
+FOSL2
+ZKSCAN1
+ZFP14
+ZNF415
+ZNF135
+ZFP82
+ZKSCAN7
+ZNF777
+ZNF682
+FOXP2
+SOX6
+SOX5
+SOX17
+PLAG1
+ZKSCAN2
+ZNF582
+ZNF506
+ZNF324
+ZNF671
+ZNF264
+ZNF302
+ZNF184
+ZNF419
+ZNF85
+ZNF430
+ZNF549
+ZNF211
+ZNF205
+ZNF45
+ZNF133
+ZNF484
+ZNF557
+ZNF337
+ZNF317
+ZNF331
+ZNF141
+ZNF304
+ZNF132
+ZNF189
+ZNF287
+ZIM3
+ZNF614
+ZNF300
+RBAK
+ZNF157
+ZNF182
+ZNF7
+ZNF214
+ZNF547
+ZNF776
+ZNF18
+ZNF19
+ZNF222
+ZNF235
+ZNF714
+ZNF333
+ZNF382
+ZNF496
+PRDM9
+ZNF202
+ZNF3
+ZNF180
+ZNF641
+ZNF610
+ZNF528
+ZNF701
+ZNF283
+ZNF558
+ZNF30
+ZNF354A
+ZNF764
+ZNF778
+ZNF212
+ZNF439
+ZNF440
+ZNF562
+ZNF561
+ZNF584
+ZIK1
+ZNF540
+ZNF570
+ZNF621
+ZNF680
+ZNF483
+ZNF417
+ZNF791
+ZNF266
+ZNF519
+ZNF25
+ZNF77
+ZNF169
+ZNF613
+ZNF620
+ZNF619
+ZNF114
+ZNF543
+ZNF354B
+ZNF223
+ZNF552
+ZNF154
+ZNF816
+ZNF571
+ZNF443
+ZNF792
+ZNF707
+ZNF875
+ZNF101
+ZNF716
+ZNF708
+ZNF662
+ZNF320
+ZNF530
+ZNF730
+ZNF93
+ZFP90
+ZNF479
+ZNF445
+ZNF74
+ZNF267
+ZNF566
+ZNF529
+ZNF284
+ZNF749
+ZNF17
+ZNF555
+ZNF75D
+ZNF197
+ZFP69B
+ZFP69
+ZNF626
+ZNF793
+ZNF383
+ZNF669
+ZNF548
+ZNF567
+ZNF573
+ZNF527
+ZNF33A
+ZNF79
+ZNF681
+ZNF766
+ZNF565
+ZNF765
+ZNF124
+ZNF605
+ZNF799
+ZNF782
+ZNF846
+ZNF136
+ZKSCAN5
+ZNF33B
+ZNF431
+ZNF418
+ZNF585A
+ZNF429
+ZNF100
+ZNF398
+ZNF441
+ZNF257
+ZNF785
+ZNF786
+ZNF675
+ZNF860
+ZNF695
+ZNF615
+ZNF433
+ZNF81
+ZNF780A
+ZNF181
+ZNF44
+ZNF790
+ZNF823
+ZNF311
+ZNF273
+ZNF84
+ZNF667
+ZNF649
+ZNF248
+ZNF334
+ZNF485
+ZNF442
+ZNF26
+ZNF69
+ZNF480
+ZNF587
+ZNF808
+ZNF28
+ZNF627
+ZNF789
+ZNF534
+ZNF525
+ZNF805
+ZNF468
+ZNF616
+ZFP57
+ZNF783
+ZNF425
+ZNF611
+ZNF254
+ZNF90
+ZNF891
+ZNF705G
+ZNF880
+ZNF492
+ZNF879
+ZNF736
+ZNF737
+ZNF324B
+ZNF564
+ZNF674
+ZNF550
+ZNF432
+ZNF10
+ZNF486
+ZNF225
+ZNF285
+ZNF224
+ZIM2
+ZNF2
+ZNF8
+ZNF487
+MXI1
+MYC
+ZEB1
+REST
+CTCFL
+E2F6
+PBX3
+STAT1
+STAT3
+STAT2
+THAP1
+TP73
+HIF1A
+TWIST1
+MITF
+KLF9
+ZNF24
+NFYA
+TFDP1
+FOXK2
+FOXH1
+GRHL2
+PBX2
+DUX4
+IRF1
+MYB
+ESR2
+HNF4G
+NR2F2
+RELB
+SOX13
+TCF7L2
+NFYB
+BACH1
+SIX5
+TBP
+ZNF416
+ZNF574
+ZNF41
+ZNF653
+ZNF35
+ZNF16
+ZNF692
+ZFP3
+ZNF322
+ZNF467
+ZSCAN22
+ZNF71
+ZFP64
+PRDM6
+ZNF37A
+ZNF586
+MYNN
+ZNF213
+PATZ1
+MAZ
+ZNF175
+KLF7
+GTF3A
+ZNF436
+FEZF1
+ZNF341
+ZNF394
+IKZF3
+ZNF513
+ZNF22
+ZNF146
+ZNF280A
+ZNF768
+ZNF554
+ZNF596
+ZBTB42
+ZNF594
+ZNF329
+ZBTB6
+ZSCAN30
+ZNF490
+ZNF563
+ZNF34
+ZNF774
+ZNF502
+ZFP28
+ZNF98
+ZNF677
+ZNF121
+ZNF770
+ZSCAN5C
+ZBTB48
+ZNF134
+GLI4
+ZNF260
+ZNF350
+ZNF595
+INSM1
+ARID5B
+LYL1
+AHR
+EPAS1
+ARNT
+TAL1
+NFE2L2
+ATF1
+ZFX
+MECOM
+SALL4
+KLF8
+ZBTB17
+PRDM14
+IKZF1
+ZNF335
+E2F5
+FOXM1
+LHX3
+NKX2-1
+NKX6-1
+MBD2
+MECP2
+NR1H3
+PPARA
+TP53
+RUNX1
+AIRE
+SMAD4
+STAT5A
+STAT4
+STAT6
+STAT5B
+THAP11
+NFYC
+ZNF711
+ARID3A
+HMGA1
+HMGA2
+MYF5
+NFE2L3
+ATF5
+DDIT3
+ZEB2
+HIVEP2
+IKZF2
+ZBTB11
+ZNF423
+ZBTB16
+ZNF541
+GZF1
+ZSCAN10
+PRDM12
+ZNF236
+PRDM15
+PRDM16
+ZNF761
+ZNF148
+ZNF589
+ZNF219
+SALL2
+E4F1
+SP7
+ZNF581
+ZNF217
+ZFP92
+ZSCAN26
+ZNF628
+ZNF521
+SP5
+ZNF316
+ZNF705E
+ZNF727
+ZNF735
+ZNF883
+ZNF718
+ZNF658
+SATB1
+CXXC1
+EBF4
+EBF3
+EBF2
+FOXF1
+FOXN1
+FOXJ1
+FOXD4L4
+TRPS1
+GTF2IRD1
+GTF2I
+HOXA3
+NKX2-2
+SETDB1
+MTERF1
+CDC5L
+SMAD9
+SMAD1
+HBP1
+SOX11
+TBX22
+LTF
+DNTTIP1
+POU2AF1
+CEBPZ
+GTF2B
+CARF
+SPZ1
+NR0B1
+BPTF
+PURA
+TOPORS
+NFE4
+ADNP
+CHAMP1
+DACH1
+DRAP1
+GATAD1
+GATAD2A
+HHEX
+HMG20A
+HMG20B
+HMGXB4
+IKZF5
+INSM2
+KAT7
+KMT2B
+MBD1
+MXD3
+MXD4
+NCOA1
+NCOA3
+NFXL1
+PHF20
+PRDM10
+SKI
+ZBED5
+ZBTB10
+ZBTB21
+ZBTB25
+ZBTB40
+ZBTB8A
+ZFP37
+ZFP91
+ZGPAT
+ZKSCAN8
+ZNF239
+ZNF362
+ZNF366
+ZNF407
+ZNF426
+ZNF48
+ZNF507
+ZNF511
+ZNF512
+ZNF518A
+ZNF577
+ZNF579
+ZNF585B
+ZNF592
+ZNF600
+ZNF629
+ZNF639
+ZNF644
+ZNF652
+ZNF654
+ZNF664
+ZNF697
+ZNF781
+ZNF83
+ZNF843
+ZSCAN21
+ZXDB
+AFF4
+ASCC1
+BAD
+CBFA2T2
+CBFB
+ZNF830
+CNOT6
+NELFB
+DDX20
+ENO1
+FEZF2
+FHL2
+FOXP4
+GTF2H3
+GTF3C2
+GTF3C5
+HCFC2
+HCLS1
+HDAC8
+UBE2K
+HTATIP2
+ID2
+KDM5A
+LARP1
+CERS4
+MAGED4
+MAGEF1
+MYEF2
+NCALD
+NME1
+NMRAL1
+NUCB1
+OTUD4
+PAXIP1
+PDCD11
+PDLIM5
+PHTF1
+PIR
+PLAGL1
+PQBP1
+PURG
+RAB18
+RAN
+RBBP5
+RBFOX2
+RFXANK
+SCAND2P
+SCMH1
+SEMA4A
+SF1
+SMAD2
+SNAPC4
+SNAPC5
+SND1
+SSBP3
+SSX2
+SSX3
+TAF1A
+TAF9
+TBPL1
+TCEAL2
+TFAM
+THAP5
+MED30
+TIMELESS
+TRMT1
+TSC22D4
+TSNAX
+TULP1
+VPS4B
+YEATS4
+ZBTB4
+ZBTB46
+ZHX3
+ZNF131
+ZNF160
+ZNF207
+RNF114
+ZNF326
+ZNF385A
+ZNF503
+ZNF510
+ZNF706
+TFAP2D
+BRCA1
+CREB3L2
+FUBP1
+HAND1
+HLTF
+HOXC6
+ID4
+NR1H2
+NR4A3
+SMARCA1
+SMARCA5
+SOX1
+TAF1
+TLX1
+HIVEP1
+ZNF165
+NF1
+BNC2
+ZBED2
+NKX2-4
+ARID5A
+BCL3
+CHD1
+CHD2
+DBX2
+DMC1
+EP300
+EZH2
+GTF2F1
+HCFC1
+HLX
+HOXC5
+IRX4
+IRX6
+MTA3
+NKX1-1
+NKX1-2
+NKX2-6
+OTP
+PML
+RAD21
+RCOR1
+SIN3A
+SMARCC1
+SMARCC2
+SMC3
+SP100
+TBL1XR1
+WRNIP1
+ZBTB3
+ZNF691
+TRAF4
+CPSF4
+MYCLP1
+TCF15
+TAF6
+GABPB1
+ILF2
+SIRT6
+ING4
+CHURC1
+MXD1
+TAL2
+RFXAP
+GTF2A2
+GTF2A1
+TFDP2
+RB1
+SMAD7
+SMAD6
+DEAF1
+ARNTL2
+TRIM28
+PARP1
+TERF1
+CNOT3
+DBX1
+BRF1
+BDP1
+POLR3A
+EWSR1
+CTNNB1
+FOXN4
+BCLAF1
+CCNT2
+HDAC2
+OVOL3
+ZNF536
+ZBTB5
+ZNF688
+TBX10
+FOXD4L6
+FOXE3
+RLF
+SP6
+ZNF746
+FOXD4L5
+FOXD4L3
+TBPL2
+ZNF687
+ZNF438
+ZNF516
+ZSCAN18
+PRDM13
+FOXD4L1
+SALL1
+ZBTB41
+ZBTB1
+ZSCAN5B
+GTF2A1L
+ZBTB8B
+ZNF575
+ZNF280B
+ZBTB34
+IKZF4
+AEBP2
+ZNF772
+ZSCAN25
+FIZ1
+ZNF215
+SALL3
+ZNF500
+ZFY
+ZBTB24
+ZNF853
+ZSCAN20
+ZNF80
+ZNF20
+ZNF630
+ZNF699
+ZNF470
+ZNF57
+ZXDC
+ZNF648
+ZNF544
+ZNF546
+ZNF517
+ZFP2
+ZNF572
+ZNF66
+ZNF689
+ZNF837
+ZNF710
+ZNF625
+ZNF491
+ZNF709
+ZNF526
+ZNF676
+ZNF556
+ZNF408
+ZNF700
+ZNF286A
+ZNF471
+ZFP30
+ZNF230
+ZNF233
+ZNF275
+ZNF729
+ZSCAN32
+ZNF195
+ZNF814
+ZNF878
+ZNF726
+ZNF208
+ZNF732
+ZNF99
+ZNF253
+ZNF623
+ZNF14
+ZNF705D
+ZNF43
+ZNF92
+ZNF117
+ZNF138
+ZNF91
+ZXDA
+ZNF155
+ZNF234
+ZNF844
+ZNF763
+ZNF569
+ZNF404
+ZNF678
+ZNF829
+ZNF672
+ZNF568
+ZNF841
+ZNF813
+ZNF836
+ZNF705A
+ZNF773
+ZNF551
+ZSCAN2
+ZNF227
+ZNF497
+ZNF493
+ZNF679
+ZNF683
+ZFP62
+ZNF721
+ZNF461
+ZNF397
+ZNF420
+ZNF578
+ZNF775
+ZNF845
+ZNF560
+ZNF606
+ZNF668
+ZKSCAN4
+ZNF514
+ZNF696
+ZNF607
+ZNF599
+ZNF559
+ZNF251
+ZNF583
+ZNF665
+ZNF670
+ZNF358
+ZNF319
+ZNF70
+ZNF226
+ZNF624
+PRDM5
+ZNF112
+ZNF780B
+ZBTB47
+ZBTB39
+ZNF646
+ZNF835
+ZNF107
+ZNF391
+ZSCAN12
+ZFPM1
+PEG3
+ZBTB38
+ZNF367
+ZNF256
+HDAC1
+APEX1
+CTBP1
+BANP
+CRTC2
+NONO
+SFPQ
+ABL1
+HELT
+DIDO1
+HNRNPUL1
+DPF2
+NCOA2
+ILF3
+RHOXF2B
+AHDC1
+HMGXB3
+LCOR
+MLLT10
+SATB2
+GPBP1L1
+ZNF280D
+ZNF142
+ZNF462
+ZNF576
+ATF7-NPFF
+NANOGP8
+MBTPS2
+CIC
+SETBP1
+FOXL3
+SEBOX
+DMRTB1
+RFX6
+TAF1L
+TWIST2
+FREM1
+ARID3B
+RBPJL
+CREBL2
+FOXB2
+FOXD4
+SP140
+CPHXL
+AHCTF1
+DNAJC21
+MYPOP
+PRDM11
+PHF21A
+CCDC169-SOHLH2
+MLXIP
+CREBZF
+TERF2
+SP110
+NFX1
+ASH2L
+METTL14
+VPS72
+CERS6
+CERS3
+CERS5
+CERS2
+PRDM7
+HIF3A
+BNC1
+FANK1
+IL21
+ZNF622
+NPAS4
+ZBED6
+TMEM33
+ACAA1
+ZNF800
+ADNP2
+ZNF414
+ZFP91-CNTF
+ZNF587B
+ZNF451
+ZNF532
+LDB1
+LMO2
+YOD1
+METTL3
+A1CF
+ABCF2
+ACO1
+ADARB1
+AGAP2
+AGGF1
+AGMAT
+AHRR
+AKR1A1
+ANXA1
+ANXA11
+APEX2
+ARFGAP1
+ARG1
+ARG2
+ARID3C
+ASAP3
+ASPSCR1
+ATOH8
+AVEN
+BAX
+BOLL
+BORCS8-MEF2B
+BRF2
+C19orf25
+CANX
+CAT
+CBX7
+CCDC25
+CD59
+CDK2AP1
+CELF4
+CELF5
+CELF6
+CFL2
+CKMT1B
+CLK1
+CNOT4
+CPTP
+CSNK2B
+CSTF2
+CYB5R1
+CYCS
+DAB2
+DAZAP1
+DDX4
+DDX43
+DDX53
+DGCR8
+DHX36
+DIABLO
+DIS3
+DMAP1
+DNMT3A
+DR1
+DTL
+DUS3L
+DUSP22
+DUSP26
+ECSIT
+EDN1
+EEF1D
+EIF5A2
+ESRP1
+ESRP2
+ETFB
+EXO5
+EXOSC3
+EZR
+FAAP24
+FAM127B
+FBXL19
+FEZ1
+FGF19
+FIP1L1
+FOXS1
+GADD45A
+GAR1
+GIT2
+GLYCTK
+GOT1
+GPAM
+GPANK1
+GPD1
+GRHL3
+GRHPR
+GTPBP1
+GTPBP6
+H1FX
+H2AFY
+H2AFZ
+HADHB
+HDAC3
+HES4
+HEYL
+HHAT
+HIRIP3
+HIST1H2BN
+HIST2H2AB
+HIST2H2BE
+HIVEP3
+HKR1
+HLCS
+HMGB1
+HMGB2
+HMGB3
+HMGB4
+HNRNPA0
+HNRNPA1
+HNRNPC
+HNRNPH3
+HNRNPLL
+HP1BP3
+HSPA1L
+HSPA5
+HUNK
+ID1
+IL24
+ING3
+IVD
+JAZF1
+JRK
+JRKL
+KCNIP1
+KDM2A
+KDM4A
+KDM4B
+KDM4C
+KDM4D
+KDM4E
+KDM5D
+KDM7A
+KIAA0907
+KIF22
+KLF18
+KLRG1
+LARP4
+LAS1L
+LIN28A
+LRRFIP1
+LSM6
+LUZP1
+LUZP2
+MAGEA8
+MAGOH
+MAP4K2
+MAPK1
+MCTP2
+MDM2
+MELK
+METTL21B
+MEX3C
+MIEF1
+MIOS
+MKX
+MORN1
+MRPL1
+MRPL2
+MRPS25
+MSI1
+MSI2
+MSRA
+MSRB3
+MTHFD1
+MYCL
+MYLK
+NAGS
+NANOS1
+NAP1L1
+NCBP2
+NCOR1
+NCOR2
+NELFA
+NEUROG3
+NMI
+NNT
+NOC2L
+NPDC1
+NUP107
+NUP133
+NXPH3
+ODC1
+P4HB
+PCK2
+PDE6H
+PDS5A
+PGAM2
+PHF12
+PHF2
+PHF8
+PHLDA2
+PICK1
+PIK3C3
+PKM
+PLG
+POLD2
+POLE3
+POLE4
+POLI
+POLR2A
+POLR3G
+PPARGC1A
+PPP1R10
+PPP2R3B
+PPP5C
+PRDX5
+PRKAA1
+PRKAA2
+PRNP
+PROX2
+PSMA6
+PSMC2
+PSMD12
+PTCD1
+PTPMT1
+PUM3
+R3HDM2
+RAB14
+RAB2A
+RAB7A
+RBBP9
+RBM17
+RBM22
+RBM3
+RBM42
+RBM7
+RBM8A
+RBMS1
+RFC2
+RFC3
+RFX8
+RIOK2
+RNASEH2C
+RNF138
+RPL35
+RPL6
+RPP25
+RPS10
+RPS4X
+RPS6KA5
+RUFY3
+RUVBL1
+SCAND1
+SCX
+SF3B1
+SFT2D1
+SIM1
+SIM2
+SLC18A1
+SMAP2
+SMPX
+SMUG1
+SNRNP70
+SNRPB2
+SOCS4
+SOD1
+SPAG7
+SPATS2
+SPR
+SRBD1
+SRP9
+SRRM3
+SSRP1
+STAU2
+STK40
+STUB1
+SUCLG1
+T
+TAF7
+TAGLN2
+TCEAL6
+TCF23
+TCF24
+TFDP3
+TFF3
+THOC2
+TIA1
+TIGD2
+TIGD3
+TIGD4
+TIGD5
+TIGD6
+TIGD7
+TIMM44
+TIMM8A
+TMSB4XP8
+TOB2
+TPI1
+TPPP
+TRIB1
+TRIB2
+TRIB3
+TRIM21
+TRIM69
+TRIP10
+TRMO
+TROVE2
+TSN
+U2AF1
+UBB
+UBE2V1
+UBTF
+UBXN1
+UGP2
+UQCRB
+USP39
+UTP18
+VAMP3
+WDR83
+WISP2
+XG
+XRCC1
+YWHAE
+YWHAZ
+ZC3H7A
+ZCCHC14
+ZCCHC17
+ZDHHC15
+ZDHHC24
+ZDHHC5
+ZHX2
+ZMAT2
+ZMAT4
+ZNF286B
+ZNF355P
+ZNF542P
+ZNF598
+ZNF658B
+ZNF702P
+ZNF705CP
+ZNF717
+ZNF720
+ZNF788
+ZNF806
+ZNF826P
+ZNF827
+ZNF831
+ZRSR2
+ZSWIM1
diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml
index 6ff380eae..ad847e1c9 100644
--- a/src/api/comp_method.yaml
+++ b/src/api/comp_method.yaml
@@ -12,6 +12,8 @@ functionality:
       __merge__: file_multiomics_rna_h5ad.yaml
       required: True
       direction: input
+      info:
+        test_default: ../../resources_test/grn-benchmark/multiomics_rna.h5ad
     - name: --multiomics_atac
       __merge__: file_multiomics_atac_h5ad.yaml
       required: false
@@ -21,10 +23,6 @@ functionality:
       required: true
       direction: output
 
-    
-    
-
-      
   test_resources:
     - type: python_script
       path: /src/common/component_tests/run_and_check_output.py