Skip to content

Commit

Permalink
Merge branch 'main' of github.com:openproblems-bio/task_grn_inference
Browse files Browse the repository at this point in the history
  • Loading branch information
janursa committed Sep 10, 2024
2 parents eff9176 + ca5efb9 commit e1e727a
Show file tree
Hide file tree
Showing 13 changed files with 93 additions and 47 deletions.
12 changes: 12 additions & 0 deletions dockerfiles/scsgl/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Use the base image
FROM python:3.8

# Install dependencies
RUN apt-get update
RUN apt-get install -y r-base time
RUN Rscript -e "install.packages('pcaPP')"

# Install scSGL
RUN git clone https://github.com/Single-Cell-Graph-Learning/scSGL
WORKDIR "/scSGL"
ENV SCSGL_PATH $PWD
5 changes: 5 additions & 0 deletions src/methods/single_omics/ennet/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ functionality:
description: |
GRN inference using ENNET.
documentation_url: https://doi.org/10.1186/1752-0509-7-106
arguments:
- name: --M
type: integer
default: 100
description: "Number of iterations."
resources:
- type: r_script
path: script.R
Expand Down
5 changes: 3 additions & 2 deletions src/methods/single_omics/ennet/script.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ par <- list(
"tf_all" = 'resources/prior/tf_all.csv',
"prediction" = 'output/ennet/prediction.csv',
"temp_dir": 'output/ennet',
"max_n_links": 50000
"max_n_links": 50000,
"M": 100
)
## VIASH END

Expand All @@ -34,7 +35,7 @@ Tf <- which(gene_names %in% dat$V1)

# Run GRN inference method
K <- matrix(0,nrow(X),ncol(X))
grn <- ennet(E = X, K = K, Tf = Tf)
grn <- ennet(E = X, K = K, Tf = Tf, M = par$M)

# Re-format output
df <- as.data.frame(as.table(grn))
Expand Down
2 changes: 1 addition & 1 deletion src/methods/single_omics/ennet/test.sh
Original file line number Diff line number Diff line change
@@ -1 +1 @@
viash run src/methods/single_omics/ennet/config.novsh.yaml -- --multiomics_rna resources_test/grn-benchmark/multiomics_rna.h5ad --tfs resources/prior/tf_all.csv --prediction output/ennet/prediction.csv
viash run src/methods/single_omics/ennet/config.vsh.yaml -- --multiomics_rna resources_test/grn-benchmark/multiomics_rna.h5ad --tf_all resources/prior/tf_all.csv --prediction output/ennet/prediction.csv
10 changes: 8 additions & 2 deletions src/methods/single_omics/pidc/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@
df_rna = pd.DataFrame(adata_rna.X.toarray(), index=adata_rna.obs.index, columns=adata_rna.var.index)

# Remove genes with >=90% of zeros
mask = (np.mean(df_rna.to_numpy() == 0, axis=0) >= 0.9)
mask = (np.mean(df_rna.to_numpy() == 0, axis=0) >= 0.75)
df_rna = df_rna.loc[:, ~mask]

# Remove samples with >=90% of zeros
mask = (np.mean(df_rna.to_numpy() == 0, axis=1) >= 0.9)
mask = (np.mean(df_rna.to_numpy() == 0, axis=1) >= 0.75)
df_rna = df_rna.loc[~mask, :]

# (genes x samples) format is needed
Expand All @@ -46,8 +46,14 @@
os.path.join(par['temp_dir'], 'output.tsv'),
])

# Load list of putative TFs
df = pd.read_csv(par['tf_all'], header=None, names=['gene_name'])
tfs = set(list(df['gene_name']))

# Re-format output
df = pd.read_csv(os.path.join(par['temp_dir'], 'output.tsv'), header=None, names=['source', 'target', 'weight'], sep='\t')
mask = np.asarray([(gene_name in tfs) for gene_name in df['source']], dtype=bool)
df = df[mask]
df = df.head(par['max_n_links'])
df.to_csv(par['prediction'], header=True, sep=',')

Expand Down
3 changes: 2 additions & 1 deletion src/methods/single_omics/pidc/test.sh
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
viash run src/methods/single_omics/pidc/config.novsh.yaml -- --multiomics_rna resources_test/grn-benchmark/multiomics_rna.h5ad --prediction output/pidc/prediction.csv
#viash run src/methods/single_omics/pidc/config.vsh.yaml -- --multiomics_rna resources_test/grn-benchmark/multiomics_rna.h5ad --prediction output/pidc/prediction.csv
viash run src/methods/single_omics/pidc/config.vsh.yaml -- --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad --prediction output/pidc/prediction.csv
8 changes: 1 addition & 7 deletions src/methods/single_omics/scsgl/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,14 @@ functionality:
description: |
GRN inference using SCSGL.
documentation_url: https://doi.org/10.1101/2021.07.08.451697

resources:
- type: python_script
path: script.py

platforms:
- type: docker
image: python:3.8
image: apassemi/scsgl
setup:
- type: docker
run: |
apt-get update && \
apt-get install -y r-base time && \
Rscript -e "install.packages('pcaPP')"
- type: python
packages: [ anndata, numba==0.53.1, scipy==1.6.3, pandas==1.2.4, rpy2==3.4.4, numpy==1.20.2, scikit-learn==0.24.1, PyYAML==6.0.2 ]
- type: native
Expand Down
33 changes: 23 additions & 10 deletions src/methods/single_omics/scsgl/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
import sys
import contextlib
import subprocess
from itertools import permutations

import anndata
import numpy as np
import scipy.sparse
from scipy.spatial.distance import squareform
import pandas as pd


Expand All @@ -18,24 +20,26 @@
}
## VIASH END


# Load scRNA-seq data
adata_rna = anndata.read_h5ad(par['multiomics_rna'])
gene_names = adata_rna.var.gene_ids.index.to_numpy()
X = adata_rna.X.toarray() if scipy.sparse.issparse(adata_rna.X) else adata_rna.X

# Download scSGL
SCSGL_FOLDER = os.path.join(par['temp_dir'], 'scSGL')
os.makedirs(par['temp_dir'], exist_ok=True)
if not os.path.exists(os.path.join(par['temp_dir'], 'scSGL', 'pysrc')):
subprocess.run(['git', 'clone', 'https://github.com/Single-Cell-Graph-Learning/scSGL', SCSGL_FOLDER])

# Import pysrc locally (from the cloned repository)
sys.path.append(SCSGL_FOLDER)
# Import pysrc locally
sys.path.append(os.environ['SCSGL_PATH'])
from pysrc.graphlearning import learn_signed_graph
from pysrc.evaluation import auc

# Remove genes with >=90% of zeros
mask = (np.mean(X == 0, axis=0) >= 0.75)
X = X[:, ~mask]
gene_names = gene_names[~mask]

# Remove samples with >=90% of zeros
mask = (np.mean(X == 0, axis=1) >= 0.75)
X = X[~mask, :]

# Run scSGL
print('Starting scSGL', flush=True)
df = learn_signed_graph(
X.T,
pos_density=0.45,
Expand All @@ -49,6 +53,15 @@
df.reset_index(inplace=True)
df.drop('index', axis=1, inplace=True, errors='ignore')

# Load list of putative TFs
df_tfs = pd.read_csv(par['tf_all'], header=None, names=['gene_name'])
tfs = set(list(df_tfs['gene_name']))

# Ensure first gene in pair is a putative TF
print(df)
mask = np.asarray([(gene_name in tfs) for gene_name in df['source']], dtype=bool)
df = df[mask]

# Sort values
df = df.sort_values(by='weight', key=abs, ascending=False)

Expand Down
3 changes: 2 additions & 1 deletion src/methods/single_omics/scsgl/test.sh
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
viash run src/methods/single_omics/scsgl/config.novsh.yaml -- --multiomics_rna resources_test/grn-benchmark/multiomics_rna.h5ad --prediction output/scsgl/prediction.csv
#viash build src/methods/single_omics/scsgl/config.vsh.yaml --setup cachedbuild
viash run src/methods/single_omics/scsgl/config.vsh.yaml -- --multiomics_rna resources_test/grn-benchmark/multiomics_rna.h5ad --prediction output/scsgl/prediction.csv
6 changes: 5 additions & 1 deletion src/methods/single_omics/tigress/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@ functionality:
description: |
GRN inference using TIGRESS.
documentation_url: https://rdrr.io/github/jpvert/tigress/man/tigress.html

arguments:
- name: --nsplit
type: integer
default: 25
description: "Number of sample splits to perform in stability selection."
resources:
- type: r_script
path: script.R
Expand Down
14 changes: 10 additions & 4 deletions src/methods/single_omics/tigress/script.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ par <- list(
"tf_all" = 'resources/prior/tf_all.csv',
"prediction" = 'output/tigress/prediction.csv',
"temp_dir": 'output/tigress',
"max_n_links": 50000
"max_n_links": 50000,
"nsplit": 25
)
## VIASH END

Expand All @@ -34,7 +35,14 @@ dat <- read.csv(par$tf_all, header = FALSE)
Tf <- intersect(gene_names, dat$V1)

# Run GRN inference method
grn = tigress(X, tflist = Tf, targetlist = gene_names, allsteps=FALSE, verb=FALSE, usemulticore=TRUE)
start.time <- Sys.time()
grn = tigress(
X, tflist = gene_names, targetlist = gene_names,
nstepsLARS = 5,
nsplit = par$nsplit,
allsteps=FALSE, verb=TRUE, usemulticore=TRUE
)
time.taken <- Sys.time() - start.time

# Re-format output
df <- as.data.frame(as.table(grn))
Expand All @@ -48,8 +56,6 @@ df <- cbind(index = 1:nrow(df), df)
# Keep top links
df <- head(df, par$max_n_links)

print(df)

# Save results
write.table(df, par$prediction, sep = ",", quote = FALSE, row.names = FALSE)

Expand Down
3 changes: 2 additions & 1 deletion src/methods/single_omics/tigress/test.sh
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
viash run src/methods/single_omics/tigress/config.novsh.yaml -- --multiomics_rna resources_test/grn-benchmark/multiomics_rna.h5ad --tfs resources/prior/tf_all.csv --prediction output/tigress/prediction.csv
#viash run src/methods/single_omics/tigress/config.vsh.yaml -- --multiomics_rna resources_test/grn-benchmark/multiomics_rna.h5ad --tf_all resources/prior/tf_all.csv --prediction output/tigress/prediction.csv
viash run src/methods/single_omics/tigress/config.vsh.yaml -- --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad --tf_all resources/prior/tf_all.csv --prediction output/tigress/prediction.csv
36 changes: 19 additions & 17 deletions src/metrics/regression_2/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,9 @@ def cross_validate_gene(
y_pred = np.concatenate(y_pred, axis=0)
y_target = np.concatenate(y_target, axis=0)

# results['r2'] = float(r2_score(y_target, y_pred))
results['avg-r2'] = float(np.mean(r2s))

results['r2'] = float(np.clip(r2_score(y_target, y_pred), 0, 1))
results['avg-r2'] = float(np.clip(np.mean(r2s), 0, 1))
return results


Expand All @@ -117,18 +117,19 @@ def learn_background_distribution(
for _ in range(N_POINTS_TO_ESTIMATE_BACKGROUND):
j = rng.integers(low=0, high=n_genes)
random_grn[:, j] = rng.random(size=n_genes)
res = cross_validate_gene(
estimator_t,
X,
groups,
random_grn,
j,
n_features=n_features,
random_state=SEED,
n_jobs=n_jobs
)
scores.append(res['avg-r2'])


if n_features > 0:
res = cross_validate_gene(
estimator_t,
X,
groups,
random_grn,
j,
n_features=n_features,
random_state=SEED,
n_jobs=n_jobs
)
scores.append(res['avg-r2'])
background[n_features] = (np.mean(scores), max(0.001, np.std(scores)))
background['max'] = background[max_n_regulators]
return background
Expand Down Expand Up @@ -156,8 +157,9 @@ def cross_validate(
# Perform cross-validation for each gene
results = []
for j in tqdm.tqdm(range(n_genes), desc=f'{estimator_t} CV'):
res = cross_validate_gene(estimator_t, X, groups, grn, j, n_features=int(n_features[j]),n_jobs=n_jobs)
results.append(res)
if n_features[j] > 0:
res = cross_validate_gene(estimator_t, X, groups, grn, j, n_features=int(n_features[j]),n_jobs=n_jobs)
results.append(res)

return {
'gene_names': list(gene_names),
Expand Down

0 comments on commit e1e727a

Please sign in to comment.