Skip to content

Commit

Permalink
Merge pull request #6 from chrarnold/main
Browse files Browse the repository at this point in the history
New version of GRaNIE integration
  • Loading branch information
janursa authored Aug 21, 2024
2 parents cbd7398 + 0e1073f commit 329c6b8
Show file tree
Hide file tree
Showing 3 changed files with 146 additions and 89 deletions.
31 changes: 8 additions & 23 deletions dockerfiles/granie/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,6 @@
FROM bioconductor/bioconductor_docker:devel-R-4.4.1


# Install required dependencies for the R packages
#RUN apt-get update && apt-get install -y \
# libcurl4-openssl-dev \
# libxml2-dev \
# libssl-dev \
# libcairo2-dev
# libxt-dev \
# libopenblas-dev


#Install R packages
RUN R -e "install.packages(c('devtools'))"

Expand All @@ -21,19 +11,14 @@ WORKDIR /workspace
# Default command
CMD ["R"]

RUN R -e "remotes::install_version('Matrix', version = '1.6-3')"

#Version 1.9.4 of GRaNIE
RUN R -e "devtools::install_gitlab('grp-zaugg/GRaNIE@855bf3def33ad7af9353db79c7e21c9279035fb8', host = 'git.embl.de', subdir = 'src/GRaNIE', dependencies = TRUE, upgrade = 'never')"

#Version 0.2.1 of GRaNIEverse
RUN R -e "devtools::install_gitlab('grp-zaugg/GRaNIEverse@3224b042f25f0085eeaf9194042bcb89103ea962', host = 'git.embl.de', upgrade = 'never', dependencies = TRUE)"

#There seems to be an incompatibility with the Matrix and irlba package, downgrading Matrix seems to work

RUN R -e "install.packages('irlba',type='source')"

RUN R -e "BiocManager::install(c('BSgenome.Hsapiens.UCSC.hg38', 'EnsDb.Hsapiens.v86', 'EnsDb.Mmusculus.v79', 'BSgenome.Mmusculus.UCSC.mm39'))"
#Version 1.9.4 of GRaNIE.
RUN R -e "devtools::install_gitlab('grp-zaugg/[email protected]', host = 'git.embl.de', subdir = 'src/GRaNIE', dependencies = TRUE, upgrade = 'never')"

#Version 0.3.1 of GRaNIEverse
RUN R -e "devtools::install_gitlab('grp-zaugg/[email protected]', host = 'git.embl.de', upgrade = 'never', dependencies = TRUE)"

#There is an incompatibility with the Matrix and irlba package, see https://github.com/satijalab/seurat/issues/8000
RUN R -e "install.packages('irlba',type='source', rebuild = TRUE)"

# biovizbase is only needed to create the ATAC assay along with annotation
RUN R -e "BiocManager::install(c('BSgenome.Hsapiens.UCSC.hg38', 'EnsDb.Hsapiens.v86', 'EnsDb.Mmusculus.v79', 'BSgenome.Mmusculus.UCSC.mm39', 'biovizBase', 'Signac', 'glmGamPoi'))"
79 changes: 68 additions & 11 deletions src/methods/multi_omics/granie/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,63 @@ functionality:
required: false
direction: input
description: "Path to the RNA data file (e.g., rna.rds)."
default: resources_test/grn-benchmark/multiomics_rna.rds
default: resources_test/grn-benchmark/multiomics_r/rna.rds

- name: --file_atac
type: file
required: false
direction: input
description: "Path to the ATAC data file (e.g., atac.rds)."
default: resources_test/grn-benchmark/multiomics_atac.rds
default: resources_test/grn-benchmark/multiomics_r/atac.rds

- name: --normRNA
type: string
required: false
direction: input
description: Normalization method for RNA data.
default: "SCT"

- name: --normATAC
type: string
required: false
direction: input
description: Normalization method for ATAC data.
default: "LSI"

- name: --LSI_featureCutoff
type: string
required: false
direction: input
description: Feature cutoff for LSI normalization.
default: "q0"

- name: --nDimensions_ATAC
type: integer
required: false
direction: input
description: Number of dimensions for ATAC modality
default: 50

- name: --integrationMethod
type: string
required: false
direction: input
description: Method used for data integration.
default: "WNN"

- name: --WNN_knn
type: integer
required: false
direction: input
description: Number of nearest neighbors for WNN integration.
default: 20

- name: --minCellsPerCluster
type: integer
required: false
direction: input
description: Minimum number of cells required per cluster.
default: 25


- name: --preprocessing_clusteringMethod
Expand All @@ -40,12 +89,12 @@ functionality:
description: Resolution for clustering, typically between 5 and 20.
default: 14

- name: --preprocessing_SCT_nDimensions
- name: --preprocessing_RNA_nDimensions
type: integer
required: false
direction: input
default: 50
description: Number of dimensions for SCT, default is 50.
description: Number of dimensions for RNA reduction, default is 50.

- name: --genomeAssembly
type: string
Expand Down Expand Up @@ -89,12 +138,13 @@ functionality:
direction: input
description: FDR threshold for peak-gene connections (default is 0.2).

- name: --peak_gene
type: file
required: false
direction: output
description: Path to the peak-gene output file (e.g., peak_gene.csv). Not yet implemented.
default: output/granie/peak_gene.csv
# Existance of output files is checked, not yet implemented
# - name: --peak_gene
# type: file
# required: false
# direction: output
# description: Path to the peak-gene output file (e.g., peak_gene.csv). Not yet implemented.
# default: output/granie/peak_gene.csv

- name: --useWeightingLinks
type: boolean
Expand All @@ -110,14 +160,21 @@ functionality:
direction: input
description: "Flag to force rerun of the analysis regardless of existing results."

- name: --subset
type: boolean
required: false
default: false
direction: input
description: "Flag for testing purposes to subset the data for faster running times"

resources:
- type: r_script
path: script.R


platforms:
- type: docker
image: chrarnold84/granieverse:latest
image: chrarnold84/granieverse:v1.3

- type: native
- type: nextflow
Expand Down
125 changes: 70 additions & 55 deletions src/methods/multi_omics/granie/script.R
Original file line number Diff line number Diff line change
@@ -1,48 +1,55 @@

## VIASH START
# par <- list(
# file_rna = "resources_test/grn-benchmark/multiomics_r/rna.rds",
# file_atac = "resources_test/grn-benchmark/multiomics_r/atac.rds",
# preprocessing_clusteringMethod = 1, # Seurat::FindClusters: (1 = original Louvain algorithm, 2 = Louvain algorithm with multilevel refinement, 3 = SLM algorithm, 4 = Leiden algorithm)
# preprocessing_clusterResolution = 14, # Typically between 5 and 20
# preprocessing_RNA_nDimensions = 50, # Default 50
# genomeAssembly = "hg38",
# GRaNIE_corMethod = "spearman",
# GRaNIE_includeSexChr = TRUE,
# GRaNIE_promoterRange = 250000,
# GRaNIE_TF_peak_fdr_threshold = 0.2,
# GGRaNIE_peak_gene_fdr_threshold = 0.2,
# num_workers = 4,
# peak_gene = "output/granie/peak_gene.csv", # not yet implemented, should I?
# prediction= "output/granie/prediction.csv",
# useWeightingLinks = FALSE,
# forceRerun = FALSE
# )

# meta <- list(
# functionality_name = "my_method_r"
# )
## VIASH END

set.seed(42)
suppressPackageStartupMessages(library(Seurat))

library(Seurat)
library(Signac)
library(Matrix)
suppressPackageStartupMessages(library(Signac))
suppressPackageStartupMessages(library(Matrix))
library(GRaNIEverse)
library(GRaNIE)
library(qs)
library(BSgenome.Hsapiens.UCSC.hg38)
library(EnsDb.Hsapiens.v86)
library(EnsDb.Mmusculus.v79)
library(BSgenome.Mmusculus.UCSC.mm39)
library(dplyr)
suppressPackageStartupMessages(library(qs))
suppressPackageStartupMessages(library(BSgenome.Hsapiens.UCSC.hg38))
suppressPackageStartupMessages(library(EnsDb.Hsapiens.v86))
suppressPackageStartupMessages(library(EnsDb.Mmusculus.v79))
suppressPackageStartupMessages(library(BSgenome.Mmusculus.UCSC.mm39))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(SummarizedExperiment))


cat("Content of par list:")
str(par)

#qs::qsave(par, "/home/carnold/par.qs")

## VIASH START
par <- list(
file_rna = "resources_test/grn-benchmark/multiomics_r/rna.rds",
file_atac = "resources_test/grn-benchmark/multiomics_r/atac.rds",
temp_dir = "output/granie/",
preprocessing_clusteringMethod = 1, # Seurat::FindClusters: (1 = original Louvain algorithm, 2 = Louvain algorithm with multilevel refinement, 3 = SLM algorithm, 4 = Leiden algorithm)
preprocessing_clusterResolution = 14, # Typically between 5 and 20
preprocessing_SCT_nDimensions = 50, # Default 50
genomeAssembly = "hg38",
GRaNIE_corMethod = "spearman",
GRaNIE_includeSexChr = TRUE,
GRaNIE_promoterRange = 250000,
GRaNIE_TF_peak_fdr_threshold = 0.2,
GRaNIE_peak_gene_fdr_threshold = 0.2,
num_workers = 4,
peak_gene = "output/granie/peak_gene.csv", # not yet implemented, should I?
prediction= "output/granie/prediction.csv",
useWeightingLinks = FALSE,
forceRerun = FALSE
)
## VIASH END
print(par)
# meta <- list(
# functionality_name = "my_method_r"
# )


#### STANDARD ASSIGNMENTS ###
file_seurat = "seurat_granie.qs"
outputDir = par$temp_dir
outputDir = dirname(par$prediction)

if (!dir.exists(outputDir)) {
dir.create(outputDir, recursive = TRUE)
Expand All @@ -67,7 +74,7 @@ if (!file.exists(destfile)) {
# Define the directory to extract the files to
exdir <- "PWMScan_HOCOMOCOv12_H12INVIVO"

GRaNIE_TFBSFolder = paste0(exdir, "/PWMScan_HOCOMOCOv12/H12INVIVO")
GRaNIE_TFBSFolder = paste0(exdir, "/H12INVIVO")

if (!file.exists(GRaNIE_TFBSFolder)) {
untar(destfile, exdir = exdir)
Expand All @@ -91,7 +98,7 @@ if (!file.exists(file_RNA)) {
# Preprocess data #
###################

if (par.l$forceRerun | !file.exists(file_seurat)) {
if (par$forceRerun | !file.exists(file_seurat)) {

# Sparse matrix
rna.m = readRDS(par$file_rna)
Expand Down Expand Up @@ -146,12 +153,6 @@ if (par.l$forceRerun | !file.exists(file_seurat)) {
}

output_seuratProcessed = paste0(outputDir, "/seuratObject.qs")
if (!file.exists(output_seuratProcessed)) {
prepareData = TRUE
} else {
prepareData = FALSE
}


###################
# Preprocess data #
Expand All @@ -162,24 +163,38 @@ GRaNIE_file_peaks = paste0(outputDir, "/atac.pseudobulkFromClusters_res", par$pr
GRaNIE_file_rna = paste0(outputDir, "/rna.pseudobulkFromClusters_res", par$preprocessing_clusterResolution, "_mean.tsv.gz")
GRaNIE_file_metadata = paste0(outputDir, "/metadata_res", par$preprocessing_clusterResolution, "_mean.tsv.gz")

if (file.exists(GRaNIE_file_peaks) & file.exists(GRaNIE_file_metadata) & file.exists(GRaNIE_file_rna) & !par.l$forceRerun) {
if (file.exists(GRaNIE_file_peaks) & file.exists(GRaNIE_file_metadata) & file.exists(GRaNIE_file_rna) & !par$forceRerun) {

cat("Preprocessing skipped because all files alreadx exist anf forceRerun = FALSE.")
cat("Preprocessing skipped because all files already exist anf forceRerun = FALSE.")

} else {


# Subset for testing purposes
if (par$subset == TRUE) {
cat("SUBSET cells\n")
random_cells <- sample(Cells(seurat_object), size = 5000, replace = FALSE)
seurat_object = subset(seurat_object, cells = random_cells)
cat("SUBSET peaks\n")
peak_names <- rownames(seurat_object[["peaks"]])
selected_peaks <- sample(peak_names, size = 50000, replace = FALSE)
seurat_object[["peaks"]] = subset(seurat_object[["peaks"]], features = selected_peaks)
}

seurat_object = prepareSeuratData_GRaNIE(seurat_object,
outputDir = par$outputDir,
file_RNA_features = file_RNA,
assayName_RNA_raw = "RNA", assayName_ATAC = "peaks",
prepareData = prepareData,
SCT_nDimensions = par$preprocessing_SCT_nDimensions,
dimensionsToIgnore_LSI_ATAC = 1,
outputDir = outputDir,
saveSeuratObject = TRUE,
genome = par$genomeAssembly,
assayName_RNA = "RNA", normRNA = "SCT", nDimensions_RNA = par$preprocessing_RNA_nDimensions, recalculateVariableFeatures = NULL,
assayName_ATAC_raw = "peaks",
normATAC = "LSI", LSI_featureCutoff = "q0", nDimensions_ATAC = 50, dimensionsToIgnore_LSI_ATAC = 1,
integrationMethod = "WNN", WNN_knn = 20,
pseudobulk_source = "cluster",
countAggregation = "mean",
clusteringAlgorithm = par$preprocessing_clusteringMethod,
clusterResolutions = par$preprocessing_clusterResolution,
saveSeuratObject = TRUE)
clusterResolutions = par$preprocessing_clusterResolution,
minCellsPerCluster = 25,
forceRerun = FALSE
)

}

Expand All @@ -190,7 +205,7 @@ if (file.exists(GRaNIE_file_peaks) & file.exists(GRaNIE_file_metadata) & file.ex
##############

GRN = runGRaNIE(
dir_output = par$temp_dir,
dir_output = outputDir,
datasetName = "undescribed",
GRaNIE_file_peaks,
GRaNIE_file_rna,
Expand Down

0 comments on commit 329c6b8

Please sign in to comment.