diff --git a/dockerfiles/granie/Dockerfile b/dockerfiles/granie/Dockerfile index 1cf5ed313..ee59a1b08 100644 --- a/dockerfiles/granie/Dockerfile +++ b/dockerfiles/granie/Dockerfile @@ -2,16 +2,6 @@ FROM bioconductor/bioconductor_docker:devel-R-4.4.1 -# Install required dependencies for the R packages -#RUN apt-get update && apt-get install -y \ -# libcurl4-openssl-dev \ -# libxml2-dev \ -# libssl-dev \ -# libcairo2-dev -# libxt-dev \ -# libopenblas-dev - - #Install R packages RUN R -e "install.packages(c('devtools'))" @@ -21,19 +11,14 @@ WORKDIR /workspace # Default command CMD ["R"] -RUN R -e "remotes::install_version('Matrix', version = '1.6-3')" - -#Version 1.9.4 of GRaNIE -RUN R -e "devtools::install_gitlab('grp-zaugg/GRaNIE@855bf3def33ad7af9353db79c7e21c9279035fb8', host = 'git.embl.de', subdir = 'src/GRaNIE', dependencies = TRUE, upgrade = 'never')" - -#Version 0.2.1 of GRaNIEverse -RUN R -e "devtools::install_gitlab('grp-zaugg/GRaNIEverse@3224b042f25f0085eeaf9194042bcb89103ea962', host = 'git.embl.de', upgrade = 'never', dependencies = TRUE)" - -#There seems to be an incompatibility with the Matrix and irlba package, downgrading Matrix seems to work - -RUN R -e "install.packages('irlba',type='source')" - -RUN R -e "BiocManager::install(c('BSgenome.Hsapiens.UCSC.hg38', 'EnsDb.Hsapiens.v86', 'EnsDb.Mmusculus.v79', 'BSgenome.Mmusculus.UCSC.mm39'))" +#Version 1.9.4 of GRaNIE. +RUN R -e "devtools::install_gitlab('grp-zaugg/GRaNIE@v1.9.4', host = 'git.embl.de', subdir = 'src/GRaNIE', dependencies = TRUE, upgrade = 'never')" +#Version 0.3.1 of GRaNIEverse +RUN R -e "devtools::install_gitlab('grp-zaugg/GRaNIEverse@v1.3.1', host = 'git.embl.de', upgrade = 'never', dependencies = TRUE)" +#There is an incompatibility with the Matrix and irlba package, see https://github.com/satijalab/seurat/issues/8000 +RUN R -e "install.packages('irlba',type='source', rebuild = TRUE)" +# biovizbase is only needed to create the ATAC assay along with annotation +RUN R -e "BiocManager::install(c('BSgenome.Hsapiens.UCSC.hg38', 'EnsDb.Hsapiens.v86', 'EnsDb.Mmusculus.v79', 'BSgenome.Mmusculus.UCSC.mm39', 'biovizBase', 'Signac', 'glmGamPoi'))" diff --git a/src/methods/multi_omics/granie/config.vsh.yaml b/src/methods/multi_omics/granie/config.vsh.yaml index 55daf328d..151c545cc 100644 --- a/src/methods/multi_omics/granie/config.vsh.yaml +++ b/src/methods/multi_omics/granie/config.vsh.yaml @@ -16,14 +16,63 @@ functionality: required: false direction: input description: "Path to the RNA data file (e.g., rna.rds)." - default: resources_test/grn-benchmark/multiomics_rna.rds + default: resources_test/grn-benchmark/multiomics_r/rna.rds - name: --file_atac type: file required: false direction: input description: "Path to the ATAC data file (e.g., atac.rds)." - default: resources_test/grn-benchmark/multiomics_atac.rds + default: resources_test/grn-benchmark/multiomics_r/atac.rds + + - name: --normRNA + type: string + required: false + direction: input + description: Normalization method for RNA data. + default: "SCT" + + - name: --normATAC + type: string + required: false + direction: input + description: Normalization method for ATAC data. + default: "LSI" + + - name: --LSI_featureCutoff + type: string + required: false + direction: input + description: Feature cutoff for LSI normalization. + default: "q0" + + - name: --nDimensions_ATAC + type: integer + required: false + direction: input + description: Number of dimensions for ATAC modality + default: 50 + + - name: --integrationMethod + type: string + required: false + direction: input + description: Method used for data integration. + default: "WNN" + + - name: --WNN_knn + type: integer + required: false + direction: input + description: Number of nearest neighbors for WNN integration. + default: 20 + + - name: --minCellsPerCluster + type: integer + required: false + direction: input + description: Minimum number of cells required per cluster. + default: 25 - name: --preprocessing_clusteringMethod @@ -40,12 +89,12 @@ functionality: description: Resolution for clustering, typically between 5 and 20. default: 14 - - name: --preprocessing_SCT_nDimensions + - name: --preprocessing_RNA_nDimensions type: integer required: false direction: input default: 50 - description: Number of dimensions for SCT, default is 50. + description: Number of dimensions for RNA reduction, default is 50. - name: --genomeAssembly type: string @@ -89,12 +138,13 @@ functionality: direction: input description: FDR threshold for peak-gene connections (default is 0.2). - - name: --peak_gene - type: file - required: false - direction: output - description: Path to the peak-gene output file (e.g., peak_gene.csv). Not yet implemented. - default: output/granie/peak_gene.csv + # Existance of output files is checked, not yet implemented + # - name: --peak_gene + # type: file + # required: false + # direction: output + # description: Path to the peak-gene output file (e.g., peak_gene.csv). Not yet implemented. + # default: output/granie/peak_gene.csv - name: --useWeightingLinks type: boolean @@ -110,6 +160,13 @@ functionality: direction: input description: "Flag to force rerun of the analysis regardless of existing results." + - name: --subset + type: boolean + required: false + default: false + direction: input + description: "Flag for testing purposes to subset the data for faster running times" + resources: - type: r_script path: script.R @@ -117,7 +174,7 @@ functionality: platforms: - type: docker - image: chrarnold84/granieverse:latest + image: chrarnold84/granieverse:v1.3 - type: native - type: nextflow diff --git a/src/methods/multi_omics/granie/script.R b/src/methods/multi_omics/granie/script.R index 3df0aeb83..c8709be90 100644 --- a/src/methods/multi_omics/granie/script.R +++ b/src/methods/multi_omics/granie/script.R @@ -1,48 +1,55 @@ +## VIASH START +# par <- list( +# file_rna = "resources_test/grn-benchmark/multiomics_r/rna.rds", +# file_atac = "resources_test/grn-benchmark/multiomics_r/atac.rds", +# preprocessing_clusteringMethod = 1, # Seurat::FindClusters: (1 = original Louvain algorithm, 2 = Louvain algorithm with multilevel refinement, 3 = SLM algorithm, 4 = Leiden algorithm) +# preprocessing_clusterResolution = 14, # Typically between 5 and 20 +# preprocessing_RNA_nDimensions = 50, # Default 50 +# genomeAssembly = "hg38", +# GRaNIE_corMethod = "spearman", +# GRaNIE_includeSexChr = TRUE, +# GRaNIE_promoterRange = 250000, +# GRaNIE_TF_peak_fdr_threshold = 0.2, +# GGRaNIE_peak_gene_fdr_threshold = 0.2, +# num_workers = 4, +# peak_gene = "output/granie/peak_gene.csv", # not yet implemented, should I? +# prediction= "output/granie/prediction.csv", +# useWeightingLinks = FALSE, +# forceRerun = FALSE +# ) + +# meta <- list( +# functionality_name = "my_method_r" +# ) +## VIASH END + set.seed(42) +suppressPackageStartupMessages(library(Seurat)) -library(Seurat) -library(Signac) -library(Matrix) +suppressPackageStartupMessages(library(Signac)) +suppressPackageStartupMessages(library(Matrix)) library(GRaNIEverse) library(GRaNIE) -library(qs) -library(BSgenome.Hsapiens.UCSC.hg38) -library(EnsDb.Hsapiens.v86) -library(EnsDb.Mmusculus.v79) -library(BSgenome.Mmusculus.UCSC.mm39) -library(dplyr) +suppressPackageStartupMessages(library(qs)) +suppressPackageStartupMessages(library(BSgenome.Hsapiens.UCSC.hg38)) +suppressPackageStartupMessages(library(EnsDb.Hsapiens.v86)) +suppressPackageStartupMessages(library(EnsDb.Mmusculus.v79)) +suppressPackageStartupMessages(library(BSgenome.Mmusculus.UCSC.mm39)) +suppressPackageStartupMessages(library(dplyr)) +suppressPackageStartupMessages(library(SummarizedExperiment)) + + +cat("Content of par list:") +str(par) + +#qs::qsave(par, "/home/carnold/par.qs") -## VIASH START -par <- list( - file_rna = "resources_test/grn-benchmark/multiomics_r/rna.rds", - file_atac = "resources_test/grn-benchmark/multiomics_r/atac.rds", - temp_dir = "output/granie/", - preprocessing_clusteringMethod = 1, # Seurat::FindClusters: (1 = original Louvain algorithm, 2 = Louvain algorithm with multilevel refinement, 3 = SLM algorithm, 4 = Leiden algorithm) - preprocessing_clusterResolution = 14, # Typically between 5 and 20 - preprocessing_SCT_nDimensions = 50, # Default 50 - genomeAssembly = "hg38", - GRaNIE_corMethod = "spearman", - GRaNIE_includeSexChr = TRUE, - GRaNIE_promoterRange = 250000, - GRaNIE_TF_peak_fdr_threshold = 0.2, - GRaNIE_peak_gene_fdr_threshold = 0.2, - num_workers = 4, - peak_gene = "output/granie/peak_gene.csv", # not yet implemented, should I? - prediction= "output/granie/prediction.csv", - useWeightingLinks = FALSE, - forceRerun = FALSE -) -## VIASH END -print(par) -# meta <- list( -# functionality_name = "my_method_r" -# ) #### STANDARD ASSIGNMENTS ### file_seurat = "seurat_granie.qs" -outputDir = par$temp_dir +outputDir = dirname(par$prediction) if (!dir.exists(outputDir)) { dir.create(outputDir, recursive = TRUE) @@ -67,7 +74,7 @@ if (!file.exists(destfile)) { # Define the directory to extract the files to exdir <- "PWMScan_HOCOMOCOv12_H12INVIVO" -GRaNIE_TFBSFolder = paste0(exdir, "/PWMScan_HOCOMOCOv12/H12INVIVO") +GRaNIE_TFBSFolder = paste0(exdir, "/H12INVIVO") if (!file.exists(GRaNIE_TFBSFolder)) { untar(destfile, exdir = exdir) @@ -91,7 +98,7 @@ if (!file.exists(file_RNA)) { # Preprocess data # ################### -if (par.l$forceRerun | !file.exists(file_seurat)) { +if (par$forceRerun | !file.exists(file_seurat)) { # Sparse matrix rna.m = readRDS(par$file_rna) @@ -146,12 +153,6 @@ if (par.l$forceRerun | !file.exists(file_seurat)) { } output_seuratProcessed = paste0(outputDir, "/seuratObject.qs") -if (!file.exists(output_seuratProcessed)) { - prepareData = TRUE -} else { - prepareData = FALSE -} - ################### # Preprocess data # @@ -162,24 +163,38 @@ GRaNIE_file_peaks = paste0(outputDir, "/atac.pseudobulkFromClusters_res", par$pr GRaNIE_file_rna = paste0(outputDir, "/rna.pseudobulkFromClusters_res", par$preprocessing_clusterResolution, "_mean.tsv.gz") GRaNIE_file_metadata = paste0(outputDir, "/metadata_res", par$preprocessing_clusterResolution, "_mean.tsv.gz") -if (file.exists(GRaNIE_file_peaks) & file.exists(GRaNIE_file_metadata) & file.exists(GRaNIE_file_rna) & !par.l$forceRerun) { +if (file.exists(GRaNIE_file_peaks) & file.exists(GRaNIE_file_metadata) & file.exists(GRaNIE_file_rna) & !par$forceRerun) { - cat("Preprocessing skipped because all files alreadx exist anf forceRerun = FALSE.") + cat("Preprocessing skipped because all files already exist anf forceRerun = FALSE.") } else { - + + # Subset for testing purposes + if (par$subset == TRUE) { + cat("SUBSET cells\n") + random_cells <- sample(Cells(seurat_object), size = 5000, replace = FALSE) + seurat_object = subset(seurat_object, cells = random_cells) + cat("SUBSET peaks\n") + peak_names <- rownames(seurat_object[["peaks"]]) + selected_peaks <- sample(peak_names, size = 50000, replace = FALSE) + seurat_object[["peaks"]] = subset(seurat_object[["peaks"]], features = selected_peaks) + } + seurat_object = prepareSeuratData_GRaNIE(seurat_object, - outputDir = par$outputDir, - file_RNA_features = file_RNA, - assayName_RNA_raw = "RNA", assayName_ATAC = "peaks", - prepareData = prepareData, - SCT_nDimensions = par$preprocessing_SCT_nDimensions, - dimensionsToIgnore_LSI_ATAC = 1, + outputDir = outputDir, + saveSeuratObject = TRUE, + genome = par$genomeAssembly, + assayName_RNA = "RNA", normRNA = "SCT", nDimensions_RNA = par$preprocessing_RNA_nDimensions, recalculateVariableFeatures = NULL, + assayName_ATAC_raw = "peaks", + normATAC = "LSI", LSI_featureCutoff = "q0", nDimensions_ATAC = 50, dimensionsToIgnore_LSI_ATAC = 1, + integrationMethod = "WNN", WNN_knn = 20, pseudobulk_source = "cluster", countAggregation = "mean", clusteringAlgorithm = par$preprocessing_clusteringMethod, - clusterResolutions = par$preprocessing_clusterResolution, - saveSeuratObject = TRUE) + clusterResolutions = par$preprocessing_clusterResolution, + minCellsPerCluster = 25, + forceRerun = FALSE + ) } @@ -190,7 +205,7 @@ if (file.exists(GRaNIE_file_peaks) & file.exists(GRaNIE_file_metadata) & file.ex ############## GRN = runGRaNIE( - dir_output = par$temp_dir, + dir_output = outputDir, datasetName = "undescribed", GRaNIE_file_peaks, GRaNIE_file_rna,