From b3ea5d99f043dbfeb5b9370eb025903e8a38942a Mon Sep 17 00:00:00 2001 From: Christian Arnold Date: Fri, 2 Aug 2024 15:29:22 +0200 Subject: [PATCH] Added GRaNIE script --- src/methods/granie/script.R | 241 ++++++++++++++++++++++++++++++++++++ 1 file changed, 241 insertions(+) create mode 100644 src/methods/granie/script.R diff --git a/src/methods/granie/script.R b/src/methods/granie/script.R new file mode 100644 index 000000000..088d7b59f --- /dev/null +++ b/src/methods/granie/script.R @@ -0,0 +1,241 @@ + +set.seed(42) + +library(Seurat) +library(Signac) +library(Matrix) +library(GRaNIEverse) +library(GRaNIE) +library(qs) +library(BSgenome.Hsapiens.UCSC.hg38) +library(EnsDb.Hsapiens.v86) +library(EnsDb.Mmusculus.v79) +library(BSgenome.Mmusculus.UCSC.mm39) +library(dplyr) + +## VIASH START +par <- list( + file_rna = "resources_test/grn-benchmark/multiomics_r/rna.rds", + files_atac = "resources_test/grn-benchmark/multiomics_r/atac.rds", + temp_dir = "output/granie/", + preprocessing_clusteringMethod = 1, # Seurat::FindClusters: (1 = original Louvain algorithm, 2 = Louvain algorithm with multilevel refinement, 3 = SLM algorithm, 4 = Leiden algorithm) + preprocessing_clusterResolution = 14, # Typically between 5 and 20 + preprocessing_SCT_nDimensions = 50, # Default 50 + genomeAssembly = "hg38", + GRaNIE_corMethod = "spearman", + GRaNIE_includeSexChr = TRUE, + GRaNIE_promoterRange = 250000, + GRaNIE_TF_peak.fdr.threshold = 0.2, + GRaNIE_peak_gene.fdr.threshold = 0.2, + GRaNIE_nCores = 4, + peak_gene = "output/granie/peak_gene.csv", # not yet implemented, should I? + prediction= "output/granie/prediction.csv", + useWeightingLinks = FALSE, + forceRerun = FALSE +) + +print(par) +# meta <- list( +# functionality_name = "my_method_r" +# ) +## VIASH END + +#### STANDARD ASSIGNMENTS ### +file_seurat = "seurat_granie.qs" +outputDir = par$temp_dir + +if (!dir.exists(outputDir)) { + dir.create(outputDir, recursive = TRUE) +} + +setwd(outputDir) + + +######################### +# Downloading resources # +######################### +file_hocomoco_v12 = "https://s3.embl.de/zaugg-web/GRaNIE/TFBS/hg38/PWMScan_HOCOMOCOv12_H12INVIVO.tar.gz" + +destfile <- "PWMScan_HOCOMOCOv12_H12INVIVO.tar.gz" + +if (!file.exists(destfile)) { + + options(timeout = 1200) + download.file(file_hocomoco_v12, destfile) +} + +# Define the directory to extract the files to +exdir <- "PWMScan_HOCOMOCOv12_H12INVIVO" + +GRaNIE_TFBSFolder = paste0(exdir, "/PWMScan_HOCOMOCOv12/H12INVIVO") + +if (!file.exists(GRaNIE_TFBSFolder)) { + untar(destfile, exdir = exdir) +} + +if (par$genomeAssembly == "hg38"){ + file_RNA_URL = "https://s3.embl.de/zaugg-web/GRaNIEverse/features_RNA_hg38.tsv.gz" + +} else if (par$genomeAssembly == "mm10") { + file_RNA_URL = "https://s3.embl.de/zaugg-web/GRaNIEverse/features_RNA_mm10.tsv.gz" +} + +file_RNA <- paste0("features_RNA_", par$genomeAssembly, ".tsv.gz") +if (!file.exists(file_RNA)) { + options(timeout = 1200) + download.file(file_RNA_URL, file_RNA) +} + + +################### +# Preprocess data # +################### + +if (par.l$forceRerun | !file.exists(file_seurat)) { + + # Sparse matrix + rna.m = readRDS(par$file_rna) + + seurat_object <- CreateSeuratObject(count = rna.m, project = "PBMC", min.cells = 1, min.features = 1, assay = "RNA") + + # RangedSummarizedExperiment + atac = readRDS(par$file_atac) + + # Extract counts and metadata from the RangedSummarizedExperiment + atac_counts <- assays(atac)$counts + + rownames(atac_counts) = paste0(seqnames(rowRanges(atac)) %>% as.character(), ":", start(rowRanges(atac)), "-", end(rowRanges(atac))) + + # Create a ChromatinAssay + chrom_assay <- CreateChromatinAssay( + counts = atac_counts, + sep = c(":", "-"), + genome = 'hg38', + fragments = NULL, + min.cells = 1, + min.features = 1, + colData = DataFrame(colData(atac)) + ) + + if (par$genomeAssembly == "hg38"){ + annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86) + + } else if (par$genomeAssembly == "mm10") { + annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Mmusculus.v79) + } + + seqlevelsStyle(annotations) <- "UCSC" + genome(annotations) <- par$genomeAssembly + Annotation(chrom_assay) <- annotations + + # Unify cells + # Identify the common cells between the RNA and ATAC assays + common_cells <- intersect(colnames(seurat_object[["RNA"]]), colnames(chrom_assay)) + + # Subset the Seurat object to include only the common cells + chrom_assay <- subset(chrom_assay, cells = common_cells) + + seurat_object[["peaks"]] = chrom_assay + + qs::qsave(seurat_object, "seurat_granie.qs") + +} else { + + seurat_object = qs::qread(file_seurat) + +} + +output_seuratProcessed = paste0(outputDir, "/seuratObject.qs") +if (!file.exists(output_seuratProcessed)) { + prepareData = TRUE +} else { + prepareData = FALSE +} + + +################### +# Preprocess data # +################### + +# Take output from preprocessing steps +GRaNIE_file_peaks = paste0(outputDir, "/atac.pseudobulkFromClusters_res", par$preprocessing_clusterResolution, "_mean.tsv.gz") +GRaNIE_file_rna = paste0(outputDir, "/rna.pseudobulkFromClusters_res", par$preprocessing_clusterResolution, "_mean.tsv.gz") +GRaNIE_file_metadata = paste0(outputDir, "/metadata_res", par$preprocessing_clusterResolution, "_mean.tsv.gz") + +if (file.exists(GRaNIE_file_peaks) & file.exists(GRaNIE_file_metadata) & file.exists(GRaNIE_file_rna) & !par.l$forceRerun) { + + cat("Preprocessing skipped because all files alreadx exist anf forceRerun = FALSE.") + +} else { + + seurat_object = prepareSeuratData_GRaNIE(seurat_object, + outputDir = par$outputDir, + file_RNA_features = file_RNA, + assayName_RNA_raw = "RNA", assayName_ATAC = "peaks", + prepareData = prepareData, + SCT_nDimensions = par$preprocessing_SCT_nDimensions, + dimensionsToIgnore_LSI_ATAC = 1, + pseudobulk_source = "cluster", + countAggregation = "mean", + clusteringAlgorithm = par$preprocessing_clusteringMethod, + clusterResolutions = par$preprocessing_clusterResolution, + saveSeuratObject = TRUE) + +} + + + +############## +# Run GRaNIE # +############## + +GRN = runGRaNIE( + dir_output = par$temp_dir, + datasetName = "undescribed", + GRaNIE_file_peaks, + GRaNIE_file_rna, + GRaNIE_file_metadata, + TFBS_source = "custom", + TFBS_folder = GRaNIE_TFBSFolder, + genomeAssembly = par$genomeAssembly, + normalization_peaks = "none", + idColumn_peaks = "peakID", + normalization_rna = "none", + idColumn_RNA = "ENSEMBL", + includeSexChr = par$GRaNIE_includeSexChr, + minCV = 0, + minNormalizedMean_peaks = NULL, + minNormalizedMean_RNA = NULL, + minSizePeaks = 5, + corMethod = par$GRaNIE_corMethod, + promoterRange = par$GRaNIE_promoterRange, + useGCCorrection = FALSE, + TF_peak.fdr.threshold = par$GRaNIE_TF_peak.fdr.threshold, + peak_gene.fdr.threshold = par$GRaNIE_peak_gene.fdr.threshold, + runTFClassification = FALSE, + runNetworkAnalyses = FALSE, + nCores = par$GRaNIE_nCores, + forceRerun = TRUE +) + +# Post-process GRN +connections.df = getGRNConnections(GRN, + include_TF_gene_correlations = TRUE, + include_peakMetadata = TRUE, + include_TFMetadata = TRUE, + include_geneMetadata = TRUE) + +final.df = connections.df %>% + dplyr::select(TF.name, gene.name, TF_gene.r) %>% + dplyr::rename(source = TF.name, target = gene.name) + +if (par$useWeightingLinks) { + final.df = dplyr::mutate(final.df, weight = abs(.data$TF_gene.r)) +} else { + final.df = dplyr::mutate(final.df, weight = 1) +} + +final.df %>% + dplyr::select(source, target, weight) %>% + readr::write_csv(par$prediction) +