diff --git a/.Rbuildignore b/.Rbuildignore index 475c93d..c57d8d6 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -4,3 +4,7 @@ ^LICENSE\.md$ ^\.github$ ^data-raw$ +^vignettes/prerender_figures\.R$ +^_pkgdown\.yml$ +^docs$ +^pkgdown$ diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index d46a617..09b1b65 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -4,7 +4,7 @@ on: push: branches: [main, master] pull_request: - branches: [main, master] + branches: [main, master, dev] name: R-CMD-check.yaml diff --git a/.gitignore b/.gitignore index 198dbfd..90cd220 100644 --- a/.gitignore +++ b/.gitignore @@ -51,3 +51,5 @@ rsconnect/ .Rdata .DS_Store .quarto +inst/doc +docs diff --git a/DESCRIPTION b/DESCRIPTION index 10510ef..177895e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -16,11 +16,13 @@ Authors@R: c( Description: This package contains functions that help in manipulating tables and generating plots for multi-omics analysis including genomics, transcriptomics, proteomics, methylomics and immunoinformatics. License: CC BY-NC-SA 4.0 Encoding: UTF-8 -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Imports: dplyr, ggplot2, tsne, + cowplot, + graphics, magrittr, rlang, stats, @@ -32,10 +34,17 @@ Imports: patchwork Suggests: biomaRt, - cowplot, + circlize, + cluster, + ComplexHeatmap, dbscan, + knitr, + rmarkdown, + DESeq2, ggnewscale, + ggraph, ggrepel, + igraph, grDevices, matrixStats, openxlsx, @@ -43,9 +52,15 @@ Suggests: readr, survival, survminer, + tidygraph, tidyr, - tidyselect + tidyselect, + tm Roxygen: list(markdown = TRUE) Depends: R (>= 3.5) LazyData: true +LazyDataCompression: xz +VignetteBuilder: knitr +URL: https://bigmindlab.github.io/OmicsKit + diff --git a/NAMESPACE b/NAMESPACE index c7c52e6..44d21fc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,24 +1,36 @@ # Generated by roxygen2: do not edit by hand export(add_annotations) -export(barplot_GSEA) +export(addgenesPA) export(detect_filter) +export(do_clust) +export(geneset_similarity) export(get_annotations) +export(get_network_communities) export(get_stars) -export(heatmap_GSEA) -export(merge_GSEA) +export(get_superterm) +export(getgenesPA) +export(heatmap_PA) +export(heatmap_path_PA) +export(list_gmts) +export(merge_PA) +export(multiplot_PA) +export(network_clust) +export(network_clust_gg) export(nice_KM) export(nice_PCA) export(nice_UMAP) export(nice_VSB) +export(nice_VSB_DEseq2) export(nice_Volcano) export(nice_tSNE) -export(plot_GSEA) export(power_analysis) export(save_results) export(split_cases) +export(splot_PA) export(tpm) import(ggplot2) importFrom(magrittr,"%>%") importFrom(patchwork,plot_layout) importFrom(rlang,.data) +importFrom(utils,modifyList) diff --git a/R/add_annotations.R b/R/add_annotations.R index 7fb4985..4b75e06 100644 --- a/R/add_annotations.R +++ b/R/add_annotations.R @@ -8,6 +8,42 @@ #' @param reference A reference table with the annotations including a column named "geneID". #' @param variables Character vector of columns in `reference` to add. If NULL (default), all columns except geneID are used. #' @param data_frame Logical; if TRUE, coerce `object` to a data.frame first. Default: FALSE. +#' +#' @return The input `object` as a data frame with additional columns from +#' `reference` joined by Ensembl gene ID. A `geneID` column is added +#' containing the row names of the original object. +#' +#' @examples +#' \dontrun{ +#' data(norm_counts) +#' +#' # Requires a reference table with a "geneID" column. +#' # Use get_annotations() to generate it: +#' annotations <- get_annotations( +#' ensembl_ids = rownames(norm_counts), +#' mode = "genes" +#' ) +#' +#' # Add gene symbol and biotype columns to the counts matrix +#' norm_counts_annot <- add_annotations( +#' object = norm_counts, +#' reference = annotations, +#' variables = c("symbol", "biotype") +#' ) +#' +#' # Inspect result +#' head(norm_counts_annot[, c("geneID", "symbol", "biotype")]) +#' +#' # Add all annotation columns (variables = NULL uses everything) +#' norm_counts_full <- add_annotations( +#' object = norm_counts, +#' reference = annotations +#' ) +#' } +#' +#' @seealso [get_annotations()] to generate the `reference` table; +#' [norm_counts] for an example input matrix. +#' #' @export add_annotations <- function(object, reference, variables = NULL, data_frame = FALSE){ diff --git a/R/barplot_GSEA.R b/R/barplot_GSEA.R deleted file mode 100644 index 8339beb..0000000 --- a/R/barplot_GSEA.R +++ /dev/null @@ -1,57 +0,0 @@ -######################### -# Function barplot_GSEA # -######################### - -#' Create and save a customized barplot for GSEA results -#' -#' This function generates a customized barplot with: -#' * Grouped bars. -#' * Adjusted aesthetics. -#' * Personalized axis labels. -#' * Optionally save the result in SVG format. -#' -#' @param data A data frame containing GSEA results with columns such as `datatype`, `NES`, `-Log10FDR`, and `New_name`. -#' @param output_path The file path where the barplot will be saved (SVG format). -#' @param custom_labels A named vector of custom expressions for x-axis labels. -#' @param axis_y Name of the column to use for the y-axis aesthetic, as a string. Default: "NES". -#' @import ggplot2 -#' @importFrom rlang .data -#' @export - -barplot_GSEA <- function(data, output_path, custom_labels, axis_y = "NES") - -{ - # Generate the barplot - barplot <- ggplot(data, aes(x = .data$datatype, y = .data[[axis_y]], fill = .data[["-Log10FDR"]])) + - geom_bar(stat = "identity", color = "black", size = 0.5, width = 0.6) + - scale_fill_gradient( - low = "white", high = "red", na.value = "white", - limits = c(0, 5.5), # Fixed legend limits - oob = scales::squish, # Squish out-of-bounds values - guide = guide_colorbar(barwidth = 3, barheight = 18) - ) + - labs(x = "Comparisons", y = axis_y) + - theme_bw() + - theme( - axis.line.x = element_blank(), - axis.line = element_line(color = "black", size = 0.5), - axis.title.x = element_text(size = 45, face = "bold", margin = margin(t = 25)), - axis.title.y = element_text(size = 45, face = "bold"), - axis.text.x = element_text(angle = 0, hjust = 0.5, vjust = 0.5, size = 30), - axis.text.y.left = element_text(size = 50), - legend.title = element_text(size = 50), - legend.text = element_text(size = 50), - panel.spacing = grid::unit(0.6, "lines"), - panel.border = element_blank(), - strip.background = element_rect(fill = "white", color = "white"), - strip.text.y.left = element_text(size = 50, angle = 0, hjust = 0.5), - strip.placement = "outside" - ) + - geom_hline(yintercept = 0, color = "black", size = 2) + - expand_limits(y = 0) + - ylim(-3.4, 3.2) + - facet_wrap(~ .data$New_name, ncol = 2, strip.position = "left", scales = "free_y") + - scale_x_discrete(labels = custom_labels) - - return(barplot) -} diff --git a/R/dataclust_PA.R b/R/dataclust_PA.R new file mode 100644 index 0000000..ff04feb --- /dev/null +++ b/R/dataclust_PA.R @@ -0,0 +1,71 @@ +#################### +# Example datasets # +#################### + +#' Example gene set list for pathway analysis clustering +#' +#' A named list of 40 curated gene sets spanning four biological themes: +#' apoptosis & cell death, cell cycle & DNA damage, immune response & +#' inflammation, and metabolism. Gene set names follow standard database +#' conventions (`KEGG_`, `HALLMARK_`, `GO_`) and gene symbols are real human +#' genes. Designed to be used as input to [geneset_similarity()]. +#' +#' @format A named list of 40 elements. Each element is a character vector of +#' human gene symbols (HGNC) belonging to that gene set. Gene set sizes range +#' from 11 to 20 genes. +#' +#' @source Curated manually for OmicsKit examples, based on KEGG, MSigDB +#' Hallmark, and Gene Ontology gene set collections. +#' +#' @examples +#' data(geneset_list) +#' +#' # How many gene sets? +#' length(geneset_list) +#' +#' # Inspect one gene set +#' geneset_list[["KEGG_APOPTOSIS"]] +#' +#' # Use with geneset_similarity() +#' data(camera_results) +#' jac <- geneset_similarity(geneset_list, camera_results, fdr_th = 0.05) +#' +#' @seealso [geneset_similarity()], [camera_results] +"geneset_list" + + +#' Example CAMERA enrichment results for pathway analysis clustering +#' +#' A data frame simulating the output of a CAMERA differential expression +#' analysis, containing significance values for the 40 gene sets in +#' [geneset_list]. Approximately 60% of gene sets have FDR < 0.05, providing +#' enough significant sets for meaningful clustering. Designed to be used +#' alongside [geneset_list] as input to [geneset_similarity()]. +#' +#' @format A data frame with 40 rows and 4 columns: +#' \describe{ +#' \item{GeneSet}{Character. Gene set name, matching the names in +#' [geneset_list].} +#' \item{Direction}{Character. Enrichment direction: `"Up"` or `"Down"`.} +#' \item{PValue}{Numeric. Raw p-value from the simulated CAMERA test.} +#' \item{FDR}{Numeric. Benjamini-Hochberg adjusted p-value.} +#' } +#' +#' @source Simulated with `set.seed(1905)` in `data-raw/example_PA.R` for +#' OmicsKit examples. +#' +#' @examples +#' data(camera_results) +#' +#' # Overview +#' head(camera_results) +#' +#' # How many gene sets are significant? +#' sum(camera_results$FDR < 0.05) +#' +#' # Use with geneset_similarity() +#' data(geneset_list) +#' jac <- geneset_similarity(geneset_list, camera_results, fdr_th = 0.05) +#' +#' @seealso [geneset_similarity()], [geneset_list] +"camera_results" diff --git a/R/deseq2_results.R b/R/deseq2_results.R new file mode 100644 index 0000000..5c1f89c --- /dev/null +++ b/R/deseq2_results.R @@ -0,0 +1,267 @@ +###################### +# deseq2_results # +###################### + +#' DESeq2 differential expression results for TCGA-LUAD +#' +#' DESeq2 results from a differential expression analysis +#' comparing primary lung adenocarcinoma tumors versus normal tissue using +#' TCGA-LUAD RNA-seq data. Contains 21330 genes to produce informative +#' visualizations with [nice_Volcano()], and also suitable as input for +#' [detect_filter()], and [add_annotations()] +#' and related plotting functions. +#' +#' @format A data frame with 21,330 rows and 7 columns: +#' \describe{ +#' \item{gene_id}{Character. Ensembl gene ID (e.g., `"ENSG00000141510"`).} +#' \item{baseMean}{Numeric. Mean of normalized counts across all samples.} +#' \item{log2FoldChange}{Numeric. Shrunken log2 fold change +#' (tumor vs. normal).} +#' \item{lfcSE}{Numeric. Standard error of the log2 fold change estimate.} +#' \item{stat}{Numeric. Wald test statistic.} +#' \item{pvalue}{Numeric. Raw p-value.} +#' \item{padj}{Numeric. Benjamini-Hochberg adjusted p-value (FDR).} +#' } +#' +#' @source TCGA-LUAD STAR counts downloaded from the GDC Data Portal +#' (\url{https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz}). +#' DESeq2 analysis performed with default settings; results generated by +#' `data-raw/deseq2_results.R`. +#' +#' @examples +#' data(deseq2_results) +#' +#' # Overview +#' head(deseq2_results) +#' +#' # Significant genes +#' sum(deseq2_results$padj < 0.05, na.rm = TRUE) +#' +#' # Volcano plot +#' nice_Volcano( +#' results = deseq2_results, +#' x_var = "log2FoldChange", +#' y_var = "padj", +#' label_var = "gene_id", +#' title = "TCGA-LUAD: Tumor vs Normal" +#' ) +#' \dontrun{ +#' # detect_filter (required: "ensembl" column in results) +#' deseq2_res <- deseq2_results +#' colnames(deseq2_res)[colnames(deseq2_res) == "gene_id"] <- "ensembl" +#' rownames(deseq2_res) <- deseq2_res$ensembl +#' +#' # Get sample IDs per group from sampledata +#' samples_normal <- sampledata$patient_id[sampledata$sample_type == "normal"] +#' samples_tumor <- sampledata$patient_id[sampledata$sample_type == "tumor"] +#' +#' detected <- detect_filter( +#' norm.counts = as.data.frame(norm_counts), +#' df.BvsA = deseq2_res, +#' samples.baseline = samples_normal, +#' samples.condition1 = samples_tumor, +#' cutoffs = c(50, 50, 0) +#' ) +#' +#' # Number of detectable genes +#' length(detected$DetectGenes) +#' +#' # Subset results to detectable genes +#' head(detected$Comparison1) +#' } +#' @seealso [nice_Volcano()], [raw_counts], [sampledata] +"deseq2_results" + + +##################### +# norm_counts data # +##################### + +#' Normalized counts matrix for TCGA-LUAD +#' +#' DESeq2 size-factor normalized counts derived from the TCGA-LUAD RNA-seq +#' dataset (16 tumor samples, 16 normal samples). Counts are divided by +#' DESeq2 size factors to correct for differences in library size across +#' samples, but remain in counts scale (not log-transformed). +#' +#' Suitable as input for [nice_VSB()], [detect_filter()], and +#' [add_annotations()]. For dimensionality reduction methods ([nice_PCA()], +#' [nice_UMAP()], [nice_tSNE()]) use [vst_counts] instead, which removes the +#' mean-variance dependence of RNA-seq data. +#' +#' @format A numeric matrix with 21,330 rows (genes) and 32 columns (samples): +#' \describe{ +#' \item{rows}{Ensembl gene IDs (e.g., `"ENSG00000141510"`).} +#' \item{columns}{Sample IDs matching the `patient_id` column of +#' [sampledata].} +#' \item{values}{Non-negative numeric. Size-factor normalized counts. +#' Range: \[0, 1,889,573\].} +#' } +#' +#' @source TCGA-LUAD STAR counts downloaded from the GDC Data Portal +#' (\url{https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz}). +#' Normalized with DESeq2::counts() (`normalized = TRUE`); generated by +#' `data-raw/deseq2_results.R`. +#' +#' @examples +#' data(norm_counts) +#' data(sampledata) +#' +#' # Dimensions +#' dim(norm_counts) +#' +#' # Value range +#' range(norm_counts) +#' +#' # Expression of a specific gene across samples +#' norm_counts["ENSG00000141510", ] +#' +#' # Violin-Scatter-Box plot for one gene +#' nice_VSB( +#' object = norm_counts, +#' annotations = sampledata, +#' variables = c(fill = "sample_type"), +#' genename = "ENSG00000141510", +#' categories = c("normal", "tumor"), +#' labels = c("Normal", "Tumor"), +#' colors = c("steelblue", "firebrick") +#' ) +#' +#' \dontrun{ +#' # detect_filter: (required: "ensembl" column in results) +#' deseq2_res <- deseq2_results +#' colnames(deseq2_res)[colnames(deseq2_res) == "gene_id"] <- "ensembl" +#' rownames(deseq2_res) <- deseq2_res$ensembl +#' +#' # Get sample IDs per group from sampledata +#' samples_normal <- sampledata$patient_id[sampledata$sample_type == "normal"] +#' samples_tumor <- sampledata$patient_id[sampledata$sample_type == "tumor"] +#' +#' detected <- detect_filter( +#' norm.counts = as.data.frame(norm_counts), +#' df.BvsA = deseq2_res, +#' samples.baseline = samples_normal, +#' samples.condition1 = samples_tumor, +#' cutoffs = c(50, 50, 0) +#' ) +#' +#' # Number of detectable genes +#' length(detected$DetectGenes) +#' +#' # Subset results to detectable genes +#' head(detected$Comparison1) +#' +#' # add_annotations: add gene symbols +#' # Required: reference df with geneID + annotation columns +#' # Example using biomaRt to fetch gene symbols +#' library(biomaRt) +#' mart <- useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl") +#' ref <- getBM( +#' attributes = c("ensembl_gene_id", "hgnc_symbol", "gene_biotype"), +#' filters = "ensembl_gene_id", +#' values = rownames(norm_counts), +#' mart = mart +#' ) +#' colnames(ref)[1] <- "geneID" +#' +#' norm_counts_annot <- add_annotations( +#' object = norm_counts, +#' reference = ref, +#' variables = c("hgnc_symbol", "gene_biotype") +#' ) +#' +#' head(norm_counts_annot[, c("geneID", "hgnc_symbol", "gene_biotype")]) +#' } +#' +#' @seealso [vst_counts], [deseq2_results], [sampledata], [nice_VSB()], +#' [detect_filter()], [add_annotations()] +"norm_counts" + + +#################### +# vst_counts data # +#################### + +#' Variance-stabilized counts matrix for TCGA-LUAD +#' +#' Variance Stabilizing Transformation (VST) applied to the TCGA-LUAD RNA-seq +#' dataset (16 tumor samples, 16 normal samples) using [DESeq2::vst()] with +#' `blind = TRUE`. VST removes the mean-variance dependence characteristic of +#' RNA-seq count data, placing all genes on a comparable log2-like scale. This +#' makes it the appropriate input for sample-level dimensionality reduction and +#' clustering methods. +#' +#' Suitable as input for [nice_PCA()], [nice_UMAP()], and [nice_tSNE()]. For +#' gene-level expression plots ([nice_VSB()]) or filtering ([detect_filter()]) +#' use [norm_counts] instead. +#' +#' @format A numeric matrix with 21,330 rows (genes) and 32 columns (samples): +#' \describe{ +#' \item{rows}{Ensembl gene IDs (e.g., `"ENSG00000141510"`).} +#' \item{columns}{Sample IDs matching the `patient_id` column of +#' [sampledata].} +#' \item{values}{Numeric. VST-transformed expression values on a log2-like +#' scale. Range: \[1.78, 20.85\].} +#' } +#' +#' @source TCGA-LUAD STAR counts downloaded from the GDC Data Portal +#' (\url{https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz}). +#' Transformed with [DESeq2::vst()] (`blind = TRUE`); generated by +#' `data-raw/deseq2_results.R`. +#' +#' @examples +#' data(vst_counts) +#' data(sampledata) +#' +#' # Dimensions +#' dim(vst_counts) +#' +#' # Value range (log2-like scale) +#' range(vst_counts) +#' +#' # PCA plot colored by sample type +#' colnames(sampledata)[colnames(sampledata) == "patient_id"] <- "id" +# +#' nice_PCA( +#' object = vst_counts, +#' annotations = sampledata, +#' variables = c(fill = "sample_type"), +#' legend_names = c(fill = "Sample Type"), +#' colors = c("steelblue", "firebrick"), +#' shapes = c(21, 21), +#' title = "TCGA-LUAD PCA" +#' ) +#' +#' \dontrun{ +#' # UMAP plot +#' colnames(sampledata)[colnames(sampledata) == "patient_id"] <- "id" +#' +#' nice_UMAP( +#' object = vst_counts, +#' annotations = sampledata, +#' variables = c(fill = "sample_type"), +#' legend_names = c(fill = "Sample Type"), +#' colors = c("steelblue", "firebrick"), +#' shapes = c(21, 21), +#' title = "TCGA-LUAD UMAP" +#' ) +#' +#' # tSNE plot +#' # perplexity must be lower than the number of samples divided by 3 +#' +#' colnames(sampledata)[colnames(sampledata) == "patient_id"] <- "id" +#' nice_tSNE( +#' object = vst_counts, +#' annotations = sampledata, +#' perplexity = 5, +#' variables = c(fill = "sample_type"), +#' legend_names = c(fill = "Sample Type"), +#' colors = c("steelblue", "firebrick"), +#' shapes = c(21, 21), +#' title = "TCGA-LUAD tSNE" +#' ) +#' } +#' +#' @seealso [norm_counts], [deseq2_results], [sampledata], [nice_PCA()], +#' [nice_UMAP()], [nice_tSNE()] +"vst_counts" diff --git a/R/detect_filter.R b/R/detect_filter.R index f3f5c85..24db9b1 100644 --- a/R/detect_filter.R +++ b/R/detect_filter.R @@ -17,6 +17,48 @@ #' @param samples.condition2 Vector of Sample IDs or indexes corresponding to the second condition (optional). #' @param samples.condition3 Vector of Sample IDs or indexes corresponding to the third condition (optional). #' @param cutoffs Vector containing threshold values for baseMean, mean normalized counts and Log2 Fold Change; respectively. Default: c(50, 50, 0). +#' +#' @return A named list. Always contains: +#' * `$Comparison1`: Data frame of detectable genes from `df.BvsA`. +#' * `$DetectGenes`: Character vector of unique detectable gene IDs across +#' all comparisons. +#' +#' If `df.CvsA` is provided, also contains `$Comparison2`. If `df.DvsA` is +#' provided, also contains `$Comparison3`. +#' +#' @examples +#' \dontrun{ +#' data(norm_counts) +#' data(deseq2_results) +#' data(sampledata) +#' +#' # detect_filter requires an "ensembl" column in the results data frame +#' res <- deseq2_results +#' colnames(res)[colnames(res) == "gene_id"] <- "ensembl" +#' rownames(res) <- res$ensembl +#' +#' # Get sample IDs per group +#' samples_normal <- sampledata$patient_id[sampledata$sample_type == "normal"] +#' samples_tumor <- sampledata$patient_id[sampledata$sample_type == "tumor"] +#' +#' detected <- detect_filter( +#' norm.counts = as.data.frame(norm_counts), +#' df.BvsA = res, +#' samples.baseline = samples_normal, +#' samples.condition1 = samples_tumor, +#' cutoffs = c(50, 50, 0) +#' ) +#' +#' # Number of detectable genes +#' length(detected$DetectGenes) +#' +#' # Subset results +#' head(detected$Comparison1) +#' } +#' +#' @seealso [nice_VSB()] to plot expression of detected genes; +#' [norm_counts] for an example normalized counts matrix. +#' #' @export @@ -28,27 +70,27 @@ detect_filter <- function(norm.counts, df.BvsA, df.CvsA = NULL, df.DvsA = NULL, if (length(cutoffs) != 3) { stop("Cutoffs vector must contain three values: baseMean, mean normalized counts, and Log2 Fold Change thresholds.") } - + # Create an empty vector to store genes temporary genes_vector <- c() - + # Obtain mean normalized counts per phenotype norm.counts$Mean.Baseline <- rowMeans(norm.counts[, samples.baseline]) norm.counts$Mean.Condition1 <- rowMeans(norm.counts[, samples.condition1]) - + # Filter data frames by baseMean df.BvsA.det <- df.BvsA[df.BvsA$baseMean > cutoffs[1], ] - + if (!is.null(df.CvsA)) { norm.counts$Mean.Condition2 <- rowMeans(norm.counts[, samples.condition2]) df.CvsA.det <- df.CvsA[df.CvsA$baseMean > cutoffs[1], ] } - + if (!is.null(df.DvsA)) { norm.counts$Mean.Condition3 <- rowMeans(norm.counts[, samples.condition3]) df.DvsA.det <- df.DvsA[df.DvsA$baseMean > cutoffs[1], ] } - + # Get detectable genes for (i in 1:length(df.BvsA.det[, "ensembl"])) { if (df.BvsA.det[i, "log2FoldChange"] > cutoffs[3]) { @@ -61,7 +103,7 @@ detect_filter <- function(norm.counts, df.BvsA, df.CvsA = NULL, df.DvsA = NULL, } } } - + if (!is.null(df.CvsA)) { for (i in 1:length(df.CvsA.det[, "ensembl"])) { if (df.CvsA.det[i, "log2FoldChange"] > cutoffs[3]) { @@ -75,7 +117,7 @@ detect_filter <- function(norm.counts, df.BvsA, df.CvsA = NULL, df.DvsA = NULL, } } } - + if (!is.null(df.DvsA)) { for (i in 1:length(df.DvsA.det[, "ensembl"])) { if (df.DvsA.det[i, "log2FoldChange"] > cutoffs[3]) { @@ -89,23 +131,23 @@ detect_filter <- function(norm.counts, df.BvsA, df.CvsA = NULL, df.DvsA = NULL, } } } - + # Remove duplicates genes_vector <- unique(genes_vector) - + # Subset data frames to only include detectable genes df.BvsA.det <- df.BvsA.det[df.BvsA.det$ensembl %in% genes_vector, ] detected_genes <- list(Comparison1 = df.BvsA.det, DetectGenes = genes_vector) - + if (!is.null(df.CvsA)) { df.CvsA.det <- df.CvsA.det[df.CvsA.det$ensembl %in% genes_vector, ] detected_genes$Comparison2 <- df.CvsA.det } - + if (!is.null(df.DvsA)) { df.DvsA.det <- df.DvsA.det[df.DvsA.det$ensembl %in% genes_vector, ] detected_genes$Comparison3 <- df.DvsA.det } - + return(detected_genes) } diff --git a/R/doclust_PA.R b/R/doclust_PA.R new file mode 100644 index 0000000..485e190 --- /dev/null +++ b/R/doclust_PA.R @@ -0,0 +1,601 @@ +# ============================================================================= +# doclust_PA.R +# Pathway Analysis — Jaccard similarity, hierarchical clustering, +# community detection, and super-term generation. +# +# Functions: +# geneset_similarity — Compute Jaccard similarity & distance matrices +# do_clust — Hierarchical clustering with silhouette selection +# get_superterm — TF-IDF super-term labels for gene set communities +# get_network_communities — Community detection + super-terms in one call +# ============================================================================= + +######################## +# Function geneset_similarity # +######################## + +#' Compute Jaccard similarity and distance matrices for gene sets +#' +#' Filters a named list of gene sets by a significance threshold and computes +#' pairwise Jaccard similarity and distance matrices for the retained sets. +#' The output object can be passed directly to [do_clust()], +#' [get_network_communities()], [network_clust()], or [network_clust_gg()], +#' or its individual slots can be used independently (e.g., `$dist_mat` for +#' UMAP, `$jaccard_sim` for custom visualizations). +#' +#' @param geneset_list A named list where each element is a character vector of +#' gene symbols belonging to that gene set. Typically the output of +#' [list_gmts()]. +#' @param results A data frame with at least two columns: `GeneSet` (gene set +#' names) and `FDR` (adjusted p-values). +#' @param fdr_th Numeric. FDR cutoff to retain significant gene sets. +#' Default: `0.05`. +#' +#' @return An object of class `JaccardResult`, a named list with three slots: +#' * `$jaccard_sim`: Numeric matrix of pairwise Jaccard similarities. +#' * `$dist_mat`: A `dist` object of 1 - Jaccard similarity, suitable for +#' clustering or UMAP. +#' * `$geneset_list`: Named list of gene sets retained after FDR filtering. +#' +#' @examples +#' geneset_list <- list( +#' KEGG_APOPTOSIS = c("TP53", "BCL2", "CASP3", "BAX"), +#' KEGG_CELL_CYCLE = c("CDK2", "CCND1", "TP53", "RB1"), +#' HALLMARK_HYPOXIA = c("HIF1A", "VEGFA", "LDHA", "BNIP3"), +#' HALLMARK_GLYCOLYSIS = c("LDHA", "ENO1", "PKM", "HIF1A") +#' ) +#' +#' results <- data.frame( +#' GeneSet = names(geneset_list), +#' FDR = c(0.01, 0.03, 0.04, 0.20) +#' ) +#' +#' # Only the first three gene sets pass the FDR threshold +#' jac <- geneset_similarity(geneset_list, results, fdr_th = 0.05) +#' +#' jac$jaccard_sim # similarity matrix +#' jac$dist_mat # distance object (usable in UMAP, clustering, etc.) +#' jac$geneset_list # filtered gene sets +#' +#' @seealso [list_gmts()], [do_clust()], [get_network_communities()], +#' [network_clust()], [network_clust_gg()] +#' @importFrom magrittr %>% +#' @importFrom rlang .data +#' @export + +geneset_similarity <- function(geneset_list, results, fdr_th = 0.05) { + + if (!is.list(geneset_list) || is.null(names(geneset_list))) { + stop("`geneset_list` must be a named list of character vectors.", call. = FALSE) + } + if (!is.data.frame(results)) { + stop("`results` must be a data frame.", call. = FALSE) + } + if (!all(c("GeneSet", "FDR") %in% colnames(results))) { + stop("`results` must contain columns named `GeneSet` and `FDR`.", call. = FALSE) + } + if (!is.numeric(fdr_th) || fdr_th <= 0 || fdr_th > 1) { + stop("`fdr_th` must be a numeric value between 0 and 1.", call. = FALSE) + } + + # Filter gene sets by FDR threshold + selected_sets <- results %>% + dplyr::filter(.data[["FDR"]] < fdr_th) %>% + dplyr::pull(.data[["GeneSet"]]) + + selected_sets <- intersect(selected_sets, names(geneset_list)) + + if (length(selected_sets) == 0) { + stop( + "No gene sets remain after FDR filtering. ", + "Consider increasing `fdr_th`.", + call. = FALSE + ) + } + + geneset_list <- geneset_list[selected_sets] + n <- length(geneset_list) + + # Build pairwise Jaccard similarity matrix + jaccard_sim <- matrix( + 0, + nrow = n, + ncol = n, + dimnames = list(names(geneset_list), names(geneset_list)) + ) + + for (i in seq_len(n)) { + for (j in seq_len(n)) { + a <- geneset_list[[i]] + b <- geneset_list[[j]] + u <- length(union(a, b)) + jaccard_sim[i, j] <- if (u == 0) 0 else length(intersect(a, b)) / u + } + } + + dist_mat <- stats::as.dist(1 - jaccard_sim) + + result <- list( + jaccard_sim = jaccard_sim, + dist_mat = dist_mat, + geneset_list = geneset_list + ) + class(result) <- "JaccardResult" + return(result) +} + + +##################### +# Function do_clust # +##################### + +#' Hierarchical clustering of gene sets with silhouette-based optimization +#' +#' Performs hierarchical clustering on a Jaccard distance matrix, selects the +#' optimal number of clusters by maximizing average silhouette width, and +#' returns cluster assignments, a silhouette ggplot2 object, and a +#' ComplexHeatmap with dendrogram. +#' +#' @param x A `JaccardResult` object (output of [geneset_similarity()]) or an +#' object of class `dist`. +#' @param method Agglomeration method passed to [stats::hclust()]. +#' Default: `"ward.D2"`. +#' @param max_k Maximum number of clusters to evaluate in silhouette analysis. +#' Default: `NULL`, which sets it automatically to `max(1, floor(n / 2))`. +#' +#' @return A named list with five elements: +#' * `$hclust`: The [stats::hclust()] object. +#' * `$cluster_assignments`: A [tibble::tibble()] with columns `NAME` and +#' `cluster`. +#' * `$optimal_k`: Integer. The optimal number of clusters. +#' * `$silhouette_plot`: A ggplot2 object of average silhouette width vs. k. +#' * `$heatmap`: A `ComplexHeatmap` object. Display with +#' `ComplexHeatmap::draw(result$heatmap)`. +#' +#' @examples +#' \dontrun{ +#' # Requires ComplexHeatmap and cluster packages +#' geneset_list <- list( +#' KEGG_APOPTOSIS = c("TP53", "BCL2", "CASP3", "BAX"), +#' KEGG_CELL_CYCLE = c("CDK2", "CCND1", "TP53", "RB1"), +#' HALLMARK_HYPOXIA = c("HIF1A", "VEGFA", "LDHA", "BNIP3"), +#' HALLMARK_GLYCOLYSIS = c("LDHA", "ENO1", "PKM", "HIF1A"), +#' KEGG_P53_PATHWAY = c("TP53", "MDM2", "CDKN1A", "BAX") +#' ) +#' +#' results <- data.frame( +#' GeneSet = names(geneset_list), +#' FDR = c(0.01, 0.02, 0.03, 0.04, 0.01) +#' ) +#' +#' jac <- geneset_similarity(geneset_list, results) +#' clust <- do_clust(jac) +#' +#' clust$silhouette_plot # ggplot2 silhouette curve +#' ComplexHeatmap::draw(clust$heatmap) # Jaccard heatmap with dendrogram +#' clust$optimal_k # selected number of clusters +#' clust$cluster_assignments # tibble: NAME | cluster +#' } +#' +#' @seealso [geneset_similarity()], [get_network_communities()], +#' [network_clust()], [network_clust_gg()] +#' @import ggplot2 +#' @importFrom rlang .data +#' @export + +do_clust <- function(x, method = "ward.D2", max_k = NULL) { + + if (!requireNamespace("cluster", quietly = TRUE)) { + stop("Package \"cluster\" must be installed to use this function.", call. = FALSE) + } + if (!requireNamespace("ComplexHeatmap", quietly = TRUE)) { + stop("Package \"ComplexHeatmap\" must be installed to use this function.", call. = FALSE) + } + if (!requireNamespace("circlize", quietly = TRUE)) { + stop("Package \"circlize\" must be installed to use this function.", call. = FALSE) + } + + # Extract distance and similarity matrices + if (inherits(x, "JaccardResult")) { + dist_mat <- x$dist_mat + jaccard_sim <- x$jaccard_sim + } else if (inherits(x, "dist")) { + dist_mat <- x + jaccard_sim <- 1 - as.matrix(x) + } else { + stop( + "`x` must be a `JaccardResult` object (output of `geneset_similarity()`) ", + "or an object of class `dist`.", + call. = FALSE + ) + } + + n <- attr(dist_mat, "Size") + if (n < 3) stop("At least 3 gene sets are needed for clustering.", call. = FALSE) + + if (is.null(max_k)) max_k <- max(1L, floor(n / 2L)) + if (max_k >= n) max_k <- n - 1L + if (max_k < 2L) max_k <- 2L + + # Hierarchical clustering + hc <- stats::hclust(dist_mat, method = method) + + # Silhouette optimisation across k = 2 ... max_k + sil_scores <- vapply(2:max_k, function(k) { + cl <- stats::cutree(hc, k = k) + mean(cluster::silhouette(cl, dist_mat)[, 3]) + }, numeric(1)) + + optimal_k <- which.max(sil_scores) + 1L + + # Silhouette ggplot2 object + df_sil <- tibble::tibble(k = 2:max_k, silhouette = sil_scores) + + p_sil <- ggplot(df_sil, aes(x = .data[["k"]], y = .data[["silhouette"]])) + + geom_line(color = "gray40") + + geom_point(color = "gray40") + + geom_point( + data = df_sil[df_sil$k == optimal_k, ], + color = "red", size = 3 + ) + + geom_text( + data = df_sil[df_sil$k == optimal_k, ], + aes(label = paste0("k = ", .data[["k"]])), + vjust = -1, color = "red", size = 4 + ) + + labs( + x = "Number of clusters (k)", + y = "Average silhouette width", + title = "Silhouette analysis" + ) + + theme_bw() + + theme(plot.title = element_text(size = 16, hjust = 0.5)) + + # Cluster assignments tibble + cluster_assignments <- tibble::tibble( + NAME = labels(dist_mat), + cluster = stats::cutree(hc, k = optimal_k) + ) + + # Heatmap with dendrogram + ht <- ComplexHeatmap::Heatmap( + as.matrix(jaccard_sim), + name = "Jaccard", + cluster_rows = hc, + cluster_columns = hc, + show_row_names = FALSE, + show_column_names = FALSE, + col = circlize::colorRamp2(c(0, 1), c("white", "blue")) + ) + + return(list( + hclust = hc, + cluster_assignments = cluster_assignments, + optimal_k = optimal_k, + silhouette_plot = p_sil, + heatmap = ht + )) +} + + +######################## +# Function get_superterm # +######################## + +#' Generate representative super-term labels for gene set communities +#' +#' For each community in a gene set network, applies **TF-IDF** (Term +#' Frequency-Inverse Document Frequency) weighting to the words present in gene +#' set names to produce a short, representative label called a *super-term*. +#' +#' **How TF-IDF works here:** each gene set name is treated as a document and +#' each word as a term. TF-IDF upweights words that are frequent within a +#' community but rare across all communities, making the resulting label +#' specific to that cluster rather than generic. A frequency-based fallback is +#' used when TF-IDF returns no terms (e.g., very small communities). +#' +#' Common pathway words (`"pathway"`, `"signaling"`, `"regulation"`, etc.) and +#' standard English stopwords are removed before scoring. +#' +#' **Note:** this function is most easily used through +#' [get_network_communities()], which handles community detection and calls +#' `get_superterm()` internally. If you prefer to call it directly, you need a +#' community membership vector: +#' +#' ```r +#' adj <- (jac$jaccard_sim > 0.3) * 1 +#' g <- igraph::graph_from_adjacency_matrix(adj, mode = "undirected") +#' comm <- igraph::cluster_louvain(g) +#' membership <- igraph::membership(comm) +#' +#' st <- get_superterm( +#' geneset_names = names(membership), +#' community_membership = membership +#' ) +#' ``` +#' +#' @param geneset_names Character vector of gene set names (nodes in the +#' network). +#' @param community_membership A named numeric or integer vector mapping each +#' gene set to its community ID. Typically the output of +#' [igraph::membership()] applied to a community detection result (e.g., +#' [igraph::cluster_louvain()]). Must have the same length as +#' `geneset_names`. See [get_network_communities()] for a simpler workflow. +#' @param n_terms Integer. Number of top TF-IDF terms to include in each label. +#' Default: `3`. +#' @param remove_prefix Logical. If `TRUE`, removes the text before the first +#' underscore in gene set names (e.g., strips the `"KEGG_"` prefix from +#' `"KEGG_GLYCOLYSIS"`). Default: `TRUE`. +#' +#' @return A named list with two elements: +#' * `$mapping`: A [tibble::tibble()] with columns `geneset`, `community`, +#' and `superterm` — one row per gene set, sorted by community. +#' * `$summary`: A [tibble::tibble()] with columns `community`, `superterm`, +#' and `n_genesets` — one row per community, sorted by decreasing size. +#' +#' @examples +#' \dontrun{ +#' # Recommended: use get_network_communities() which calls this internally +#' net <- get_network_communities(jac, threshold = 0.3) +#' net$superterms$mapping +#' net$superterms$summary +#' +#' # Direct usage with a pre-computed membership vector +#' adj <- (jac$jaccard_sim > 0.3) * 1 +#' g <- igraph::graph_from_adjacency_matrix(adj, mode = "undirected") +#' comm <- igraph::cluster_louvain(g) +#' membership <- igraph::membership(comm) +#' +#' st <- get_superterm( +#' geneset_names = names(membership), +#' community_membership = membership, +#' n_terms = 3, +#' remove_prefix = TRUE +#' ) +#' +#' st$mapping # per-gene-set labels +#' st$summary # per-community summary +#' } +#' +#' @seealso [get_network_communities()], [network_clust()], [network_clust_gg()] +#' @importFrom magrittr %>% +#' @importFrom rlang .data +#' @export + +get_superterm <- function(geneset_names, community_membership, + n_terms = 3, remove_prefix = TRUE) { + + if (!requireNamespace("tm", quietly = TRUE)) { + stop("Package \"tm\" must be installed to use this function.", call. = FALSE) + } + if (!is.character(geneset_names) || length(geneset_names) == 0) { + stop("`geneset_names` must be a non-empty character vector.", call. = FALSE) + } + if (!is.numeric(community_membership) && !is.integer(community_membership)) { + stop("`community_membership` must be a numeric or integer vector.", call. = FALSE) + } + if (length(geneset_names) != length(community_membership)) { + stop( + "`geneset_names` and `community_membership` must have the same length.", + call. = FALSE + ) + } + + # Pathway-specific stopwords removed before TF-IDF scoring + pathway_stopwords <- c( + "pathway", "pathways", "signaling", "signal", "regulation", + "positive", "negative", "activity", "process", "involved", + "via", "mediated", "induced", "related", "associated", + "dependent", "independent" + ) + + unique_comms <- sort(unique(community_membership)) + + superterms <- vapply(unique_comms, function(comm_id) { + + nodes <- geneset_names[community_membership == comm_id] + + if (length(nodes) == 0) return("Empty_Cluster") + + # Single gene set: split name into words directly + if (length(nodes) == 1) { + clean <- if (remove_prefix) sub("^[^_]+_", "", nodes) else nodes + words <- strsplit(clean, "[_[:space:]]+")[[1]] + return(paste(utils::head(words, 3), collapse = "_")) + } + + # Multiple gene sets: TF-IDF pipeline + cleaned <- if (remove_prefix) gsub("^[^_]+_", "", nodes) else nodes + cleaned <- tolower(gsub("_", " ", cleaned)) + + corpus <- tm::Corpus(tm::VectorSource(cleaned)) + corpus <- tm::tm_map(corpus, tm::content_transformer(tolower)) + corpus <- tm::tm_map(corpus, tm::removePunctuation) + corpus <- tm::tm_map(corpus, tm::removeNumbers) + corpus <- tm::tm_map(corpus, tm::removeWords, + c(tm::stopwords("english"), pathway_stopwords)) + corpus <- tm::tm_map(corpus, tm::stripWhitespace) + + dtm <- tm::DocumentTermMatrix(corpus, + control = list(weighting = tm::weightTfIdf)) + term_freq <- colSums(as.matrix(dtm)) + top_terms <- utils::head(sort(term_freq, decreasing = TRUE), n_terms) + + # Fallback: frequency-based when TF-IDF returns nothing + if (length(top_terms) == 0) { + all_words <- unlist(strsplit(cleaned, "\\s+")) + all_words <- all_words[nchar(all_words) > 3] + if (length(all_words) == 0) return("Cluster") + word_freq <- sort(table(all_words), decreasing = TRUE) + top_term_names <- names(utils::head(word_freq, n_terms)) + } else { + top_term_names <- names(top_terms) + } + + # Capitalize first letter of each term + top_term_names <- vapply(top_term_names, function(x) { + paste0(toupper(substring(x, 1, 1)), substring(x, 2)) + }, character(1)) + + paste(top_term_names, collapse = "/") + + }, character(1)) + + names(superterms) <- as.character(unique_comms) + + mapping <- tibble::tibble( + geneset = geneset_names, + community = as.integer(community_membership), + superterm = superterms[as.character(community_membership)] + ) %>% + dplyr::arrange(.data[["community"]], .data[["geneset"]]) + + summary_tbl <- mapping %>% + dplyr::group_by(.data[["community"]], .data[["superterm"]]) %>% + dplyr::summarise(n_genesets = dplyr::n(), .groups = "drop") %>% + dplyr::arrange(dplyr::desc(.data[["n_genesets"]])) + + return(list( + mapping = mapping, + summary = summary_tbl + )) +} + + +################################ +# Function get_network_communities # +################################ + +#' Detect gene set communities and generate super-term labels +#' +#' Convenience wrapper that builds a binary adjacency network from a Jaccard +#' similarity matrix, runs a community-detection algorithm, and optionally +#' generates super-term labels for each community via [get_superterm()]. +#' Designed to be the single step between [geneset_similarity()] and the network +#' plotting functions [network_clust()] / [network_clust_gg()]. +#' +#' @param x A `JaccardResult` object (output of [geneset_similarity()]). +#' @param threshold Numeric between 0 and 1. Gene set pairs with a Jaccard +#' similarity above this value are connected in the network. Default: `0.3`. +#' @param method Character. Community detection algorithm to use. One of: +#' * `"louvain"` — [igraph::cluster_louvain()]: fast, recommended for most +#' use cases. Default. +#' * `"fast_greedy"` — [igraph::cluster_fast_greedy()]: optimizes modularity +#' greedily, works well on mid-size networks. +#' * `"walktrap"` — [igraph::cluster_walktrap()]: random-walk approach, +#' tends to find smaller, tighter communities. +#' @param superterms Logical. If `TRUE`, calls [get_superterm()] and includes +#' its output in `$superterms`. Default: `TRUE`. +#' @param n_terms Integer. Number of top TF-IDF terms per super-term label. +#' Passed to [get_superterm()]. Default: `3`. +#' @param remove_prefix Logical. Remove database prefix before the first +#' underscore (e.g., `"KEGG_"`). Passed to [get_superterm()]. Default: `TRUE`. +#' @param seed Integer. Random seed for reproducible community detection. +#' Default: `174`. +#' +#' @return A named list with four elements: +#' * `$communities`: The igraph communities object. +#' * `$membership`: Named integer vector of community IDs, one per gene set. +#' * `$adjacency_matrix`: Binary matrix (`1` if Jaccard > `threshold`). +#' * `$superterms`: Output of [get_superterm()] with `$mapping` and +#' `$summary`. `NULL` if `superterms = FALSE`. +#' +#' @examples +#' \dontrun{ +#' gsl <- list_gmts("path/to/gmt_folder/") +#' res <- read.csv("path/to/results.csv") +#' +#' # Full workflow +#' jac <- geneset_similarity(gsl, res, fdr_th = 0.05) +#' clust <- do_clust(jac) +#' net <- get_network_communities(jac, threshold = 0.3, method = "louvain") +#' +#' net$membership # community ID per gene set +#' net$superterms$mapping # gene set -> superterm +#' net$superterms$summary # community sizes and labels +#' +#' # Pass results to network plots +#' plots <- network_clust_gg( +#' jac, +#' clust_result = clust, +#' superterms = TRUE, +#' superterm_data = net$superterms +#' ) +#' plots$combined +#' } +#' +#' @seealso [geneset_similarity()], [do_clust()], [get_superterm()], +#' [network_clust()], [network_clust_gg()] +#' @importFrom magrittr %>% +#' @export + +get_network_communities <- function(x, + threshold = 0.3, + method = "louvain", + superterms = TRUE, + n_terms = 3, + remove_prefix = TRUE, + seed = 174) { + + if (!requireNamespace("igraph", quietly = TRUE)) { + stop("Package \"igraph\" must be installed to use this function.", call. = FALSE) + } + if (!inherits(x, "JaccardResult")) { + stop( + "`x` must be a `JaccardResult` object (output of `geneset_similarity()`).", + call. = FALSE + ) + } + if (!is.numeric(threshold) || threshold <= 0 || threshold >= 1) { + stop("`threshold` must be a numeric value strictly between 0 and 1.", call. = FALSE) + } + + method <- match.arg(method, c("louvain", "fast_greedy", "walktrap")) + + # Build binary adjacency matrix + adjacency_matrix <- (x$jaccard_sim > threshold) * 1L + diag(adjacency_matrix) <- 0L + + g <- igraph::graph_from_adjacency_matrix(adjacency_matrix, mode = "undirected") + + if (igraph::vcount(g) == 0) { + stop("The network has no nodes. Check that `x` contains gene sets.", call. = FALSE) + } + + # Community detection + set.seed(seed) + communities <- switch( + method, + louvain = igraph::cluster_louvain(g), + fast_greedy = igraph::cluster_fast_greedy(g), + walktrap = igraph::cluster_walktrap(g) + ) + + membership <- igraph::membership(communities) + + # Super-terms + st_result <- NULL + if (superterms) { + if (!requireNamespace("tm", quietly = TRUE)) { + warning( + "Package \"tm\" is required to generate super-terms. ", + "Install it with install.packages(\"tm\") or set `superterms = FALSE`.", + call. = FALSE + ) + } else { + st_result <- get_superterm( + geneset_names = names(membership), + community_membership = membership, + n_terms = n_terms, + remove_prefix = remove_prefix + ) + } + } + + return(list( + communities = communities, + membership = membership, + adjacency_matrix = adjacency_matrix, + superterms = st_result + )) +} diff --git a/R/get_annotations.R b/R/get_annotations.R index 6d89c0f..ff25e15 100644 --- a/R/get_annotations.R +++ b/R/get_annotations.R @@ -18,6 +18,43 @@ #' @param format The output is saved in .csv or .xlsx formats. Default = csv. #' @importFrom magrittr %>% #' @importFrom rlang .data +#' +#' @return A data frame with one row per input ID and the following columns: +#' `geneID`, `symbol`, `biotype`, `chromosome`, `gene_start`, `gene_end`, +#' `gene_length`, `description`. For `mode = "transcripts"`, an additional +#' `transcriptID` column is included. The data frame is also saved to disk +#' as a `.csv` or `.xlsx` file (see `filename` and `format`). +#' +#' @examples +#' \dontrun{ +#' # Annotate genes from Normalized counts (requires internet connection) +#' data(norm_counts) +#' +#' # Requires a reference table with a "geneID" column. +#' # Use get_annotations() to generate it: +#' annotations <- get_annotations( +#' ensembl_ids = rownames(norm_counts), +#' mode = "genes" +#' ) +#' +#' head(annotations) +#' +#' # Use with add_annotations() +#' norm_counts_annot <- add_annotations( +#' object = norm_counts, +#' reference = annotations, +#' variables = c("symbol", "biotype") +#' ) +#' } +#' +#' @note Requires an active internet connection to query the Ensembl BioMart. +#' `gene_length` is computed as `gene_end - gene_start + 1` (genomic length). +#' For TPM calculation with [tpm()], this is an approximation, +#' use transcript-level lengths for higher accuracy. +#' +#' @seealso [add_annotations()] to join annotations to a counts matrix; +#' [tpm()] which requires gene lengths from this function. +#' #' @export get_annotations <- function(ensembl_ids, mode = "genes", filename = "gene_annotations", version = "Current", format = "csv") { @@ -92,9 +129,10 @@ get_annotations <- function(ensembl_ids, mode = "genes", filename = "gene_annota ) } - openxlsx::write.xlsx(df, file = paste0(filename, ".xlsx"), colNames = T, rowNames = F, append = F) + openxlsx::write.xlsx(df, file = paste0(filename, ".xlsx"), + colNames = TRUE, rowNames = FALSE, append = FALSE) } else { - utils::write.csv(df, rowNames = F, file = paste0(filename, ".csv")) + utils::write.csv(df, row.names = FALSE, file = paste0(filename, ".csv")) } return(df) diff --git a/R/get_genes_PA.R b/R/get_genes_PA.R new file mode 100644 index 0000000..92ca2c3 --- /dev/null +++ b/R/get_genes_PA.R @@ -0,0 +1,344 @@ +###################### +# Function getgenesPA # +###################### + +#' Extract gene members from pathway analysis results +#' +#' For each gene set in a pathway analysis results table, retrieves leading +#' edge genes, a user-defined top fraction of genes, all genes in the gene +#' set, or any combination. All gene lists are ordered by their rank in the +#' provided ranked gene list. +#' +#' **Three extraction modes:** +#' +#' * `"le"`: **GSEA output only.** Leading edge genes: the subset of genes +#' that drives the enrichment signal. Size is computed as +#' `round(SIZE * tags)`, where `tags` is the fraction of gene hits before +#' (positive ES) or after (negative ES) the peak in the running enrichment +#' score. Definition from the GSEA User Guide: *"The percentage of gene hits +#' before (for positive ES) or after (for negative ES) the peak in the +#' running enrichment score. This gives an indication of the percentage of +#' genes contributing to the enrichment score."* +#' (\url{https://docs.gsea-msigdb.org/#GSEA/GSEA_User_Guide/}). +#' Requires columns `SIZE` and `tags` in `pa_data`, produced automatically +#' by [merge_PA()]. +#' +#' * `"top"`: **Any enrichment result (GSEA, CAMERA, PADOG, etc.).** +#' A user-defined top fraction of genes ordered by rank. Size is computed as +#' `round(SIZE * top)`, where `top` is a numeric value between 0 and 1 +#' provided in a `top` column of `pa_data`. This does **not** represent true +#' leading edge genes: it is a flexible, rank-based selection suitable for +#' exploratory visualization with any pathway analysis method. +#' Requires columns `SIZE` and `top` in `pa_data`. +#' +#' * `"all"`: All genes in the gene set, ordered by rank. +#' +#' @param pa_data A data frame of pathway analysis results. Must always +#' contain: +#' * `NAME`: gene set name. +#' +#' Additionally required depending on `genes`: +#' * `SIZE`: number of genes in the gene set. Required for `"le"` and +#' `"top"`. +#' * `tags`: numeric fraction (0-1) of genes contributing to the +#' enrichment score (GSEA leading edge). Produced automatically by +#' [merge_PA()]. Required for `genes = "le"`. +#' * `top`: numeric fraction (0-1) defining the proportion of top-ranked +#' genes to extract. Set manually by the user (e.g., +#' `pa_data$top <- 0.25` for the top 25%). Required for `genes = "top"`. +#' +#' Typically the output of [merge_PA()]. +#' @param geneset_list A named list of gene sets, where each element is a +#' character vector of gene symbols. Typically the output of [list_gmts()], +#' or use the built-in [geneset_list] for quick testing. +#' @param ranked_genes A character vector of gene symbols ordered by their +#' ranking metric (e.g., DESeq2 `stat`, log2FC, or signal-to-noise ratio), +#' from most positive to most negative. Non-significant genes fall in the +#' middle of the list. Used to order genes within each extracted set. +#' @param genes Character vector specifying which extraction mode(s) to use. +#' Any combination of `"all"`, `"le"`, and `"top"`. Default: +#' `c("all", "le")`. +#' +#' @return Depends on `genes`: +#' * Single mode (e.g., `genes = "le"`): a named list where each element +#' is a character vector of gene symbols. The list has an attribute +#' `genes_type` used by [addgenesPA()] to name the output column. +#' * Multiple modes (e.g., `genes = c("all", "le", "top")`): a named list +#' with one element per requested mode: +#' * `$all`: named list of all gene set members. +#' * `$le`: named list of leading edge genes (GSEA only). +#' * `$top`: named list of top-ranked genes. +#' +#' @examples +#' \dontrun{ +#' data(gsea_results) +#' data(geneset_list) +#' data(deseq2_results) +#' +#' #or +#' gsl <- list_gmts("path/to/gmt_folder/") +#' +#' ranked <- deseq2_results$gene_id[order(deseq2_results$stat, +#' decreasing = TRUE)] +#' pa_single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ] +#' +#' # ── GSEA results: all three modes available +#' gene_lists <- getgenesPA(pa_single, geneset_list, ranked, +#' genes = c("all", "le", "top")) +#' +#' # But first add the top column (e.g. top 30% of genes by rank) +#' pa_single$top <- 0.30 +#' gene_lists <- getgenesPA(pa_single, geneset_list, ranked, +#' genes = c("all", "le", "top")) +#' +#' gene_lists$le[["KEGG_APOPTOSIS"]] # leading edge genes +#' gene_lists$top[["KEGG_APOPTOSIS"]] # top 30% by rank +#' gene_lists$all[["KEGG_APOPTOSIS"]] # all genes +#' +#' pa_annot <- addgenesPA(pa_single, gene_lists) +#' head(pa_annot[, c("NAME", "all_genes", "le_genes", "top_genes")]) +#' +#' # ── CAMERA results: use "top" (no leading edge available) ─── +#' data(camera_results) +#' camera_pa <- camera_results +#' colnames(camera_pa)[colnames(camera_pa) == "GeneSet"] <- "NAME" +#' camera_pa$SIZE <- sapply(camera_pa$NAME, +#' function(x) length(geneset_list[[x]])) +#' camera_pa$top <- 0.25 # top 25% by rank +#' +#' gene_lists_cam <- getgenesPA(camera_pa, geneset_list, ranked, +#' genes = c("all", "top")) +#' pa_annot_cam <- addgenesPA(camera_pa, gene_lists_cam) +#' head(pa_annot_cam[, c("NAME", "all_genes", "top_genes")]) +#' } +#' +#' @seealso [addgenesPA()] to append gene columns to pa_data; +#' [heatmap_PA()] for heatmap visualization; +#' [list_gmts()] to generate `geneset_list`; +#' [merge_PA()] to generate `pa_data` with the required `tags` column. +#' +#' @export + +getgenesPA <- function(pa_data, geneset_list, ranked_genes, + genes = c("all", "le")) { + + genes <- match.arg(genes, choices = c("all", "le", "top"), several.ok = TRUE) + + # --- Input validation ---- + if (!is.data.frame(pa_data)) { + stop("`pa_data` must be a data frame.", call. = FALSE) + } + if (!"NAME" %in% colnames(pa_data)) { + stop("`pa_data` must contain a column named 'NAME'.", call. = FALSE) + } + if ("le" %in% genes && !all(c("SIZE", "tags") %in% colnames(pa_data))) { + stop( + "`pa_data` must contain columns 'SIZE' and 'tags' when genes = 'le'. ", + "These are produced by merge_PA() from GSEA output. ", + "For non-GSEA results, use genes = 'top' and add: ", + "pa_data$SIZE <- ...; pa_data$top <- 0.25", + call. = FALSE + ) + } + if ("top" %in% genes && !all(c("SIZE", "top") %in% colnames(pa_data))) { + stop( + "`pa_data` must contain columns 'SIZE' and 'top' when genes = 'top'. ", + "Add them manually: pa_data$SIZE <- ...; pa_data$top <- 0.25", + call. = FALSE + ) + } + if (!is.list(geneset_list) || is.null(names(geneset_list))) { + stop("`geneset_list` must be a named list. Use list_gmts() to generate it.", + call. = FALSE) + } + if (!is.character(ranked_genes) || length(ranked_genes) == 0) { + stop("`ranked_genes` must be a non-empty character vector.", call. = FALSE) + } + + common_sets <- intersect(pa_data$NAME, names(geneset_list)) + if (length(common_sets) == 0) { + stop( + "No gene set names in `pa_data$NAME` match `geneset_list`. ", + "Check that both use the same naming convention.", + call. = FALSE + ) + } + + # --- Internal helper: order genes by rank and take top n - + .extract_top_n <- function(all_g, n, ranked_genes) { + if (is.na(n) || n <= 0) return(character(0)) + gene_ranks <- match(all_g, ranked_genes) + ordered_g <- all_g[order(gene_ranks, na.last = TRUE)] + utils::head(ordered_g, n) + } + + result <- list() + + # --- all: every gene in the gene set ordered by rank + if ("all" %in% genes) { + result$all <- stats::setNames( + lapply(common_sets, function(gs) { + all_g <- geneset_list[[gs]] + gene_ranks <- match(all_g, ranked_genes) + all_g[order(gene_ranks, na.last = TRUE)] + }), + common_sets + ) + } + + # --- le: leading edge genes (GSEA only, uses tags column) + if ("le" %in% genes) { + result$le <- stats::setNames( + lapply(common_sets, function(gs) { + row <- pa_data[pa_data$NAME == gs, ][1, ] + all_g <- geneset_list[[gs]] + # Leading edge size from tags (fraction contributing to ES peak) + le_size <- round(as.numeric(row$SIZE) * as.numeric(row$tags)) + .extract_top_n(all_g, le_size, ranked_genes) + }), + common_sets + ) + } + + # --- top: user-defined top fraction by rank (any enrichment method) - + if ("top" %in% genes) { + result$top <- stats::setNames( + lapply(common_sets, function(gs) { + row <- pa_data[pa_data$NAME == gs, ][1, ] + all_g <- geneset_list[[gs]] + top_size <- round(as.numeric(row$SIZE) * as.numeric(row$top)) + .extract_top_n(all_g, top_size, ranked_genes) + }), + common_sets + ) + } + + # --- Simplify output if only one mode requested ----- + if (length(genes) == 1) { + out <- result[[genes]] + attr(out, "genes_type") <- genes + return(out) + } + + return(result) +} + + +###################### +# Function addgenesPA # +###################### + +#' Add gene columns to pathway analysis results +#' +#' Appends `all_genes`, `le_genes`, and/or `top_genes` columns to a pathway +#' analysis results data frame based on the output of [getgenesPA()]. +#' Gene symbols within each cell are comma-separated. Automatically detects +#' which column(s) to add based on the structure of the input. +#' +#' @param pa_data A data frame of pathway analysis results containing a `NAME` +#' column. Typically the output of [merge_PA()]. +#' @param gene_lists Output of [getgenesPA()]. Can be: +#' * A list with `$all`, `$le`, and/or `$top` slots: when multiple modes +#' are requested (e.g., `getgenesPA(..., genes = c("all", "le", "top"))`). +#' Adds the corresponding columns. +#' * A flat named list with attribute `genes_type`: when a single mode is +#' requested. Adds the corresponding column (`all_genes`, `le_genes`, or +#' `top_genes`). +#' +#' @return The input `pa_data` data frame with one or more additional columns: +#' * `all_genes`: comma-separated string of all gene set members ordered by +#' rank. +#' * `le_genes`: comma-separated string of leading edge genes (GSEA only), +#' ordered by rank. +#' * `top_genes`: comma-separated string of top-ranked genes based on the +#' user-defined `top` fraction. +#' +#' Gene sets not found in `gene_lists` receive `NA`. +#' +#' @examples +#' \dontrun{ +#' data(gsea_results) +#' data(geneset_list) +#' data(deseq2_results) +#' +#' ranked <- deseq2_results$gene_id[order(deseq2_results$stat, +#' decreasing = TRUE)] +#' pa_single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ] +#' pa_single$top <- 0.30 +#' +#' # Add all three columns +#' gene_lists <- getgenesPA(pa_single, geneset_list, ranked, +#' genes = c("all", "le", "top")) +#' pa_annot <- addgenesPA(pa_single, gene_lists) +#' head(pa_annot[, c("NAME", "all_genes", "le_genes", "top_genes")]) +#' +#' # Add only leading edge genes +#' le_only <- getgenesPA(pa_single, geneset_list, ranked, genes = "le") +#' pa_annot <- addgenesPA(pa_single, le_only) +#' head(pa_annot[, c("NAME", "le_genes")]) +#' +#' # CAMERA: add only top and all (no leading edge) +#' data(camera_results) +#' camera_pa <- camera_results +#' colnames(camera_pa)[colnames(camera_pa) == "GeneSet"] <- "NAME" +#' camera_pa$SIZE <- sapply(camera_pa$NAME, +#' function(x) length(geneset_list[[x]])) +#' camera_pa$top <- 0.25 +#' gene_lists_cam <- getgenesPA(camera_pa, geneset_list, ranked, +#' genes = c("all", "top")) +#' pa_annot_cam <- addgenesPA(camera_pa, gene_lists_cam) +#' head(pa_annot_cam[, c("NAME", "all_genes", "top_genes")]) +#' } +#' +#' @seealso [getgenesPA()] to generate `gene_lists`; +#' [heatmap_PA()] for heatmap visualization; +#' [save_results()] to export the annotated results. +#' +#' @export + +addgenesPA <- function(pa_data, gene_lists) { + + if (!is.data.frame(pa_data)) { + stop("`pa_data` must be a data frame.", call. = FALSE) + } + if (!"NAME" %in% colnames(pa_data)) { + stop("`pa_data` must contain a column named 'NAME'.", call. = FALSE) + } + if (!is.list(gene_lists)) { + stop("`gene_lists` must be a list, output of getgenesPA().", call. = FALSE) + } + + # Helper: collapse gene vector to comma-separated string per gene set + .collapse <- function(named_list, set_names) { + vapply(set_names, function(gs) { + if (!gs %in% names(named_list)) return(NA_character_) + g <- named_list[[gs]] + if (length(g) == 0) return(NA_character_) + paste(g, collapse = ",") + }, character(1)) + } + + # Column name mapping per genes_type + col_map <- c(all = "all_genes", le = "le_genes", top = "top_genes") + + has_all_slot <- "all" %in% names(gene_lists) && is.list(gene_lists$all) + has_le_slot <- "le" %in% names(gene_lists) && is.list(gene_lists$le) + has_top_slot <- "top" %in% names(gene_lists) && is.list(gene_lists$top) + genes_type <- attr(gene_lists, "genes_type") + + if (has_all_slot || has_le_slot || has_top_slot) { + if (has_all_slot) pa_data$all_genes <- .collapse(gene_lists$all, pa_data$NAME) + if (has_le_slot) pa_data$le_genes <- .collapse(gene_lists$le, pa_data$NAME) + if (has_top_slot) pa_data$top_genes <- .collapse(gene_lists$top, pa_data$NAME) + } else if (!is.null(genes_type) && genes_type %in% names(col_map)) { + pa_data[[col_map[[genes_type]]]] <- .collapse(gene_lists, pa_data$NAME) + } else { + stop( + "`gene_lists` structure not recognized. ", + "Pass the direct output of getgenesPA().", + call. = FALSE + ) + } + + return(pa_data) +} diff --git a/R/get_stars.R b/R/get_stars.R index a2a2298..82a7f83 100644 --- a/R/get_stars.R +++ b/R/get_stars.R @@ -9,6 +9,41 @@ #' @param geneID Ensembl ID of the gene of interest. #' @param object DESeq2 results object of a comparison. #' @param thresholds Vector with 4 values of significance. Default c(0.001, 0.01, 0.1, 0.25). +#' +#' @return A single character string: `"****"`, `"***"`, `"**"`, `"*"`, +#' `"ns"` (not significant), or `"Gene ID not found"` if the gene is absent +#' from `object`. +#' +#' @examples +#' data(deseq2_results) +#' +#' # get_stars expects a column named "ensembl" +#' res <- deseq2_results +#' colnames(res)[colnames(res) == "gene_id"] <- "ensembl" +#' +#' # Get significance stars for the most significant gene +#' get_stars( +#' geneID = res$ensembl[1], +#' object = res +#' ) +#' +#' # Custom thresholds +#' get_stars( +#' geneID = res$ensembl[1], +#' object = res, +#' thresholds = c(0.001, 0.01, 0.05, 0.10) +#' ) +#' +#' # Non-significant gene +#' get_stars( +#' geneID = res$ensembl[nrow(res)], +#' object = res +#' ) +#' +#' @seealso [detect_filter()] to identify detectable genes before annotating; +#' [nice_VSB()] where significance stars can be added to plots; +#' [deseq2_results] for an example input. +#' #' @export get_stars <- function(geneID, object, thresholds = c(0.001, 0.01, 0.1, 0.25)) diff --git a/R/gsea_results.R b/R/gsea_results.R new file mode 100644 index 0000000..0959e3e --- /dev/null +++ b/R/gsea_results.R @@ -0,0 +1,86 @@ +###################### +# gsea_results data # +###################### + +#' Simulated GSEA pathway analysis results for TCGA-LUAD +#' +#' A simulated data frame representing the output of [merge_PA()] for three +#' pairwise comparisons of TCGA-LUAD samples across 40 gene sets from three +#' MSigDB collections (HALLMARK, KEGG, GO). Gene sets and gene memberships are +#' derived from [geneset_list]. NES values and FDR are simulated with +#' `set.seed(174)` to produce realistic enrichment patterns, where ~60% of +#' gene sets per comparison are significant (FDR < 0.05). +#' +#' This dataset is designed to demonstrate [splot_PA()], [multiplot_PA()], +#' [getgenesPA()], [addgenesPA()], and [heatmap_PA()] without requiring +#' external GSEA output files. +#' +#' @format A data frame with 120 rows (40 gene sets x 3 comparisons) and 15 +#' columns: +#' \describe{ +#' \item{NAME}{Character. Gene set name, matching the names in +#' [geneset_list].} +#' \item{SIZE}{Integer. Number of genes in the gene set.} +#' \item{ES}{Numeric. Enrichment score.} +#' \item{NES}{Numeric. Normalized enrichment score.} +#' \item{NOM p-val}{Numeric. Nominal p-value.} +#' \item{FDR}{Numeric. False discovery rate. Approximately 60% of gene +#' sets per comparison have FDR < 0.05.} +#' \item{FWER p-val}{Numeric. Family-wise error rate.} +#' \item{RANK AT MAX}{Integer. Gene rank at maximum enrichment score.} +#' \item{Log10FDR}{Numeric. `-log10(FDR)`.} +#' \item{tags}{Numeric. Fraction of gene set in the leading edge (0-1).} +#' \item{list}{Numeric. Fraction of the ranked list used (0-1).} +#' \item{signal}{Numeric. Enrichment signal strength (0-1).} +#' \item{LEADING EDGE}{Character. Leading edge string in GSEA format +#' (e.g., `"tags=20%, list=35%, signal=15%"`).} +#' \item{COLLECTION}{Character. MSigDB collection name: `"HALLMARK"`, +#' `"KEGG"`, or `"GO"`.} +#' \item{COMPARISON}{Character. Comparison name: `"TumorVsNormal"`, +#' `"MetastasisVsNormal"`, or `"MetastasisVsTumor"`.} +#' } +#' +#' @source Simulated with `set.seed(174)` in `data-raw/gsea_results.R`. +#' Gene set names and memberships derived from [geneset_list]. NES values +#' and significance are simulated to reflect realistic GSEA output patterns. +#' +#' @examples +#' data(gsea_results) +#' +#' # Overview +#' dim(gsea_results) +#' table(gsea_results$COMPARISON) +#' table(gsea_results$COLLECTION) +#' +#' # How many gene sets are significant per comparison? +#' tapply(gsea_results$FDR < 0.05, gsea_results$COMPARISON, sum) +#' +#' # Single comparison plot +#' single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ] +#' \dontrun{ +#' splot_PA( +#' data = single, +#' geneset_col = "NAME", +#' collection_col = "COLLECTION", +#' nes_col = "NES", +#' fdr_col = "FDR" +#' ) +#' } +#' +#' # Multi-comparison plot +#' \dontrun{ +#' multiplot_PA( +#' data = gsea_results, +#' comparison_col = "COMPARISON", +#' facet_col = "NAME", +#' fdr_col = "FDR", +#' comparison_order = c("TumorVsNormal", "MetastasisVsNormal", +#' "MetastasisVsTumor") +#' ) +#' } +#' +#' @seealso [merge_PA()] which produces this format from real GSEA output; +#' [splot_PA()], [multiplot_PA()] for visualization; +#' [getgenesPA()], [addgenesPA()] for gene-level annotation; +#' [geneset_list] for the gene set memberships used here. +"gsea_results" diff --git a/R/heatmap_GSEA.R b/R/heatmap_GSEA.R deleted file mode 100644 index 40ca338..0000000 --- a/R/heatmap_GSEA.R +++ /dev/null @@ -1,165 +0,0 @@ -######################### -# Function heatmap_GSEA # -######################### - -#' Plot leading edge heatmaps from GSEA results. -#' -#' Generates heatmaps of leading edge genes for each gene set from GSEA output. -#' -#' @param main_dir Optional base directory. If supplied, it will be prepended to all relative file paths. -#' @param expression_file Path to the expression data file (tab-delimited) or relative to main_dir. -#' @param metadata_file Path to the metadata file (Excel) or relative to main_dir. -#' @param gmt_file Path to the GMT file defining gene sets or relative to main_dir. -#' @param ranked_genes_file Path to the ranked genes list file or relative to main_dir. -#' @param gsea_file Path to the GSEA results file with leading edge genes or relative to main_dir. -#' @param output_dir Directory to save heatmaps and optional TSV; default "leading_edge_heatmaps". -#' @param sample_col Name of the sample ID column in metadata; default "Sample". -#' @param group_col Name of the group column in metadata; default "group". -#' @param save_dataframe Logical; if TRUE, saves the merged data frame as TSV before plotting. -#' @return Saves one PDF and one JPG heatmap per gene set under output_dir; optionally saves intermediate TSV. -#' @export - -heatmap_GSEA <- function(main_dir = NULL, expression_file, metadata_file, gmt_file, - ranked_genes_file, gsea_file, output_dir = "leading_edge_heatmaps", - sample_col = "Sample", group_col = "group", save_dataframe = FALSE) -{ - # Ensure required packages are installed - if (!requireNamespace("readr", quietly = TRUE)) stop("Package \"readr\" must be installed to use this function.", call. = FALSE) - if (!requireNamespace("grDevices", quietly = TRUE)) stop("Package \"grDevices\" must be installed to use this function.", call. = FALSE) - if (!requireNamespace("tidyselect", quietly = TRUE)) stop("Package \"tidyselect\" must be installed to use this function.", call. = FALSE) - if (!requireNamespace("openxlsx", quietly = TRUE)) stop("Package \"openxlsx\" must be installed to use this function.", call. = FALSE) - if (!requireNamespace("pheatmap", quietly = TRUE)) stop("Package \"pheatmap\" must be installed to use this function.", call. = FALSE) - - # Prepend base directory if provided - if (!is.null(main_dir)) { - expression_file <- file.path(main_dir, expression_file) - metadata_file <- file.path(main_dir, metadata_file) - gmt_file <- file.path(main_dir, gmt_file) - ranked_genes_file <- file.path(main_dir, ranked_genes_file) - gsea_file <- file.path(main_dir, gsea_file) - output_dir <- file.path(main_dir, output_dir) - } - - # 1) Read and process GMT - gmt_data <- readLines(gmt_file) %>% - strsplit("\t") %>% - lapply(function(x) data.frame(NAME = x[1], DESCRIPTION = x[2], GENES = paste(x[-c(1,2)], collapse = ","), stringsAsFactors = FALSE)) %>% - dplyr::bind_rows() - - # 2) Read GSEA results and join genes - gsea_df <- readr::read_tsv(gsea_file, show_col_types = FALSE) %>% - dplyr::left_join(gmt_data %>% dplyr::select(NAME, GENES), by = "NAME") - - # 3) Read ranked genes list - ranked_df <- readr::read_tsv(ranked_genes_file, show_col_types = FALSE) - ranked_vector <- ranked_df[[1]] - - # 4) Internal helper: extract top-n genes from leading edge - extract_top_n <- function(genes_str, n) { - if (is.na(genes_str) || n <= 0) return(NA_character_) - glist <- unlist(strsplit(genes_str, ",")) - glist <- glist[order(match(glist, ranked_vector), na.last = TRUE)] - paste(utils::head(glist, n), collapse = ",") - } - - # 5) Compute leading edge size and genes - gsea_df <- gsea_df %>% - dplyr::mutate(L.EDGE_size = ifelse(is.na(SIZE * tags), NA, ifelse((SIZE * tags) %% 1 <= 0.5, floor(SIZE * tags), ceiling(SIZE * tags)))) %>% - dplyr::rowwise() %>% dplyr::mutate(LEADING_EDGE_GENES = extract_top_n(GENES, L.EDGE_size)) %>% - dplyr::ungroup() - - # Save intermediate dataframe if requested - if (save_dataframe) { - if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE) - intermediate_file <- file.path(output_dir, "leading_edge_genes_df.tsv") - readr::write_tsv(gsea_df, intermediate_file) - message("Saved data frame to: ", intermediate_file) - } - - # 6) Read metadata and prepare annotation - meta <- openxlsx::read.xlsx(metadata_file) %>% - dplyr::select(tidyselect::all_of(c(sample_col, group_col))) %>% - dplyr::rename(Sample = tidyselect::all_of(sample_col), Group = tidyselect::all_of(group_col)) %>% - as.data.frame() - rownames(meta) <- meta$Sample - - # 7) Read expression data - expr_raw <- utils::read.table(expression_file, header = TRUE, sep = "\t", - stringsAsFactors = FALSE, check.names = FALSE) - # Determine gene-name column - if ("NAME" %in% colnames(expr_raw)) { - rownames(expr_raw) <- expr_raw$NAME - expr_mat <- expr_raw[, setdiff(colnames(expr_raw), "NAME"), drop = FALSE] - } else { - gene_col <- colnames(expr_raw)[1] - rownames(expr_raw) <- expr_raw[[gene_col]] - expr_mat <- expr_raw[, -1, drop = FALSE] - } - # Clean sample names - colnames(expr_mat) <- sub("^X", "", colnames(expr_mat)) - - # Ensure output directory exists - if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE) - - # 8) Loop through each gene set and plot heatmap - for (i in seq_len(nrow(gsea_df))) { - geneset_name <- gsea_df$NAME[i] - leading_genes <- unlist(strsplit(gsea_df$LEADING_EDGE_GENES[i], ",")) - genes_present <- leading_genes[leading_genes %in% rownames(expr_mat)] - if (length(genes_present) == 0) next - - heatmap_mat <- expr_mat[genes_present, , drop = FALSE] - common_samps <- intersect(colnames(heatmap_mat), rownames(meta)) - if (length(common_samps) == 0) next - - heatmap_mat <- heatmap_mat[, common_samps, drop = FALSE] - annot_col <- data.frame(Group = meta[common_samps, "Group"]) - rownames(annot_col) <- common_samps - - # Dynamic sizing - w <- 10 - h <- max(5, nrow(heatmap_mat) * 0.1 + 2) - - # PDF output - grDevices::pdf(file.path(output_dir, paste0(geneset_name, "_heatmap.pdf")), width = w, height = h) - pheatmap::pheatmap( - heatmap_mat, - main = geneset_name, - color = grDevices::colorRampPalette(c("blue","white","red"))(30), - scale = "row", - clustering_distance_rows = "euclidean", - cluster_cols = FALSE, - clustering_method = "complete", - fontsize_row = 6, - fontsize_col = 7, - annotation_col = annot_col, - border_color = NA, - cellheight = 5, - cellwidth = 8 - ) - grDevices::dev.off() - - # JPG output - grDevices::jpeg(file.path(output_dir, paste0(geneset_name, "_heatmap.jpg")), - width = w * 100, height = h * 100, res = 150) - pheatmap::pheatmap( - heatmap_mat, - main = geneset_name, - color = grDevices::colorRampPalette(c("blue","white","red"))(30), - scale = "row", - clustering_distance_rows = "euclidean", - cluster_cols = FALSE, - clustering_method = "complete", - fontsize_row = 6, - fontsize_col = 7, - annotation_col = annot_col, - border_color = NA, - cellheight = 5, - cellwidth = 8 - ) - grDevices::dev.off() - } - - message("Heatmaps saved in: ", normalizePath(output_dir)) - return(TRUE) -} diff --git a/R/list_gmts.R b/R/list_gmts.R new file mode 100644 index 0000000..1368364 --- /dev/null +++ b/R/list_gmts.R @@ -0,0 +1,78 @@ +###################### +# Function list_gmts # +###################### + +#' Read GMT files from a directory into a named gene set list +#' +#' Scans a directory for `.gmt` files, parses them, and returns a single named +#' list where each element is a character vector of gene symbols for one gene +#' set. The output is ready to be passed directly to [geneset_similarity()]. +#' +#' **GMT format:** each row contains the gene set name in column 1, an optional +#' description in column 2, and gene symbols from column 3 onward. Empty fields +#' are automatically removed. This is the standard format used by MSigDB +#' (Molecular Signatures Database) and other gene set annotation resources. +#' +#' @param dir Character. Path to the directory containing one or more `.gmt` +#' files. The function searches the directory non-recursively. +#' +#' @return A named list where each element is a character vector of gene symbols +#' for one gene set. Names correspond to gene set names as defined in column 1 +#' of the GMT files. If the same gene set name appears in multiple files, the +#' last occurrence overwrites the earlier one. +#' +#' @examples +#' \dontrun{ +#' # Read all GMT files from a directory +#' geneset_list <- list_gmts("path/to/gmt_files/") +#' +#' # Inspect output +#' length(geneset_list) # number of gene sets +#' names(geneset_list)[1:5] # first five gene set names +#' geneset_list[["KEGG_APOPTOSIS"]] # genes in a specific set +#' +#' # Pass directly to geneset_similarity +#' jac <- geneset_similarity(geneset_list, results_df, fdr_th = 0.05) +#' } +#' +#' @seealso [geneset_similarity()] +#' @export + +list_gmts <- function(dir) { + + if (!is.character(dir) || length(dir) != 1) { + stop("`dir` must be a single character string with the path to a directory.", + call. = FALSE) + } + if (!dir.exists(dir)) { + stop("Directory not found: ", dir, call. = FALSE) + } + + gmt_files <- list.files(dir, pattern = "\\.gmt$", full.names = TRUE) + + if (length(gmt_files) == 0) { + stop("No .gmt files found in: ", dir, call. = FALSE) + } + + geneset_list <- list() + + for (f in gmt_files) { + gmt <- utils::read.delim(f, header = FALSE, stringsAsFactors = FALSE) + for (i in seq_len(nrow(gmt))) { + name <- gmt$V1[i] + genes <- as.character(gmt[i, 3:ncol(gmt)]) + genes <- genes[genes != ""] + geneset_list[[name]] <- genes + } + } + + if (length(geneset_list) == 0) { + stop("No gene sets could be parsed from the .gmt files in: ", dir, + call. = FALSE) + } + + message("Loaded ", length(geneset_list), " gene sets from ", + length(gmt_files), " GMT file(s).") + + return(geneset_list) +} diff --git a/R/merge_GSEA.R b/R/merge_GSEA.R deleted file mode 100644 index 990d6b7..0000000 --- a/R/merge_GSEA.R +++ /dev/null @@ -1,69 +0,0 @@ -####################### -# Function merge_GSEA # -####################### - -#' Merge GSEA results data frames. -#' -#' After running GSEA_all.sh from GSEA.sh, merge_GSEA function joins .tsv files to a single file -#' -#' @param input_directory The directory containing the GSEA collection results in TSV format. -#' @param output_file The output file to save the merged data. If not provided, the file will be saved in the input directory. -#' @importFrom magrittr %>% -#' @export - - -merge_GSEA <- function(input_directory, output_file = "collections_merged_gsea_data.tsv") { - - if (!requireNamespace("dplyr", quietly = TRUE)) stop("Package \"dplyr\" must be installed to use this function.", call. = FALSE) - if (!requireNamespace("readr", quietly = TRUE)) stop("Package \"readr\" must be installed to use this function.", call. = FALSE) - if (!requireNamespace("tidyr", quietly = TRUE)) stop("Package \"tidyr\" must be installed to use this function.", call. = FALSE) - - # Validate input directory and check for TSV files - if (!dir.exists(input_directory)) { - stop("The specified directory does not exist: ", input_directory) - } - files <- list.files(path = input_directory, pattern = "\\.tsv$", full.names = TRUE) - if (length(files) == 0) { - stop("No TSV files found in ", input_directory) - } - - # Function to read each file and add a column with the modified file name - read_file <- function(file) { - data <- readr::read_tsv(file) - file_name <- basename(file) - file_name <- sub("_all.tsv$", "", file_name) # Change the pattern if necessary - numeric_cols <- c("SIZE", "ES", "NES", "NOM p-val", "FDR q-val", "FWER p-val", "RANK AT MAX") - data <- data %>% - dplyr::mutate(dplyr::across(tidyselect::all_of(numeric_cols), as.numeric)) - data$COLLECTION <- file_name - return(data) - } - - # Read and combine all TSV files into a single data frame, remove empty columns - gsea_data <- lapply(files, read_file) %>% dplyr::bind_rows() %>% dplyr::select(-`...12`) - - # Find problematic values in numeric columns - gsea_data %>% - dplyr::filter(dplyr::if_any(tidyselect::all_of(numeric_cols), ~ !grepl("^-?[0-9.]+$", .))) %>% - print() - - # Data processing: selection, separation, mutation, and renaming of columns - gsea_data <- gsea_data %>% - dplyr::select(-"GS
follow link to MSigDB", -"GS DETAILS") %>% - tidyr::separate(col = `LEADING EDGE`, into = c("tags", "list", "signal"), sep = ",", remove = FALSE) %>% - dplyr::mutate( - tags = 0.01 * as.numeric(sub("%", "", sub("tags=", "", tags))), - list = 0.01 * as.numeric(sub("%", "", sub("list=", "", list))), - signal = 0.01 * as.numeric(sub("%", "", sub("signal=", "", signal))), - `FDR q-val` = ifelse(`FDR q-val` == 0, 0.001, `FDR q-val`), - `Log10FDR` = -log10(`FDR q-val`) - ) %>% - dplyr::relocate(`Log10FDR`, .after = `FWER p-val`) %>% - dplyr::rename(COMPARISON = Comparison, FDR = `FDR q-val`) - - # Save the processed data to a TSV file - readr::write_tsv(gsea_data, output_file) - message("GSEA data saved to:", output_file, "\n") - - return(TRUE) -} diff --git a/R/merge_PA.R b/R/merge_PA.R new file mode 100644 index 0000000..b5cb9ca --- /dev/null +++ b/R/merge_PA.R @@ -0,0 +1,205 @@ +####################### +# Function merge_PA # +####################### + +utils::globalVariables(c( + "LEADING EDGE", "tags", "signal", + "FDR q-val", "Log10FDR", "FWER p-val", "Comparison", "...12" +)) + +#' Merge GSEA result files into a single data frame +#' +#' Reads all `.tsv` files produced by `GSEA_merge.sh` (from the GSEA.sh +#' pipeline) from a directory, standardizes numeric columns, parses the +#' leading edge string, computes `-log10(FDR)`, and returns a single merged +#' data frame ready for downstream use with [splot_PA()], [multiplot_PA()] , [getgenesPA()], and +#' [heatmap_PA()]. +#' +#' **Input file format:** Each `.tsv` file corresponds to one MSigDB collection +#' (e.g., `H.tsv`, `C2.tsv`) and must follow the standard GSEA output +#' format with the following columns: +#' * `NAME`: gene set name. +#' * `SIZE`: number of genes in the gene set. +#' * `ES`: enrichment score. +#' * `NES`: normalized enrichment score. +#' * `NOM p-val`: nominal p-value. +#' * `FDR q-val`: false discovery rate. Values of exactly `0` indicate +#' that no permutation produced an equally extreme NES (i.e., the true +#' FDR is below the permutation resolution `1 / n_permutations`). These +#' are replaced by `fdr_replace` to avoid `-Inf` in log-transforms. +#' * `FWER p-val`: family-wise error rate. +#' * `RANK AT MAX`: gene rank at maximum enrichment score. +#' * `LEADING EDGE`: string encoding the leading edge statistics in the +#' format `"tags=XX%, list=XX%, signal=XX%"`. Parsed into three numeric +#' columns: `tags` (fraction of gene set in leading edge), `list` +#' (fraction of ranked list used), and `signal` (enrichment signal +#' strength). +#' * `Comparison`: name of the comparison (e.g., `"TumorVsNormal"`). +#' Renamed to `COMPARISON` in the output. Required for visualization +#' with [splot_PA()] or [multiplot_PA()] . +#' * `GS
follow link to MSigDB` and `GS DETAILS`: removed automatically. +#' +#' **Output columns:** All input columns (minus the two removed above) plus: +#' * `COLLECTION`: name of the MSigDB collection, derived from the file name +#' by removing the `.tsv` suffix. +#' * `tags`, `list`, `signal`: numeric leading edge components (0-1 scale). +#' * `Log10FDR`: `-log10(FDR)` computed after applying `fdr_replace`. +#' * `FDR`: renamed from `FDR q-val`. +#' * `COMPARISON`: renamed from `Comparison`. +#' +#' @param input_directory Character. Path to the directory containing one or +#' more GSEA collection result files in `.tsv` format (e.g., output of +#' `GSEA_merge.sh`). Each file must end in `.tsv`. +#' @param fdr_replace Numeric. Value used to replace `FDR q-val = 0`. This +#' occurs when no permutation produced an NES as extreme as the observed +#' one, meaning the true FDR is below `1 / n_permutations`. With the +#' standard 1,000 permutations, the recommended value is `0.001`. Adjust +#' to `1 / n_permutations` if a different number of permutations was used. +#' Default: `0.001`. +#' +#' @note The input `.tsv` files must contain a `Comparison` column identifying +#' each comparison (e.g., `"TumorVsNormal"`). This column is renamed to +#' `COMPARISON` in the output and is required by [splot_PA()] or [multiplot_PA()] to operate in +#' multi-comparison mode. If your files come from a single comparison and +#' do not have this column, add it manually to each file before merging: +#' `your_data$Comparison <- "YourComparisonName"`. +#' +#' @return A data frame (`gsea_data`) containing all merged and processed GSEA +#' results with standardized column names. +#' +#' @examples +#' \dontrun{ +#' # Merge all GSEA collection TSV files from a directory +#' gsea_data <- merge_PA( +#' input_directory = "path/to/gsea_results/", +#' fdr_replace = 0.001 # standard for 1000 permutations +#' ) +#' +#' # Inspect result +#' head(gsea_data) +#' colnames(gsea_data) +#' +#' # Use directly in downstream functions +#' gsl <- list_gmts("path/to/gmt_folder/") +#' ranked <- deseq2_results$gene_id[order(deseq2_results$stat, +#' decreasing = TRUE)] +#' gene_lists <- getgenesPA(gsea_data, gsl, ranked) +#' pa_annot <- addgenesPA(gsea_data, gene_lists) +#' +#' plot_PA(gsea_data, comparison_col = "COMPARISON") +#' } +#' +#' @seealso [splot_PA()] or [multiplot_PA()] for visualization of merged results; +#' [getgenesPA()] and [addgenesPA()] for gene-level annotation; +#' [heatmap_PA()] for leading edge heatmaps; +#' [list_gmts()] to load gene sets. +#' +#' @importFrom magrittr %>% +#' @export + +merge_PA <- function(input_directory, + fdr_replace = 0.001) { + + if (!requireNamespace("readr", quietly = TRUE)) { + stop("Package \"readr\" must be installed to use this function.", call. = FALSE) + } + if (!requireNamespace("tidyr", quietly = TRUE)) { + stop("Package \"tidyr\" must be installed to use this function.", call. = FALSE) + } + if (!requireNamespace("tidyselect", quietly = TRUE)) { + stop("Package \"tidyselect\" must be installed to use this function.", call. = FALSE) + } + + # --- Input validation -- + if (!dir.exists(input_directory)) { + stop("The specified directory does not exist: ", input_directory, call. = FALSE) + } + + files <- list.files(path = input_directory, pattern = "\\.tsv$", + full.names = TRUE) + if (length(files) == 0) { + stop("No TSV files found in: ", input_directory, call. = FALSE) + } + + if (!is.numeric(fdr_replace) || fdr_replace <= 0 || fdr_replace >= 1) { + stop("`fdr_replace` must be a numeric value between 0 and 1.", call. = FALSE) + } + + # --- Numeric columns present in every GSEA output file + numeric_cols <- c("SIZE", "ES", "NES", "NOM p-val", + "FDR q-val", "FWER p-val", "RANK AT MAX") + + # --- Read and combine all TSV files + read_one <- function(file) { + data <- readr::read_tsv(file, show_col_types = FALSE) + coll_name <- sub("\\.tsv$", "", basename(file)) + data <- data %>% + dplyr::mutate( + dplyr::across(tidyselect::all_of(numeric_cols), as.numeric) + ) + data$COLLECTION <- coll_name + return(data) + } + + gsea_data <- lapply(files, read_one) %>% + dplyr::bind_rows() + + # Remove empty trailing column if present + if ("...12" %in% colnames(gsea_data)) { + gsea_data <- dplyr::select(gsea_data, -`...12`) + } + + # --- Report any non-numeric values in numeric columns -- + problematic <- gsea_data %>% + dplyr::filter( + dplyr::if_any(tidyselect::all_of(numeric_cols), + ~ !grepl("^-?[0-9.]+([eE][+-]?[0-9]+)?$", + as.character(.))) + ) + if (nrow(problematic) > 0) { + message("Warning: ", nrow(problematic), + " row(s) with non-numeric values in numeric columns:") + print(problematic) + } + + # --- Check Comparison column --- + if (!"Comparison" %in% colnames(gsea_data)) { + stop( + "No 'Comparison' column found in the TSV files. ", + "Add it manually to each file before merging: ", + "your_data$Comparison <- 'YourComparisonName'.", + call. = FALSE + ) + } + + # --- Process and standardize --- + gsea_data <- gsea_data %>% + dplyr::select(-dplyr::any_of(c("GS
follow link to MSigDB", + "GS DETAILS"))) %>% + tidyr::separate( + col = `LEADING EDGE`, + into = c("tags", "list", "signal"), + sep = ",", + remove = FALSE + ) %>% + dplyr::mutate( + tags = 0.01 * as.numeric(sub("%", "", sub("tags=", "", tags))), + list = 0.01 * as.numeric(sub("%", "", sub("list=", "", list))), + signal = 0.01 * as.numeric(sub("%", "", sub("signal=", "", signal))), + # Replace FDR = 0 to avoid -Inf in log-transforms. + # FDR = 0 means no permutation matched the observed NES, + # so the true FDR < 1/n_permutations (typically 0.001 for 1000 perms). + `FDR q-val` = ifelse(`FDR q-val` == 0, fdr_replace, `FDR q-val`), + Log10FDR = -log10(`FDR q-val`) + ) %>% + dplyr::relocate(Log10FDR, .after = `FWER p-val`) %>% + dplyr::rename(COMPARISON = Comparison, FDR = `FDR q-val`) + + # --- Message and return + message("Merged GSEA data processed: ", + nrow(gsea_data), " gene sets from ", + length(unique(gsea_data$COLLECTION)), " collection(s) and ", + length(unique(gsea_data$COMPARISON)), " comparison(s).") + + return(gsea_data) +} diff --git a/R/nice_KM.R b/R/nice_KM.R index 8ceb8a2..02d32e5 100644 --- a/R/nice_KM.R +++ b/R/nice_KM.R @@ -26,6 +26,18 @@ #' @param returnData Logical. If `TRUE`, returns a list with `km_fit` and `plot`; if `FALSE`, returns only the `ggplot` object. Default `FALSE`. #' @return A `ggplot` object (or a list with `km_fit` and `plot` if `returnData = TRUE`). #' @import ggplot2 +#' +#' @return A ggplot2 object if `returnData = FALSE` (default). If +#' `returnData = TRUE`, a named list with two elements: +#' * `$km_fit`: The [survival::survfit()] object. +#' * `$plot`: The ggplot2 survival curve. +#' +#' @references +#' Kaplan, E. L., & Meier, P. (1958). Nonparametric estimation from +#' incomplete observations. *Journal of the American Statistical +#' Association*, 53(282), 457–481. +#' \doi{10.1080/01621459.1958.10501452} +#' #' @export nice_KM <- function(data, gene, time_var, event_var, coord = NULL, title_prefix = "Mut ", colors = c("#1F8FFF", "#ED4D4D"), diff --git a/R/nice_PCA.R b/R/nice_PCA.R index a0e48d7..400f38f 100644 --- a/R/nice_PCA.R +++ b/R/nice_PCA.R @@ -36,6 +36,45 @@ #' @import ggplot2 #' @importFrom magrittr %>% #' @importFrom rlang .data +#' +#' @return A ggplot2 object if `returnData = FALSE` (default). If +#' `returnData = TRUE`, a numeric matrix of PCA coordinates with dimensions +#' samples × `outPCs`, with a `percentVar` attribute containing the +#' proportion of variance explained per component. +#' +#' @examples +#' data(vst_counts) +#' data(sampledata) +#' +#' # nice_PCA joins by a column named "id" in annotations +#' sampledata_pca <- sampledata +#' colnames(sampledata_pca)[colnames(sampledata_pca) == "patient_id"] <- "id" +#' +#' nice_PCA( +#' object = vst_counts, +#' annotations = sampledata_pca, +#' variables = c(fill = "sample_type"), +#' legend_names = c(fill = "Sample Type"), +#' colors = c("steelblue", "firebrick"), +#' shapes = c(21, 21), +#' title = "TCGA-LUAD PCA" +#' ) +#' +#' # Return PCA coordinates instead of plot +#' pca_data <- nice_PCA( +#' object = vst_counts, +#' annotations = sampledata_pca, +#' variables = c(fill = "sample_type"), +#' legend_names = c(fill = "Sample Type"), +#' colors = c("steelblue", "firebrick"), +#' shapes = c(21, 21), +#' returnData = TRUE +#' ) +#' head(pca_data) +#' +#' @seealso [nice_UMAP()], [nice_tSNE()] for other alternatives; +#' [vst_counts] for the recommended input matrix. +#' #' @export nice_PCA <- function(object, annotations = NULL, PCs = c(1,2), ntop = NULL, @@ -51,7 +90,7 @@ nice_PCA <- function(object, annotations = NULL, PCs = c(1,2), ntop = NULL, object <- as.matrix(object) expr <- if (transform) log2(object + 0.001) else object - + if (scale) { expr <- expr[matrixStats::rowVars(expr) > 0, , drop = FALSE] } diff --git a/R/nice_UMAP.R b/R/nice_UMAP.R index 48bbb2f..2bce45f 100644 --- a/R/nice_UMAP.R +++ b/R/nice_UMAP.R @@ -29,6 +29,42 @@ #' @import ggplot2 #' @importFrom magrittr %>% #' @importFrom rlang .data +#' +#' @return A ggplot2 object if `returnData = FALSE` (default). If +#' `returnData = TRUE`, a data frame with UMAP coordinates and sample +#' annotations. +#' +#' @examples +#' \dontrun{ +#' data(vst_counts) +#' data(sampledata) +#' +#' sampledata_u <- sampledata +#' colnames(sampledata_u)[colnames(sampledata_u) == "patient_id"] <- "id" +#' +#' nice_UMAP( +#' object = vst_counts, +#' annotations = sampledata_u, +#' variables = c(fill = "sample_type"), +#' legend_names = c(fill = "Sample Type"), +#' colors = c("steelblue", "firebrick"), +#' shapes = c(21, 21), +#' title = "TCGA-LUAD UMAP", +#' neighbors = 5, +#' epochs = 1000, +#' seed = 1905 +#' ) +#' } +#' +#' @seealso [nice_PCA()], [nice_tSNE()] for alternative dimensionality +#' reduction methods; [vst_counts] for the recommended input matrix. +#' +#' @references +#' McInnes, L., Healy, J., & Melville, J. (2018). Umap: Uniform Manifold +#' Approximation and Projection for Dimension Reduction. +#' *arXiv preprint arXiv:1802.03426*. +#' \url{https://arxiv.org/abs/1802.03426} +#' #' @export nice_UMAP <- function(object, annotations = NULL, neighbors = 5, components = 2, epochs = 10000, seed = 0, diff --git a/R/nice_VSB.R b/R/nice_VSB.R index 97be5c6..fc06fcc 100644 --- a/R/nice_VSB.R +++ b/R/nice_VSB.R @@ -2,16 +2,17 @@ # Function nice_VSB # ##################### -#' Function to make Violin-Scatter-Box plots. +#' Function to make Violin-Scatter-Box plots from data frames. #' #' This function will make a Boxplot, using a DEseq object. #' It will show the data points on top with a small deviation (jitter) for a better visualization. #' -#' @param object A DEseq object already transformed with the variance stabilizing or rlog transformations. +#' @param object A data frame object with normalized counts genes(in rows) across samples(in columns). #' @param annotations Data frame with annotations. #' @param variables To indicate the variables to be used as Shape and Fill of the markers. #' @param genename The gene name to be used for the plot. -#' @param symbol The gene symbol to be used for the plot. +#' @param symbol The gene symbol to display in the plot title. To obtain +#' gene symbols from Ensembl IDs, use [get_annotations()]. #' @param labels A vector containing the x-labels of the box-plot. Default: c("N", "P", "R", "M"). #' @param categories A vector containing the labels for the legend. Default: c("normal", "primary", "recurrence", "metastasis"). #' @param colors Vector of colors to be used for the categories of the variable assigned as Marker Fill. @@ -25,6 +26,29 @@ #' @import ggplot2 #' @importFrom magrittr %>% #' @importFrom rlang .data +#' +#' @return A ggplot2 object. +#' +#' @examples +#' data(norm_counts) +#' data(sampledata) +#' +#' nice_VSB( +#' object = norm_counts, +#' annotations = sampledata, +#' variables = c(fill = "sample_type"), +#' genename = rownames(norm_counts)[1], +#' categories = c("normal", "tumor"), +#' labels = c("Normal", "Tumor"), +#' colors = c("steelblue", "firebrick"), +#' shapes = 21, +#' markersize = 3 +#' ) +#' +#' @seealso [nice_Volcano()] for genome-wide visualization; [detect_filter()] +#' to identify reliably expressed genes; [get_stars()] to add significance +#' annotations; [norm_counts] for an example input matrix. +#' #' @export nice_VSB <- function (object = NULL, annotations, variables = c(fill = "VarFill", shape = "VarShape"), diff --git a/R/nice_VSB_DEseq2.R b/R/nice_VSB_DEseq2.R new file mode 100644 index 0000000..940aa51 --- /dev/null +++ b/R/nice_VSB_DEseq2.R @@ -0,0 +1,116 @@ +############################## +# Function nice_VSB_DEseq2.R # +############################# + +#' Function to make Box-Scatter-Violin plots from DEseq2 output directly. +#' +#' This function will make a Boxplot, using a DEseq object. +#' It will show the data points on top with a small deviation (jitter) for a better visualization. +#' +#' @param object A DEseq object already transformed with the variance stabilizing or rlog transformations. + +#' @param variables To indicate the variables to be used as Shape and Fill of the markers. +#' @param genename The gene name to be used for the plot. +#' @param symbol The gene symbol to display in the plot title. To obtain +#' gene symbols from Ensembl IDs, use [get_annotations()]. +#' @param labels A vector containing the x-labels of the box-plot. Default: c("N", "P", "R", "M"). +#' @param categories A vector containing the labels for the legend. Default: c("normal", "primary", "recurrence", "metastasis"). +#' @param colors Vector of colors to be used for the categories of the variable assigned as Marker Fill. +#' @param shapes Vector of shapes to be used for the categories of the variable assigned as Marker Shape. +#' @param markersize Size of the marker. +#' @param alpha Transparency of the marker, which goes from 0 (transparent) to 1 (no transparent). Default: 0.8. +#' @param width Width of the plot. +#' @param height Height of the plot. +#' @param jitter Random deviation added to the dots. Default: 0.2. +#' @param dpi DPI of the plot. Default: 150. +#' @param save To save the plot. Default: FALSE. +#' @param title_size Font of the title and axis names. Default: c(axis = 20, fig = 24). +#' @param label_size Font of the labels (x-axis) and numbers (y-axis). Default: c(x = 20, y = 16). +#' @param legend_size Font of the title and elements of the legend. Default: c(title = 14, elements = 12). +#' @import ggplot2 + +#' @importFrom rlang .data +#' +#' @examples +#' \dontrun{ +#' # requires a DESeq2 object +#' +#' data(sampledata) +#' +#' nice_VSB_DEseq2( +#' object = vst, +#' annotations = sampledata, +#' variables = c(fill = "sample_type"), +#' genename = rownames(norm_counts)[1], +#' categories = c("normal", "tumor"), +#' labels = c("Normal", "Tumor"), +#' colors = c("steelblue", "firebrick"), +#' shapes = 21, +#' markersize = 3 +#' ) +#' } +#' @export + +nice_VSB_DEseq2 <- function (object = NULL, variables = c(fill = "VarFill", shape = "VarShape"), + genename = NULL, symbol = NULL, labels = c("N", "P", "R", "M"), + categories = c("normal", "primary", "recurrence", "metastasis"), + colors = NULL, shapes = NULL, markersize = NULL, alpha = 0.8, + width = NULL, height = NULL, jitter = 0.2, dpi = 150, save = FALSE, + title_size = c(axis = 20, fig = 24), label_size = c(x = 20, y = 16), + legend_size = c(title = 14, elements = 12)) { + + if (!requireNamespace("DESeq2", quietly = TRUE)) { + stop( + "Package \"DESeq2\" must be installed to use this function.", + call. = FALSE + ) + } + + # Extracting the vector of counts for that gene + gene_counts <- DESeq2::counts(object, normalized = TRUE)[genename, ] + log2_gc <- log2(gene_counts) + + # Making a dataframe for the plot + df.box <- data.frame(object@colData[, c("id", "sample_type", variables)], log2_gc) + + # Re-ordering sample_type for the plot + df.box[, "sample_type"] <- factor(df.box[, "sample_type"], + levels = categories, + labels = labels) + + # Plot + p.bs <- ggplot(df.box, aes(x = .data$sample_type, y = log2_gc)) + theme_bw() + + geom_violin(alpha = 0.1, scale = "width", fill = "yellow", color = "peru", + show.legend = FALSE, trim = TRUE) + + geom_boxplot(width = 0.6, fill = "gray90") + + labs(title = paste("Gene:", genename, symbol), + x = expression("Sample Type"), + y = expression("log"[2]*"(Normalized Gene Counts)")) + + theme(plot.title = element_text(size = title_size["fig"]), + axis.title = element_text(size = title_size["axis"]), + axis.text.x = element_text(size = label_size["x"]), + axis.text.y = element_text(size = label_size["y"]), + legend.title = element_text(size = legend_size["title"]), + legend.text=element_text(size = legend_size["elements"])) + + if (length(variables) == 1) { + p.bs <- p.bs + geom_point(data = df.box, aes_string(fill = variables["fill"]), shape = 21, + size = markersize, alpha = alpha, color = "black", + position = position_jitter(width = jitter)) + + } else if (length(variables) == 2) { + p.bs <- p.bs + geom_point(data = df.box, aes_string(fill = variables["fill"], shape = variables["shape"]), + size = markersize, alpha = alpha, color = "black", + position = position_jitter(width = jitter)) + + scale_shape_manual(name = variables["shape"], values = shapes) + + } else { return("Up to two variables allowed") } + + p.bs <- p.bs + scale_fill_manual(name = variables[1], values = colors, + guide = guide_legend(override.aes = aes(shape = 21, size = 7))) + + if (save == T) { + ggsave(paste0(symbol,".jpg"), plot = p.bs, width = width, height = height, dpi = dpi) + + } else { return(p.bs) } +} diff --git a/R/nice_Volcano.R b/R/nice_Volcano.R index cd8d4b6..8c09860 100644 --- a/R/nice_Volcano.R +++ b/R/nice_Volcano.R @@ -12,7 +12,9 @@ #' @param results A data frame containing at least one column of effect sizes (e.g. log₂FC) and one column of significance (e.g. FDR). #' @param x_var Name of the column in `results` to plot on the x-axis (e.g. log₂FC). #' @param y_var Name of the column in `results` to plot on the y-axis (e.g. FDR). -#' @param label_var to be defined. +#' @param label_var Name of the column in `results` to use as point labels +#' (e.g. gene IDs or HGNC symbols). To use gene symbols, first run +#' [get_annotations()] and join the `symbol` column to your results table. #' @param legend Logical. Control legend display. Default: TRUE. #' @param title title. #' @param colors colors. @@ -25,6 +27,38 @@ #' @param genes Vector of genes to label in the plot. Default: NULL. #' @import ggplot2 #' @importFrom rlang .data +#' +#' @return A ggplot2 object +#' +#' @examples +#' data(deseq2_results) +#' +#' nice_Volcano( +#' results = deseq2_results, +#' x_var = "log2FoldChange", +#' y_var = "padj", +#' label_var = "gene_id", +#' title = "TCGA-LUAD: Tumor vs Normal", +#' cutoff_y = 0.05, +#' cutoff_x = 1, +#' x_range = 8, +#' y_max = 10 +#' ) +#' +#' # Highlight specific genes +#' nice_Volcano( +#' results = deseq2_results, +#' x_var = "log2FoldChange", +#' y_var = "padj", +#' label_var = "gene_id", +#' title = "TCGA-LUAD: Tumor vs Normal", +#' genes = deseq2_results$gene_id[1:5] +#' ) +#' +#' @seealso [nice_VSB()] for gene-level expression visualization; +#' [detect_filter()] to filter detectable genes before plotting; +#' [deseq2_results] for an example input dataset. +#' #' @export nice_Volcano <- function(results, x_range = 9, y_max = 8, cutoff_y = 0.05, cutoff_x = 1, diff --git a/R/nice_tSNE.R b/R/nice_tSNE.R index a38d27f..598fa54 100644 --- a/R/nice_tSNE.R +++ b/R/nice_tSNE.R @@ -28,6 +28,41 @@ #' @import ggplot2 #' @importFrom magrittr %>% #' @importFrom rlang .data +#' +#' @return A ggplot2 object if `returnData = FALSE` (default). If +#' `returnData = TRUE`, a data frame with tSNE coordinates and sample +#' annotations. +#' +#' @examples +#' \dontrun{ +#' data(vst_counts) +#' data(sampledata) +#' +#' sampledata_t <- sampledata +#' colnames(sampledata_t)[colnames(sampledata_t) == "patient_id"] <- "id" +#' +#' # perplexity must be < n_samples / 3; with 32 samples use perplexity = 5 +#' nice_tSNE( +#' object = vst_counts, +#' annotations = sampledata_t, +#' perplexity = 5, +#' max_iterations = 1000, +#' variables = c(fill = "sample_type"), +#' legend_names = c(fill = "Sample Type"), +#' colors = c("steelblue", "firebrick"), +#' shapes = c(21, 21), +#' title = "TCGA-LUAD tSNE", +#' seed = 1905 +#' ) +#' } +#' @seealso [nice_PCA()], [nice_UMAP()] for alternative dimensionality +#' reduction methods; [vst_counts] for the recommended input matrix. +#' +#' @references +#' van der Maaten, L., & Hinton, G. (2008). Visualizing data using t-SNE. +#' *Journal of Machine Learning Research*, 9, 2579–2605. +#' \url{https://jmlr.org/papers/v9/vandermaaten08a.html} +#' #' @export nice_tSNE <- function(object, annotations = NULL, perplexity = 3, max_iterations = 10000, seed = 0, diff --git a/R/plot_GSEA.R b/R/plot_GSEA.R deleted file mode 100644 index 3be4b21..0000000 --- a/R/plot_GSEA.R +++ /dev/null @@ -1,117 +0,0 @@ -###################### -# Function plot_GSEA # -###################### - -#' Plot global GSEA results -#' -#' Generates a composite plot displaying NES values, pathway labels, -#' and a \emph{logFDR} legend, organized by MSigDB collections. -#' -#' @param data Data frame containing the GSEA results. -#' @param geneset_col Name of the column containing the genesets. -#' @param collection_col Name of the column containing the collections. -#' @param nes_col Name of the column containing the NES values. -#' @param logfdr_col Name of the column containing \eqn{-\log_{10}(FDR)} values. -#' @param text_size_genesets Text size for the geneset labels. -#' @param text_size_collection Text size for the collection labels. -#' @import ggplot2 -#' @importFrom patchwork plot_layout -#' @return GSEA barplots arranged in a grid. -#' @export - -plot_GSEA <- function(data, geneset_col, collection_col, nes_col, logfdr_col, - text_size_genesets = 5, text_size_collection = 5) -{ - - if (!requireNamespace("patchwork", quietly = TRUE)) stop("Package \"patchwork\" must be installed to use this function.", call. = FALSE) - if (!requireNamespace("cowplot", quietly = TRUE)) stop("Package \"cowplot\" must be installed to use this function.", call. = FALSE) - - # Rename columns dynamically - data <- data[, c(geneset_col, collection_col, nes_col, logfdr_col)] - colnames(data) <- c("Geneset", "Collection", "NES", "logFDR") - - # Order data by NES value (descending) - data <- data[order(data$NES, decreasing = TRUE), ] - - # Ensure Geneset and Collection are factors with ordered levels - data$Geneset <- factor(data$Geneset, levels = rev(unique(data$Geneset))) - data$Collection <- factor(data$Collection, levels = unique(data$Collection)) - - # Right-side label: "MSigDB" vertically centered, in bold and italic - plot_text_msigdb <- ggplot() + - annotate("text", label = "MSigDB", fontface = "bold.italic", angle = 90, size = 35, x = 0, y = 0.5)+ - theme_void() - - # Lef-side label: "Pathways" vertically centered, in bold and italic - plot_text_pathways <- ggplot() + - annotate("text", label = "Pathways", fontface = "bold.italic", angle = 90, size = 35, x = 0, y = 0.5)+ - theme_void() - - # Right panel: Collection labels (without repetition) - plot_right <- ggplot(data, aes(y = Geneset, x = 1.5, label = Collection)) + - geom_text(aes(label = ifelse(duplicated(Collection), "", Collection)), - hjust = 0.5, size = 0, fontface = "bold") + - facet_grid(Collection ~ ., scales = "free_y", space = "free", switch = "y") + - theme_void() + - theme(strip.text.y = element_text(angle = 0, hjust = 1, size = text_size_collection), - panel.spacing = grid::unit(1, "lines")) - - # Center panel: NES bar plot - plot_center <- ggplot(data, aes(x = NES, y = Geneset, fill = logFDR)) + - geom_col(color = "black", size = 1) + - scale_fill_gradient(low = "white", high = "red", - limits = c(0,3), breaks = seq(0,3,1)) + - scale_y_discrete(position = "right") + - facet_grid(Collection ~ ., scales = "free_y", space = "free_y") + - theme_bw() + - labs(x = "NES", y = "") + - theme(axis.text.y = element_blank(), - strip.background = element_rect(fill = "white", color = "black",linewidth = 1 ), - axis.ticks.y = element_line(color = "black", size = 1.5), - axis.ticks.length = grid::unit(0.3, "cm"), - strip.text.y = element_text(size = 1, margin = margin(0, 0, 0, 0)),# element_blank(), - legend.position = "none", - axis.title.x = element_text(size = 49), - axis.text.x = element_text(size = 45), - panel.spacing = grid::unit(4, "lines") - ) - - # Left panel: Pathays labels - plot_left <- ggplot(data, aes(y = Geneset, x = 0, label = Geneset)) + - geom_text(hjust = 1, size = text_size_genesets) + - theme_void() + - theme(axis.text.y = element_blank(), - plot.margin = margin(0, 0, 0, -50)) - - # Legend panel - plot_legend <- ggplot(data, aes(x = NES, y = Geneset, fill = logFDR)) + - geom_tile() + - scale_fill_gradient(low = "white", high = "red", - name = expression(-log[10] ~ FDR), # log10FDR with subscrip, - limits = c(0,3), breaks = seq(0,3,1), - guide = guide_colorbar(ticks.colour = "black", # Make ticks black - ticks.linewidth = 1.5, # Make ticks thicker - draw.ulim = TRUE, # Draw upper limit tick - draw.llim = TRUE)) + # Draw lower limit tick - theme_bw() + - theme(legend.position = "right", - legend.box = "vertical", - legend.title = element_text(size = 44, hjust = 0.5, face = "bold"), # Bigger title - legend.text = element_text(size = 30), # Bigger legend text - legend.key.size = grid::unit(1.5, "cm"), # Bigger color key size - legend.key.height = grid::unit(2, "cm"), # Increase the height of the legend box - legend.spacing = grid::unit(3.5, "cm"), # More space between title and legend - legend.box.margin = margin(10, 20, 10, 10)) # 5, 5, 10, 5)) # Adjust internal spacing - - plot_legend <- plot_legend + theme(legend.box = "vertical") - plot_right_legend <- cowplot::get_legend(plot_legend) - - # Extract legend - #plot_right_legend <- get_legend(plot_legend) - - # Combine all plots - final_plot <- plot_text_pathways + plot_left + plot_center + plot_right + plot_text_msigdb + plot_right_legend + - patchwork::plot_layout(ncol = 6, widths = c(4, 25, 15, 3, 10, 3)) - - return(final_plot) -} diff --git a/R/plot_PA.R b/R/plot_PA.R new file mode 100644 index 0000000..0499106 --- /dev/null +++ b/R/plot_PA.R @@ -0,0 +1,1000 @@ +#################### +# Function splot_PA # +#################### + +utils::globalVariables(c( + "NAME", "GENES", "SIZE", "tags", "L.EDGE_size", + "numeric_cols", "LEADING EDGE", "signal", + "FDR q-val", "Log10FDR", "FWER p-val", "Comparison" +)) + +#' Pathway analysis visualization for a single comparison +#' +#' Generates a publication-quality multi-panel pathway enrichment plot for a +#' single comparison using patchwork. Gene sets appear on the y-axis grouped +#' by MSigDB collection, NES on the x-axis, and -log10(FDR) as fill color. +#' Six panels are assembled side by side: a "Pathways" label, gene set names, +#' the NES bar chart, collection labels, a "MSigDB" label, and the color legend. +#' +#' For visualizing enrichment across multiple comparisons, use +#' [multiplot_PA()] instead. +#' +#' @param data A data frame of pathway analysis results for a single +#' comparison. Typically the output of [merge_PA()] filtered to one value +#' of the `COMPARISON` column, or results from a single CAMERA/GSEA run. +#' Must contain the columns specified by `geneset_col`, `collection_col`, +#' `nes_col`, and `fdr_col`. +#' @param geneset_col Name of the column containing gene set labels shown on +#' the y-axis. Default: `"NAME"`. +#' @param collection_col Name of the column containing MSigDB collection +#' labels used to group gene sets (e.g., `"KEGG"`, `"HALLMARK"`, `"GO"`). +#' Default: `"COLLECTION"`. +#' @param nes_col Name of the column containing NES values (x-axis). +#' Default: `"NES"`. +#' @param fdr_col Name of the column containing FDR values. `-log10(FDR)` is +#' computed internally and used as the fill color. Default: `"FDR"`. +#' @param order One of `"desc"` or `"asc"`. Sort order for NES values on the +#' y-axis. Default: `"desc"`. +#' @param fill_limits Numeric vector of length 2 setting the color scale range +#' for `-log10(FDR)`. Values outside this range are clamped to the nearest +#' limit. For example, `fill_limits = c(0, 5)` maps all gene sets with +#' `-log10(FDR) >= 5` (i.e., FDR <= 0.00001) to the maximum color (red), +#' and any value below 0 to the minimum color (white). Useful when a few +#' gene sets have extreme significance that washes out color variation in the +#' rest. Default: `NULL` (auto uses the actual data range). +#' @param fill_palette Character vector of two colors for the fill gradient +#' (low to high -log10(FDR)). Default: `c("white", "red")`. +#' @param theme_params Named list to override default theme parameters. +#' See Details. +#' +#' @details +#' `theme_params` accepts any of the following named elements: +#' \describe{ +#' \item{`side_label_size`}{Size for "Pathways" and "MSigDB" labels. +#' Default: `35`.} +#' \item{`geneset_text_size`}{Text size for gene set labels. Default: `5`.} +#' \item{`collection_text_size`}{Text size for collection labels. +#' Default: `5`.} +#' \item{`panel_widths`}{Patchwork relative widths for the 6 panels. +#' Default: `c(4, 25, 15, 3, 10, 3)`.} +#' \item{`col_size`}{Border linewidth for `geom_col`. Default: `1`.} +#' \item{`axis_title_size`}{Font size for axis titles. Default: `45`.} +#' \item{`axis_text_size_x`}{Font size for x-axis labels. Default: `30`.} +#' \item{`tick_size`}{Linewidth for axis ticks. Default: `1.5`.} +#' \item{`tick_length`}{Length of axis ticks in cm. Default: `0.3`.} +#' \item{`panel_spacing_single`}{Spacing between facets. Default: `4`.} +#' } +#' +#' @return A `patchwork` object combining six ggplot2 panels. +#' +#' @examples +#' \dontrun{ +#' gsea_results <- merge_PA("path/to/gsea_results/") +#' +#' # Filter to one comparison +#' single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ] +#' +#' splot_PA( +#' data = single, +#' geneset_col = "NAME", +#' collection_col = "COLLECTION", +#' nes_col = "NES", +#' fdr_col = "FDR" +#' ) +#' +#' # Cap color scale at -log10(FDR) = 5 so subtle differences are visible +#' # (gene sets with FDR <= 0.00001 all get the same max red color) +#' splot_PA( +#' data = single, +#' geneset_col = "NAME", collection_col = "COLLECTION", +#' nes_col = "NES", fdr_col = "FDR", +#' fill_limits = c(0, 5) +#' ) +#' } +#' +#' @seealso [multiplot_PA()] for multi-comparison faceted barplots; +#' [merge_PA()] to generate the input data frame; +#' [camera_results] for a minimal example dataset. +#' +#' @import ggplot2 +#' @importFrom rlang .data +#' @importFrom patchwork plot_layout +#' @importFrom utils modifyList +#' @export + +splot_PA <- function(data, + geneset_col = "NAME", + collection_col = "COLLECTION", + nes_col = "NES", + fdr_col = "FDR", + order = "desc", + fill_limits = NULL, + fill_palette = c("white", "red"), + theme_params = list()) { + + if (!requireNamespace("patchwork", quietly = TRUE)) { + stop("Package \"patchwork\" must be installed to use this function.", call. = FALSE) + } + if (!requireNamespace("cowplot", quietly = TRUE)) { + stop("Package \"cowplot\" must be installed to use this function.", call. = FALSE) + } + + if (!is.data.frame(data)) stop("`data` must be a data frame.", call. = FALSE) + + for (col in c(geneset_col, collection_col, nes_col, fdr_col)) { + if (!col %in% colnames(data)) { + stop("Column '", col, "' not found in `data`.", call. = FALSE) + } + } + + order <- match.arg(order, c("desc", "asc")) + + defaults <- list( + side_label_size = 35, + geneset_text_size = 5, + collection_text_size = 5, + panel_widths = c(4, 25, 15, 3, 10, 3), + col_size = 1, + axis_title_size = 45, + axis_text_size_x = 30, + tick_size = 1.5, + tick_length = 0.3, + panel_spacing_single = 4 + ) + params <- utils::modifyList(defaults, theme_params) + + # Always compute -log10(FDR) internally + data$tmp_log10FDR <- -log10(data[[fdr_col]]) + + data <- data[order(data[[nes_col]], decreasing = (order == "desc")), ] + + df <- data[, c(geneset_col, collection_col, nes_col, "tmp_log10FDR")] + colnames(df) <- c("Geneset", "Collection", "NES", "tmp_log10FDR") + df$Geneset <- factor(df$Geneset, levels = rev(unique(df$Geneset))) + df$Collection <- factor(df$Collection, levels = unique(df$Collection)) + + plot_text_pathways <- ggplot() + + annotate("text", label = "Pathways", fontface = "bold.italic", angle = 90, + size = params$side_label_size, x = 0, y = 0.5) + + theme_void() + + plot_left <- ggplot(df, aes(y = .data$Geneset, x = 0)) + + geom_text(aes(label = .data$Geneset), hjust = 1, + size = params$geneset_text_size) + + theme_void() + + theme(axis.text.y = element_blank(), plot.margin = margin(0, 0, 0, -50)) + + plot_center <- ggplot(df, aes(x = .data$NES, y = .data$Geneset, + fill = .data$tmp_log10FDR)) + + geom_col(color = "black", linewidth = params$col_size) + + scale_fill_gradient(low = fill_palette[1], high = fill_palette[2], + limits = fill_limits, + breaks = scales::pretty_breaks()) + + scale_y_discrete(position = "right") + + facet_grid(Collection ~ ., scales = "free_y", space = "free_y") + + theme_bw() + labs(x = "NES", y = "") + + theme( + axis.text.y = element_blank(), + strip.background = element_rect(fill = "white", color = "black", linewidth = 1), + axis.ticks.y = element_line(linewidth = params$tick_size), + axis.ticks.length = grid::unit(params$tick_length, "cm"), + strip.text.y = element_text(size = 1), + legend.position = "none", + axis.title.x = element_text(size = params$axis_title_size), + axis.text.x = element_text(size = params$axis_text_size_x), + panel.spacing = grid::unit(params$panel_spacing_single, "lines") + ) + + plot_text_msigdb <- ggplot() + + annotate("text", label = "MSigDB", fontface = "bold.italic", angle = 90, + size = params$side_label_size, x = 0, y = 0.5) + + theme_void() + + plot_right <- ggplot(df, aes(y = .data$Geneset, x = 1.5)) + + geom_text( + aes(label = ifelse(duplicated(.data$Collection), "", + as.character(.data$Collection))), + hjust = 0.5, size = params$collection_text_size, fontface = "bold" + ) + + facet_grid(Collection ~ ., scales = "free_y", space = "free", switch = "y") + + theme_void() + + theme(strip.text.y = element_text(size = params$collection_text_size), + panel.spacing = grid::unit(1, "lines")) + + plot_legend <- ggplot(df, aes(x = .data$NES, y = .data$Geneset, + fill = .data$tmp_log10FDR)) + + geom_tile() + + scale_fill_gradient( + low = fill_palette[1], high = fill_palette[2], + name = expression(-log[10] ~ FDR), limits = fill_limits, + guide = guide_colorbar(ticks.colour = "black", ticks.linewidth = 1.5, + draw.ulim = TRUE, draw.llim = TRUE) + ) + + theme_bw() + + theme( + legend.title = element_text(size = 44, face = "bold"), + legend.text = element_text(size = 30), + legend.key.size = grid::unit(1.5, "cm"), + legend.key.height = grid::unit(2, "cm"), + legend.spacing = grid::unit(3.5, "cm"), + legend.box.margin = margin(10, 20, 10, 10) + ) + + plot_right_legend <- cowplot::get_legend(plot_legend) + + final_plot <- plot_text_pathways + plot_left + plot_center + plot_right + + plot_text_msigdb + plot_right_legend + + patchwork::plot_layout(ncol = 6, widths = params$panel_widths) + + return(final_plot) +} + + +######################## +# Function multiplot_PA # +######################## + +#' Pathway analysis visualization across multiple comparisons +#' +#' Generates a faceted barplot showing NES values across multiple comparisons +#' for a set of gene sets. Each facet represents one gene set and bars +#' represent the NES per comparison, colored by -log10(FDR). This layout makes +#' it easy to compare how enrichment of gene sets changes across conditions +#' (e.g., TumorVsNormal, MetastasisVsNormal). +#' +#' All comparisons must be combined in a single data frame with a column +#' identifying each comparison as produced by [merge_PA()]. +#' +#' For visualizing a single comparison with full collection grouping, use +#' [splot_PA()] instead. +#' +#' @param data A data frame of pathway analysis results containing two or more +#' comparisons. Typically the output of [merge_PA()]. +#' @param comparison_col Name of the column identifying each comparison. +#' Appears on the x-axis of each facet. Default: `"COMPARISON"`. +#' @param facet_col Name of the column used to define facets one facet per +#' unique value. Can be the original gene set name column (e.g., `"NAME"`) +#' or a manually curated column with cleaner or shorter labels +#' (e.g., `"clean_name"`). Default: `"NAME"`. +#' @param axis_y Name of the column to use for the y-axis. Default: `"NES"`. +#' @param fdr_col Name of the column containing FDR values. `-log10(FDR)` is +#' computed internally and used as the fill color. Default: `"FDR"`. +#' @param comparison_order Character vector specifying the left-to-right order +#' of comparisons on the x-axis of each facet. For example, +#' `comparison_order = c("BvsA", "CvsA")` places `BvsA` on the left and +#' `CvsA` on the right. If `NULL` (default), the order follows the factor +#' levels of `comparison_col` as they appear in `data`. +#' @param custom_labels Named character vector of x-axis tick labels. Useful +#' for shortening comparison names on the axis. For example, +#' `custom_labels = c(TumorVsNormal = "Tumor", MetastasisVsNormal = "Mets")`. +#' Default: `NULL`. +#' @param ncol_wrap Integer. Number of columns in `facet_wrap`. Default: `2`. +#' @param free_y Logical. If `TRUE`, each facet uses its own y-axis scale. +#' Default: `TRUE`. +#' @param fill_limits Numeric vector of length 2 setting the color scale range +#' for `-log10(FDR)`. Values outside this range are clamped to the nearest +#' limit. For example, `fill_limits = c(0, 5)` maps all gene sets with +#' `-log10(FDR) >= 5` (FDR <= 0.00001) to maximum red, and any value below +#' 0 to white. Useful when one gene set has extreme significance that makes +#' the rest appear uniform. Default: `NULL` (auto). +#' @param fill_palette Character vector of two colors for the fill gradient +#' (low to high -log10(FDR)). Default: `c("white", "red")`. +#' @param theme_params Named list to override default theme parameters. +#' See Details. +#' +#' @details +#' `theme_params` accepts any of the following named elements: +#' \describe{ +#' \item{`bar_col`}{Bar border color. Default: `"black"`.} +#' \item{`bar_size`}{Bar border linewidth. Default: `0.5`.} +#' \item{`bar_width`}{Bar width. Default: `0.6`.} +#' \item{`hline_size`}{Linewidth for horizontal line at y = 0. Default: `2`.} +#' \item{`axis_title_size`}{Font size for axis titles. Default: `45`.} +#' \item{`axis_text_size_x`}{Font size for x-axis labels. Default: `30`.} +#' \item{`axis_text_size_y`}{Font size for y-axis labels. Default: `50`.} +#' \item{`tick_size`}{Linewidth for axis ticks. Default: `1.5`.} +#' \item{`tick_length`}{Length of axis ticks in cm. Default: `0.3`.} +#' \item{`strip_text_size`}{Font size for facet strip labels. Default: `50`.} +#' \item{`panel_spacing_multi`}{Spacing between facets. Default: `0.6`.} +#' } +#' +#' @return A ggplot2 object. +#' +#' @examples +#' \dontrun{ +#' gsea_results <- merge_PA("path/to/gsea_results/") +#' +#' # Basic multi-comparison plot +#' multiplot_PA( +#' data = gsea_results, +#' comparison_col = "COMPARISON", +#' facet_col = "NAME", +#' fdr_col = "FDR", +#' ncol_wrap = 3 +#' ) +#' +#' # Control left-to-right order of comparisons on the x-axis +#' multiplot_PA( +#' data = gsea_results, +#' comparison_col = "COMPARISON", +#' facet_col = "NAME", +#' fdr_col = "FDR", +#' comparison_order = c("BvsA", "CvsA") # BvsA on the left, CvsA on the right +#' ) +#' +#' # Use cleaner facet labels and shorten x-axis tick names +#' gsea_results$clean_name <- gsub("_", " ", gsea_results$NAME) +#' +#' multiplot_PA( +#' data = gsea_results, +#' comparison_col = "COMPARISON", +#' facet_col = "clean_name", +#' fdr_col = "FDR", +#' comparison_order = c("BvsA", "CvsA"), +#' custom_labels = c(BvsA = "Tumor", CvsA = "Metastasis") +#' ) +#' } +#' +#' @seealso [splot_PA()] for single-comparison patchwork plots; +#' [merge_PA()] to generate the input data frame; +#' [camera_results] for a minimal example dataset. +#' +#' @import ggplot2 +#' @importFrom rlang .data +#' @importFrom utils modifyList +#' @export + +multiplot_PA <- function(data, + comparison_col = "COMPARISON", + facet_col = "NAME", + axis_y = "NES", + fdr_col = "FDR", + comparison_order = NULL, + custom_labels = NULL, + ncol_wrap = 2, + free_y = TRUE, + fill_limits = NULL, + fill_palette = c("white", "red"), + theme_params = list()) { + + if (!is.data.frame(data)) stop("`data` must be a data frame.", call. = FALSE) + + for (col in c(comparison_col, axis_y, fdr_col)) { + if (!col %in% colnames(data)) { + stop("Column '", col, "' not found in `data`.", call. = FALSE) + } + } + + if (!facet_col %in% colnames(data)) { + stop( + "Column '", facet_col, "' not found in `data`. ", + "`facet_col` can be the original gene set name column (e.g., 'NAME') ", + "or a manually curated column with cleaner labels.", + call. = FALSE + ) + } + + defaults <- list( + bar_col = "black", + bar_size = 0.5, + bar_width = 0.6, + hline_size = 2, + axis_title_size = 45, + axis_text_size_x = 30, + axis_text_size_y = 50, + tick_size = 1.5, + tick_length = 0.3, + strip_text_size = 50, + panel_spacing_multi = 0.6 + ) + params <- utils::modifyList(defaults, theme_params) + + # Always compute -log10(FDR) internally + data$tmp_log10FDR <- -log10(data[[fdr_col]]) + + # Apply comparison order if specified + if (!is.null(comparison_order)) { + missing_comps <- setdiff(comparison_order, unique(data[[comparison_col]])) + if (length(missing_comps) > 0) { + warning( + "The following values in `comparison_order` were not found in `", + comparison_col, "`: ", + paste(missing_comps, collapse = ", "), + call. = FALSE + ) + } + data[[comparison_col]] <- factor(data[[comparison_col]], + levels = comparison_order) + } + + p <- ggplot(data, aes(x = .data[[comparison_col]], + y = .data[[axis_y]], + fill = .data$tmp_log10FDR)) + + geom_bar(stat = "identity", + color = params$bar_col, + linewidth = params$bar_size, + width = params$bar_width) + + scale_fill_gradient( + low = fill_palette[1], high = fill_palette[2], + limits = fill_limits, oob = scales::squish, + name = expression(-log[10] ~ FDR), + guide = guide_colorbar(barwidth = 3, barheight = 18) + ) + + labs(x = "Comparisons", y = axis_y) + + theme_bw() + + theme( + axis.line.x = element_blank(), + axis.line = element_line(linewidth = 0.5), + axis.title.x = element_text(size = params$axis_title_size), + axis.title.y = element_text(size = params$axis_title_size), + axis.text.x = element_text(size = params$axis_text_size_x), + axis.text.y = element_text(size = params$axis_text_size_y), + axis.ticks = element_line(linewidth = params$tick_size), + axis.ticks.length = grid::unit(params$tick_length, "cm"), + strip.text = element_text(size = params$strip_text_size), + panel.spacing = grid::unit(params$panel_spacing_multi, "lines") + ) + + geom_hline(yintercept = 0, linewidth = params$hline_size) + + expand_limits(y = 0) + + facet_wrap(~ .data[[facet_col]], ncol = ncol_wrap, + scales = if (free_y) "free_y" else "fixed") + + if (!is.null(custom_labels)) { + p <- p + scale_x_discrete(labels = custom_labels) + } + + return(p) +} + + +######################## +# Function heatmap_PA # +######################## + +#' Plot leading edge heatmaps from pathway analysis results +#' +#' Generates heatmaps of gene expression for each gene set in `pa_data_annot`, +#' using the `all_genes`, `le_genes` (GSEA output only), and/or `top_genes` +#' columns produced by [addgenesPA()]. Genes within each heatmap are ordered +#' by their position in `ranked_genes`. +#' +#' The recommended workflow before calling this function is: +#' ```r +#' gsl <- list_gmts("path/to/gmt/") +#' pa_data <- merge_PA("path/to/pa_data/") +#' ranked <- deseq2_results$gene_id[order(deseq2_results$stat, +#' decreasing = TRUE)] +#' gene_lists <- getgenesPA(pa_data, gsl, ranked, genes = c("all", "le")) +#' pa_annot <- addgenesPA(pa_data, gene_lists) +#' +#' heatmap_PA( +#' expression_data = vst_counts, +#' metadata = sampledata, +#' pa_data_annot = pa_annot, +#' ranked_genes = ranked, +#' plot_genes = c("all_genes", "le_genes") +#' ) +#' ``` +#' +#' @param expression_data A numeric matrix or data frame of expression values +#' with gene symbols or Ensembl IDs as row names and sample IDs as column +#' names. Recommended input: VST-transformed counts from [vst_counts] or +#' normalized coutns [norm_counts]. +#' @param metadata A data frame of sample annotations. Must contain a column +#' matching `sample_col` (sample IDs) and a column matching `group_col` +#' (condition labels, e.g., `"Control"`, `"Treatment"`). +#' @param pa_data_annot A data frame of pathway analysis results enriched with +#' gene columns. Must contain the column `NAME` and at least one of +#' `all_genes`, `le_genes`, or `top_genes` (comma-separated gene symbols per +#' gene set). Typically the output of [addgenesPA()]. +#' @param ranked_genes A character vector of gene symbols ordered by their +#' ranking metric (e.g., stat, log2FC or signal-to-noise ratio), used to sort +#' genes within each heatmap row. +#' @param plot_genes Character vector specifying which gene columns to plot. +#' One or both of `"all_genes"` and `"le_genes"`, and `"top_genes"`. Each +#' selection produces its own set of output files in a dedicated subfolder. +#' Default: `c("all_genes", "le_genes")`. +#' @param sample_col Name of the sample ID column in `metadata`. +#' Default: `"Sample"`. +#' @param group_col Name of the condition/group column in `metadata` +#' (e.g., `"Control"` vs `"Treatment"`). Used for heatmap column +#' annotations. Default: `"group"`. +#' @param out_dir Character. Path to the output directory. Subdirectories are +#' created automatically based on `pdf`, `jpg`, and `plot_genes`: +#' * `/pdf/all_genes/` +#' * `/pdf/le_genes/` +#' * `/pdf/top_genes/` +#' * `/jpg/all_genes/` +#' * `/jpg/le_genes/` +#' * `/jpg/top_genes/` +#' Default: `"heatmaps_PA"`. +#' @param pdf Logical. If `TRUE`, saves PDF heatmaps. Default: `TRUE`. +#' @param jpg Logical. If `TRUE`, saves JPG heatmaps. Default: `TRUE`. +#' +#' @return Invisibly returns `TRUE` upon completion. Saves heatmap files to +#' the corresponding subdirectories under `out_dir`. +#' +#' @examples +#' \dontrun{ +#' data(vst_counts) +#' data(sampledata) +#' data(deseq2_results) +#' data(gsea_results) +#' data(geneset_list) +#' +#' ranked <- deseq2_results$gene_id[order(deseq2_results$stat, +#' decreasing = TRUE)] +#' +#' # ── Example 1: GSEA results (all_genes + le_genes) ──── +#' pa_single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ] +#' gene_lists <- getgenesPA(pa_single, geneset_list, ranked, +#' genes = c("all", "le")) +#' pa_annot <- addgenesPA(pa_single, gene_lists) +#' +#' heatmap_PA( +#' expression_data = vst_counts, +#' metadata = sampledata, +#' pa_data_annot = pa_annot, +#' ranked_genes = ranked, +#' plot_genes = c("all_genes", "le_genes"), +#' sample_col = "patient_id", +#' group_col = "sample_type", +#' out_dir = "heatmaps_gsea", +#' pdf = TRUE, +#' jpg = TRUE +#' ) +#' # Creates: +#' # heatmaps_gsea/pdf/all_genes/_heatmap.pdf +#' # heatmaps_gsea/pdf/le_genes/_heatmap.pdf +#' # heatmaps_gsea/jpg/all_genes/_heatmap.jpg +#' # heatmaps_gsea/jpg/le_genes/_heatmap.jpg +#' +#' # ── Example 2: CAMERA results (all_genes + top_genes) +#' # camera_results does not contain leading edge information. +#' # Use genes = "top" with a manually set top fraction instead. +#' # Note: top_genes are rank-based and do NOT represent true leading edge genes. +#' data(camera_results) +#' camera_pa <- camera_results +#' colnames(camera_pa)[colnames(camera_pa) == "GeneSet"] <- "NAME" +#' camera_pa$SIZE <- sapply(camera_pa$NAME, +#' function(x) length(geneset_list[[x]])) +#' camera_pa$top <- 0.25 +#' +#' gene_lists_cam <- getgenesPA(camera_pa, geneset_list, ranked, +#' genes = c("all", "top")) +#' pa_annot_cam <- addgenesPA(camera_pa, gene_lists_cam) +#' +#' heatmap_PA( +#' expression_data = vst_counts, +#' metadata = sampledata, +#' pa_data_annot = pa_annot_cam, +#' ranked_genes = ranked, +#' plot_genes = c("all_genes", "top_genes"), +#' sample_col = "patient_id", +#' group_col = "sample_type", +#' out_dir = "heatmaps_camera" +#' ) +#' } +#' +#' @seealso [getgenesPA()] for gene extraction; +#' [addgenesPA()] to generate `pa_data_annot`; +#' [list_gmts()] to generate the geneset list; +#' [merge_PA()] to generate `pa_data`; +#' [vst_counts] for an example expression matrix. +#' +#' @export + +heatmap_PA <- function(expression_data, + metadata, + pa_data_annot, + ranked_genes, + plot_genes = c("all_genes", "le_genes"), + sample_col = "Sample", + group_col = "group", + out_dir = "heatmaps_PA", + pdf = TRUE, + jpg = TRUE) { + + if (!requireNamespace("pheatmap", quietly = TRUE)) { + stop("Package \"pheatmap\" must be installed to use this function.", call. = FALSE) + } + if (!requireNamespace("grDevices", quietly = TRUE)) { + stop("Package \"grDevices\" must be installed to use this function.", call. = FALSE) + } + + # --- Input validation ---- + if (!is.matrix(expression_data) && !is.data.frame(expression_data)) { + stop("`expression_data` must be a matrix or data frame.", call. = FALSE) + } + if (!is.data.frame(metadata)) { + stop("`metadata` must be a data frame.", call. = FALSE) + } + if (!sample_col %in% colnames(metadata)) { + stop("`sample_col` ('", sample_col, "') not found in `metadata`.", call. = FALSE) + } + if (!group_col %in% colnames(metadata)) { + stop("`group_col` ('", group_col, "') not found in `metadata`.", call. = FALSE) + } + if (!is.data.frame(pa_data_annot)) { + stop("`pa_data_annot` must be a data frame.", call. = FALSE) + } + if (!"NAME" %in% colnames(pa_data_annot)) { + stop("`pa_data_annot` must contain a column named 'NAME'.", call. = FALSE) + } + + plot_genes <- match.arg(plot_genes, + choices = c("all_genes", "le_genes", "top_genes"), + several.ok = TRUE) + + missing_cols <- setdiff(plot_genes, colnames(pa_data_annot)) + if (length(missing_cols) > 0) { + stop( + "Column(s) not found in `pa_data_annot`: ", + paste(missing_cols, collapse = ", "), + ". Run addgenesPA() first.", + call. = FALSE + ) + } + + if (!is.character(ranked_genes) || length(ranked_genes) == 0) { + stop("`ranked_genes` must be a non-empty character vector.", call. = FALSE) + } + + # --- Prepare metadata ---- + meta <- metadata[, c(sample_col, group_col), drop = FALSE] + colnames(meta) <- c("Sample", "Group") + rownames(meta) <- meta$Sample + + expr_mat <- as.matrix(expression_data) + common_samps <- intersect(colnames(expr_mat), rownames(meta)) + + if (length(common_samps) == 0) { + stop( + "No common samples between `expression_data` columns and `metadata` rows. ", + "Check that `sample_col` matches the column names of `expression_data`.", + call. = FALSE + ) + } + + # --- Internal heatmap drawer -- + .draw_heatmap <- function(mat, title) { + annot_col <- data.frame(Group = meta[colnames(mat), "Group"]) + rownames(annot_col) <- colnames(mat) + pheatmap::pheatmap( + mat, + main = title, + color = grDevices::colorRampPalette( + c("blue", "white", "red"))(30), + scale = "row", + clustering_distance_rows = "euclidean", + cluster_cols = FALSE, + clustering_method = "complete", + fontsize_row = 6, + fontsize_col = 7, + annotation_col = annot_col, + border_color = NA, + cellheight = 5, + cellwidth = 8 + ) + } + + # --- Loop over requested gene column types --- + n_plotted <- 0L + + for (gene_col in plot_genes) { + + if (pdf) { + pdf_sub <- file.path(out_dir, "pdf", gene_col) + if (!dir.exists(pdf_sub)) dir.create(pdf_sub, recursive = TRUE) + } + if (jpg) { + jpg_sub <- file.path(out_dir, "jpg", gene_col) + if (!dir.exists(jpg_sub)) dir.create(jpg_sub, recursive = TRUE) + } + + for (i in seq_len(nrow(pa_data_annot))) { + + gs <- pa_data_annot$NAME[i] + genes_str <- pa_data_annot[[gene_col]][i] + + if (is.na(genes_str) || nchar(genes_str) == 0) next + + genes_vec <- strsplit(genes_str, ",")[[1]] + genes_in_mat <- genes_vec[genes_vec %in% rownames(expr_mat)] + + if (length(genes_in_mat) == 0) next + + gene_ranks <- match(genes_in_mat, ranked_genes) + genes_ordered <- genes_in_mat[order(gene_ranks, na.last = TRUE)] + # Order samples by group so conditions are grouped in the heatmap + sample_groups <- meta[common_samps, "Group"] + samps_ordered <- common_samps[order(sample_groups)] + heatmap_mat <- expr_mat[genes_ordered, samps_ordered, drop = FALSE] + + w <- 10 + h <- max(5, nrow(heatmap_mat) * 0.1 + 2) + + if (pdf) { + grDevices::pdf(file.path(pdf_sub, paste0(gs, "_heatmap.pdf")), + width = w, height = h) + .draw_heatmap(heatmap_mat, gs) + grDevices::dev.off() + } + + if (jpg) { + grDevices::jpeg(file.path(jpg_sub, paste0(gs, "_heatmap.jpg")), + width = w * 100, height = h * 100, res = 150) + .draw_heatmap(heatmap_mat, gs) + grDevices::dev.off() + } + + n_plotted <- n_plotted + 1L + } + } + + message("Done. ", n_plotted, " heatmap(s) saved in: ", + normalizePath(out_dir)) + invisible(TRUE) +} + +############################ +# Function heatmap_path_PA # +############################ + +utils::globalVariables(c( + "NAME", "GENES", "SIZE", "tags", "L.EDGE_size" +)) + +#' Plot leading edge heatmaps from GSEA analysis results using file paths +#' +#' Generates one heatmap per gene set from GSEA/CAMERA/PADOG output by reading +#' all required inputs from file paths. For each gene set, the leading edge +#' genes are extracted, ordered by their rank in the ranked gene list, and +#' plotted as a scaled row heatmap against the expression matrix. +#' +#' This function is the file-path-based alternative to [heatmap_PA()], which +#' accepts R objects directly. Use this version when working from raw output +#' files on disk (e.g., directly after running `GSEA_merge.sh`). +#' +#' @param main_dir Character or `NULL`. Optional base directory prepended to +#' all relative file paths. If `NULL` (default), all paths are used as-is. +#' @param expression_file Character. Path to a tab-delimited expression data +#' file. Rows are genes (first column or a column named `NAME` used as row +#' names), columns are sample IDs. Recommended input: VST-transformed counts. +#' @param metadata_file Character. Path to an Excel (`.xlsx`) metadata file. +#' Must contain a column matching `sample_col` (sample IDs) and a column +#' matching `group_col` (condition labels, e.g., `"Control"`, +#' `"Treatment"`). +#' @param gmt_file Character. Path to a `.gmt` file defining gene sets. Each +#' row contains: gene set name (column 1), description (column 2, ignored), +#' and gene symbols (columns 3+). +#' @param ranked_genes_file Character. Path to a tab-delimited file where the +#' first column contains gene symbols ordered by their ranking metric (e.g., +#' log2FC or signal-to-noise ratio), from most positive to most negative. +#' Used to order leading edge genes within each heatmap. +#' @param gsea_file Character. Path to a GSEA results `.tsv` file containing +#' at least the columns `NAME`, `SIZE`, and `tags` (from the `LEADING EDGE` +#' column parsed by [merge_PA()]). +#' @param output_dir Character. Directory where heatmap files are saved. +#' Created automatically if it does not exist. Default: +#' `"leading_edge_heatmaps"`. +#' @param sample_col Name of the sample ID column in the metadata file. +#' Default: `"Sample"`. +#' @param group_col Name of the condition/group column in the metadata file +#' (e.g., `"Control"` vs `"Treatment"`). Used for heatmap column +#' annotations. Default: `"group"`. +#' @param save_dataframe Logical. If `TRUE`, saves the intermediate data frame +#' (gene sets with computed leading edge genes) as a `.tsv` file in +#' `output_dir` before plotting. Useful for inspection or reuse. +#' Default: `FALSE`. +#' +#' @return Invisibly returns `TRUE` upon completion. Saves two files per gene +#' set in `output_dir`: +#' * `_heatmap.pdf` +#' * `_heatmap.jpg` +#' +#' If `save_dataframe = TRUE`, also saves +#' `/leading_edge_genes_df.tsv`. +#' +#' @note For a more flexible workflow that accepts R objects directly (avoiding +#' repeated file reads), use [heatmap_PA()] instead, which takes +#' `expression_data`, `metadata`, and `pa_data_annot` as R objects and +#' integrates with [getgenesPA()] and [addgenesPA()]. +#' +#' @examples +#' \dontrun{ +#' # Run with all files in a single base directory +#' heatmap_path_PA( +#' main_dir = "path/to/analysis/", +#' expression_file = "vst_expression.tsv", +#' metadata_file = "metadata.xlsx", +#' gmt_file = "genesets.gmt", +#' ranked_genes_file = "ranked_genes.tsv", +#' gsea_file = "gsea_results.tsv", +#' output_dir = "leading_edge_heatmaps", +#' sample_col = "Sample", +#' group_col = "group", +#' save_dataframe = TRUE +#' ) +#' # Saves: +#' # leading_edge_heatmaps/_heatmap.pdf +#' # leading_edge_heatmaps/_heatmap.jpg +#' # leading_edge_heatmaps/leading_edge_genes_df.tsv (if save_dataframe = TRUE) +#' +#' # Run with absolute paths (no main_dir) +#' heatmap_path_PA( +#' expression_file = "/data/vst_counts.tsv", +#' metadata_file = "/data/metadata.xlsx", +#' gmt_file = "/data/h.all.v2023.gmt", +#' ranked_genes_file = "/data/ranked_genes.tsv", +#' gsea_file = "/data/gsea_results.tsv" +#' ) +#' } +#' +#' @seealso [heatmap_PA()] for the R-object-based alternative; +#' [getgenesPA()] and [addgenesPA()] for extracting leading edge genes +#' from R objects; [merge_PA()] to generate the GSEA results input; +#' [list_gmts()] to load GMT files as R objects. +#' +#' @importFrom magrittr %>% +#' @export + +heatmap_path_PA <- function(main_dir = NULL, + expression_file, + metadata_file, + gmt_file, + ranked_genes_file, + gsea_file, + output_dir = "leading_edge_heatmaps", + sample_col = "Sample", + group_col = "group", + save_dataframe = FALSE) { + + if (!requireNamespace("readr", quietly = TRUE)) stop("Package \"readr\" must be installed to use this function.", call. = FALSE) + if (!requireNamespace("grDevices", quietly = TRUE)) stop("Package \"grDevices\" must be installed to use this function.", call. = FALSE) + if (!requireNamespace("tidyselect", quietly = TRUE)) stop("Package \"tidyselect\" must be installed to use this function.", call. = FALSE) + if (!requireNamespace("openxlsx", quietly = TRUE)) stop("Package \"openxlsx\" must be installed to use this function.", call. = FALSE) + if (!requireNamespace("pheatmap", quietly = TRUE)) stop("Package \"pheatmap\" must be installed to use this function.", call. = FALSE) + + # Prepend base directory if provided + if (!is.null(main_dir)) { + expression_file <- file.path(main_dir, expression_file) + metadata_file <- file.path(main_dir, metadata_file) + gmt_file <- file.path(main_dir, gmt_file) + ranked_genes_file <- file.path(main_dir, ranked_genes_file) + gsea_file <- file.path(main_dir, gsea_file) + output_dir <- file.path(main_dir, output_dir) + } + + # 1) Read and process GMT + gmt_data <- readLines(gmt_file) %>% + strsplit("\t") %>% + lapply(function(x) data.frame( + NAME = x[1], + GENES = paste(x[-c(1, 2)], collapse = ","), + stringsAsFactors = FALSE + )) %>% + dplyr::bind_rows() + + # 2) Read GSEA results and join genes + gsea_df <- readr::read_tsv(gsea_file, show_col_types = FALSE) %>% + dplyr::left_join( + dplyr::select(gmt_data, NAME, GENES), + by = "NAME" + ) + + # 3) Read ranked genes list + ranked_df <- readr::read_tsv(ranked_genes_file, show_col_types = FALSE) + ranked_vector <- ranked_df[[1]] + + # 4) Internal helper: extract top-n genes ordered by rank + extract_top_n <- function(genes_str, n) { + if (is.na(genes_str) || n <= 0) return(NA_character_) + glist <- unlist(strsplit(genes_str, ",")) + glist <- glist[order(match(glist, ranked_vector), na.last = TRUE)] + paste(utils::head(glist, n), collapse = ",") + } + + # 5) Compute leading edge size and extract genes + gsea_df <- gsea_df %>% + dplyr::mutate( + L.EDGE_size = ifelse( + is.na(SIZE * tags), NA, + ifelse((SIZE * tags) %% 1 <= 0.5, + floor(SIZE * tags), + ceiling(SIZE * tags)) + ) + ) %>% + dplyr::rowwise() %>% + dplyr::mutate(LEADING_EDGE_GENES = extract_top_n(GENES, L.EDGE_size)) %>% + dplyr::ungroup() + + # Optionally save intermediate data frame + if (save_dataframe) { + if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE) + intermediate_file <- file.path(output_dir, "leading_edge_genes_df.tsv") + readr::write_tsv(gsea_df, intermediate_file) + message("Saved intermediate data frame to: ", intermediate_file) + } + + # 6) Read metadata and prepare annotation + meta <- openxlsx::read.xlsx(metadata_file) %>% + dplyr::select(tidyselect::all_of(c(sample_col, group_col))) %>% + dplyr::rename( + Sample = tidyselect::all_of(sample_col), + Group = tidyselect::all_of(group_col) + ) %>% + as.data.frame() + rownames(meta) <- meta$Sample + + # 7) Read expression matrix + expr_raw <- utils::read.table(expression_file, header = TRUE, sep = "\t", + stringsAsFactors = FALSE, check.names = FALSE) + + if ("NAME" %in% colnames(expr_raw)) { + rownames(expr_raw) <- expr_raw$NAME + expr_mat <- expr_raw[, setdiff(colnames(expr_raw), "NAME"), drop = FALSE] + } else { + gene_col <- colnames(expr_raw)[1] + rownames(expr_raw) <- expr_raw[[gene_col]] + expr_mat <- expr_raw[, -1, drop = FALSE] + } + + # Clean sample names (remove leading "X" added by R) + colnames(expr_mat) <- sub("^X", "", colnames(expr_mat)) + + # Ensure output directory exists + if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE) + + # 8) Loop through each gene set and plot heatmap + for (i in seq_len(nrow(gsea_df))) { + + geneset_name <- gsea_df$NAME[i] + leading_genes <- unlist(strsplit(gsea_df$LEADING_EDGE_GENES[i], ",")) + genes_present <- leading_genes[leading_genes %in% rownames(expr_mat)] + + if (length(genes_present) == 0) next + + common_samps <- intersect(colnames(expr_mat), rownames(meta)) + if (length(common_samps) == 0) next + + heatmap_mat <- expr_mat[genes_present, common_samps, drop = FALSE] + annot_col <- data.frame(Group = meta[common_samps, "Group"]) + rownames(annot_col) <- common_samps + + w <- 10 + h <- max(5, nrow(heatmap_mat) * 0.1 + 2) + + .draw <- function() { + pheatmap::pheatmap( + heatmap_mat, + main = geneset_name, + color = grDevices::colorRampPalette(c("blue", "white", "red"))(30), + scale = "row", + clustering_distance_rows = "euclidean", + cluster_cols = FALSE, + clustering_method = "complete", + fontsize_row = 6, + fontsize_col = 7, + annotation_col = annot_col, + border_color = NA, + cellheight = 5, + cellwidth = 8 + ) + } + + grDevices::pdf(file.path(output_dir, paste0(geneset_name, "_heatmap.pdf")), + width = w, height = h) + .draw() + grDevices::dev.off() + + grDevices::jpeg(file.path(output_dir, paste0(geneset_name, "_heatmap.jpg")), + width = w * 100, height = h * 100, res = 150) + .draw() + grDevices::dev.off() + } + + message("Heatmaps saved in: ", normalizePath(output_dir)) + invisible(TRUE) +} + diff --git a/R/plotclust_PA.R b/R/plotclust_PA.R new file mode 100644 index 0000000..ebd333c --- /dev/null +++ b/R/plotclust_PA.R @@ -0,0 +1,637 @@ +# ============================================================================= +# plotclust_PA.R +# Pathway Analysis — Gene set network visualization. +# +# Functions: +# network_clust — Network plots using igraph base R graphics +# network_clust_gg — Network plots using ggraph / ggplot2 (returns objects) +# ============================================================================= + +######################## +# Function network_clust # +######################## + +#' Gene set network clustering with igraph base R graphics +#' +#' Builds a weighted gene set network from a Jaccard similarity matrix, applies +#' Louvain community detection, and draws up to four network visualizations +#' using base R igraph graphics directly to the active graphics device. +#' Optionally overlays super-term community labels generated by +#' [get_superterm()]. +#' +#' For a ggplot2-based version that returns plot objects instead of drawing +#' them, see [network_clust_gg()]. +#' +#' @param x A `JaccardResult` object (output of [geneset_similarity()]). +#' @param clust_result A list returned by [do_clust()], used to color nodes by +#' hierarchical cluster assignment. +#' @param jaccard_threshold Numeric. Minimum Jaccard similarity required for an +#' edge to be included in the network. Default: `0.50`. +#' @param min_degree Integer. Minimum node degree for a node to be retained in +#' the network. Default: `2`. +#' @param superterms Logical. If `TRUE`, overlays super-term labels at community +#' centroids on the relevant plots. Requires `superterm_data`. Default: +#' `FALSE`. +#' @param superterm_data A list returned by [get_superterm()]. Required when +#' `superterms = TRUE`. +#' @param type Character. Which plot(s) to draw. One of: +#' * `"clean"` : colored nodes, no labels. +#' * `"superterms"` : colored nodes with community super-term labels. +#' * `"combined"` : colored nodes with super-term labels and individual node +#' labels. +#' * `"individual"` : colored nodes with individual node labels only. +#' * `"all"` : all four plots drawn sequentially. Default. +#' @param seed Integer. Random seed for the Fruchterman-Reingold layout and +#' Louvain community detection. Default: `174`. +#' +#' @return Invisibly, a named list with: +#' * `$graph`: The filtered `igraph` graph object (`g_clean`). +#' * `$layout`: Numeric matrix with the normalized Fruchterman-Reingold node +#' coordinates. +#' * `$node_attributes`: A [tibble::tibble()] with columns `NAME`, `cluster`, +#' `community`, `degree`, `betweenness`, and `closeness` (plus `superterm` +#' if `superterms = TRUE`). +#' * `$superterm_report`: A [tibble::tibble()] with columns `community`, +#' `superterm`, `n_genesets`, and `geneset_members`. Only present if +#' `superterms = TRUE`. +#' +#' @examples +#' \dontrun{ +#' # Requires igraph +#' gsl <- list_gmts("path/to/gmt_folder/") +#' res <- read.csv("path/to/results.csv") +#' +#' jac <- geneset_similarity(gsl, res, fdr_th = 0.05) +#' clust <- do_clust(jac) +#' net <- get_network_communities(jac, threshold = 0.3) +#' +#' # Draw all four plots directly to the active graphics device +#' result <- network_clust( +#' jac, +#' clust_result = clust, +#' superterms = TRUE, +#' superterm_data = net$superterms, +#' type = "all", +#' seed = 174 +#' ) +#' +#' # Draw only the clean version (no labels) +#' network_clust(jac, clust, type = "clean") +#' +#' # Access node-level metrics from the invisible return +#' result$node_attributes +#' result$superterm_report +#' +#' # Save a specific plot to PDF +#' pdf("network_superterms.pdf", width = 14, height = 14) +#' network_clust(jac, clust, superterms = TRUE, +#' superterm_data = net$superterms, type = "superterms") +#' dev.off() +#' } +#' +#' @seealso [geneset_similarity()], [do_clust()], [get_network_communities()], +#' [get_superterm()], [network_clust_gg()] +#' @importFrom magrittr %>% +#' @importFrom rlang .data +#' @export + +network_clust <- function(x, clust_result, + jaccard_threshold = 0.50, + min_degree = 2, + superterms = FALSE, + superterm_data = NULL, + type = "all", + seed = 174) { + + if (!requireNamespace("igraph", quietly = TRUE)) { + stop("Package \"igraph\" must be installed to use this function.", call. = FALSE) + } + + # --- Input validation --------------------------------------------------- + if (!inherits(x, "JaccardResult")) { + stop( + "`x` must be a `JaccardResult` object (output of `geneset_similarity()`).", + call. = FALSE + ) + } + if (!is.list(clust_result) || + !all(c("cluster_assignments", "hclust") %in% names(clust_result))) { + stop("`clust_result` must be the output of `do_clust()`.", call. = FALSE) + } + + type <- match.arg(type, c("clean", "superterms", "combined", "individual", "all")) + + if (superterms && is.null(superterm_data)) { + warning( + "`superterms = TRUE` but no `superterm_data` was provided. ", + "Run `get_superterm()` first and pass its output to `superterm_data`. ", + "Falling back to `superterms = FALSE`.", + call. = FALSE + ) + superterms <- FALSE + } + + if (superterms && type %in% c("clean", "individual")) { + warning( + "Super-term labels are not displayed for `type = \"", type, "\"`. ", + "Use `type = \"superterms\"`, `\"combined\"`, or `\"all\"` to show them.", + call. = FALSE + ) + } + + # --- Build graph -------------------------------------------------------- + adj_mat <- x$jaccard_sim + adj_mat[adj_mat < jaccard_threshold] <- 0 + diag(adj_mat) <- 0 + + g <- igraph::graph_from_adjacency_matrix(adj_mat, mode = "undirected", weighted = TRUE) + g_clean <- igraph::induced_subgraph(g, igraph::V(g)[igraph::degree(g) > min_degree]) + + if (igraph::vcount(g_clean) == 0) { + stop( + "No nodes remain after filtering. ", + "Try lowering `jaccard_threshold` or `min_degree`.", + call. = FALSE + ) + } + + node_names <- igraph::V(g_clean)$name + + # --- Community detection ------------------------------------------------ + set.seed(seed) + communities <- igraph::cluster_louvain(g_clean) + community_membership <- igraph::membership(communities) + + # --- Layout ------------------------------------------------------------- + set.seed(seed) + layout_norm <- igraph::norm_coords( + igraph::layout_with_fr(g_clean), + xmin = -1, xmax = 1, ymin = -1, ymax = 1 + ) + + # --- Node colors (from hierarchical clusters) --------------------------- + node_clusters <- clust_result$cluster_assignments$cluster[ + match(node_names, clust_result$cluster_assignments$NAME) + ] + n_clusters <- length(unique(stats::na.omit(node_clusters))) + cluster_pal <- grDevices::rainbow(n_clusters, s = 0.7, v = 0.9) + node_colors <- cluster_pal[as.factor(node_clusters)] + + # --- Super-term setup --------------------------------------------------- + if (superterms) { + st_mapping <- superterm_data$mapping + node_superterms <- st_mapping$superterm[match(node_names, st_mapping$geneset)] + + unique_comms <- sort(unique(community_membership)) + + community_centroids <- do.call(rbind, lapply(unique_comms, function(cid) { + idx <- which(community_membership == cid) + if (length(idx) == 1L) layout_norm[idx, ] + else colMeans(layout_norm[idx, , drop = FALSE]) + })) + + plot_labels <- vapply(unique_comms, function(cid) { + lbl <- superterm_data$summary$superterm[superterm_data$summary$community == cid] + if (length(lbl) == 0) NA_character_ else lbl[[1]] + }, character(1)) + } + + # --- Internal plot helper ----------------------------------------------- + .draw_network <- function(show_labels = FALSE, + label_cex = 0.5, + superterm_overlay = FALSE, + main_title = "Gene Set Network") { + + graphics::par(mar = c(1, 1, 3, 1)) + + plot( + g_clean, + layout = layout_norm, + vertex.size = if (show_labels) 5 else 6, + vertex.color = node_colors, + vertex.label = if (show_labels) node_names else NA, + vertex.label.cex = label_cex, + vertex.label.color = "gray20", + vertex.label.dist = 0.3, + vertex.frame.color = "gray30", + vertex.frame.width = 0.5, + edge.width = igraph::E(g_clean)$weight * 3, + edge.color = grDevices::rgb(0.5, 0.5, 0.5, 0.3), + main = main_title, + rescale = FALSE, + xlim = c(-1.4, 1.4), + ylim = c(-1.4, 1.4), + asp = 0 + ) + + if (superterm_overlay && superterms) { + for (i in seq_along(unique_comms)) { + lbl <- plot_labels[[i]] + if (is.na(lbl)) next + cx <- community_centroids[i, 1] + cy <- community_centroids[i, 2] + sw <- graphics::strwidth(lbl, cex = 1.2) * 1.3 + sh <- graphics::strheight(lbl, cex = 1.2) * 2.5 + graphics::rect( + cx - sw / 2, cy - sh / 2, cx + sw / 2, cy + sh / 2, + col = grDevices::rgb(1, 1, 1, 0.9), + border = "gray40", + lwd = 1.5 + ) + } + graphics::text( + x = community_centroids[, 1], + y = community_centroids[, 2], + labels = plot_labels, + cex = 1.2, + font = 2, + col = "black" + ) + } + } + + # --- Draw requested plots ----------------------------------------------- + if (type %in% c("clean", "all")) + .draw_network(show_labels = FALSE, superterm_overlay = FALSE, + main_title = "Gene Set Network - Clean") + + if (type %in% c("superterms", "all")) + .draw_network(show_labels = FALSE, superterm_overlay = TRUE, + main_title = "Gene Set Network - Super-terms") + + if (type %in% c("combined", "all")) + .draw_network(show_labels = TRUE, label_cex = 0.4, superterm_overlay = TRUE, + main_title = "Gene Set Network - Super-terms + Individual Labels") + + if (type %in% c("individual", "all")) + .draw_network(show_labels = TRUE, label_cex = 0.6, superterm_overlay = FALSE, + main_title = "Gene Set Network - Individual Labels") + + # --- Node attributes ---------------------------------------------------- + node_attributes <- tibble::tibble( + NAME = node_names, + cluster = node_clusters, + community = as.integer(community_membership), + degree = igraph::degree(g_clean), + betweenness = igraph::betweenness(g_clean), + closeness = igraph::closeness(g_clean) + ) + + if (superterms) node_attributes$superterm <- node_superterms + + # --- Build return object ------------------------------------------------ + result <- list( + graph = g_clean, + layout = layout_norm, + node_attributes = node_attributes + ) + + if (superterms) { + result$superterm_report <- superterm_data$mapping %>% + dplyr::group_by(.data[["community"]], .data[["superterm"]]) %>% + dplyr::summarise( + n_genesets = dplyr::n(), + geneset_members = paste(.data[["geneset"]], collapse = " | "), + .groups = "drop" + ) %>% + dplyr::arrange(dplyr::desc(.data[["n_genesets"]])) + } + + invisible(result) +} +########################## +# Function network_clust_gg # +########################## + +#' Gene set network clustering with ggraph graphics +#' +#' Builds a weighted gene set network from a Jaccard similarity matrix, applies +#' Louvain community detection, and returns up to four ggplot2 network +#' visualizations using ggraph. Each plot is a standard ggplot2 object that +#' can be further customized, saved with [ggplot2::ggsave()], or combined +#' with patchwork. +#' +#' Optionally overlays super-term community labels generated by +#' [get_superterm()]. +#' +#' For a base R igraph version that draws directly to the active graphics +#' device, see [network_clust()]. +#' +#' @param x A `JaccardResult` object (output of [geneset_similarity()]). +#' @param clust_result A list returned by [do_clust()], used to color nodes by +#' hierarchical cluster assignment. +#' @param jaccard_threshold Numeric. Minimum Jaccard similarity required for an +#' edge to be included in the network. Default: `0.50`. +#' @param min_degree Integer. Minimum node degree for a node to be retained in +#' the network. Default: `2`. +#' @param superterms Logical. If `TRUE`, overlays super-term labels at community +#' centroids on the relevant plots. Requires `superterm_data`. Default: +#' `FALSE`. +#' @param superterm_data A list returned by [get_network_communities()] (slot +#' `$superterms`) or directly by [get_superterm()]. Required when +#' `superterms = TRUE`. +#' @param type Character. Which plot(s) to return. One of: +#' * `"clean"` : colored nodes, no labels. +#' * `"superterms"` : colored nodes with community super-term labels. +#' * `"combined"` : colored nodes with super-term labels and individual node +#' labels. +#' * `"individual"` : colored nodes with individual node labels only. +#' * `"all"` : all four plots. Default. +#' @param seed Integer. Random seed for the Fruchterman-Reingold layout and +#' Louvain community detection. Default: `174`. +#' +#' @return A named list. Depending on `type`, the list may contain any +#' combination of the ggplot2 elements `$clean`, `$superterms`, `$combined`, +#' and `$individual`. The list always contains: +#' * `$node_attributes`: A [tibble::tibble()] with columns `NAME`, `cluster`, +#' `community`, `degree`, `betweenness`, and `closeness` (plus `superterm` +#' if `superterms = TRUE`). +#' * `$superterm_report`: A [tibble::tibble()] with columns `community`, +#' `superterm`, `n_genesets`, and `geneset_members`. Only present if +#' `superterms = TRUE`. +#' +#' @examples +#' \dontrun{ +#' # Requires igraph, ggraph, tidygraph +#' gsl <- list_gmts("path/to/gmt_folder/") +#' res <- read.csv("path/to/results.csv") +#' +#' jac <- geneset_similarity(gsl, res, fdr_th = 0.05) +#' clust <- do_clust(jac) +#' net <- get_network_communities(jac, threshold = 0.3) +#' +#' # Return all four plots as ggplot2 objects +#' plots <- network_clust_gg( +#' jac, +#' clust_result = clust, +#' superterms = TRUE, +#' superterm_data = net$superterms, +#' type = "all", +#' seed = 174 +#' ) +#' +#' # Display individual plots +#' plots$clean # no labels +#' plots$superterms # community labels only +#' plots$combined # community + node labels +#' plots$individual # node labels only +#' +#' # Save with ggsave (full ggplot2 objects) +#' ggplot2::ggsave("network_combined.pdf", plots$combined, +#' width = 14, height = 14) +#' +#' # Access node-level metrics +#' plots$node_attributes +#' plots$superterm_report +#' +#' # Combine two plots with patchwork +#' library(patchwork) +#' plots$clean + plots$superterms +#' } +#' +#' @seealso [geneset_similarity()], [do_clust()], [get_network_communities()], +#' [get_superterm()], [network_clust()] +#' @import ggplot2 +#' @importFrom magrittr %>% +#' @importFrom rlang .data +#' @export + +network_clust_gg <- function(x, clust_result, + jaccard_threshold = 0.50, + min_degree = 2, + superterms = FALSE, + superterm_data = NULL, + type = "all", + seed = 174) { + + if (!requireNamespace("igraph", quietly = TRUE)) { + stop("Package \"igraph\" must be installed to use this function.", call. = FALSE) + } + if (!requireNamespace("ggraph", quietly = TRUE)) { + stop("Package \"ggraph\" must be installed to use this function.", call. = FALSE) + } + if (!requireNamespace("tidygraph", quietly = TRUE)) { + stop("Package \"tidygraph\" must be installed to use this function.", call. = FALSE) + } + + # --- Input validation --------------------------------------------------- + if (!inherits(x, "JaccardResult")) { + stop( + "`x` must be a `JaccardResult` object (output of `geneset_similarity()`).", + call. = FALSE + ) + } + if (!is.list(clust_result) || + !all(c("cluster_assignments", "hclust") %in% names(clust_result))) { + stop("`clust_result` must be the output of `do_clust()`.", call. = FALSE) + } + + type <- match.arg(type, c("clean", "superterms", "combined", "individual", "all")) + + if (superterms && is.null(superterm_data)) { + warning( + "`superterms = TRUE` but no `superterm_data` was provided. ", + "Run `get_superterm()` first and pass its output to `superterm_data`. ", + "Falling back to `superterms = FALSE`.", + call. = FALSE + ) + superterms <- FALSE + } + + if (superterms && type %in% c("clean", "individual")) { + warning( + "Super-term labels are not displayed for `type = \"", type, "\"`. ", + "Use `type = \"superterms\"`, `\"combined\"`, or `\"all\"` to show them.", + call. = FALSE + ) + } + + # --- Build graph -------------------------------------------------------- + adj_mat <- x$jaccard_sim + adj_mat[adj_mat < jaccard_threshold] <- 0 + diag(adj_mat) <- 0 + + g <- igraph::graph_from_adjacency_matrix(adj_mat, mode = "undirected", weighted = TRUE) + g_clean <- igraph::induced_subgraph(g, igraph::V(g)[igraph::degree(g) > min_degree]) + + if (igraph::vcount(g_clean) == 0) { + stop( + "No nodes remain after filtering. ", + "Try lowering `jaccard_threshold` or `min_degree`.", + call. = FALSE + ) + } + + node_names <- igraph::V(g_clean)$name + + # --- Community detection ------------------------------------------------ + set.seed(seed) + communities <- igraph::cluster_louvain(g_clean) + community_membership <- igraph::membership(communities) + + # --- Node colors -------------------------------------------------------- + node_clusters <- clust_result$cluster_assignments$cluster[ + match(node_names, clust_result$cluster_assignments$NAME) + ] + n_clusters <- length(unique(stats::na.omit(node_clusters))) + cluster_pal <- grDevices::rainbow(n_clusters, s = 0.7, v = 0.9) + + # --- Build tbl_graph with node metadata --------------------------------- + tg <- tidygraph::as_tbl_graph(g_clean) %>% + tidygraph::activate("nodes") %>% + dplyr::mutate( + cluster = factor(node_clusters), + community = as.integer(community_membership) + ) + + if (igraph::ecount(g_clean) > 0) { + tg <- tg %>% + tidygraph::activate("edges") %>% + dplyr::mutate(weight = igraph::E(g_clean)$weight) + } + + # --- Compute shared layout (FR) ----------------------------------------- + set.seed(seed) + layout_obj <- ggraph::create_layout(tg, layout = "fr") + + # --- Community centroids for super-term labels -------------------------- + centroid_data <- NULL + node_superterms <- NULL + + if (superterms) { + st_mapping <- superterm_data$mapping + node_superterms <- st_mapping$superterm[match(node_names, st_mapping$geneset)] + + centroid_data <- layout_obj %>% + dplyr::mutate(community = as.integer(community_membership)) %>% + dplyr::group_by(.data[["community"]]) %>% + dplyr::summarise( + x = mean(.data[["x"]]), + y = mean(.data[["y"]]), + .groups = "drop" + ) %>% + dplyr::left_join( + superterm_data$summary %>% + dplyr::select(.data[["community"]], .data[["superterm"]]), + by = "community" + ) + } + + # --- Internal ggraph plot builder --------------------------------------- + .build_gg <- function(show_node_labels = FALSE, + label_size = 3, + show_superterms = FALSE, + plot_title = "Gene Set Network") { + + has_edges <- igraph::ecount(g_clean) > 0 + + p <- ggraph::ggraph(layout_obj) + + {if (has_edges) + ggraph::geom_edge_link( + ggplot2::aes(width = .data[["weight"]], alpha = .data[["weight"]]), + color = "gray50", + show.legend = FALSE + )} + + ggraph::geom_node_point( + ggplot2::aes(fill = .data[["cluster"]], color = .data[["cluster"]]), + size = 5, + shape = 21, + stroke = 0.5, + show.legend = FALSE + ) + + ggraph::scale_edge_width(range = c(0.3, 2)) + + ggraph::scale_edge_alpha(range = c(0.1, 0.5)) + + ggplot2::scale_fill_manual(values = cluster_pal, na.value = "gray80") + + ggplot2::scale_color_manual(values = cluster_pal, na.value = "gray80") + + ggplot2::labs(title = plot_title) + + ggplot2::theme_void() + + ggplot2::theme( + plot.title = ggplot2::element_text(size = 16, hjust = 0.5, face = "bold"), + plot.margin = ggplot2::margin(10, 10, 10, 10) + ) + + if (show_node_labels) { + p <- p + ggraph::geom_node_text( + ggplot2::aes(label = .data[["name"]]), + size = label_size, + repel = TRUE, + color = "gray20" + ) + } + + if (show_superterms && superterms && !is.null(centroid_data)) { + p <- p + + ggplot2::geom_label( + data = centroid_data, + ggplot2::aes( + x = .data[["x"]], + y = .data[["y"]], + label = .data[["superterm"]] + ), + size = 4, + fontface = "bold", + fill = grDevices::rgb(1, 1, 1, 0.9), + color = "black", + label.size = 0.4, + inherit.aes = FALSE + ) + } + + return(p) + } + + # --- Build requested plots ---------------------------------------------- + plots <- list() + + if (type %in% c("clean", "all")) + plots$clean <- .build_gg( + show_node_labels = FALSE, show_superterms = FALSE, + plot_title = "Gene Set Network - Clean" + ) + + if (type %in% c("superterms", "all")) + plots$superterms <- .build_gg( + show_node_labels = FALSE, show_superterms = TRUE, + plot_title = "Gene Set Network - Super-terms" + ) + + if (type %in% c("combined", "all")) + plots$combined <- .build_gg( + show_node_labels = TRUE, label_size = 2.5, show_superterms = TRUE, + plot_title = "Gene Set Network - Super-terms + Individual Labels" + ) + + if (type %in% c("individual", "all")) + plots$individual <- .build_gg( + show_node_labels = TRUE, label_size = 3, show_superterms = FALSE, + plot_title = "Gene Set Network - Individual Labels" + ) + + # --- Node attributes ---------------------------------------------------- + node_attributes <- tibble::tibble( + NAME = node_names, + cluster = node_clusters, + community = as.integer(community_membership), + degree = igraph::degree(g_clean), + betweenness = igraph::betweenness(g_clean), + closeness = igraph::closeness(g_clean) + ) + + if (superterms) node_attributes$superterm <- node_superterms + + plots$node_attributes <- node_attributes + + # --- Superterm report --------------------------------------------------- + if (superterms) { + plots$superterm_report <- superterm_data$mapping %>% + dplyr::group_by(.data[["community"]], .data[["superterm"]]) %>% + dplyr::summarise( + n_genesets = dplyr::n(), + geneset_members = paste(.data[["geneset"]], collapse = " | "), + .groups = "drop" + ) %>% + dplyr::arrange(dplyr::desc(.data[["n_genesets"]])) + } + + return(plots) +} diff --git a/R/power_analysis.R b/R/power_analysis.R index 33f033d..ca594cb 100644 --- a/R/power_analysis.R +++ b/R/power_analysis.R @@ -11,9 +11,42 @@ #' @param alpha Numeric. Desired FDR (type I error rate). #' @param power_target Numeric. Desired statistical power (1 – β). #' @param max_n Integer. Maximum sample size per group to explore. -#' @param plot Logical. If TRUE, draws the power curve; if FALSE, skips plotting. +#' @param plot Logical. If TRUE, draws the power curve; if FALSE, skips plotting. #' @import ggplot2 #' @importFrom rlang .data +#' +#' @return A named list. If `plot = TRUE`, contains three elements: +#' * `$min_sample_size`: Integer. Minimum sample size per group to reach +#' `power_target`. +#' * `$power_table`: A data frame with columns `SampleSize` and `Power`. +#' * `$plot`: A ggplot2 object of the power curve. +#' +#' If `plot = FALSE`, returns only `$min_sample_size` and `$power_table`. +#' +#' @examples +#' # Basic power analysis with default parameters +#' result <- power_analysis( +#' effect_size = 1, +#' dispersion = 0.1, +#' n_genes = 20000, +#' prop_de = 0.05, +#' alpha = 0.05, +#' power_target = 0.8, +#' max_n = 20 +#' ) +#' +#' # Minimum sample size to reach 80% power +#' result$min_sample_size +#' +#' # Full power table +#' head(result$power_table) +#' +#' # Higher effect size requires fewer samples +#' power_analysis(effect_size = 2, dispersion = 0.1, plot = FALSE)$min_sample_size +#' +#' # See plot +#' #power_analysis$plot +#' #' @export power_analysis <- function( @@ -55,8 +88,7 @@ power_analysis <- function( min_n_required <- min(df$SampleSize[df$Power >= power_target], na.rm = TRUE) # Plot - if (plot) { - + if (plot == TRUE) { p <- ggplot(df, aes(x = .data[["SampleSize"]], y = .data[["Power"]])) + geom_line(linewidth = 1.2, color = "#2c3e50") + geom_hline(yintercept = power_target, linetype = "dashed", color = "red") + @@ -69,8 +101,12 @@ power_analysis <- function( x = "Sample Size per Group", y = "Statistical Power") + theme_bw(base_size = 14) - print(p) - } + return(list(min_sample_size = min_n_required, + power_table = df, + plot = p)) + }else{ - return(list(min_sample_size = min_n_required, power_table = df)) + return(list(min_sample_size = min_n_required, + power_table = df)) + } } diff --git a/R/save_results.R b/R/save_results.R index e7dd63d..9c02721 100644 --- a/R/save_results.R +++ b/R/save_results.R @@ -15,6 +15,34 @@ #' @param l2fc The cut-off of Log2(Fold Change) for the over- and under-expressed tables. Default = 0. #' @param cutoff_alpha The cut-off of the False Discovery Rate (FDR o padj). Default = 0.25. #' @importFrom rlang .data +#' +#' @return Invisibly returns `NULL`. Saves three `.xlsx` files to the working +#' directory: +#' * `_full.xlsx` : all genes. +#' * `_up_log2FC>_FDR<.xlsx` : upregulated genes. +#' * `_down_log2FC<_FDR<.xlsx` : downregulated genes. +#' +#' @examples +#' \dontrun{ +#' data(deseq2_results) +#' +#' # Save full results + over/under-expressed tables as .xlsx files +#' save_results( +#' df = deseq2_results, +#' name = "TCGA_LUAD_TumorVsNormal", +#' l2fc = 1, +#' cutoff_alpha = 0.05 +#' ) +#' +#' # Creates: +#' # TCGA_LUAD_TumorVsNormal_full.xlsx +#' # TCGA_LUAD_TumorVsNormal_up_log2FC>1_FDR<0.05.xlsx +#' # TCGA_LUAD_TumorVsNormal_down_log2FC<1_FDR<0.05.xlsx +#' } +#' +#' @seealso [detect_filter()] to further filter saved results; +#' [deseq2_results] for an example input. +#' #' @export save_results <- function(df, name, l2fc = 0, cutoff_alpha = 0.25){ @@ -23,7 +51,7 @@ save_results <- function(df, name, l2fc = 0, cutoff_alpha = 0.25){ stop( "Package \"openxlsx\" must be installed to use this function.", call. = FALSE - ) + ) } names(df)[names(df) == "padj"] <- "FDR" @@ -33,18 +61,14 @@ save_results <- function(df, name, l2fc = 0, cutoff_alpha = 0.25){ file = paste0(name, "_full.xlsx"), overwrite = T) #Saving over-expressed genes: - #df.sig.fold_over <- subset(df, ((FDR < cutoff_alpha) & !is.na(FDR)) & - # log2FoldChange >= l2fc) df.sig.fold_over <- df[df$FDR < cutoff_alpha & !is.na(df$FDR) & df$log2FoldChange >= l2fc, ] openxlsx::write.xlsx(df.sig.fold_over, colNames = T, rowNames = F, append = F, - file = paste0(name, "_Overexp.xlsx"), overwrite = T) + file = paste0(name, "_up_log2FC>",l2fc,"_FDR<", cutoff_alpha, ".xlsx"), overwrite = T) #Saving under-expressed genes: - #df.sig.fold_under <- subset(df, ((FDR < cutoff_alpha) & !is.na(FDR)) & - # log2FoldChange <= -l2fc) df.sig.fold_under <- df[df$FDR < cutoff_alpha & !is.na(df$FDR) & df$log2FoldChange <= -l2fc, ] openxlsx::write.xlsx(df.sig.fold_under, colNames = T, rowNames = F, append = F, - file = paste0(name, "_Underexp.xlsx"), overwrite = T) + file = paste0(name, "_down_log2FC<",l2fc,"_FDR<", cutoff_alpha, ".xlsx"), overwrite = T) } diff --git a/R/split_cases.R b/R/split_cases.R index 6e187e0..569dad2 100644 --- a/R/split_cases.R +++ b/R/split_cases.R @@ -8,7 +8,7 @@ #' including the baseline, there are 10 mutually exclusive cases where genes can #' fall into. This function allows us to obtain these 10 cases and saves them #' into a list. -#' +#' #' @param df.BvsA Data frame comparing the first condition to the baseline. #' @param df.CvsA Data frame comparing the second condition to the baseline. #' @param df.BvsC Data frame comparing the two conditions of the study. @@ -17,236 +17,296 @@ #' @param significance_cutoff Cut-off of the significance variable. #' @param change_var Variable that indicates the direction of the change (i.e. log2FoldChange in DESeq2, NES in GSEA). #' @param change_cutoff The values of the change variable will be filtered by |change_var| > change_cutoff. Default: 0. +#' +#' @return A named list of 10 data frames (`$Case1` through `$Case10`), each +#' containing the genes belonging to that mutually exclusive expression +#' pattern. Cases 1–6 contain a `trend` column (`"up"` or `"dn"`). Case 10 +#' contains genes not significant in any comparison. +#' +#' @details +#' The 10 cases are: +#' * **Case 1** : Ladder: significant in all 3, same direction. +#' * **Case 2** : Stronger in condition 1: significant in all 3, direction +#' reverses between conditions. +#' * **Case 3** : Stronger in condition 2. +#' * **Case 4** : Marker of condition 2: significant in CvsA and BvsC only. +#' * **Case 5** : Marker of condition 1: significant in BvsA and BvsC only. +#' * **Case 6** : Shared: significant in BvsA and CvsA only. +#' * **Cases 7-9** : Significant in only one comparison. +#' * **Case 10** : Not significant in any comparison. +#' +#' @examples +#' \dontrun{ +#' # split_cases requires three DESeq2 comparisons. +#' # Simulate a 3-phenotype study: Normal (A), Primary (B), Metastasis (C) +#' set.seed(174) +#' n_genes <- 500 +#' +#' make_res <- function(seed) { +#' set.seed(seed) +#' data.frame( +#' ensembl = paste0("ENSG", sprintf("%011d", seq_len(n_genes))), +#' log2FoldChange = rnorm(n_genes, 0, 1.5), +#' padj = runif(n_genes, 0, 0.5), +#' stringsAsFactors = FALSE +#' ) +#' } +#' +#' df_BvsA <- make_res(1) +#' df_CvsA <- make_res(2) +#' df_BvsC <- make_res(3) +#' +#' cases <- split_cases( +#' df.BvsA = df_BvsA, +#' df.CvsA = df_CvsA, +#' df.BvsC = df_BvsC, +#' unique_id = "ensembl", +#' significance_var = "padj", +#' significance_cutoff = 0.25, +#' change_var = "log2FoldChange", +#' change_cutoff = 0 +#' ) +#' +#' # Number of genes per case +#' sapply(cases, nrow) +#' +#' # Inspect Case 1 (ladder genes: significant in all 3 comparisons) +#' head(cases$Case1) +#' } +#' +#' @seealso [detect_filter()] to pre-filter genes before splitting; +#' [nice_Volcano()] to visualize individual comparison results. +#' #' @export split_cases <- function (df.BvsA = NULL, df.CvsA = NULL, df.BvsC = NULL, unique_id = "ensembl", significance_var = "padj", significance_cutoff = 0.25, change_var = "log2FoldChange", change_cutoff = 0) - + { # Set row names as unique identifiers if (!all(rownames(df.BvsA) == df.BvsA[, unique_id])) { rownames(df.BvsA) <- df.BvsA[, unique_id] } - + if (!all(rownames(df.CvsA) == df.CvsA[, unique_id])) { rownames(df.CvsA) <- df.CvsA[, unique_id] } - + if (!all(rownames(df.BvsC) == df.BvsC[, unique_id])) { rownames(df.BvsC) <- df.BvsC[, unique_id] } - + # Create subsets by significance df.BvsA.sig <- df.BvsA[df.BvsA[, significance_var] < significance_cutoff & !is.na(df.BvsA[, significance_var]), ] df.CvsA.sig <- df.CvsA[df.CvsA[, significance_var] < significance_cutoff & !is.na(df.CvsA[, significance_var]), ] df.BvsC.sig <- df.BvsC[df.BvsC[, significance_var] < significance_cutoff & !is.na(df.BvsC[, significance_var]), ] - + df.BvsA.nsig <- df.BvsA[df.BvsA[, significance_var] >= significance_cutoff | is.na(df.BvsA[, significance_var]), ] df.CvsA.nsig <- df.CvsA[df.CvsA[, significance_var] >= significance_cutoff | is.na(df.CvsA[, significance_var]), ] df.BvsC.nsig <- df.BvsC[df.BvsC[, significance_var] >= significance_cutoff | is.na(df.BvsC[, significance_var]), ] - - + + # Defining cases - + ####################### Significant in all comparisons ####################### - + ############################# # Case 1: Ladders up/down ############################# # These genes would show a progressively increasing or decreasing expression, # particularly useful when comparing conditions over time, intensity or evolution - - + + case1.up.genes <- Reduce(intersect, list(df.BvsA.sig[df.BvsA.sig[, change_var] > change_cutoff, unique_id], df.CvsA.sig[df.CvsA.sig[, change_var] > change_cutoff, unique_id], df.BvsC.sig[df.BvsC.sig[, change_var] > change_cutoff, unique_id])) - + case1.up <- as.data.frame(df.BvsA.sig[case1.up.genes, ]) case1.up$trend <- rep("up", nrow(case1.up)) - + case1.dn.genes <- Reduce(intersect, list(df.BvsA.sig[df.BvsA.sig[, change_var] < -change_cutoff, unique_id], df.CvsA.sig[df.CvsA.sig[, change_var] < -change_cutoff, unique_id], df.BvsC.sig[df.BvsC.sig[, change_var] < -change_cutoff, unique_id])) - + case1.dn <- as.data.frame(df.BvsA.sig[case1.dn.genes, ]) case1.dn$trend <- rep("dn", nrow(case1.dn)) - + ################################## - # Case 2: Stronger in condition 1 + # Case 2: Stronger in condition 1 ################################## # Genes with stronger dysregulation in the first condition, which means that # they change their expression trend while still being significant - + case2.up.genes <- Reduce(intersect, list(df.BvsA.sig[df.BvsA.sig[, change_var] > change_cutoff, unique_id], df.CvsA.sig[df.CvsA.sig[, change_var] > change_cutoff, unique_id], df.BvsC.sig[df.BvsC.sig[, change_var] < -change_cutoff, unique_id])) - + case2.up <- as.data.frame(df.BvsA.sig[case2.up.genes, ]) case2.up$trend <- rep("up", nrow(case2.up)) - + case2.dn.genes <- Reduce(intersect, list(df.BvsA.sig[df.BvsA.sig[, change_var] < -change_cutoff, unique_id], df.CvsA.sig[df.CvsA.sig[, change_var] < -change_cutoff, unique_id], df.BvsC.sig[df.BvsC.sig[, change_var] > change_cutoff, unique_id])) - + case2.dn <- as.data.frame(df.BvsA.sig[case2.dn.genes, ]) case2.dn$trend <- rep("dn", nrow(case2.dn)) - + ################################## # Case 3: Stronger in condition 2 ################################## # Genes with stronger dysregulation in the second condition, which means that # they change their expression trend while still being significant - + case3.up.genes <- Reduce(intersect, list(df.BvsA.sig[df.BvsA.sig[, change_var] < -change_cutoff, unique_id], df.CvsA.sig[df.CvsA.sig[, change_var] > change_cutoff, unique_id], df.BvsC.sig[df.BvsC.sig[, change_var] > change_cutoff, unique_id])) - + case3.up <- as.data.frame(df.BvsA.sig[case3.up.genes, ]) case3.up$trend <- rep("up", nrow(case3.up)) - + case3.dn.genes <- Reduce(intersect, list(df.BvsA.sig[df.BvsA.sig[, change_var] > change_cutoff, unique_id], df.CvsA.sig[df.CvsA.sig[, change_var] < -change_cutoff, unique_id], df.BvsC.sig[df.BvsC.sig[, change_var] < -change_cutoff, unique_id])) - + case3.dn <- as.data.frame(df.BvsA.sig[case3.dn.genes, ]) case3.dn$trend <- rep("dn", nrow(case3.dn)) - - + + ##################### Significant in only two comparisons ##################### - + ########################################### # Case 4: Significant in data 2 and data 3 ########################################### # These genes would be consider markers of the second condition since they are # dysregulated compared to the baseline and to the first condition only - + case4.up.genes <- Reduce(intersect, list(df.CvsA.sig[df.CvsA.sig[, change_var] > change_cutoff, unique_id], df.BvsC.sig[df.BvsC.sig[, change_var] > change_cutoff, unique_id], df.BvsA.nsig[, unique_id])) - + case4.up <- as.data.frame(df.CvsA.sig[case4.up.genes, ]) case4.up$trend <- rep("up", nrow(case4.up)) - + case4.dn.genes <- Reduce(intersect, list(df.CvsA.sig[df.CvsA.sig[, change_var] < -change_cutoff, unique_id], df.BvsC.sig[df.BvsC.sig[, change_var] < -change_cutoff, unique_id], df.BvsA.nsig[, unique_id])) - + case4.dn <- as.data.frame(df.CvsA.sig[case4.dn.genes, ]) case4.dn$trend <- rep("dn", nrow(case4.dn)) - + ########################################### # Case 5: Significant in data 1 and data 3 ########################################### # These genes would be consider markers of the first condition since they are # dysregulated compared to the baseline and to the second condition only - + case5.up.genes <- Reduce(intersect, list(df.BvsA.sig[df.BvsA.sig[, change_var] > change_cutoff, unique_id], df.BvsC.sig[df.BvsC.sig[, change_var] < -change_cutoff, unique_id], df.CvsA.nsig[, unique_id])) - + case5.up <- as.data.frame(df.BvsA.sig[case5.up.genes, ]) case5.up$trend <- rep("up", nrow(case5.up)) - + case5.dn.genes <- Reduce(intersect, list(df.BvsA.sig[df.BvsA.sig[, change_var] < -change_cutoff, unique_id], df.BvsC.sig[df.BvsC.sig[, change_var] > change_cutoff, unique_id], df.CvsA.nsig[, unique_id])) - + case5.dn <- as.data.frame(df.BvsA.sig[case5.dn.genes, ]) case5.dn$trend <- rep("dn", nrow(case5.dn)) - + ########################################### # Case 6: Significant in data 1 and data 2 ########################################### # According to the study's design, lets consider that genes dysregulated # in both conditions when compared to the baseline are the ones to focus on - + case6.up.genes <- Reduce(intersect, list(df.BvsA.sig[df.BvsA.sig[, change_var] > change_cutoff, unique_id], df.CvsA.sig[df.CvsA.sig[, change_var] > change_cutoff, unique_id], df.BvsC.nsig[, unique_id])) - + case6.up <- as.data.frame(df.BvsA.sig[case6.up.genes, ]) case6.up$trend <- rep("up", nrow(case6.up)) - + case6.dn.genes <- Reduce(intersect, list(df.BvsA.sig[df.BvsA.sig[, change_var] < -change_cutoff, unique_id], df.CvsA.sig[df.CvsA.sig[, change_var] < -change_cutoff, unique_id], df.BvsC.nsig[, unique_id])) - + case6.dn <- as.data.frame(df.BvsA.sig[case6.dn.genes, ]) case6.dn$trend <- rep("dn", nrow(case6.dn)) - - + + ################ Significant in only one comparison or neither ################ - + # None of these cases are insightful or provide relevant information for the # analysis performed - + ######################################## # Case 7: Significant in data 3 only ######################################## - + case7.up.genes <- Reduce(intersect, list(df.BvsA.nsig[, unique_id], df.CvsA.nsig[, unique_id], df.BvsC.sig[df.BvsC.sig[, change_var] > change_cutoff, unique_id])) - + case7.up <- as.data.frame(df.BvsC.sig[case7.up.genes, ]) case7.up$trend <- rep("up", nrow(case7.up)) - + case7.dn.genes <- Reduce(intersect, list(df.BvsA.nsig[, unique_id], df.CvsA.nsig[, unique_id], df.BvsC.sig[df.BvsC.sig[, change_var] < -change_cutoff, unique_id])) - + case7.dn <- as.data.frame(df.BvsC.sig[case7.dn.genes, ]) case7.dn$trend <- rep("dn", nrow(case7.dn)) - + ######################################## # Case 8: Significant in data 2 only ######################################## - + case8.up.genes <- Reduce(intersect, list(df.BvsA.nsig[, unique_id], df.BvsC.nsig[, unique_id], df.CvsA.sig[df.CvsA.sig[, change_var] > change_cutoff, unique_id])) - + case8.up <- as.data.frame(df.CvsA.sig[case8.up.genes, ]) case8.up$trend <- rep("up", nrow(case8.up)) - + case8.dn.genes <- Reduce(intersect, list(df.BvsA.nsig[, unique_id], df.BvsC.nsig[, unique_id], df.CvsA.sig[df.CvsA.sig[, change_var] < -change_cutoff, unique_id])) - + case8.dn <- as.data.frame(df.CvsA.sig[case8.dn.genes, ]) case8.dn$trend <- rep("dn", nrow(case8.dn)) - + ######################################## # Case 9: Significant in data 1 only ######################################## - + case9.up.genes <- Reduce(intersect, list(df.CvsA.nsig[, unique_id], df.BvsC.nsig[, unique_id], df.BvsA.sig[df.BvsA.sig[, change_var] > change_cutoff, unique_id])) - + case9.up <- as.data.frame(df.BvsA.sig[case9.up.genes, ]) case9.up$trend <- rep("up", nrow(case9.up)) - + case9.dn.genes <- Reduce(intersect, list(df.CvsA.nsig[, unique_id], df.BvsC.nsig[, unique_id], df.BvsA.sig[df.BvsA.sig[, change_var] < -change_cutoff, unique_id])) - + case9.dn <- as.data.frame(df.BvsA.sig[case9.dn.genes, ]) case9.dn$trend <- rep("dn", nrow(case9.dn)) - + ######################################## # Case 10: Significant in none ######################################## - + case10.genes <- Reduce(intersect, list(df.BvsA.nsig[, unique_id], df.CvsA.nsig[, unique_id], df.BvsC.nsig[, unique_id])) - + case10 <- as.data.frame(df.BvsA.nsig[case10.genes, ]) - + # Create data frames of results per cases case1 <- rbind(case1.up, case1.dn) case2 <- rbind(case2.up, case2.dn) @@ -257,8 +317,8 @@ split_cases <- function (df.BvsA = NULL, df.CvsA = NULL, df.BvsC = NULL, unique_ case7 <- rbind(case7.up, case7.dn) case8 <- rbind(case8.up, case8.dn) case9 <- rbind(case9.up, case9.dn) - + return(list(Case1 = case1, Case2 = case2, Case3 = case3, Case4 = case4, Case5 = case5, Case6 = case6, Case7 = case7, Case8 = case8, Case9 = case9, Case10 = case10)) - + } diff --git a/R/tpm.R b/R/tpm.R index 6a0a04b..4959d52 100644 --- a/R/tpm.R +++ b/R/tpm.R @@ -12,6 +12,44 @@ #' #' @param raw_counts A table with the gene counts. #' @param gene_lengths A column with the gene lengths. +#' +#' @return A numeric matrix of the same dimensions as `raw_counts` with TPM +#' values. Column sums equal 1,000,000 by definition. +#' +#' @note TPM normalizes for both sequencing depth and gene length, making +#' values comparable between genes within a sample. It is not appropriate +#' for differential expression analysis, use DESeq2 normalized counts +#' ([norm_counts]) for that purpose. Gene lengths from [get_annotations()] +#' are genomic lengths (including introns); for higher accuracy use +#' transcript-level lengths. +#' +#' @examples +#' \dontrun{ +#' data(raw_counts) +#' data(deseq2_results) +#' +#' # Gene lengths are needed, retrieve from get_annotations() or use +#' # pre-fetched lengths. Here we use the gene_length column if available. +#' annotations <- get_annotations( +#' ensembl_ids = rownames(raw_counts), +#' mode = "genes" +#' ) +#' +#' # Match gene lengths to raw_counts row order +#' gene_lengths <- annotations$gene_length[ +#' match(rownames(raw_counts), annotations$geneID) +#' ] +#' +#' # Calculate TPM +#' tpm_matrix <- tpm(raw_counts, gene_lengths) +#' +#' # Check: column sums should all be 1,000,000 +#' round(colSums(tpm_matrix)[1:3]) +#' } +#' +#' @seealso [get_annotations()] to obtain `gene_lengths`; +#' [norm_counts] for DESeq2 size-factor normalized counts. +#' #' @export tpm <- function(raw_counts, gene_lengths) { diff --git a/_pkgdown.yml b/_pkgdown.yml new file mode 100644 index 0000000..33e8d42 --- /dev/null +++ b/_pkgdown.yml @@ -0,0 +1,144 @@ +package: OmicsKit +title: "OmicsKit: A bioinformatics toolset for multiomics analysis" +url: https://danielgarbozo.github.io/OmicsKit + +navbar: + structure: + left: [reference, articles] + right: [github] + components: + github: + icon: fab fa-github + aria-label: GitHub + href: https://github.com/BigMindLab/OmicsKit + +home: + title: "OmicsKit" + description: > + Streamlined utilities for multi-omics analysis and publication-ready visuals. + Covers DEA, pathway analysis, dimensionality reduction, and more. + links: + - text: DEA Workflow + href: articles/DEA_workflow.html + - text: Pathway Analysis Workflow + href: articles/PA_workflow.html + - text: PA Clustering Workflow + href: articles/PA_clustering.html + sidebar: + structure: [links, license, authors, dev] + +authors: + David Requena: + href: https://orcid.org/0000-0002-5968-1133 + Daniel Guevara: + href: https://orcid.org/0009-0001-2786-8729 + Daniel Garbozo: + href: https://orcid.org/0009-0003-2495-6568 + Angela Alarcon: + href: https://orcid.org/0000-0003-0293-5603 + + +footer: + structure: + left: [developed_by] + right: [built_with] + components: + developed_by: | + Developed by the + BigMind Lab. + Contact: david.requena@nyulangone.org + +reference: + - title: "Differential Expression Analysis (DEA)" + desc: > + Functions for data quality control, normalization, dimensionality reduction, + annotation, and differential expression visualization. + contents: + - power_analysis + - tpm + - nice_PCA + - nice_UMAP + - nice_tSNE + - get_annotations + - add_annotations + - save_results + - split_cases + - nice_Volcano + - detect_filter + - get_stars + - nice_VSB + - nice_VSB_DEseq2 + + - title: "Genomics" + desc: "Survival analysis and genomics visualization utilities." + contents: + - nice_KM + + - title: "Pathway Analysis (PA)" + desc: > + Tools for loading, merging, and visualizing GSEA / pathway analysis results, + including single- and multi-comparison plots and heatmaps. + contents: + - list_gmts + - merge_PA + - getgenesPA + - addgenesPA + - multiplot_PA + - splot_PA + - heatmap_PA + - heatmap_path_PA + + - title: "PA Clustering" + desc: > + Pathway clustering via Jaccard similarity, hierarchical clustering, + and network-based community detection. + contents: + - geneset_similarity + - do_clust + - get_superterm + - get_network_communities + - network_clust + - network_clust_gg + - title: "Example datasets" + desc: "Built-in datasets for reproducible examples and vignettes." + contents: + - camera_results + - deseq2_results + - geneset_list + - gsea_results + - norm_counts + - raw_counts + - sampledata + - vst_counts + +articles: + - title: "Workflows" + desc: "End-to-end analysis examples with OmicsKit." + contents: + - DEA_workflow + - PA_workflow + - PA_clustering + +template: + bootstrap: 5 + bootswatch: cosmo + bslib: + primary: "#6000C6" + secondary: "#B3A3FF" + body-bg: "#FFFFFF" + navbar-bg: "#6000C6" + navbar-fg: "#FFFFFF" + link-color: "#6000C6" + pre-bg: "#F7F7FB" + includes: + in_header: | + + +development: + mode: release + +toc: + depth: 2 diff --git a/data-raw/deseq2_results.R b/data-raw/deseq2_results.R new file mode 100644 index 0000000..daa9840 --- /dev/null +++ b/data-raw/deseq2_results.R @@ -0,0 +1,97 @@ +## data-raw/deseq2_results.R +## Generates deseq2_results, , norm_counts, and vst_counts from TCGA-LUAD raw_counts and sampledata. +## Run once to regenerate the three .rda files in data/. +## Requires: DESeq2, dplyr, tibble + +# ============================================================================= +# DESeq2 pipeline - TCGA LUAD +# Comparison: Tumor vs Normal (sample_type column) +# Source: GDC Data Portal - TCGA-LUAD STAR Counts +# Outputs: +# data/deseq2_results.rda : DESeq2 results table (all genes, FDR filtered) +# data/norm_counts.rda : Normalized counts matrix (counts(dds, normalized=TRUE)) +# data/vst_counts.rda : VST-transformed matrix (assay(vst(dds))) +# ============================================================================= + +library(DESeq2) +library(dplyr) +library(tibble) + +data(raw_counts) +data(sampledata) + +# Inspect sample_type levels +cat("sample_type levels:\n") +print(table(sampledata$sample_type)) + +# Reference = "normal" (baseline), comparison = "tumor" + +sampledata$sample_type <- factor( + sampledata$sample_type, + levels = c("normal", "tumor") +) + +# Build DESeq2 object +dds <- DESeqDataSetFromMatrix( + countData = round(raw_counts), + colData = sampledata, + design = ~ sample_type +) + +# Filter: keep genes with >= 10 counts in at least 10 samples +keep <- rowSums(counts(dds) >= 10) >= 10 +dds <- dds[keep, ] +cat("Genes after filtering:", nrow(dds), "\n") + +# Run DESeq2 +dds <- DESeq(dds) + +# 1. deseq2 results +res <- results( + dds, + contrast = c("sample_type", "tumor", "normal"), + alpha = 0.05 +) + +deseq2_results <- as.data.frame(res) %>% + rownames_to_column("gene_id") %>% + filter(!is.na(padj), !is.na(log2FoldChange)) %>% + arrange(padj) + +cat("\ndeseq2_results:\n") +cat("Total genes with results:", nrow(deseq2_results), "\n") +cat("Significant (FDR < 0.05):", sum(deseq2_results$padj < 0.05), "\n") +cat(" Columns :", paste(colnames(deseq2_results), collapse = ", "), "\n") + +# 2. norm_counts +# Normalized counts: suitable for nice_VSB, detect_filter, add_annotations +# Counts are divided by DESeq2 size factors to correct for library size. +# Still in counts scale (not log-transformed). +norm_counts <- counts(dds, normalized = TRUE) + +cat("\nnorm_counts:\n") +cat(" Dimensions :", nrow(norm_counts), "genes x", ncol(norm_counts), "samples\n") +cat(" Value range : [", round(min(norm_counts), 1), ",", + round(max(norm_counts), 1), "]\n") + +# 3. vst_counts +# Variance Stabilizing Transformation: suitable for nice_PCA, nice_UMAP, nice_tSNE. +# VST removes the mean-variance dependence of RNA-seq counts, +# placing all genes on a comparable log2-like scale for dimensionality +# reduction and sample-level clustering. +vst_counts <- assay(vst(dds, blind = TRUE)) + +cat("\nvst_counts:\n") +cat(" Dimensions :", nrow(vst_counts), "genes x", ncol(vst_counts), "samples\n") +cat(" Value range : [", round(min(vst_counts), 2), ",", + round(max(vst_counts), 2), "]\n") + +# Save +usethis::use_data(deseq2_results, compress = "xz", overwrite = TRUE) +usethis::use_data(norm_counts, compress = "xz", overwrite = TRUE) +usethis::use_data(vst_counts, compress = "xz", overwrite = TRUE) + +message("\nDone. Saved:") +message(" data/deseq2_results.rda") +message(" data/norm_counts.rda") +message(" data/vst_counts.rda") diff --git a/data-raw/example_PA.R b/data-raw/example_PA.R new file mode 100644 index 0000000..e8c0ace --- /dev/null +++ b/data-raw/example_PA.R @@ -0,0 +1,240 @@ +## data-raw/example_PA.R +## Run this script once to regenerate the example datasets. +## Requires: usethis + +# ============================================================================= +# geneset_list +# A named list of 40 curated gene sets with realistic KEGG / HALLMARK / GO +# naming conventions and real human gene symbols, grouped into four biological +# themes so that geneset_similarity() + do_clust() + get_network_communities() +# produce meaningful clustering results. +# +# Themes: +# 1. Apoptosis & cell death (8 sets) +# 2. Cell cycle & DNA damage (8 sets) +# 3. Immune response & inflammation (12 sets) +# 4. Metabolism (12 sets) +# ============================================================================= + +geneset_list <- list( + + # ── 1. Apoptosis & cell death ───────────────────────────────────────────── + KEGG_APOPTOSIS = c( + "TP53", "BCL2", "BCL2L1", "BAX", "BAD", "BID", "CASP3", "CASP8", + "CASP9", "CYCS", "APAF1", "FADD", "FAS", "TNFRSF10A", "TNFRSF10B", + "MCL1", "XIAP", "DIABLO" + ), + HALLMARK_APOPTOSIS = c( + "TP53", "BCL2", "BAX", "CASP3", "CASP9", "CYCS", "APAF1", "MCL1", + "BCL2L1", "BID", "PMAIP1", "BBC3", "CASP7", "DFFA", "DFFB" + ), + GO_INTRINSIC_APOPTOSIS = c( + "BAX", "BCL2", "BCL2L1", "BID", "CASP9", "CYCS", "APAF1", "DIABLO", + "SMAC", "HtrA2", "MCL1", "PMAIP1", "BBC3", "BOK" + ), + GO_EXTRINSIC_APOPTOSIS = c( + "FAS", "FADD", "CASP8", "CASP3", "TNFRSF10A", "TNFRSF10B", "TRADD", + "RIPK1", "CFLAR", "BID", "CASP10" + ), + HALLMARK_P53_PATHWAY = c( + "TP53", "MDM2", "CDKN1A", "BAX", "PUMA", "NOXA", "GADD45A", "BBC3", + "PMAIP1", "SIAH1", "BTG2", "SESN1", "SESN2", "TIGAR" + ), + KEGG_P53_SIGNALING = c( + "TP53", "MDM2", "MDM4", "CDKN1A", "GADD45A", "BAX", "BBC3", "PMAIP1", + "SIAH1", "ATM", "CHEK2", "CDKN2A", "RB1", "CCND1" + ), + GO_REGULATION_OF_APOPTOSIS = c( + "BCL2", "BCL2L1", "MCL1", "XIAP", "BIRC2", "BIRC3", "CFLAR", "TP53", + "MDM2", "BAX", "BAD", "BID", "CASP3", "CASP8", "DIABLO" + ), + GO_MITOCHONDRIAL_OUTER_MEMBRANE_PERMEABILIZATION = c( + "BAX", "BAK1", "BCL2", "BCL2L1", "MCL1", "BID", "BIM", "PUMA", + "NOXA", "VDAC1", "VDAC2", "CYCS" + ), + + # ── 2. Cell cycle & DNA damage ──────────────────────────────────────────── + KEGG_CELL_CYCLE = c( + "CDK1", "CDK2", "CDK4", "CDK6", "CCNA1", "CCNA2", "CCNB1", "CCND1", + "CCNE1", "RB1", "E2F1", "TP53", "CDKN1A", "CDKN2A", "ATM", "CHEK1", + "CHEK2", "WEE1", "CDC25A", "CDC25C" + ), + HALLMARK_E2F_TARGETS = c( + "E2F1", "E2F2", "E2F3", "CCNA2", "CCNE1", "CDK2", "PCNA", "MCM2", + "MCM3", "MCM4", "MCM5", "MCM6", "MCM7", "RRM1", "RRM2", "TYMS" + ), + HALLMARK_G2M_CHECKPOINT = c( + "CDK1", "CCNB1", "CCNB2", "CDC20", "BUB1", "BUB1B", "MAD2L1", + "AURKA", "AURKB", "PLK1", "WEE1", "CHEK1", "CDC25C", "BRCA1" + ), + GO_DNA_DAMAGE_RESPONSE = c( + "ATM", "ATR", "CHEK1", "CHEK2", "BRCA1", "BRCA2", "RAD51", "H2AFX", + "MDC1", "RNF8", "RNF168", "PALB2", "FANCD2", "TP53", "CDKN1A" + ), + KEGG_DNA_REPLICATION = c( + "PCNA", "MCM2", "MCM3", "MCM4", "MCM5", "MCM6", "MCM7", "RFC1", + "RFC2", "RFC3", "RFC4", "RFC5", "POLA1", "POLD1", "POLE", "LIG1" + ), + GO_MITOTIC_CELL_CYCLE = c( + "CDK1", "CCNB1", "CDC20", "APC", "MAD2L1", "BUB1", "BUB1B", + "AURKA", "AURKB", "PLK1", "ESPL1", "SECURIN", "SMC1A", "SMC3" + ), + HALLMARK_MYC_TARGETS_V1 = c( + "MYC", "CDK4", "CCND1", "E2F1", "PCNA", "MCM2", "NPM1", "RCC1", + "POLD1", "RFC2", "TERT", "LDHA", "ENO1", "PKM", "TKT" + ), + KEGG_HOMOLOGOUS_RECOMBINATION = c( + "BRCA1", "BRCA2", "RAD51", "RAD52", "PALB2", "RBBP8", "MRE11", + "RAD50", "NBN", "ATM", "FANCD2", "XRCC2", "XRCC3" + ), + + # ── 3. Immune response & inflammation ───────────────────────────────────── + HALLMARK_INFLAMMATORY_RESPONSE = c( + "IL1B", "IL6", "TNF", "CXCL8", "CCL2", "PTGS2", "NF-kB1", "RELA", + "NFKBIA", "ICAM1", "VCAM1", "SELE", "IL1A", "CXCL1", "CXCL2" + ), + HALLMARK_TNFA_SIGNALING_VIA_NFKB = c( + "TNF", "TNFRSF1A", "TRADD", "TRAF2", "RIPK1", "NFKB1", "RELA", + "NFKBIA", "NFKBIB", "IKBKA", "IKBKB", "IKBKG", "IL6", "IL8", "ICAM1" + ), + KEGG_NF_KAPPA_B_SIGNALING = c( + "NFKB1", "NFKB2", "RELA", "RELB", "REL", "NFKBIA", "NFKBIB", + "IKBKA", "IKBKB", "IKBKG", "TNF", "IL1B", "LTA", "CD40LG" + ), + GO_CYTOKINE_MEDIATED_SIGNALING = c( + "IL6", "IL6R", "JAK1", "JAK2", "STAT1", "STAT3", "IL2", "IL4", + "IL10", "IL12A", "IFNG", "IFNA1", "IRF1", "IRF3", "SOCS1", "SOCS3" + ), + HALLMARK_INTERFERON_GAMMA_RESPONSE = c( + "IFNG", "IFNGR1", "IFNGR2", "JAK1", "JAK2", "STAT1", "IRF1", + "CIITA", "HLA-DRA", "HLA-A", "TAP1", "TAP2", "B2M", "PSMB9", "CXCL10" + ), + HALLMARK_INTERFERON_ALPHA_RESPONSE = c( + "IFNA1", "IFNAR1", "IFNAR2", "JAK1", "TYK2", "STAT1", "STAT2", + "IRF3", "IRF7", "ISG15", "MX1", "OAS1", "OAS2", "IFIT1", "IFIT3" + ), + KEGG_JAK_STAT_SIGNALING = c( + "JAK1", "JAK2", "JAK3", "TYK2", "STAT1", "STAT2", "STAT3", "STAT5A", + "STAT5B", "IL2", "IL6", "IFNG", "IFNA1", "SOCS1", "SOCS3", "PTPN11" + ), + GO_T_CELL_ACTIVATION = c( + "CD3E", "CD3G", "CD247", "ZAP70", "LCK", "FYN", "LAT", "PLCG1", + "NFATC1", "NFATC2", "IL2", "CD28", "CD80", "CD86", "CTLA4", "PDCD1" + ), + KEGG_TOLL_LIKE_RECEPTOR_SIGNALING = c( + "TLR1", "TLR2", "TLR4", "TLR9", "MYD88", "TRIF", "IRAK1", "IRAK4", + "TRAF6", "NFKB1", "IRF3", "IRF7", "TNF", "IL6", "IL12A" + ), + GO_COMPLEMENT_ACTIVATION = c( + "C1QA", "C1QB", "C1QC", "C2", "C3", "C4A", "C4B", "C5", "CFB", + "CFD", "CFH", "CFI", "CR1", "CD55", "CD59" + ), + HALLMARK_IL6_JAK_STAT3_SIGNALING = c( + "IL6", "IL6R", "IL6ST", "JAK1", "JAK2", "STAT3", "SOCS1", "SOCS3", + "IL10", "IL11", "IL22", "IL31RA", "OSMR", "LIFR" + ), + KEGG_CHEMOKINE_SIGNALING = c( + "CCL2", "CCL5", "CXCL8", "CXCL10", "CCR2", "CCR5", "CXCR3", "CXCR4", + "GNB1", "GNG2", "PLCB1", "PIK3CA", "AKT1", "MAPK1", "STAT3" + ), + + # ── 4. Metabolism ───────────────────────────────────────────────────────── + HALLMARK_GLYCOLYSIS = c( + "HK1", "HK2", "GPI", "PFKL", "PFKM", "ALDOA", "TPI1", "GAPDH", + "PGK1", "PGAM1", "ENO1", "PKM", "LDHA", "SLC2A1", "SLC2A3" + ), + KEGG_GLYCOLYSIS_GLUCONEOGENESIS = c( + "HK1", "HK2", "GPI", "PFKL", "ALDOA", "TPI1", "GAPDH", "PGK1", + "ENO1", "PKM", "LDHA", "PCK1", "PCK2", "G6PC", "FBP1", "FBP2" + ), + GO_GLUCONEOGENESIS = c( + "PCK1", "PCK2", "FBP1", "FBP2", "G6PC", "ALDOB", "ENO1", "GAPDH", + "PGK1", "PGAM1", "MDH1", "MDH2", "GOT1", "GOT2" + ), + HALLMARK_OXIDATIVE_PHOSPHORYLATION = c( + "NDUFA1", "NDUFA2", "NDUFB1", "SDHA", "SDHB", "UQCRC1", "UQCRC2", + "COX4I1", "COX5A", "ATP5A1", "ATP5B", "ATP5C1", "CYCS", "SCO1" + ), + KEGG_CITRATE_CYCLE_TCA = c( + "CS", "ACO1", "ACO2", "IDH1", "IDH2", "OGDH", "SUCLA2", "SUCLG1", + "SDHA", "SDHB", "FH", "MDH1", "MDH2", "DLST", "DLD" + ), + GO_FATTY_ACID_BETA_OXIDATION = c( + "HADHA", "HADHB", "ACADL", "ACADM", "ACADS", "ACADVL", "ECHS1", + "HADH", "CPT1A", "CPT1B", "CPT2", "SLC25A20", "ACSL1", "ACSL3" + ), + KEGG_FATTY_ACID_METABOLISM = c( + "FASN", "ACACA", "ACACB", "ACSL1", "ACSL3", "ACSL4", "HADHA", + "HADHB", "ACADL", "ACADM", "CPT1A", "CPT2", "ELOVL1", "ELOVL6" + ), + HALLMARK_FATTY_ACID_METABOLISM = c( + "FASN", "ACACA", "ACSL1", "HADHA", "ACADL", "CPT1A", "PPARA", + "PPARG", "RXRA", "FABP1", "FABP4", "ACOX1", "EHHADH", "HSD17B4" + ), + KEGG_PENTOSE_PHOSPHATE_PATHWAY = c( + "G6PD", "PGD", "RPIA", "TALDO1", "TKT", "RPE", "RBKS", "PGLS", + "H6PD", "DERA", "PRPS1", "PRPS2" + ), + KEGG_PURINE_METABOLISM = c( + "ADSS", "ADSL", "ATIC", "PFAS", "GART", "PPAT", "PAICS", "ADA", + "AMPD1", "DGUOK", "HPRT1", "IMPDH1", "IMPDH2", "GMPS" + ), + KEGG_AMINO_SUGAR_NUCLEOTIDE_SUGAR_METABOLISM = c( + "GFPT1", "GFPT2", "UAP1", "GNPNAT1", "PGM3", "NAGK", "CMAS", + "SLC35A1", "UGP2", "UGDH", "B4GALT1", "ST6GAL1" + ), + HALLMARK_MTORC1_SIGNALING = c( + "MTOR", "RPTOR", "RPS6KB1", "RPS6KB2", "EIF4EBP1", "EIF4E", "AKT1", + "TSC1", "TSC2", "RHEB", "DEPTOR", "MLST8", "PRAS40", "HIF1A", "MYC" + ) +) + +# ============================================================================= +# camera_results +# A data.frame simulating CAMERA output for a differential expression analysis. +# Columns: GeneSet, Direction, PValue, FDR. +# ~60 % of gene sets are significant (FDR < 0.05) so clustering has enough +# sets to work with. +# ============================================================================= + +set.seed(174) + +n_sets <- length(geneset_list) +set_names <- names(geneset_list) + +# Simulate realistic p-value distribution +raw_p <- c( + # Apoptosis theme — mostly significant + runif(8, min = 1e-6, max = 0.02), + # Cell cycle theme — mostly significant + runif(8, min = 1e-5, max = 0.03), + # Immune theme — mix + c(runif(7, min = 1e-4, max = 0.04), runif(5, min = 0.06, max = 0.50)), + # Metabolism theme — mix + c(runif(7, min = 1e-3, max = 0.04), runif(5, min = 0.10, max = 0.80)) +) + +fdr_vals <- p.adjust(raw_p, method = "BH") +direction <- ifelse( + runif(n_sets) > 0.4, "Up", "Down" +) + +camera_results <- data.frame( + GeneSet = set_names, + Direction = direction, + PValue = raw_p, + FDR = fdr_vals, + stringsAsFactors = FALSE +) + +# ============================================================================= +# Save to data/ +# ============================================================================= + +usethis::use_data(geneset_list, overwrite = TRUE) +usethis::use_data(camera_results, overwrite = TRUE) + +message("Done. Objects saved:") +message(" data/geneset_list.rda — ", length(geneset_list), " gene sets") +message(" data/camera_results.rda — ", nrow(camera_results), " rows, ", + sum(camera_results$FDR < 0.05), " sets with FDR < 0.05") diff --git a/data-raw/gsea_results.R b/data-raw/gsea_results.R new file mode 100644 index 0000000..ca8f730 --- /dev/null +++ b/data-raw/gsea_results.R @@ -0,0 +1,121 @@ +## data-raw/gsea_results.R +## Generates gsea_results a simulated merge_PA() output for three comparisons +## (TumorVsNormal, MetastasisVsNormal, and MetastasisVsTumor) across three +## MSigDB collections (HALLMARK, KEGG, GO). Uses geneset_list and deseq2_results +## already in the package. +## Run once to regenerate data/gsea_results.rda +## Requires: OmicsKit data objects already loaded + + +# Setup +# ============================================================================= +set.seed(174) + +data(geneset_list) +data(deseq2_results) + +# ranked_genes: genes ordered by DESeq2 Wald stat (most positive = most upregulated) +ranked_genes <- deseq2_results$gene_id[ + order(deseq2_results$stat, decreasing = TRUE) +] + +# Define gene sets per collection +# We use names from geneset_list split by prefix +# ============================================================================= +gs_names <- names(geneset_list) +hallmark <- gs_names[grepl("^HALLMARK_", gs_names)] +kegg <- gs_names[grepl("^KEGG_", gs_names)] +go <- gs_names[grepl("^GO_", gs_names)] + +# Use all available from each collection +collection_map <- c( + setNames(rep("HALLMARK", length(hallmark)), hallmark), + setNames(rep("KEGG", length(kegg)), kegg), + setNames(rep("GO", length(go)), go) +) + +all_sets <- names(collection_map) +n_sets <- length(all_sets) + +# Helper: simulate one comparison +# ============================================================================= +simulate_comparison <- function(comparison_name, seed_offset = 0) { + + set.seed(174 + seed_offset) + + sizes <- vapply(all_sets, function(gs) length(geneset_list[[gs]]), integer(1)) + + # Simulate NES: mix of up/down enrichment + nes <- rnorm(n_sets, mean = 0, sd = 1.5) + + # Simulate FDR directly: ~60% significant (FDR < 0.05), rest non-significant + n_sig <- round(n_sets * 0.6) + n_nonsig <- n_sets - n_sig + fdr_sig <- sort(runif(n_sig, min = 0.001, max = 0.049)) + fdr_nonsig <- runif(n_nonsig, min = 0.06, max = 0.95) + rank_by_abs <- order(abs(nes), decreasing = TRUE) + fdr <- numeric(n_sets) + fdr[rank_by_abs[seq_len(n_sig)]] <- fdr_sig + fdr[rank_by_abs[seq(n_sig + 1L, n_sets)]] <- fdr_nonsig + nom_pval <- pmin(1, fdr * runif(n_sets, 0.3, 0.9)) + fwer <- pmin(1, fdr * 1.5) + + # Simulate rank at max + rank_at_max <- sample(seq_len(length(ranked_genes)), n_sets, replace = TRUE) + + # Simulate leading edge: tags% ~ |NES| / 3, capped at 60% + tags_pct <- pmin(0.60, abs(nes) / 3 + runif(n_sets, 0, 0.1)) + list_pct <- runif(n_sets, 0.20, 0.80) + signal_pct <- tags_pct * (1 - list_pct) / (1 - tags_pct * list_pct + 1e-6) + signal_pct <- pmin(signal_pct, 1) + + leading_edge <- sprintf( + "tags=%.0f%%, list=%.0f%%, signal=%.0f%%", + tags_pct * 100, + list_pct * 100, + signal_pct * 100 + ) + + data.frame( + NAME = all_sets, + SIZE = sizes, + ES = nes * 0.7, + NES = nes, + `NOM p-val` = nom_pval, + FDR = fdr, + `FWER p-val` = fwer, + `RANK AT MAX` = rank_at_max, + Log10FDR = -log10(fdr), + tags = tags_pct, + list = list_pct, + signal = signal_pct, + `LEADING EDGE` = leading_edge, + COLLECTION = unname(collection_map[all_sets]), + COMPARISON = comparison_name, + stringsAsFactors = FALSE, + check.names = FALSE + ) +} + +# Generate three comparisons +# ============================================================================= +comp1 <- simulate_comparison("TumorVsNormal", seed_offset = 0) +comp2 <- simulate_comparison("MetastasisVsNormal", seed_offset = 7) +comp3 <- simulate_comparison("MetastasisVsTumor", seed_offset = 14) + +gsea_results <- rbind(comp1, comp2, comp3) + +cat("gsea_results:\n") +cat(" Rows :", nrow(gsea_results), "\n") +cat(" Comparisons :", paste(unique(gsea_results$COMPARISON), collapse = ", "), "\n") +cat(" Collections :", paste(unique(gsea_results$COLLECTION), collapse = ", "), "\n") +cat(" Gene sets :", n_sets, "\n") +cat(" FDR < 0.05 :", + sum(gsea_results$FDR < 0.05), "out of", nrow(gsea_results), "\n") +cat(" Columns :", paste(colnames(gsea_results), collapse = ", "), "\n") + +# Save +# ============================================================================= +usethis::use_data(gsea_results, compress = "xz", overwrite = TRUE) + +message("Done. Saved: data/gsea_results.rda") diff --git a/data/camera_results.rda b/data/camera_results.rda new file mode 100644 index 0000000..b00268e Binary files /dev/null and b/data/camera_results.rda differ diff --git a/data/deseq2_results.rda b/data/deseq2_results.rda new file mode 100644 index 0000000..6e5edbe Binary files /dev/null and b/data/deseq2_results.rda differ diff --git a/data/geneset_list.rda b/data/geneset_list.rda new file mode 100644 index 0000000..a0cbbcd Binary files /dev/null and b/data/geneset_list.rda differ diff --git a/data/gsea_results.rda b/data/gsea_results.rda new file mode 100644 index 0000000..8e6888b Binary files /dev/null and b/data/gsea_results.rda differ diff --git a/data/norm_counts.rda b/data/norm_counts.rda new file mode 100644 index 0000000..0d835f2 Binary files /dev/null and b/data/norm_counts.rda differ diff --git a/data/vst_counts.rda b/data/vst_counts.rda new file mode 100644 index 0000000..e1af716 Binary files /dev/null and b/data/vst_counts.rda differ diff --git a/man/add_annotations.Rd b/man/add_annotations.Rd index 4bfae0e..6a67022 100644 --- a/man/add_annotations.Rd +++ b/man/add_annotations.Rd @@ -15,6 +15,44 @@ add_annotations(object, reference, variables = NULL, data_frame = FALSE) \item{data_frame}{Logical; if TRUE, coerce \code{object} to a data.frame first. Default: FALSE.} } +\value{ +The input \code{object} as a data frame with additional columns from +\code{reference} joined by Ensembl gene ID. A \code{geneID} column is added +containing the row names of the original object. +} \description{ A function to add annotations to a table of gene counts. } +\examples{ +\dontrun{ +data(norm_counts) + +# Requires a reference table with a "geneID" column. +# Use get_annotations() to generate it: +annotations <- get_annotations( + ensembl_ids = rownames(norm_counts), + mode = "genes" +) + +# Add gene symbol and biotype columns to the counts matrix +norm_counts_annot <- add_annotations( + object = norm_counts, + reference = annotations, + variables = c("symbol", "biotype") +) + +# Inspect result +head(norm_counts_annot[, c("geneID", "symbol", "biotype")]) + +# Add all annotation columns (variables = NULL uses everything) +norm_counts_full <- add_annotations( + object = norm_counts, + reference = annotations +) +} + +} +\seealso{ +\code{\link[=get_annotations]{get_annotations()}} to generate the \code{reference} table; +\link{norm_counts} for an example input matrix. +} diff --git a/man/addgenesPA.Rd b/man/addgenesPA.Rd new file mode 100644 index 0000000..89b123f --- /dev/null +++ b/man/addgenesPA.Rd @@ -0,0 +1,82 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_genes_PA.R +\name{addgenesPA} +\alias{addgenesPA} +\title{Add gene columns to pathway analysis results} +\usage{ +addgenesPA(pa_data, gene_lists) +} +\arguments{ +\item{pa_data}{A data frame of pathway analysis results containing a \code{NAME} +column. Typically the output of \code{\link[=merge_PA]{merge_PA()}}.} + +\item{gene_lists}{Output of \code{\link[=getgenesPA]{getgenesPA()}}. Can be: +\itemize{ +\item A list with \verb{$all}, \verb{$le}, and/or \verb{$top} slots: when multiple modes +are requested (e.g., \code{getgenesPA(..., genes = c("all", "le", "top"))}). +Adds the corresponding columns. +\item A flat named list with attribute \code{genes_type}: when a single mode is +requested. Adds the corresponding column (\code{all_genes}, \code{le_genes}, or +\code{top_genes}). +}} +} +\value{ +The input \code{pa_data} data frame with one or more additional columns: +\itemize{ +\item \code{all_genes}: comma-separated string of all gene set members ordered by +rank. +\item \code{le_genes}: comma-separated string of leading edge genes (GSEA only), +ordered by rank. +\item \code{top_genes}: comma-separated string of top-ranked genes based on the +user-defined \code{top} fraction. +} + +Gene sets not found in \code{gene_lists} receive \code{NA}. +} +\description{ +Appends \code{all_genes}, \code{le_genes}, and/or \code{top_genes} columns to a pathway +analysis results data frame based on the output of \code{\link[=getgenesPA]{getgenesPA()}}. +Gene symbols within each cell are comma-separated. Automatically detects +which column(s) to add based on the structure of the input. +} +\examples{ +\dontrun{ +data(gsea_results) +data(geneset_list) +data(deseq2_results) + +ranked <- deseq2_results$gene_id[order(deseq2_results$stat, + decreasing = TRUE)] +pa_single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ] +pa_single$top <- 0.30 + +# Add all three columns +gene_lists <- getgenesPA(pa_single, geneset_list, ranked, + genes = c("all", "le", "top")) +pa_annot <- addgenesPA(pa_single, gene_lists) +head(pa_annot[, c("NAME", "all_genes", "le_genes", "top_genes")]) + +# Add only leading edge genes +le_only <- getgenesPA(pa_single, geneset_list, ranked, genes = "le") +pa_annot <- addgenesPA(pa_single, le_only) +head(pa_annot[, c("NAME", "le_genes")]) + +# CAMERA: add only top and all (no leading edge) +data(camera_results) +camera_pa <- camera_results +colnames(camera_pa)[colnames(camera_pa) == "GeneSet"] <- "NAME" +camera_pa$SIZE <- sapply(camera_pa$NAME, + function(x) length(geneset_list[[x]])) +camera_pa$top <- 0.25 +gene_lists_cam <- getgenesPA(camera_pa, geneset_list, ranked, + genes = c("all", "top")) +pa_annot_cam <- addgenesPA(camera_pa, gene_lists_cam) +head(pa_annot_cam[, c("NAME", "all_genes", "top_genes")]) +} + +} +\seealso{ +\code{\link[=getgenesPA]{getgenesPA()}} to generate \code{gene_lists}; +\code{\link[=heatmap_PA]{heatmap_PA()}} for heatmap visualization; +\code{\link[=save_results]{save_results()}} to export the annotated results. +} diff --git a/man/barplot_GSEA.Rd b/man/barplot_GSEA.Rd deleted file mode 100644 index ba47ad0..0000000 --- a/man/barplot_GSEA.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/barplot_GSEA.R -\name{barplot_GSEA} -\alias{barplot_GSEA} -\title{Create and save a customized barplot for GSEA results} -\usage{ -barplot_GSEA(data, output_path, custom_labels, axis_y = "NES") -} -\arguments{ -\item{data}{A data frame containing GSEA results with columns such as \code{datatype}, \code{NES}, \code{-Log10FDR}, and \code{New_name}.} - -\item{output_path}{The file path where the barplot will be saved (SVG format).} - -\item{custom_labels}{A named vector of custom expressions for x-axis labels.} - -\item{axis_y}{Name of the column to use for the y-axis aesthetic, as a string. Default: "NES".} -} -\description{ -This function generates a customized barplot with: -\itemize{ -\item Grouped bars. -\item Adjusted aesthetics. -\item Personalized axis labels. -\item Optionally save the result in SVG format. -} -} diff --git a/man/camera_results.Rd b/man/camera_results.Rd new file mode 100644 index 0000000..c1635c2 --- /dev/null +++ b/man/camera_results.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataclust_PA.R +\docType{data} +\name{camera_results} +\alias{camera_results} +\title{Example CAMERA enrichment results for pathway analysis clustering} +\format{ +A data frame with 40 rows and 4 columns: +\describe{ +\item{GeneSet}{Character. Gene set name, matching the names in +\link{geneset_list}.} +\item{Direction}{Character. Enrichment direction: \code{"Up"} or \code{"Down"}.} +\item{PValue}{Numeric. Raw p-value from the simulated CAMERA test.} +\item{FDR}{Numeric. Benjamini-Hochberg adjusted p-value.} +} +} +\source{ +Simulated with \code{set.seed(1905)} in \code{data-raw/example_PA.R} for +OmicsKit examples. +} +\usage{ +camera_results +} +\description{ +A data frame simulating the output of a CAMERA differential expression +analysis, containing significance values for the 40 gene sets in +\link{geneset_list}. Approximately 60\% of gene sets have FDR < 0.05, providing +enough significant sets for meaningful clustering. Designed to be used +alongside \link{geneset_list} as input to \code{\link[=geneset_similarity]{geneset_similarity()}}. +} +\examples{ +data(camera_results) + +# Overview +head(camera_results) + +# How many gene sets are significant? +sum(camera_results$FDR < 0.05) + +# Use with geneset_similarity() +data(geneset_list) +jac <- geneset_similarity(geneset_list, camera_results, fdr_th = 0.05) + +} +\seealso{ +\code{\link[=geneset_similarity]{geneset_similarity()}}, \link{geneset_list} +} +\keyword{datasets} diff --git a/man/deseq2_results.Rd b/man/deseq2_results.Rd new file mode 100644 index 0000000..003b423 --- /dev/null +++ b/man/deseq2_results.Rd @@ -0,0 +1,82 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/deseq2_results.R +\docType{data} +\name{deseq2_results} +\alias{deseq2_results} +\title{DESeq2 differential expression results for TCGA-LUAD} +\format{ +A data frame with 21,330 rows and 7 columns: +\describe{ +\item{gene_id}{Character. Ensembl gene ID (e.g., \code{"ENSG00000141510"}).} +\item{baseMean}{Numeric. Mean of normalized counts across all samples.} +\item{log2FoldChange}{Numeric. Shrunken log2 fold change +(tumor vs. normal).} +\item{lfcSE}{Numeric. Standard error of the log2 fold change estimate.} +\item{stat}{Numeric. Wald test statistic.} +\item{pvalue}{Numeric. Raw p-value.} +\item{padj}{Numeric. Benjamini-Hochberg adjusted p-value (FDR).} +} +} +\source{ +TCGA-LUAD STAR counts downloaded from the GDC Data Portal +(\url{https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz}). +DESeq2 analysis performed with default settings; results generated by +\code{data-raw/deseq2_results.R}. +} +\usage{ +deseq2_results +} +\description{ +DESeq2 results from a differential expression analysis +comparing primary lung adenocarcinoma tumors versus normal tissue using +TCGA-LUAD RNA-seq data. Contains 21330 genes to produce informative +visualizations with \code{\link[=nice_Volcano]{nice_Volcano()}}, and also suitable as input for +\code{\link[=detect_filter]{detect_filter()}}, and \code{\link[=add_annotations]{add_annotations()}} +and related plotting functions. +} +\examples{ +data(deseq2_results) + +# Overview +head(deseq2_results) + +# Significant genes +sum(deseq2_results$padj < 0.05, na.rm = TRUE) + +# Volcano plot +nice_Volcano( + results = deseq2_results, + x_var = "log2FoldChange", + y_var = "padj", + label_var = "gene_id", + title = "TCGA-LUAD: Tumor vs Normal" +) +\dontrun{ +# detect_filter (required: "ensembl" column in results) +deseq2_res <- deseq2_results +colnames(deseq2_res)[colnames(deseq2_res) == "gene_id"] <- "ensembl" +rownames(deseq2_res) <- deseq2_res$ensembl + +# Get sample IDs per group from sampledata +samples_normal <- sampledata$patient_id[sampledata$sample_type == "normal"] +samples_tumor <- sampledata$patient_id[sampledata$sample_type == "tumor"] + +detected <- detect_filter( + norm.counts = as.data.frame(norm_counts), + df.BvsA = deseq2_res, + samples.baseline = samples_normal, + samples.condition1 = samples_tumor, + cutoffs = c(50, 50, 0) +) + +# Number of detectable genes +length(detected$DetectGenes) + +# Subset results to detectable genes +head(detected$Comparison1) +} +} +\seealso{ +\code{\link[=nice_Volcano]{nice_Volcano()}}, \link{raw_counts}, \link{sampledata} +} +\keyword{datasets} diff --git a/man/detect_filter.Rd b/man/detect_filter.Rd index e7cce47..a83d36b 100644 --- a/man/detect_filter.Rd +++ b/man/detect_filter.Rd @@ -35,8 +35,54 @@ detect_filter( \item{samples.condition3}{Vector of Sample IDs or indexes corresponding to the third condition (optional).} } +\value{ +A named list. Always contains: +\itemize{ +\item \verb{$Comparison1}: Data frame of detectable genes from \code{df.BvsA}. +\item \verb{$DetectGenes}: Character vector of unique detectable gene IDs across +all comparisons. +} + +If \code{df.CvsA} is provided, also contains \verb{$Comparison2}. If \code{df.DvsA} is +provided, also contains \verb{$Comparison3}. +} \description{ This function identifies genes with measurable expression levels across samples. Detectable genes must meet two conditions: the baseMean and their mean normalized counts in the phenotypes of interest must be greater than a set threshold. It returns a list of detectable genes and the comparisons in which they can be found. } +\examples{ +\dontrun{ +data(norm_counts) +data(deseq2_results) +data(sampledata) + +# detect_filter requires an "ensembl" column in the results data frame +res <- deseq2_results +colnames(res)[colnames(res) == "gene_id"] <- "ensembl" +rownames(res) <- res$ensembl + +# Get sample IDs per group +samples_normal <- sampledata$patient_id[sampledata$sample_type == "normal"] +samples_tumor <- sampledata$patient_id[sampledata$sample_type == "tumor"] + +detected <- detect_filter( + norm.counts = as.data.frame(norm_counts), + df.BvsA = res, + samples.baseline = samples_normal, + samples.condition1 = samples_tumor, + cutoffs = c(50, 50, 0) +) + +# Number of detectable genes +length(detected$DetectGenes) + +# Subset results +head(detected$Comparison1) +} + +} +\seealso{ +\code{\link[=nice_VSB]{nice_VSB()}} to plot expression of detected genes; +\link{norm_counts} for an example normalized counts matrix. +} diff --git a/man/do_clust.Rd b/man/do_clust.Rd new file mode 100644 index 0000000..fb97bb8 --- /dev/null +++ b/man/do_clust.Rd @@ -0,0 +1,66 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/doclust_PA.R +\name{do_clust} +\alias{do_clust} +\title{Hierarchical clustering of gene sets with silhouette-based optimization} +\usage{ +do_clust(x, method = "ward.D2", max_k = NULL) +} +\arguments{ +\item{x}{A \code{JaccardResult} object (output of \code{\link[=geneset_similarity]{geneset_similarity()}}) or an +object of class \code{dist}.} + +\item{method}{Agglomeration method passed to \code{\link[stats:hclust]{stats::hclust()}}. +Default: \code{"ward.D2"}.} + +\item{max_k}{Maximum number of clusters to evaluate in silhouette analysis. +Default: \code{NULL}, which sets it automatically to \code{max(1, floor(n / 2))}.} +} +\value{ +A named list with five elements: +\itemize{ +\item \verb{$hclust}: The \code{\link[stats:hclust]{stats::hclust()}} object. +\item \verb{$cluster_assignments}: A \code{\link[tibble:tibble]{tibble::tibble()}} with columns \code{NAME} and +\code{cluster}. +\item \verb{$optimal_k}: Integer. The optimal number of clusters. +\item \verb{$silhouette_plot}: A ggplot2 object of average silhouette width vs. k. +\item \verb{$heatmap}: A \code{ComplexHeatmap} object. Display with +\code{ComplexHeatmap::draw(result$heatmap)}. +} +} +\description{ +Performs hierarchical clustering on a Jaccard distance matrix, selects the +optimal number of clusters by maximizing average silhouette width, and +returns cluster assignments, a silhouette ggplot2 object, and a +ComplexHeatmap with dendrogram. +} +\examples{ +\dontrun{ +# Requires ComplexHeatmap and cluster packages +geneset_list <- list( + KEGG_APOPTOSIS = c("TP53", "BCL2", "CASP3", "BAX"), + KEGG_CELL_CYCLE = c("CDK2", "CCND1", "TP53", "RB1"), + HALLMARK_HYPOXIA = c("HIF1A", "VEGFA", "LDHA", "BNIP3"), + HALLMARK_GLYCOLYSIS = c("LDHA", "ENO1", "PKM", "HIF1A"), + KEGG_P53_PATHWAY = c("TP53", "MDM2", "CDKN1A", "BAX") +) + +results <- data.frame( + GeneSet = names(geneset_list), + FDR = c(0.01, 0.02, 0.03, 0.04, 0.01) +) + +jac <- geneset_similarity(geneset_list, results) +clust <- do_clust(jac) + +clust$silhouette_plot # ggplot2 silhouette curve +ComplexHeatmap::draw(clust$heatmap) # Jaccard heatmap with dendrogram +clust$optimal_k # selected number of clusters +clust$cluster_assignments # tibble: NAME | cluster +} + +} +\seealso{ +\code{\link[=geneset_similarity]{geneset_similarity()}}, \code{\link[=get_network_communities]{get_network_communities()}}, +\code{\link[=network_clust]{network_clust()}}, \code{\link[=network_clust_gg]{network_clust_gg()}} +} diff --git a/man/geneset_list.Rd b/man/geneset_list.Rd new file mode 100644 index 0000000..00ae84d --- /dev/null +++ b/man/geneset_list.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataclust_PA.R +\docType{data} +\name{geneset_list} +\alias{geneset_list} +\title{Example gene set list for pathway analysis clustering} +\format{ +A named list of 40 elements. Each element is a character vector of +human gene symbols (HGNC) belonging to that gene set. Gene set sizes range +from 11 to 20 genes. +} +\source{ +Curated manually for OmicsKit examples, based on KEGG, MSigDB +Hallmark, and Gene Ontology gene set collections. +} +\usage{ +geneset_list +} +\description{ +A named list of 40 curated gene sets spanning four biological themes: +apoptosis & cell death, cell cycle & DNA damage, immune response & +inflammation, and metabolism. Gene set names follow standard database +conventions (\code{KEGG_}, \code{HALLMARK_}, \code{GO_}) and gene symbols are real human +genes. Designed to be used as input to \code{\link[=geneset_similarity]{geneset_similarity()}}. +} +\examples{ +data(geneset_list) + +# How many gene sets? +length(geneset_list) + +# Inspect one gene set +geneset_list[["KEGG_APOPTOSIS"]] + +# Use with geneset_similarity() +data(camera_results) +jac <- geneset_similarity(geneset_list, camera_results, fdr_th = 0.05) + +} +\seealso{ +\code{\link[=geneset_similarity]{geneset_similarity()}}, \link{camera_results} +} +\keyword{datasets} diff --git a/man/geneset_similarity.Rd b/man/geneset_similarity.Rd new file mode 100644 index 0000000..573aabd --- /dev/null +++ b/man/geneset_similarity.Rd @@ -0,0 +1,61 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/doclust_PA.R +\name{geneset_similarity} +\alias{geneset_similarity} +\title{Compute Jaccard similarity and distance matrices for gene sets} +\usage{ +geneset_similarity(geneset_list, results, fdr_th = 0.05) +} +\arguments{ +\item{geneset_list}{A named list where each element is a character vector of +gene symbols belonging to that gene set. Typically the output of +\code{\link[=list_gmts]{list_gmts()}}.} + +\item{results}{A data frame with at least two columns: \code{GeneSet} (gene set +names) and \code{FDR} (adjusted p-values).} + +\item{fdr_th}{Numeric. FDR cutoff to retain significant gene sets. +Default: \code{0.05}.} +} +\value{ +An object of class \code{JaccardResult}, a named list with three slots: +\itemize{ +\item \verb{$jaccard_sim}: Numeric matrix of pairwise Jaccard similarities. +\item \verb{$dist_mat}: A \code{dist} object of 1 - Jaccard similarity, suitable for +clustering or UMAP. +\item \verb{$geneset_list}: Named list of gene sets retained after FDR filtering. +} +} +\description{ +Filters a named list of gene sets by a significance threshold and computes +pairwise Jaccard similarity and distance matrices for the retained sets. +The output object can be passed directly to \code{\link[=do_clust]{do_clust()}}, +\code{\link[=get_network_communities]{get_network_communities()}}, \code{\link[=network_clust]{network_clust()}}, or \code{\link[=network_clust_gg]{network_clust_gg()}}, +or its individual slots can be used independently (e.g., \verb{$dist_mat} for +UMAP, \verb{$jaccard_sim} for custom visualizations). +} +\examples{ +geneset_list <- list( + KEGG_APOPTOSIS = c("TP53", "BCL2", "CASP3", "BAX"), + KEGG_CELL_CYCLE = c("CDK2", "CCND1", "TP53", "RB1"), + HALLMARK_HYPOXIA = c("HIF1A", "VEGFA", "LDHA", "BNIP3"), + HALLMARK_GLYCOLYSIS = c("LDHA", "ENO1", "PKM", "HIF1A") +) + +results <- data.frame( + GeneSet = names(geneset_list), + FDR = c(0.01, 0.03, 0.04, 0.20) +) + +# Only the first three gene sets pass the FDR threshold +jac <- geneset_similarity(geneset_list, results, fdr_th = 0.05) + +jac$jaccard_sim # similarity matrix +jac$dist_mat # distance object (usable in UMAP, clustering, etc.) +jac$geneset_list # filtered gene sets + +} +\seealso{ +\code{\link[=list_gmts]{list_gmts()}}, \code{\link[=do_clust]{do_clust()}}, \code{\link[=get_network_communities]{get_network_communities()}}, +\code{\link[=network_clust]{network_clust()}}, \code{\link[=network_clust_gg]{network_clust_gg()}} +} diff --git a/man/get_annotations.Rd b/man/get_annotations.Rd index 9909682..f741973 100644 --- a/man/get_annotations.Rd +++ b/man/get_annotations.Rd @@ -23,6 +23,13 @@ get_annotations( \item{format}{The output is saved in .csv or .xlsx formats. Default = csv.} } +\value{ +A data frame with one row per input ID and the following columns: +\code{geneID}, \code{symbol}, \code{biotype}, \code{chromosome}, \code{gene_start}, \code{gene_end}, +\code{gene_length}, \code{description}. For \code{mode = "transcripts"}, an additional +\code{transcriptID} column is included. The data frame is also saved to disk +as a \code{.csv} or \code{.xlsx} file (see \code{filename} and \code{format}). +} \description{ This function annotates a column of transcripts or gene IDs (ENSEMBL) with information of the Biomart. If transcript IDs are provided, they are also annotated with information of the genes to which they belong. @@ -34,3 +41,36 @@ The Gene information added include: \item Gene start, end and length } } +\note{ +Requires an active internet connection to query the Ensembl BioMart. +\code{gene_length} is computed as \code{gene_end - gene_start + 1} (genomic length). +For TPM calculation with \code{\link[=tpm]{tpm()}}, this is an approximation, +use transcript-level lengths for higher accuracy. +} +\examples{ +\dontrun{ +# Annotate genes from Normalized counts (requires internet connection) +data(norm_counts) + +# Requires a reference table with a "geneID" column. +# Use get_annotations() to generate it: +annotations <- get_annotations( + ensembl_ids = rownames(norm_counts), + mode = "genes" +) + +head(annotations) + +# Use with add_annotations() +norm_counts_annot <- add_annotations( + object = norm_counts, + reference = annotations, + variables = c("symbol", "biotype") +) +} + +} +\seealso{ +\code{\link[=add_annotations]{add_annotations()}} to join annotations to a counts matrix; +\code{\link[=tpm]{tpm()}} which requires gene lengths from this function. +} diff --git a/man/get_network_communities.Rd b/man/get_network_communities.Rd new file mode 100644 index 0000000..effc3eb --- /dev/null +++ b/man/get_network_communities.Rd @@ -0,0 +1,90 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/doclust_PA.R +\name{get_network_communities} +\alias{get_network_communities} +\title{Detect gene set communities and generate super-term labels} +\usage{ +get_network_communities( + x, + threshold = 0.3, + method = "louvain", + superterms = TRUE, + n_terms = 3, + remove_prefix = TRUE, + seed = 174 +) +} +\arguments{ +\item{x}{A \code{JaccardResult} object (output of \code{\link[=geneset_similarity]{geneset_similarity()}}).} + +\item{threshold}{Numeric between 0 and 1. Gene set pairs with a Jaccard +similarity above this value are connected in the network. Default: \code{0.3}.} + +\item{method}{Character. Community detection algorithm to use. One of: +\itemize{ +\item \code{"louvain"} — \code{\link[igraph:cluster_louvain]{igraph::cluster_louvain()}}: fast, recommended for most +use cases. Default. +\item \code{"fast_greedy"} — \code{\link[igraph:cluster_fast_greedy]{igraph::cluster_fast_greedy()}}: optimizes modularity +greedily, works well on mid-size networks. +\item \code{"walktrap"} — \code{\link[igraph:cluster_walktrap]{igraph::cluster_walktrap()}}: random-walk approach, +tends to find smaller, tighter communities. +}} + +\item{superterms}{Logical. If \code{TRUE}, calls \code{\link[=get_superterm]{get_superterm()}} and includes +its output in \verb{$superterms}. Default: \code{TRUE}.} + +\item{n_terms}{Integer. Number of top TF-IDF terms per super-term label. +Passed to \code{\link[=get_superterm]{get_superterm()}}. Default: \code{3}.} + +\item{remove_prefix}{Logical. Remove database prefix before the first +underscore (e.g., \code{"KEGG_"}). Passed to \code{\link[=get_superterm]{get_superterm()}}. Default: \code{TRUE}.} + +\item{seed}{Integer. Random seed for reproducible community detection. +Default: \code{174}.} +} +\value{ +A named list with four elements: +\itemize{ +\item \verb{$communities}: The igraph communities object. +\item \verb{$membership}: Named integer vector of community IDs, one per gene set. +\item \verb{$adjacency_matrix}: Binary matrix (\code{1} if Jaccard > \code{threshold}). +\item \verb{$superterms}: Output of \code{\link[=get_superterm]{get_superterm()}} with \verb{$mapping} and +\verb{$summary}. \code{NULL} if \code{superterms = FALSE}. +} +} +\description{ +Convenience wrapper that builds a binary adjacency network from a Jaccard +similarity matrix, runs a community-detection algorithm, and optionally +generates super-term labels for each community via \code{\link[=get_superterm]{get_superterm()}}. +Designed to be the single step between \code{\link[=geneset_similarity]{geneset_similarity()}} and the network +plotting functions \code{\link[=network_clust]{network_clust()}} / \code{\link[=network_clust_gg]{network_clust_gg()}}. +} +\examples{ +\dontrun{ +gsl <- list_gmts("path/to/gmt_folder/") +res <- read.csv("path/to/results.csv") + +# Full workflow +jac <- geneset_similarity(gsl, res, fdr_th = 0.05) +clust <- do_clust(jac) +net <- get_network_communities(jac, threshold = 0.3, method = "louvain") + +net$membership # community ID per gene set +net$superterms$mapping # gene set -> superterm +net$superterms$summary # community sizes and labels + +# Pass results to network plots +plots <- network_clust_gg( + jac, + clust_result = clust, + superterms = TRUE, + superterm_data = net$superterms +) +plots$combined +} + +} +\seealso{ +\code{\link[=geneset_similarity]{geneset_similarity()}}, \code{\link[=do_clust]{do_clust()}}, \code{\link[=get_superterm]{get_superterm()}}, +\code{\link[=network_clust]{network_clust()}}, \code{\link[=network_clust_gg]{network_clust_gg()}} +} diff --git a/man/get_stars.Rd b/man/get_stars.Rd index a116fbb..863dc8b 100644 --- a/man/get_stars.Rd +++ b/man/get_stars.Rd @@ -13,6 +13,43 @@ get_stars(geneID, object, thresholds = c(0.001, 0.01, 0.1, 0.25)) \item{thresholds}{Vector with 4 values of significance. Default c(0.001, 0.01, 0.1, 0.25).} } +\value{ +A single character string: \code{"****"}, \code{"***"}, \code{"**"}, \code{"*"}, +\code{"ns"} (not significant), or \code{"Gene ID not found"} if the gene is absent +from \code{object}. +} \description{ This function will create asteriscs (*) from DESeq2 results objects to represent the significance of comparisons. } +\examples{ +data(deseq2_results) + +# get_stars expects a column named "ensembl" +res <- deseq2_results +colnames(res)[colnames(res) == "gene_id"] <- "ensembl" + +# Get significance stars for the most significant gene +get_stars( + geneID = res$ensembl[1], + object = res +) + +# Custom thresholds +get_stars( + geneID = res$ensembl[1], + object = res, + thresholds = c(0.001, 0.01, 0.05, 0.10) +) + +# Non-significant gene +get_stars( + geneID = res$ensembl[nrow(res)], + object = res +) + +} +\seealso{ +\code{\link[=detect_filter]{detect_filter()}} to identify detectable genes before annotating; +\code{\link[=nice_VSB]{nice_VSB()}} where significance stars can be added to plots; +\link{deseq2_results} for an example input. +} diff --git a/man/get_superterm.Rd b/man/get_superterm.Rd new file mode 100644 index 0000000..3df8568 --- /dev/null +++ b/man/get_superterm.Rd @@ -0,0 +1,98 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/doclust_PA.R +\name{get_superterm} +\alias{get_superterm} +\title{Generate representative super-term labels for gene set communities} +\usage{ +get_superterm( + geneset_names, + community_membership, + n_terms = 3, + remove_prefix = TRUE +) +} +\arguments{ +\item{geneset_names}{Character vector of gene set names (nodes in the +network).} + +\item{community_membership}{A named numeric or integer vector mapping each +gene set to its community ID. Typically the output of +\code{\link[igraph:communities]{igraph::membership()}} applied to a community detection result (e.g., +\code{\link[igraph:cluster_louvain]{igraph::cluster_louvain()}}). Must have the same length as +\code{geneset_names}. See \code{\link[=get_network_communities]{get_network_communities()}} for a simpler workflow.} + +\item{n_terms}{Integer. Number of top TF-IDF terms to include in each label. +Default: \code{3}.} + +\item{remove_prefix}{Logical. If \code{TRUE}, removes the text before the first +underscore in gene set names (e.g., strips the \code{"KEGG_"} prefix from +\code{"KEGG_GLYCOLYSIS"}). Default: \code{TRUE}.} +} +\value{ +A named list with two elements: +\itemize{ +\item \verb{$mapping}: A \code{\link[tibble:tibble]{tibble::tibble()}} with columns \code{geneset}, \code{community}, +and \code{superterm} — one row per gene set, sorted by community. +\item \verb{$summary}: A \code{\link[tibble:tibble]{tibble::tibble()}} with columns \code{community}, \code{superterm}, +and \code{n_genesets} — one row per community, sorted by decreasing size. +} +} +\description{ +For each community in a gene set network, applies \strong{TF-IDF} (Term +Frequency-Inverse Document Frequency) weighting to the words present in gene +set names to produce a short, representative label called a \emph{super-term}. +} +\details{ +\strong{How TF-IDF works here:} each gene set name is treated as a document and +each word as a term. TF-IDF upweights words that are frequent within a +community but rare across all communities, making the resulting label +specific to that cluster rather than generic. A frequency-based fallback is +used when TF-IDF returns no terms (e.g., very small communities). + +Common pathway words (\code{"pathway"}, \code{"signaling"}, \code{"regulation"}, etc.) and +standard English stopwords are removed before scoring. + +\strong{Note:} this function is most easily used through +\code{\link[=get_network_communities]{get_network_communities()}}, which handles community detection and calls +\code{get_superterm()} internally. If you prefer to call it directly, you need a +community membership vector: + +\if{html}{\out{
}}\preformatted{adj <- (jac$jaccard_sim > 0.3) * 1 +g <- igraph::graph_from_adjacency_matrix(adj, mode = "undirected") +comm <- igraph::cluster_louvain(g) +membership <- igraph::membership(comm) + +st <- get_superterm( + geneset_names = names(membership), + community_membership = membership +) +}\if{html}{\out{
}} +} +\examples{ +\dontrun{ +# Recommended: use get_network_communities() which calls this internally +net <- get_network_communities(jac, threshold = 0.3) +net$superterms$mapping +net$superterms$summary + +# Direct usage with a pre-computed membership vector +adj <- (jac$jaccard_sim > 0.3) * 1 +g <- igraph::graph_from_adjacency_matrix(adj, mode = "undirected") +comm <- igraph::cluster_louvain(g) +membership <- igraph::membership(comm) + +st <- get_superterm( + geneset_names = names(membership), + community_membership = membership, + n_terms = 3, + remove_prefix = TRUE +) + +st$mapping # per-gene-set labels +st$summary # per-community summary +} + +} +\seealso{ +\code{\link[=get_network_communities]{get_network_communities()}}, \code{\link[=network_clust]{network_clust()}}, \code{\link[=network_clust_gg]{network_clust_gg()}} +} diff --git a/man/getgenesPA.Rd b/man/getgenesPA.Rd new file mode 100644 index 0000000..49c0f93 --- /dev/null +++ b/man/getgenesPA.Rd @@ -0,0 +1,137 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_genes_PA.R +\name{getgenesPA} +\alias{getgenesPA} +\title{Extract gene members from pathway analysis results} +\usage{ +getgenesPA(pa_data, geneset_list, ranked_genes, genes = c("all", "le")) +} +\arguments{ +\item{pa_data}{A data frame of pathway analysis results. Must always +contain: +\itemize{ +\item \code{NAME}: gene set name. +} + +Additionally required depending on \code{genes}: +\itemize{ +\item \code{SIZE}: number of genes in the gene set. Required for \code{"le"} and +\code{"top"}. +\item \code{tags}: numeric fraction (0-1) of genes contributing to the +enrichment score (GSEA leading edge). Produced automatically by +\code{\link[=merge_PA]{merge_PA()}}. Required for \code{genes = "le"}. +\item \code{top}: numeric fraction (0-1) defining the proportion of top-ranked +genes to extract. Set manually by the user (e.g., +\code{pa_data$top <- 0.25} for the top 25\%). Required for \code{genes = "top"}. +} + +Typically the output of \code{\link[=merge_PA]{merge_PA()}}.} + +\item{geneset_list}{A named list of gene sets, where each element is a +character vector of gene symbols. Typically the output of \code{\link[=list_gmts]{list_gmts()}}, +or use the built-in \link{geneset_list} for quick testing.} + +\item{ranked_genes}{A character vector of gene symbols ordered by their +ranking metric (e.g., DESeq2 \code{stat}, log2FC, or signal-to-noise ratio), +from most positive to most negative. Non-significant genes fall in the +middle of the list. Used to order genes within each extracted set.} + +\item{genes}{Character vector specifying which extraction mode(s) to use. +Any combination of \code{"all"}, \code{"le"}, and \code{"top"}. Default: +\code{c("all", "le")}.} +} +\value{ +Depends on \code{genes}: +\itemize{ +\item Single mode (e.g., \code{genes = "le"}): a named list where each element +is a character vector of gene symbols. The list has an attribute +\code{genes_type} used by \code{\link[=addgenesPA]{addgenesPA()}} to name the output column. +\item Multiple modes (e.g., \code{genes = c("all", "le", "top")}): a named list +with one element per requested mode: +\itemize{ +\item \verb{$all}: named list of all gene set members. +\item \verb{$le}: named list of leading edge genes (GSEA only). +\item \verb{$top}: named list of top-ranked genes. +} +} +} +\description{ +For each gene set in a pathway analysis results table, retrieves leading +edge genes, a user-defined top fraction of genes, all genes in the gene +set, or any combination. All gene lists are ordered by their rank in the +provided ranked gene list. +} +\details{ +\strong{Three extraction modes:} +\itemize{ +\item \code{"le"}: \strong{GSEA output only.} Leading edge genes: the subset of genes +that drives the enrichment signal. Size is computed as +\code{round(SIZE * tags)}, where \code{tags} is the fraction of gene hits before +(positive ES) or after (negative ES) the peak in the running enrichment +score. Definition from the GSEA User Guide: \emph{"The percentage of gene hits +before (for positive ES) or after (for negative ES) the peak in the +running enrichment score. This gives an indication of the percentage of +genes contributing to the enrichment score."} +(\url{https://docs.gsea-msigdb.org/#GSEA/GSEA_User_Guide/}). +Requires columns \code{SIZE} and \code{tags} in \code{pa_data}, produced automatically +by \code{\link[=merge_PA]{merge_PA()}}. +\item \code{"top"}: \strong{Any enrichment result (GSEA, CAMERA, PADOG, etc.).} +A user-defined top fraction of genes ordered by rank. Size is computed as +\code{round(SIZE * top)}, where \code{top} is a numeric value between 0 and 1 +provided in a \code{top} column of \code{pa_data}. This does \strong{not} represent true +leading edge genes: it is a flexible, rank-based selection suitable for +exploratory visualization with any pathway analysis method. +Requires columns \code{SIZE} and \code{top} in \code{pa_data}. +\item \code{"all"}: All genes in the gene set, ordered by rank. +} +} +\examples{ +\dontrun{ +data(gsea_results) +data(geneset_list) +data(deseq2_results) + +#or +gsl <- list_gmts("path/to/gmt_folder/") + +ranked <- deseq2_results$gene_id[order(deseq2_results$stat, + decreasing = TRUE)] +pa_single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ] + +# ── GSEA results: all three modes available +gene_lists <- getgenesPA(pa_single, geneset_list, ranked, + genes = c("all", "le", "top")) + +# But first add the top column (e.g. top 30\% of genes by rank) +pa_single$top <- 0.30 +gene_lists <- getgenesPA(pa_single, geneset_list, ranked, + genes = c("all", "le", "top")) + +gene_lists$le[["KEGG_APOPTOSIS"]] # leading edge genes +gene_lists$top[["KEGG_APOPTOSIS"]] # top 30\% by rank +gene_lists$all[["KEGG_APOPTOSIS"]] # all genes + +pa_annot <- addgenesPA(pa_single, gene_lists) +head(pa_annot[, c("NAME", "all_genes", "le_genes", "top_genes")]) + +# ── CAMERA results: use "top" (no leading edge available) ─── +data(camera_results) +camera_pa <- camera_results +colnames(camera_pa)[colnames(camera_pa) == "GeneSet"] <- "NAME" +camera_pa$SIZE <- sapply(camera_pa$NAME, + function(x) length(geneset_list[[x]])) +camera_pa$top <- 0.25 # top 25\% by rank + +gene_lists_cam <- getgenesPA(camera_pa, geneset_list, ranked, + genes = c("all", "top")) +pa_annot_cam <- addgenesPA(camera_pa, gene_lists_cam) +head(pa_annot_cam[, c("NAME", "all_genes", "top_genes")]) +} + +} +\seealso{ +\code{\link[=addgenesPA]{addgenesPA()}} to append gene columns to pa_data; +\code{\link[=heatmap_PA]{heatmap_PA()}} for heatmap visualization; +\code{\link[=list_gmts]{list_gmts()}} to generate \code{geneset_list}; +\code{\link[=merge_PA]{merge_PA()}} to generate \code{pa_data} with the required \code{tags} column. +} diff --git a/man/gsea_results.Rd b/man/gsea_results.Rd new file mode 100644 index 0000000..92ecede --- /dev/null +++ b/man/gsea_results.Rd @@ -0,0 +1,96 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gsea_results.R +\docType{data} +\name{gsea_results} +\alias{gsea_results} +\title{Simulated GSEA pathway analysis results for TCGA-LUAD} +\format{ +A data frame with 120 rows (40 gene sets x 3 comparisons) and 15 +columns: +\describe{ +\item{NAME}{Character. Gene set name, matching the names in +\link{geneset_list}.} +\item{SIZE}{Integer. Number of genes in the gene set.} +\item{ES}{Numeric. Enrichment score.} +\item{NES}{Numeric. Normalized enrichment score.} +\item{NOM p-val}{Numeric. Nominal p-value.} +\item{FDR}{Numeric. False discovery rate. Approximately 60\% of gene +sets per comparison have FDR < 0.05.} +\item{FWER p-val}{Numeric. Family-wise error rate.} +\item{RANK AT MAX}{Integer. Gene rank at maximum enrichment score.} +\item{Log10FDR}{Numeric. \code{-log10(FDR)}.} +\item{tags}{Numeric. Fraction of gene set in the leading edge (0-1).} +\item{list}{Numeric. Fraction of the ranked list used (0-1).} +\item{signal}{Numeric. Enrichment signal strength (0-1).} +\item{LEADING EDGE}{Character. Leading edge string in GSEA format +(e.g., \code{"tags=20\%, list=35\%, signal=15\%"}).} +\item{COLLECTION}{Character. MSigDB collection name: \code{"HALLMARK"}, +\code{"KEGG"}, or \code{"GO"}.} +\item{COMPARISON}{Character. Comparison name: \code{"TumorVsNormal"}, +\code{"MetastasisVsNormal"}, or \code{"MetastasisVsTumor"}.} +} +} +\source{ +Simulated with \code{set.seed(174)} in \code{data-raw/gsea_results.R}. +Gene set names and memberships derived from \link{geneset_list}. NES values +and significance are simulated to reflect realistic GSEA output patterns. +} +\usage{ +gsea_results +} +\description{ +A simulated data frame representing the output of \code{\link[=merge_PA]{merge_PA()}} for three +pairwise comparisons of TCGA-LUAD samples across 40 gene sets from three +MSigDB collections (HALLMARK, KEGG, GO). Gene sets and gene memberships are +derived from \link{geneset_list}. NES values and FDR are simulated with +\code{set.seed(174)} to produce realistic enrichment patterns, where ~60\% of +gene sets per comparison are significant (FDR < 0.05). +} +\details{ +This dataset is designed to demonstrate \code{\link[=splot_PA]{splot_PA()}}, \code{\link[=multiplot_PA]{multiplot_PA()}}, +\code{\link[=getgenesPA]{getgenesPA()}}, \code{\link[=addgenesPA]{addgenesPA()}}, and \code{\link[=heatmap_PA]{heatmap_PA()}} without requiring +external GSEA output files. +} +\examples{ +data(gsea_results) + +# Overview +dim(gsea_results) +table(gsea_results$COMPARISON) +table(gsea_results$COLLECTION) + +# How many gene sets are significant per comparison? +tapply(gsea_results$FDR < 0.05, gsea_results$COMPARISON, sum) + +# Single comparison plot +single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ] +\dontrun{ +splot_PA( + data = single, + geneset_col = "NAME", + collection_col = "COLLECTION", + nes_col = "NES", + fdr_col = "FDR" +) +} + +# Multi-comparison plot +\dontrun{ +multiplot_PA( + data = gsea_results, + comparison_col = "COMPARISON", + facet_col = "NAME", + fdr_col = "FDR", + comparison_order = c("TumorVsNormal", "MetastasisVsNormal", + "MetastasisVsTumor") +) +} + +} +\seealso{ +\code{\link[=merge_PA]{merge_PA()}} which produces this format from real GSEA output; +\code{\link[=splot_PA]{splot_PA()}}, \code{\link[=multiplot_PA]{multiplot_PA()}} for visualization; +\code{\link[=getgenesPA]{getgenesPA()}}, \code{\link[=addgenesPA]{addgenesPA()}} for gene-level annotation; +\link{geneset_list} for the gene set memberships used here. +} +\keyword{datasets} diff --git a/man/heatmap_GSEA.Rd b/man/heatmap_GSEA.Rd deleted file mode 100644 index 788d4b0..0000000 --- a/man/heatmap_GSEA.Rd +++ /dev/null @@ -1,46 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/heatmap_GSEA.R -\name{heatmap_GSEA} -\alias{heatmap_GSEA} -\title{Plot leading edge heatmaps from GSEA results.} -\usage{ -heatmap_GSEA( - main_dir = NULL, - expression_file, - metadata_file, - gmt_file, - ranked_genes_file, - gsea_file, - output_dir = "leading_edge_heatmaps", - sample_col = "Sample", - group_col = "group", - save_dataframe = FALSE -) -} -\arguments{ -\item{main_dir}{Optional base directory. If supplied, it will be prepended to all relative file paths.} - -\item{expression_file}{Path to the expression data file (tab-delimited) or relative to main_dir.} - -\item{metadata_file}{Path to the metadata file (Excel) or relative to main_dir.} - -\item{gmt_file}{Path to the GMT file defining gene sets or relative to main_dir.} - -\item{ranked_genes_file}{Path to the ranked genes list file or relative to main_dir.} - -\item{gsea_file}{Path to the GSEA results file with leading edge genes or relative to main_dir.} - -\item{output_dir}{Directory to save heatmaps and optional TSV; default "leading_edge_heatmaps".} - -\item{sample_col}{Name of the sample ID column in metadata; default "Sample".} - -\item{group_col}{Name of the group column in metadata; default "group".} - -\item{save_dataframe}{Logical; if TRUE, saves the merged data frame as TSV before plotting.} -} -\value{ -Saves one PDF and one JPG heatmap per gene set under output_dir; optionally saves intermediate TSV. -} -\description{ -Generates heatmaps of leading edge genes for each gene set from GSEA output. -} diff --git a/man/heatmap_PA.Rd b/man/heatmap_PA.Rd new file mode 100644 index 0000000..ebfd113 --- /dev/null +++ b/man/heatmap_PA.Rd @@ -0,0 +1,165 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot_PA.R +\name{heatmap_PA} +\alias{heatmap_PA} +\title{Plot leading edge heatmaps from pathway analysis results} +\usage{ +heatmap_PA( + expression_data, + metadata, + pa_data_annot, + ranked_genes, + plot_genes = c("all_genes", "le_genes"), + sample_col = "Sample", + group_col = "group", + out_dir = "heatmaps_PA", + pdf = TRUE, + jpg = TRUE +) +} +\arguments{ +\item{expression_data}{A numeric matrix or data frame of expression values +with gene symbols or Ensembl IDs as row names and sample IDs as column +names. Recommended input: VST-transformed counts from \link{vst_counts} or +normalized coutns \link{norm_counts}.} + +\item{metadata}{A data frame of sample annotations. Must contain a column +matching \code{sample_col} (sample IDs) and a column matching \code{group_col} +(condition labels, e.g., \code{"Control"}, \code{"Treatment"}).} + +\item{pa_data_annot}{A data frame of pathway analysis results enriched with +gene columns. Must contain the column \code{NAME} and at least one of +\code{all_genes}, \code{le_genes}, or \code{top_genes} (comma-separated gene symbols per +gene set). Typically the output of \code{\link[=addgenesPA]{addgenesPA()}}.} + +\item{ranked_genes}{A character vector of gene symbols ordered by their +ranking metric (e.g., stat, log2FC or signal-to-noise ratio), used to sort +genes within each heatmap row.} + +\item{plot_genes}{Character vector specifying which gene columns to plot. +One or both of \code{"all_genes"} and \code{"le_genes"}, and \code{"top_genes"}. Each +selection produces its own set of output files in a dedicated subfolder. +Default: \code{c("all_genes", "le_genes")}.} + +\item{sample_col}{Name of the sample ID column in \code{metadata}. +Default: \code{"Sample"}.} + +\item{group_col}{Name of the condition/group column in \code{metadata} +(e.g., \code{"Control"} vs \code{"Treatment"}). Used for heatmap column +annotations. Default: \code{"group"}.} + +\item{out_dir}{Character. Path to the output directory. Subdirectories are +created automatically based on \code{pdf}, \code{jpg}, and \code{plot_genes}: +\itemize{ +\item \verb{/pdf/all_genes/} +\item \verb{/pdf/le_genes/} +\item \verb{/pdf/top_genes/} +\item \verb{/jpg/all_genes/} +\item \verb{/jpg/le_genes/} +\item \verb{/jpg/top_genes/} +Default: \code{"heatmaps_PA"}. +}} + +\item{pdf}{Logical. If \code{TRUE}, saves PDF heatmaps. Default: \code{TRUE}.} + +\item{jpg}{Logical. If \code{TRUE}, saves JPG heatmaps. Default: \code{TRUE}.} +} +\value{ +Invisibly returns \code{TRUE} upon completion. Saves heatmap files to +the corresponding subdirectories under \code{out_dir}. +} +\description{ +Generates heatmaps of gene expression for each gene set in \code{pa_data_annot}, +using the \code{all_genes}, \code{le_genes} (GSEA output only), and/or \code{top_genes} +columns produced by \code{\link[=addgenesPA]{addgenesPA()}}. Genes within each heatmap are ordered +by their position in \code{ranked_genes}. +} +\details{ +The recommended workflow before calling this function is: + +\if{html}{\out{
}}\preformatted{gsl <- list_gmts("path/to/gmt/") +pa_data <- merge_PA("path/to/pa_data/") +ranked <- deseq2_results$gene_id[order(deseq2_results$stat, + decreasing = TRUE)] +gene_lists <- getgenesPA(pa_data, gsl, ranked, genes = c("all", "le")) +pa_annot <- addgenesPA(pa_data, gene_lists) + +heatmap_PA( + expression_data = vst_counts, + metadata = sampledata, + pa_data_annot = pa_annot, + ranked_genes = ranked, + plot_genes = c("all_genes", "le_genes") +) +}\if{html}{\out{
}} +} +\examples{ +\dontrun{ +data(vst_counts) +data(sampledata) +data(deseq2_results) +data(gsea_results) +data(geneset_list) + +ranked <- deseq2_results$gene_id[order(deseq2_results$stat, + decreasing = TRUE)] + +# ── Example 1: GSEA results (all_genes + le_genes) ──── +pa_single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ] +gene_lists <- getgenesPA(pa_single, geneset_list, ranked, + genes = c("all", "le")) +pa_annot <- addgenesPA(pa_single, gene_lists) + +heatmap_PA( + expression_data = vst_counts, + metadata = sampledata, + pa_data_annot = pa_annot, + ranked_genes = ranked, + plot_genes = c("all_genes", "le_genes"), + sample_col = "patient_id", + group_col = "sample_type", + out_dir = "heatmaps_gsea", + pdf = TRUE, + jpg = TRUE +) +# Creates: +# heatmaps_gsea/pdf/all_genes/_heatmap.pdf +# heatmaps_gsea/pdf/le_genes/_heatmap.pdf +# heatmaps_gsea/jpg/all_genes/_heatmap.jpg +# heatmaps_gsea/jpg/le_genes/_heatmap.jpg + +# ── Example 2: CAMERA results (all_genes + top_genes) +# camera_results does not contain leading edge information. +# Use genes = "top" with a manually set top fraction instead. +# Note: top_genes are rank-based and do NOT represent true leading edge genes. +data(camera_results) +camera_pa <- camera_results +colnames(camera_pa)[colnames(camera_pa) == "GeneSet"] <- "NAME" +camera_pa$SIZE <- sapply(camera_pa$NAME, + function(x) length(geneset_list[[x]])) +camera_pa$top <- 0.25 + +gene_lists_cam <- getgenesPA(camera_pa, geneset_list, ranked, + genes = c("all", "top")) +pa_annot_cam <- addgenesPA(camera_pa, gene_lists_cam) + +heatmap_PA( + expression_data = vst_counts, + metadata = sampledata, + pa_data_annot = pa_annot_cam, + ranked_genes = ranked, + plot_genes = c("all_genes", "top_genes"), + sample_col = "patient_id", + group_col = "sample_type", + out_dir = "heatmaps_camera" +) +} + +} +\seealso{ +\code{\link[=getgenesPA]{getgenesPA()}} for gene extraction; +\code{\link[=addgenesPA]{addgenesPA()}} to generate \code{pa_data_annot}; +\code{\link[=list_gmts]{list_gmts()}} to generate the geneset list; +\code{\link[=merge_PA]{merge_PA()}} to generate \code{pa_data}; +\link{vst_counts} for an example expression matrix. +} diff --git a/man/heatmap_path_PA.Rd b/man/heatmap_path_PA.Rd new file mode 100644 index 0000000..5d72ced --- /dev/null +++ b/man/heatmap_path_PA.Rd @@ -0,0 +1,126 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot_PA.R +\name{heatmap_path_PA} +\alias{heatmap_path_PA} +\title{Plot leading edge heatmaps from GSEA analysis results using file paths} +\usage{ +heatmap_path_PA( + main_dir = NULL, + expression_file, + metadata_file, + gmt_file, + ranked_genes_file, + gsea_file, + output_dir = "leading_edge_heatmaps", + sample_col = "Sample", + group_col = "group", + save_dataframe = FALSE +) +} +\arguments{ +\item{main_dir}{Character or \code{NULL}. Optional base directory prepended to +all relative file paths. If \code{NULL} (default), all paths are used as-is.} + +\item{expression_file}{Character. Path to a tab-delimited expression data +file. Rows are genes (first column or a column named \code{NAME} used as row +names), columns are sample IDs. Recommended input: VST-transformed counts.} + +\item{metadata_file}{Character. Path to an Excel (\code{.xlsx}) metadata file. +Must contain a column matching \code{sample_col} (sample IDs) and a column +matching \code{group_col} (condition labels, e.g., \code{"Control"}, +\code{"Treatment"}).} + +\item{gmt_file}{Character. Path to a \code{.gmt} file defining gene sets. Each +row contains: gene set name (column 1), description (column 2, ignored), +and gene symbols (columns 3+).} + +\item{ranked_genes_file}{Character. Path to a tab-delimited file where the +first column contains gene symbols ordered by their ranking metric (e.g., +log2FC or signal-to-noise ratio), from most positive to most negative. +Used to order leading edge genes within each heatmap.} + +\item{gsea_file}{Character. Path to a GSEA results \code{.tsv} file containing +at least the columns \code{NAME}, \code{SIZE}, and \code{tags} (from the \verb{LEADING EDGE} +column parsed by \code{\link[=merge_PA]{merge_PA()}}).} + +\item{output_dir}{Character. Directory where heatmap files are saved. +Created automatically if it does not exist. Default: +\code{"leading_edge_heatmaps"}.} + +\item{sample_col}{Name of the sample ID column in the metadata file. +Default: \code{"Sample"}.} + +\item{group_col}{Name of the condition/group column in the metadata file +(e.g., \code{"Control"} vs \code{"Treatment"}). Used for heatmap column +annotations. Default: \code{"group"}.} + +\item{save_dataframe}{Logical. If \code{TRUE}, saves the intermediate data frame +(gene sets with computed leading edge genes) as a \code{.tsv} file in +\code{output_dir} before plotting. Useful for inspection or reuse. +Default: \code{FALSE}.} +} +\value{ +Invisibly returns \code{TRUE} upon completion. Saves two files per gene +set in \code{output_dir}: +\itemize{ +\item \verb{_heatmap.pdf} +\item \verb{_heatmap.jpg} +} + +If \code{save_dataframe = TRUE}, also saves +\verb{/leading_edge_genes_df.tsv}. +} +\description{ +Generates one heatmap per gene set from GSEA/CAMERA/PADOG output by reading +all required inputs from file paths. For each gene set, the leading edge +genes are extracted, ordered by their rank in the ranked gene list, and +plotted as a scaled row heatmap against the expression matrix. +} +\details{ +This function is the file-path-based alternative to \code{\link[=heatmap_PA]{heatmap_PA()}}, which +accepts R objects directly. Use this version when working from raw output +files on disk (e.g., directly after running \code{GSEA_merge.sh}). +} +\note{ +For a more flexible workflow that accepts R objects directly (avoiding +repeated file reads), use \code{\link[=heatmap_PA]{heatmap_PA()}} instead, which takes +\code{expression_data}, \code{metadata}, and \code{pa_data_annot} as R objects and +integrates with \code{\link[=getgenesPA]{getgenesPA()}} and \code{\link[=addgenesPA]{addgenesPA()}}. +} +\examples{ +\dontrun{ +# Run with all files in a single base directory +heatmap_path_PA( + main_dir = "path/to/analysis/", + expression_file = "vst_expression.tsv", + metadata_file = "metadata.xlsx", + gmt_file = "genesets.gmt", + ranked_genes_file = "ranked_genes.tsv", + gsea_file = "gsea_results.tsv", + output_dir = "leading_edge_heatmaps", + sample_col = "Sample", + group_col = "group", + save_dataframe = TRUE +) +# Saves: +# leading_edge_heatmaps/_heatmap.pdf +# leading_edge_heatmaps/_heatmap.jpg +# leading_edge_heatmaps/leading_edge_genes_df.tsv (if save_dataframe = TRUE) + +# Run with absolute paths (no main_dir) +heatmap_path_PA( + expression_file = "/data/vst_counts.tsv", + metadata_file = "/data/metadata.xlsx", + gmt_file = "/data/h.all.v2023.gmt", + ranked_genes_file = "/data/ranked_genes.tsv", + gsea_file = "/data/gsea_results.tsv" +) +} + +} +\seealso{ +\code{\link[=heatmap_PA]{heatmap_PA()}} for the R-object-based alternative; +\code{\link[=getgenesPA]{getgenesPA()}} and \code{\link[=addgenesPA]{addgenesPA()}} for extracting leading edge genes +from R objects; \code{\link[=merge_PA]{merge_PA()}} to generate the GSEA results input; +\code{\link[=list_gmts]{list_gmts()}} to load GMT files as R objects. +} diff --git a/man/list_gmts.Rd b/man/list_gmts.Rd new file mode 100644 index 0000000..77ddc64 --- /dev/null +++ b/man/list_gmts.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/list_gmts.R +\name{list_gmts} +\alias{list_gmts} +\title{Read GMT files from a directory into a named gene set list} +\usage{ +list_gmts(dir) +} +\arguments{ +\item{dir}{Character. Path to the directory containing one or more \code{.gmt} +files. The function searches the directory non-recursively.} +} +\value{ +A named list where each element is a character vector of gene symbols +for one gene set. Names correspond to gene set names as defined in column 1 +of the GMT files. If the same gene set name appears in multiple files, the +last occurrence overwrites the earlier one. +} +\description{ +Scans a directory for \code{.gmt} files, parses them, and returns a single named +list where each element is a character vector of gene symbols for one gene +set. The output is ready to be passed directly to \code{\link[=geneset_similarity]{geneset_similarity()}}. +} +\details{ +\strong{GMT format:} each row contains the gene set name in column 1, an optional +description in column 2, and gene symbols from column 3 onward. Empty fields +are automatically removed. This is the standard format used by MSigDB +(Molecular Signatures Database) and other gene set annotation resources. +} +\examples{ +\dontrun{ +# Read all GMT files from a directory +geneset_list <- list_gmts("path/to/gmt_files/") + +# Inspect output +length(geneset_list) # number of gene sets +names(geneset_list)[1:5] # first five gene set names +geneset_list[["KEGG_APOPTOSIS"]] # genes in a specific set + +# Pass directly to geneset_similarity +jac <- geneset_similarity(geneset_list, results_df, fdr_th = 0.05) +} + +} +\seealso{ +\code{\link[=geneset_similarity]{geneset_similarity()}} +} diff --git a/man/merge_GSEA.Rd b/man/merge_GSEA.Rd deleted file mode 100644 index d845bc8..0000000 --- a/man/merge_GSEA.Rd +++ /dev/null @@ -1,16 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/merge_GSEA.R -\name{merge_GSEA} -\alias{merge_GSEA} -\title{Merge GSEA results data frames.} -\usage{ -merge_GSEA(input_directory, output_file = "collections_merged_gsea_data.tsv") -} -\arguments{ -\item{input_directory}{The directory containing the GSEA collection results in TSV format.} - -\item{output_file}{The output file to save the merged data. If not provided, the file will be saved in the input directory.} -} -\description{ -After running GSEA_all.sh from GSEA.sh, merge_GSEA function joins .tsv files to a single file -} diff --git a/man/merge_PA.Rd b/man/merge_PA.Rd new file mode 100644 index 0000000..b5bc037 --- /dev/null +++ b/man/merge_PA.Rd @@ -0,0 +1,105 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/merge_PA.R +\name{merge_PA} +\alias{merge_PA} +\title{Merge GSEA result files into a single data frame} +\usage{ +merge_PA(input_directory, fdr_replace = 0.001) +} +\arguments{ +\item{input_directory}{Character. Path to the directory containing one or +more GSEA collection result files in \code{.tsv} format (e.g., output of +\code{GSEA_merge.sh}). Each file must end in \code{.tsv}.} + +\item{fdr_replace}{Numeric. Value used to replace \verb{FDR q-val = 0}. This +occurs when no permutation produced an NES as extreme as the observed +one, meaning the true FDR is below \code{1 / n_permutations}. With the +standard 1,000 permutations, the recommended value is \code{0.001}. Adjust +to \code{1 / n_permutations} if a different number of permutations was used. +Default: \code{0.001}.} +} +\value{ +A data frame (\code{gsea_data}) containing all merged and processed GSEA +results with standardized column names. +} +\description{ +Reads all \code{.tsv} files produced by \code{GSEA_merge.sh} (from the GSEA.sh +pipeline) from a directory, standardizes numeric columns, parses the +leading edge string, computes \code{-log10(FDR)}, and returns a single merged +data frame ready for downstream use with \code{\link[=splot_PA]{splot_PA()}}, \code{\link[=multiplot_PA]{multiplot_PA()}} , \code{\link[=getgenesPA]{getgenesPA()}}, and +\code{\link[=heatmap_PA]{heatmap_PA()}}. +} +\details{ +\strong{Input file format:} Each \code{.tsv} file corresponds to one MSigDB collection +(e.g., \code{H.tsv}, \code{C2.tsv}) and must follow the standard GSEA output +format with the following columns: +\itemize{ +\item \code{NAME}: gene set name. +\item \code{SIZE}: number of genes in the gene set. +\item \code{ES}: enrichment score. +\item \code{NES}: normalized enrichment score. +\item \verb{NOM p-val}: nominal p-value. +\item \verb{FDR q-val}: false discovery rate. Values of exactly \code{0} indicate +that no permutation produced an equally extreme NES (i.e., the true +FDR is below the permutation resolution \code{1 / n_permutations}). These +are replaced by \code{fdr_replace} to avoid \code{-Inf} in log-transforms. +\item \verb{FWER p-val}: family-wise error rate. +\item \verb{RANK AT MAX}: gene rank at maximum enrichment score. +\item \verb{LEADING EDGE}: string encoding the leading edge statistics in the +format \code{"tags=XX\%, list=XX\%, signal=XX\%"}. Parsed into three numeric +columns: \code{tags} (fraction of gene set in leading edge), \code{list} +(fraction of ranked list used), and \code{signal} (enrichment signal +strength). +\item \code{Comparison}: name of the comparison (e.g., \code{"TumorVsNormal"}). +Renamed to \code{COMPARISON} in the output. Required for visualization +with \code{\link[=splot_PA]{splot_PA()}} or \code{\link[=multiplot_PA]{multiplot_PA()}} . +\item \verb{GS
follow link to MSigDB} and \verb{GS DETAILS}: removed automatically. +} + +\strong{Output columns:} All input columns (minus the two removed above) plus: +\itemize{ +\item \code{COLLECTION}: name of the MSigDB collection, derived from the file name +by removing the \code{.tsv} suffix. +\item \code{tags}, \code{list}, \code{signal}: numeric leading edge components (0-1 scale). +\item \code{Log10FDR}: \code{-log10(FDR)} computed after applying \code{fdr_replace}. +\item \code{FDR}: renamed from \verb{FDR q-val}. +\item \code{COMPARISON}: renamed from \code{Comparison}. +} +} +\note{ +The input \code{.tsv} files must contain a \code{Comparison} column identifying +each comparison (e.g., \code{"TumorVsNormal"}). This column is renamed to +\code{COMPARISON} in the output and is required by \code{\link[=splot_PA]{splot_PA()}} or \code{\link[=multiplot_PA]{multiplot_PA()}} to operate in +multi-comparison mode. If your files come from a single comparison and +do not have this column, add it manually to each file before merging: +\code{your_data$Comparison <- "YourComparisonName"}. +} +\examples{ +\dontrun{ +# Merge all GSEA collection TSV files from a directory +gsea_data <- merge_PA( + input_directory = "path/to/gsea_results/", + fdr_replace = 0.001 # standard for 1000 permutations +) + +# Inspect result +head(gsea_data) +colnames(gsea_data) + +# Use directly in downstream functions +gsl <- list_gmts("path/to/gmt_folder/") +ranked <- deseq2_results$gene_id[order(deseq2_results$stat, + decreasing = TRUE)] +gene_lists <- getgenesPA(gsea_data, gsl, ranked) +pa_annot <- addgenesPA(gsea_data, gene_lists) + +plot_PA(gsea_data, comparison_col = "COMPARISON") +} + +} +\seealso{ +\code{\link[=splot_PA]{splot_PA()}} or \code{\link[=multiplot_PA]{multiplot_PA()}} for visualization of merged results; +\code{\link[=getgenesPA]{getgenesPA()}} and \code{\link[=addgenesPA]{addgenesPA()}} for gene-level annotation; +\code{\link[=heatmap_PA]{heatmap_PA()}} for leading edge heatmaps; +\code{\link[=list_gmts]{list_gmts()}} to load gene sets. +} diff --git a/man/multiplot_PA.Rd b/man/multiplot_PA.Rd new file mode 100644 index 0000000..678a152 --- /dev/null +++ b/man/multiplot_PA.Rd @@ -0,0 +1,140 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot_PA.R +\name{multiplot_PA} +\alias{multiplot_PA} +\title{Pathway analysis visualization across multiple comparisons} +\usage{ +multiplot_PA( + data, + comparison_col = "COMPARISON", + facet_col = "NAME", + axis_y = "NES", + fdr_col = "FDR", + comparison_order = NULL, + custom_labels = NULL, + ncol_wrap = 2, + free_y = TRUE, + fill_limits = NULL, + fill_palette = c("white", "red"), + theme_params = list() +) +} +\arguments{ +\item{data}{A data frame of pathway analysis results containing two or more +comparisons. Typically the output of \code{\link[=merge_PA]{merge_PA()}}.} + +\item{comparison_col}{Name of the column identifying each comparison. +Appears on the x-axis of each facet. Default: \code{"COMPARISON"}.} + +\item{facet_col}{Name of the column used to define facets one facet per +unique value. Can be the original gene set name column (e.g., \code{"NAME"}) +or a manually curated column with cleaner or shorter labels +(e.g., \code{"clean_name"}). Default: \code{"NAME"}.} + +\item{axis_y}{Name of the column to use for the y-axis. Default: \code{"NES"}.} + +\item{fdr_col}{Name of the column containing FDR values. \code{-log10(FDR)} is +computed internally and used as the fill color. Default: \code{"FDR"}.} + +\item{comparison_order}{Character vector specifying the left-to-right order +of comparisons on the x-axis of each facet. For example, +\code{comparison_order = c("BvsA", "CvsA")} places \code{BvsA} on the left and +\code{CvsA} on the right. If \code{NULL} (default), the order follows the factor +levels of \code{comparison_col} as they appear in \code{data}.} + +\item{custom_labels}{Named character vector of x-axis tick labels. Useful +for shortening comparison names on the axis. For example, +\code{custom_labels = c(TumorVsNormal = "Tumor", MetastasisVsNormal = "Mets")}. +Default: \code{NULL}.} + +\item{ncol_wrap}{Integer. Number of columns in \code{facet_wrap}. Default: \code{2}.} + +\item{free_y}{Logical. If \code{TRUE}, each facet uses its own y-axis scale. +Default: \code{TRUE}.} + +\item{fill_limits}{Numeric vector of length 2 setting the color scale range +for \code{-log10(FDR)}. Values outside this range are clamped to the nearest +limit. For example, \code{fill_limits = c(0, 5)} maps all gene sets with +\code{-log10(FDR) >= 5} (FDR <= 0.00001) to maximum red, and any value below +0 to white. Useful when one gene set has extreme significance that makes +the rest appear uniform. Default: \code{NULL} (auto).} + +\item{fill_palette}{Character vector of two colors for the fill gradient +(low to high -log10(FDR)). Default: \code{c("white", "red")}.} + +\item{theme_params}{Named list to override default theme parameters. +See Details.} +} +\value{ +A ggplot2 object. +} +\description{ +Generates a faceted barplot showing NES values across multiple comparisons +for a set of gene sets. Each facet represents one gene set and bars +represent the NES per comparison, colored by -log10(FDR). This layout makes +it easy to compare how enrichment of gene sets changes across conditions +(e.g., TumorVsNormal, MetastasisVsNormal). +} +\details{ +All comparisons must be combined in a single data frame with a column +identifying each comparison as produced by \code{\link[=merge_PA]{merge_PA()}}. + +For visualizing a single comparison with full collection grouping, use +\code{\link[=splot_PA]{splot_PA()}} instead. + +\code{theme_params} accepts any of the following named elements: +\describe{ +\item{\code{bar_col}}{Bar border color. Default: \code{"black"}.} +\item{\code{bar_size}}{Bar border linewidth. Default: \code{0.5}.} +\item{\code{bar_width}}{Bar width. Default: \code{0.6}.} +\item{\code{hline_size}}{Linewidth for horizontal line at y = 0. Default: \code{2}.} +\item{\code{axis_title_size}}{Font size for axis titles. Default: \code{45}.} +\item{\code{axis_text_size_x}}{Font size for x-axis labels. Default: \code{30}.} +\item{\code{axis_text_size_y}}{Font size for y-axis labels. Default: \code{50}.} +\item{\code{tick_size}}{Linewidth for axis ticks. Default: \code{1.5}.} +\item{\code{tick_length}}{Length of axis ticks in cm. Default: \code{0.3}.} +\item{\code{strip_text_size}}{Font size for facet strip labels. Default: \code{50}.} +\item{\code{panel_spacing_multi}}{Spacing between facets. Default: \code{0.6}.} +} +} +\examples{ +\dontrun{ +gsea_results <- merge_PA("path/to/gsea_results/") + +# Basic multi-comparison plot +multiplot_PA( + data = gsea_results, + comparison_col = "COMPARISON", + facet_col = "NAME", + fdr_col = "FDR", + ncol_wrap = 3 +) + +# Control left-to-right order of comparisons on the x-axis +multiplot_PA( + data = gsea_results, + comparison_col = "COMPARISON", + facet_col = "NAME", + fdr_col = "FDR", + comparison_order = c("BvsA", "CvsA") # BvsA on the left, CvsA on the right +) + +# Use cleaner facet labels and shorten x-axis tick names +gsea_results$clean_name <- gsub("_", " ", gsea_results$NAME) + +multiplot_PA( + data = gsea_results, + comparison_col = "COMPARISON", + facet_col = "clean_name", + fdr_col = "FDR", + comparison_order = c("BvsA", "CvsA"), + custom_labels = c(BvsA = "Tumor", CvsA = "Metastasis") +) +} + +} +\seealso{ +\code{\link[=splot_PA]{splot_PA()}} for single-comparison patchwork plots; +\code{\link[=merge_PA]{merge_PA()}} to generate the input data frame; +\link{camera_results} for a minimal example dataset. +} diff --git a/man/network_clust.Rd b/man/network_clust.Rd new file mode 100644 index 0000000..8931d94 --- /dev/null +++ b/man/network_clust.Rd @@ -0,0 +1,113 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotclust_PA.R +\name{network_clust} +\alias{network_clust} +\title{Gene set network clustering with igraph base R graphics} +\usage{ +network_clust( + x, + clust_result, + jaccard_threshold = 0.5, + min_degree = 2, + superterms = FALSE, + superterm_data = NULL, + type = "all", + seed = 174 +) +} +\arguments{ +\item{x}{A \code{JaccardResult} object (output of \code{\link[=geneset_similarity]{geneset_similarity()}}).} + +\item{clust_result}{A list returned by \code{\link[=do_clust]{do_clust()}}, used to color nodes by +hierarchical cluster assignment.} + +\item{jaccard_threshold}{Numeric. Minimum Jaccard similarity required for an +edge to be included in the network. Default: \code{0.50}.} + +\item{min_degree}{Integer. Minimum node degree for a node to be retained in +the network. Default: \code{2}.} + +\item{superterms}{Logical. If \code{TRUE}, overlays super-term labels at community +centroids on the relevant plots. Requires \code{superterm_data}. Default: +\code{FALSE}.} + +\item{superterm_data}{A list returned by \code{\link[=get_superterm]{get_superterm()}}. Required when +\code{superterms = TRUE}.} + +\item{type}{Character. Which plot(s) to draw. One of: +\itemize{ +\item \code{"clean"} : colored nodes, no labels. +\item \code{"superterms"} : colored nodes with community super-term labels. +\item \code{"combined"} : colored nodes with super-term labels and individual node +labels. +\item \code{"individual"} : colored nodes with individual node labels only. +\item \code{"all"} : all four plots drawn sequentially. Default. +}} + +\item{seed}{Integer. Random seed for the Fruchterman-Reingold layout and +Louvain community detection. Default: \code{174}.} +} +\value{ +Invisibly, a named list with: +\itemize{ +\item \verb{$graph}: The filtered \code{igraph} graph object (\code{g_clean}). +\item \verb{$layout}: Numeric matrix with the normalized Fruchterman-Reingold node +coordinates. +\item \verb{$node_attributes}: A \code{\link[tibble:tibble]{tibble::tibble()}} with columns \code{NAME}, \code{cluster}, +\code{community}, \code{degree}, \code{betweenness}, and \code{closeness} (plus \code{superterm} +if \code{superterms = TRUE}). +\item \verb{$superterm_report}: A \code{\link[tibble:tibble]{tibble::tibble()}} with columns \code{community}, +\code{superterm}, \code{n_genesets}, and \code{geneset_members}. Only present if +\code{superterms = TRUE}. +} +} +\description{ +Builds a weighted gene set network from a Jaccard similarity matrix, applies +Louvain community detection, and draws up to four network visualizations +using base R igraph graphics directly to the active graphics device. +Optionally overlays super-term community labels generated by +\code{\link[=get_superterm]{get_superterm()}}. +} +\details{ +For a ggplot2-based version that returns plot objects instead of drawing +them, see \code{\link[=network_clust_gg]{network_clust_gg()}}. +} +\examples{ +\dontrun{ +# Requires igraph +gsl <- list_gmts("path/to/gmt_folder/") +res <- read.csv("path/to/results.csv") + +jac <- geneset_similarity(gsl, res, fdr_th = 0.05) +clust <- do_clust(jac) +net <- get_network_communities(jac, threshold = 0.3) + +# Draw all four plots directly to the active graphics device +result <- network_clust( + jac, + clust_result = clust, + superterms = TRUE, + superterm_data = net$superterms, + type = "all", + seed = 174 +) + +# Draw only the clean version (no labels) +network_clust(jac, clust, type = "clean") + +# Access node-level metrics from the invisible return +result$node_attributes +result$superterm_report + +# Save a specific plot to PDF +pdf("network_superterms.pdf", width = 14, height = 14) +network_clust(jac, clust, superterms = TRUE, + superterm_data = net$superterms, type = "superterms") +dev.off() +} + +} +\seealso{ +\code{\link[=geneset_similarity]{geneset_similarity()}}, \code{\link[=do_clust]{do_clust()}}, \code{\link[=get_network_communities]{get_network_communities()}}, +\code{\link[=get_superterm]{get_superterm()}}, \code{\link[=network_clust_gg]{network_clust_gg()}} +} diff --git a/man/network_clust_gg.Rd b/man/network_clust_gg.Rd new file mode 100644 index 0000000..165e8de --- /dev/null +++ b/man/network_clust_gg.Rd @@ -0,0 +1,121 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotclust_PA.R +\name{network_clust_gg} +\alias{network_clust_gg} +\title{Gene set network clustering with ggraph graphics} +\usage{ +network_clust_gg( + x, + clust_result, + jaccard_threshold = 0.5, + min_degree = 2, + superterms = FALSE, + superterm_data = NULL, + type = "all", + seed = 174 +) +} +\arguments{ +\item{x}{A \code{JaccardResult} object (output of \code{\link[=geneset_similarity]{geneset_similarity()}}).} + +\item{clust_result}{A list returned by \code{\link[=do_clust]{do_clust()}}, used to color nodes by +hierarchical cluster assignment.} + +\item{jaccard_threshold}{Numeric. Minimum Jaccard similarity required for an +edge to be included in the network. Default: \code{0.50}.} + +\item{min_degree}{Integer. Minimum node degree for a node to be retained in +the network. Default: \code{2}.} + +\item{superterms}{Logical. If \code{TRUE}, overlays super-term labels at community +centroids on the relevant plots. Requires \code{superterm_data}. Default: +\code{FALSE}.} + +\item{superterm_data}{A list returned by \code{\link[=get_network_communities]{get_network_communities()}} (slot +\verb{$superterms}) or directly by \code{\link[=get_superterm]{get_superterm()}}. Required when +\code{superterms = TRUE}.} + +\item{type}{Character. Which plot(s) to return. One of: +\itemize{ +\item \code{"clean"} : colored nodes, no labels. +\item \code{"superterms"} : colored nodes with community super-term labels. +\item \code{"combined"} : colored nodes with super-term labels and individual node +labels. +\item \code{"individual"} : colored nodes with individual node labels only. +\item \code{"all"} : all four plots. Default. +}} + +\item{seed}{Integer. Random seed for the Fruchterman-Reingold layout and +Louvain community detection. Default: \code{174}.} +} +\value{ +A named list. Depending on \code{type}, the list may contain any +combination of the ggplot2 elements \verb{$clean}, \verb{$superterms}, \verb{$combined}, +and \verb{$individual}. The list always contains: +\itemize{ +\item \verb{$node_attributes}: A \code{\link[tibble:tibble]{tibble::tibble()}} with columns \code{NAME}, \code{cluster}, +\code{community}, \code{degree}, \code{betweenness}, and \code{closeness} (plus \code{superterm} +if \code{superterms = TRUE}). +\item \verb{$superterm_report}: A \code{\link[tibble:tibble]{tibble::tibble()}} with columns \code{community}, +\code{superterm}, \code{n_genesets}, and \code{geneset_members}. Only present if +\code{superterms = TRUE}. +} +} +\description{ +Builds a weighted gene set network from a Jaccard similarity matrix, applies +Louvain community detection, and returns up to four ggplot2 network +visualizations using ggraph. Each plot is a standard ggplot2 object that +can be further customized, saved with \code{\link[ggplot2:ggsave]{ggplot2::ggsave()}}, or combined +with patchwork. +} +\details{ +Optionally overlays super-term community labels generated by +\code{\link[=get_superterm]{get_superterm()}}. + +For a base R igraph version that draws directly to the active graphics +device, see \code{\link[=network_clust]{network_clust()}}. +} +\examples{ +\dontrun{ +# Requires igraph, ggraph, tidygraph +gsl <- list_gmts("path/to/gmt_folder/") +res <- read.csv("path/to/results.csv") + +jac <- geneset_similarity(gsl, res, fdr_th = 0.05) +clust <- do_clust(jac) +net <- get_network_communities(jac, threshold = 0.3) + +# Return all four plots as ggplot2 objects +plots <- network_clust_gg( + jac, + clust_result = clust, + superterms = TRUE, + superterm_data = net$superterms, + type = "all", + seed = 174 +) + +# Display individual plots +plots$clean # no labels +plots$superterms # community labels only +plots$combined # community + node labels +plots$individual # node labels only + +# Save with ggsave (full ggplot2 objects) +ggplot2::ggsave("network_combined.pdf", plots$combined, + width = 14, height = 14) + +# Access node-level metrics +plots$node_attributes +plots$superterm_report + +# Combine two plots with patchwork +library(patchwork) +plots$clean + plots$superterms +} + +} +\seealso{ +\code{\link[=geneset_similarity]{geneset_similarity()}}, \code{\link[=do_clust]{do_clust()}}, \code{\link[=get_network_communities]{get_network_communities()}}, +\code{\link[=get_superterm]{get_superterm()}}, \code{\link[=network_clust]{network_clust()}} +} diff --git a/man/nice_KM.Rd b/man/nice_KM.Rd index 76d24b3..18306c7 100644 --- a/man/nice_KM.Rd +++ b/man/nice_KM.Rd @@ -58,6 +58,13 @@ nice_KM( } \value{ A \code{ggplot} object (or a list with \code{km_fit} and \code{plot} if \code{returnData = TRUE}). + +A ggplot2 object if \code{returnData = FALSE} (default). If +\code{returnData = TRUE}, a named list with two elements: +\itemize{ +\item \verb{$km_fit}: The \code{\link[survival:survfit]{survival::survfit()}} object. +\item \verb{$plot}: The ggplot2 survival curve. +} } \description{ This function fits Kaplan Meier survival curves stratified by status (e.g., 'No' vs 'Yes') for a specified gene column in a data frame. It allows: @@ -66,3 +73,9 @@ This function fits Kaplan Meier survival curves stratified by status (e.g., 'No' \item Automatic handling of cases where only one category ('No' or 'Yes') is present, returning an 'empty' placeholder plot with a warning message. } } +\references{ +Kaplan, E. L., & Meier, P. (1958). Nonparametric estimation from +incomplete observations. \emph{Journal of the American Statistical +Association}, 53(282), 457–481. +\doi{10.1080/01621459.1958.10501452} +} diff --git a/man/nice_PCA.Rd b/man/nice_PCA.Rd index a4beb55..b29f42b 100644 --- a/man/nice_PCA.Rd +++ b/man/nice_PCA.Rd @@ -74,6 +74,12 @@ nice_PCA( \item{returnData}{Indicates if the function should return the data (TRUE) or the plot (FALSE). Default: FALSE.} } +\value{ +A ggplot2 object if \code{returnData = FALSE} (default). If +\code{returnData = TRUE}, a numeric matrix of PCA coordinates with dimensions +samples × \code{outPCs}, with a \code{percentVar} attribute containing the +proportion of variance explained per component. +} \description{ This was inspired on the plotPCA function from DESeq2, made by Wolfgang Huber But including some improvements made by David Requena. Now it allows: @@ -83,3 +89,38 @@ But including some improvements made by David Requena. Now it allows: \item To provide the colors, shapes and fonts. } } +\examples{ +data(vst_counts) +data(sampledata) + +# nice_PCA joins by a column named "id" in annotations +sampledata_pca <- sampledata +colnames(sampledata_pca)[colnames(sampledata_pca) == "patient_id"] <- "id" + +nice_PCA( + object = vst_counts, + annotations = sampledata_pca, + variables = c(fill = "sample_type"), + legend_names = c(fill = "Sample Type"), + colors = c("steelblue", "firebrick"), + shapes = c(21, 21), + title = "TCGA-LUAD PCA" +) + +# Return PCA coordinates instead of plot +pca_data <- nice_PCA( + object = vst_counts, + annotations = sampledata_pca, + variables = c(fill = "sample_type"), + legend_names = c(fill = "Sample Type"), + colors = c("steelblue", "firebrick"), + shapes = c(21, 21), + returnData = TRUE +) +head(pca_data) + +} +\seealso{ +\code{\link[=nice_UMAP]{nice_UMAP()}}, \code{\link[=nice_tSNE]{nice_tSNE()}} for other alternatives; +\link{vst_counts} for the recommended input matrix. +} diff --git a/man/nice_UMAP.Rd b/man/nice_UMAP.Rd index 3f01428..a99f896 100644 --- a/man/nice_UMAP.Rd +++ b/man/nice_UMAP.Rd @@ -74,6 +74,44 @@ nice_UMAP( \item{returnData}{Indicates if the function should return the data (TRUE) or the plot (FALSE). Default: FALSE.} } +\value{ +A ggplot2 object if \code{returnData = FALSE} (default). If +\code{returnData = TRUE}, a data frame with UMAP coordinates and sample +annotations. +} \description{ Function to make UMAP plots. } +\examples{ +\dontrun{ +data(vst_counts) +data(sampledata) + +sampledata_u <- sampledata +colnames(sampledata_u)[colnames(sampledata_u) == "patient_id"] <- "id" + +nice_UMAP( + object = vst_counts, + annotations = sampledata_u, + variables = c(fill = "sample_type"), + legend_names = c(fill = "Sample Type"), + colors = c("steelblue", "firebrick"), + shapes = c(21, 21), + title = "TCGA-LUAD UMAP", + neighbors = 5, + epochs = 1000, + seed = 1905 +) +} + +} +\references{ +McInnes, L., Healy, J., & Melville, J. (2018). Umap: Uniform Manifold +Approximation and Projection for Dimension Reduction. +\emph{arXiv preprint arXiv:1802.03426}. +\url{https://arxiv.org/abs/1802.03426} +} +\seealso{ +\code{\link[=nice_PCA]{nice_PCA()}}, \code{\link[=nice_tSNE]{nice_tSNE()}} for alternative dimensionality +reduction methods; \link{vst_counts} for the recommended input matrix. +} diff --git a/man/nice_VSB.Rd b/man/nice_VSB.Rd index c7ad3c4..fe13bc4 100644 --- a/man/nice_VSB.Rd +++ b/man/nice_VSB.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/nice_VSB.R \name{nice_VSB} \alias{nice_VSB} -\title{Function to make Violin-Scatter-Box plots.} +\title{Function to make Violin-Scatter-Box plots from data frames.} \usage{ nice_VSB( object = NULL, @@ -23,7 +23,7 @@ nice_VSB( ) } \arguments{ -\item{object}{A DEseq object already transformed with the variance stabilizing or rlog transformations.} +\item{object}{A data frame object with normalized counts genes(in rows) across samples(in columns).} \item{annotations}{Data frame with annotations.} @@ -31,7 +31,8 @@ nice_VSB( \item{genename}{The gene name to be used for the plot.} -\item{symbol}{The gene symbol to be used for the plot.} +\item{symbol}{The gene symbol to display in the plot title. To obtain +gene symbols from Ensembl IDs, use \code{\link[=get_annotations]{get_annotations()}}.} \item{labels}{A vector containing the x-labels of the box-plot. Default: c("N", "P", "R", "M").} @@ -53,7 +54,32 @@ nice_VSB( \item{legend_size}{Font of the title and elements of the legend. Default: c(title = 14, elements = 12).} } +\value{ +A ggplot2 object. +} \description{ This function will make a Boxplot, using a DEseq object. It will show the data points on top with a small deviation (jitter) for a better visualization. } +\examples{ +data(norm_counts) +data(sampledata) + +nice_VSB( + object = norm_counts, + annotations = sampledata, + variables = c(fill = "sample_type"), + genename = rownames(norm_counts)[1], + categories = c("normal", "tumor"), + labels = c("Normal", "Tumor"), + colors = c("steelblue", "firebrick"), + shapes = 21, + markersize = 3 +) + +} +\seealso{ +\code{\link[=nice_Volcano]{nice_Volcano()}} for genome-wide visualization; \code{\link[=detect_filter]{detect_filter()}} +to identify reliably expressed genes; \code{\link[=get_stars]{get_stars()}} to add significance +annotations; \link{norm_counts} for an example input matrix. +} diff --git a/man/nice_VSB_DEseq2.Rd b/man/nice_VSB_DEseq2.Rd new file mode 100644 index 0000000..5c0a10a --- /dev/null +++ b/man/nice_VSB_DEseq2.Rd @@ -0,0 +1,88 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/nice_VSB_DEseq2.R +\name{nice_VSB_DEseq2} +\alias{nice_VSB_DEseq2} +\title{Function to make Box-Scatter-Violin plots from DEseq2 output directly.} +\usage{ +nice_VSB_DEseq2( + object = NULL, + variables = c(fill = "VarFill", shape = "VarShape"), + genename = NULL, + symbol = NULL, + labels = c("N", "P", "R", "M"), + categories = c("normal", "primary", "recurrence", "metastasis"), + colors = NULL, + shapes = NULL, + markersize = NULL, + alpha = 0.8, + width = NULL, + height = NULL, + jitter = 0.2, + dpi = 150, + save = FALSE, + title_size = c(axis = 20, fig = 24), + label_size = c(x = 20, y = 16), + legend_size = c(title = 14, elements = 12) +) +} +\arguments{ +\item{object}{A DEseq object already transformed with the variance stabilizing or rlog transformations.} + +\item{variables}{To indicate the variables to be used as Shape and Fill of the markers.} + +\item{genename}{The gene name to be used for the plot.} + +\item{symbol}{The gene symbol to display in the plot title. To obtain +gene symbols from Ensembl IDs, use \code{\link[=get_annotations]{get_annotations()}}.} + +\item{labels}{A vector containing the x-labels of the box-plot. Default: c("N", "P", "R", "M").} + +\item{categories}{A vector containing the labels for the legend. Default: c("normal", "primary", "recurrence", "metastasis").} + +\item{colors}{Vector of colors to be used for the categories of the variable assigned as Marker Fill.} + +\item{shapes}{Vector of shapes to be used for the categories of the variable assigned as Marker Shape.} + +\item{markersize}{Size of the marker.} + +\item{alpha}{Transparency of the marker, which goes from 0 (transparent) to 1 (no transparent). Default: 0.8.} + +\item{width}{Width of the plot.} + +\item{height}{Height of the plot.} + +\item{jitter}{Random deviation added to the dots. Default: 0.2.} + +\item{dpi}{DPI of the plot. Default: 150.} + +\item{save}{To save the plot. Default: FALSE.} + +\item{title_size}{Font of the title and axis names. Default: c(axis = 20, fig = 24).} + +\item{label_size}{Font of the labels (x-axis) and numbers (y-axis). Default: c(x = 20, y = 16).} + +\item{legend_size}{Font of the title and elements of the legend. Default: c(title = 14, elements = 12).} +} +\description{ +This function will make a Boxplot, using a DEseq object. +It will show the data points on top with a small deviation (jitter) for a better visualization. +} +\examples{ +\dontrun{ +# requires a DESeq2 object + +data(sampledata) + +nice_VSB_DEseq2( + object = vst, + annotations = sampledata, + variables = c(fill = "sample_type"), + genename = rownames(norm_counts)[1], + categories = c("normal", "tumor"), + labels = c("Normal", "Tumor"), + colors = c("steelblue", "firebrick"), + shapes = 21, + markersize = 3 +) +} +} diff --git a/man/nice_Volcano.Rd b/man/nice_Volcano.Rd index b40a3db..73b8852 100644 --- a/man/nice_Volcano.Rd +++ b/man/nice_Volcano.Rd @@ -40,7 +40,9 @@ nice_Volcano( \item{x_var}{Name of the column in \code{results} to plot on the x-axis (e.g. log₂FC).} -\item{label_var}{to be defined.} +\item{label_var}{Name of the column in \code{results} to use as point labels +(e.g. gene IDs or HGNC symbols). To use gene symbols, first run +\code{\link[=get_annotations]{get_annotations()}} and join the \code{symbol} column to your results table.} \item{legend}{Logical. Control legend display. Default: TRUE.} @@ -50,6 +52,9 @@ nice_Volcano( \item{genes}{Vector of genes to label in the plot. Default: NULL.} } +\value{ +A ggplot2 object +} \description{ Volcano plot with configurable point shapes and threshold annotations: \itemize{ @@ -58,3 +63,34 @@ Volcano plot with configurable point shapes and threshold annotations: \item Vertical dashed lines at log-fold-change cutoffs, shown as custom x-axis ticks. } } +\examples{ +data(deseq2_results) + +nice_Volcano( + results = deseq2_results, + x_var = "log2FoldChange", + y_var = "padj", + label_var = "gene_id", + title = "TCGA-LUAD: Tumor vs Normal", + cutoff_y = 0.05, + cutoff_x = 1, + x_range = 8, + y_max = 10 +) + +# Highlight specific genes +nice_Volcano( + results = deseq2_results, + x_var = "log2FoldChange", + y_var = "padj", + label_var = "gene_id", + title = "TCGA-LUAD: Tumor vs Normal", + genes = deseq2_results$gene_id[1:5] +) + +} +\seealso{ +\code{\link[=nice_VSB]{nice_VSB()}} for gene-level expression visualization; +\code{\link[=detect_filter]{detect_filter()}} to filter detectable genes before plotting; +\link{deseq2_results} for an example input dataset. +} diff --git a/man/nice_tSNE.Rd b/man/nice_tSNE.Rd index 653913e..4937df1 100644 --- a/man/nice_tSNE.Rd +++ b/man/nice_tSNE.Rd @@ -71,6 +71,43 @@ nice_tSNE( \item{returnData}{Indicates if the function should return the data (TRUE) or the plot (FALSE). Default: FALSE.} } +\value{ +A ggplot2 object if \code{returnData = FALSE} (default). If +\code{returnData = TRUE}, a data frame with tSNE coordinates and sample +annotations. +} \description{ Function to make tSNE plots. } +\examples{ +\dontrun{ +data(vst_counts) +data(sampledata) + +sampledata_t <- sampledata +colnames(sampledata_t)[colnames(sampledata_t) == "patient_id"] <- "id" + +# perplexity must be < n_samples / 3; with 32 samples use perplexity = 5 +nice_tSNE( + object = vst_counts, + annotations = sampledata_t, + perplexity = 5, + max_iterations = 1000, + variables = c(fill = "sample_type"), + legend_names = c(fill = "Sample Type"), + colors = c("steelblue", "firebrick"), + shapes = c(21, 21), + title = "TCGA-LUAD tSNE", + seed = 1905 +) +} +} +\references{ +van der Maaten, L., & Hinton, G. (2008). Visualizing data using t-SNE. +\emph{Journal of Machine Learning Research}, 9, 2579–2605. +\url{https://jmlr.org/papers/v9/vandermaaten08a.html} +} +\seealso{ +\code{\link[=nice_PCA]{nice_PCA()}}, \code{\link[=nice_UMAP]{nice_UMAP()}} for alternative dimensionality +reduction methods; \link{vst_counts} for the recommended input matrix. +} diff --git a/man/norm_counts.Rd b/man/norm_counts.Rd new file mode 100644 index 0000000..be06c00 --- /dev/null +++ b/man/norm_counts.Rd @@ -0,0 +1,113 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/deseq2_results.R +\docType{data} +\name{norm_counts} +\alias{norm_counts} +\title{Normalized counts matrix for TCGA-LUAD} +\format{ +A numeric matrix with 21,330 rows (genes) and 32 columns (samples): +\describe{ +\item{rows}{Ensembl gene IDs (e.g., \code{"ENSG00000141510"}).} +\item{columns}{Sample IDs matching the \code{patient_id} column of +\link{sampledata}.} +\item{values}{Non-negative numeric. Size-factor normalized counts. +Range: [0, 1,889,573].} +} +} +\source{ +TCGA-LUAD STAR counts downloaded from the GDC Data Portal +(\url{https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz}). +Normalized with DESeq2::counts() (\code{normalized = TRUE}); generated by +\code{data-raw/deseq2_results.R}. +} +\usage{ +norm_counts +} +\description{ +DESeq2 size-factor normalized counts derived from the TCGA-LUAD RNA-seq +dataset (16 tumor samples, 16 normal samples). Counts are divided by +DESeq2 size factors to correct for differences in library size across +samples, but remain in counts scale (not log-transformed). +} +\details{ +Suitable as input for \code{\link[=nice_VSB]{nice_VSB()}}, \code{\link[=detect_filter]{detect_filter()}}, and +\code{\link[=add_annotations]{add_annotations()}}. For dimensionality reduction methods (\code{\link[=nice_PCA]{nice_PCA()}}, +\code{\link[=nice_UMAP]{nice_UMAP()}}, \code{\link[=nice_tSNE]{nice_tSNE()}}) use \link{vst_counts} instead, which removes the +mean-variance dependence of RNA-seq data. +} +\examples{ +data(norm_counts) +data(sampledata) + +# Dimensions +dim(norm_counts) + +# Value range +range(norm_counts) + +# Expression of a specific gene across samples +norm_counts["ENSG00000141510", ] + +# Violin-Scatter-Box plot for one gene +nice_VSB( + object = norm_counts, + annotations = sampledata, + variables = c(fill = "sample_type"), + genename = "ENSG00000141510", + categories = c("normal", "tumor"), + labels = c("Normal", "Tumor"), + colors = c("steelblue", "firebrick") +) + +\dontrun{ +# detect_filter: (required: "ensembl" column in results) +deseq2_res <- deseq2_results +colnames(deseq2_res)[colnames(deseq2_res) == "gene_id"] <- "ensembl" +rownames(deseq2_res) <- deseq2_res$ensembl + +# Get sample IDs per group from sampledata +samples_normal <- sampledata$patient_id[sampledata$sample_type == "normal"] +samples_tumor <- sampledata$patient_id[sampledata$sample_type == "tumor"] + +detected <- detect_filter( + norm.counts = as.data.frame(norm_counts), + df.BvsA = deseq2_res, + samples.baseline = samples_normal, + samples.condition1 = samples_tumor, + cutoffs = c(50, 50, 0) +) + +# Number of detectable genes +length(detected$DetectGenes) + +# Subset results to detectable genes +head(detected$Comparison1) + +# add_annotations: add gene symbols +# Required: reference df with geneID + annotation columns +# Example using biomaRt to fetch gene symbols +library(biomaRt) +mart <- useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl") +ref <- getBM( + attributes = c("ensembl_gene_id", "hgnc_symbol", "gene_biotype"), + filters = "ensembl_gene_id", + values = rownames(norm_counts), + mart = mart +) +colnames(ref)[1] <- "geneID" + +norm_counts_annot <- add_annotations( + object = norm_counts, + reference = ref, + variables = c("hgnc_symbol", "gene_biotype") +) + +head(norm_counts_annot[, c("geneID", "hgnc_symbol", "gene_biotype")]) +} + +} +\seealso{ +\link{vst_counts}, \link{deseq2_results}, \link{sampledata}, \code{\link[=nice_VSB]{nice_VSB()}}, +\code{\link[=detect_filter]{detect_filter()}}, \code{\link[=add_annotations]{add_annotations()}} +} +\keyword{datasets} diff --git a/man/plot_GSEA.Rd b/man/plot_GSEA.Rd deleted file mode 100644 index 9e39ad3..0000000 --- a/man/plot_GSEA.Rd +++ /dev/null @@ -1,38 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot_GSEA.R -\name{plot_GSEA} -\alias{plot_GSEA} -\title{Plot global GSEA results} -\usage{ -plot_GSEA( - data, - geneset_col, - collection_col, - nes_col, - logfdr_col, - text_size_genesets = 5, - text_size_collection = 5 -) -} -\arguments{ -\item{data}{Data frame containing the GSEA results.} - -\item{geneset_col}{Name of the column containing the genesets.} - -\item{collection_col}{Name of the column containing the collections.} - -\item{nes_col}{Name of the column containing the NES values.} - -\item{logfdr_col}{Name of the column containing \eqn{-\log_{10}(FDR)} values.} - -\item{text_size_genesets}{Text size for the geneset labels.} - -\item{text_size_collection}{Text size for the collection labels.} -} -\value{ -GSEA barplots arranged in a grid. -} -\description{ -Generates a composite plot displaying NES values, pathway labels, -and a \emph{logFDR} legend, organized by MSigDB collections. -} diff --git a/man/power_analysis.Rd b/man/power_analysis.Rd index 3d054d0..e5657c8 100644 --- a/man/power_analysis.Rd +++ b/man/power_analysis.Rd @@ -30,8 +30,44 @@ power_analysis( \item{max_n}{Integer. Maximum sample size per group to explore.} -\item{plot}{Logical. If TRUE, draws the power curve; if FALSE, skips plotting.} +\item{plot}{Logical. If TRUE, draws the power curve; if FALSE, skips plotting.} +} +\value{ +A named list. If \code{plot = TRUE}, contains three elements: +\itemize{ +\item \verb{$min_sample_size}: Integer. Minimum sample size per group to reach +\code{power_target}. +\item \verb{$power_table}: A data frame with columns \code{SampleSize} and \code{Power}. +\item \verb{$plot}: A ggplot2 object of the power curve. +} + +If \code{plot = FALSE}, returns only \verb{$min_sample_size} and \verb{$power_table}. } \description{ Power analysis for RNA-seq differential expression with optional plotting } +\examples{ +# Basic power analysis with default parameters +result <- power_analysis( + effect_size = 1, + dispersion = 0.1, + n_genes = 20000, + prop_de = 0.05, + alpha = 0.05, + power_target = 0.8, + max_n = 20 +) + +# Minimum sample size to reach 80\% power +result$min_sample_size + +# Full power table +head(result$power_table) + +# Higher effect size requires fewer samples +power_analysis(effect_size = 2, dispersion = 0.1, plot = FALSE)$min_sample_size + +# See plot +#power_analysis$plot + +} diff --git a/man/save_results.Rd b/man/save_results.Rd index 6ea21ed..12176bd 100644 --- a/man/save_results.Rd +++ b/man/save_results.Rd @@ -15,6 +15,15 @@ save_results(df, name, l2fc = 0, cutoff_alpha = 0.25) \item{cutoff_alpha}{The cut-off of the False Discovery Rate (FDR o padj). Default = 0.25.} } +\value{ +Invisibly returns \code{NULL}. Saves three \code{.xlsx} files to the working +directory: +\itemize{ +\item \verb{_full.xlsx} : all genes. +\item \verb{_up_log2FC>_FDR<.xlsx} : upregulated genes. +\item \verb{_down_log2FC<_FDR<.xlsx} : downregulated genes. +} +} \description{ This function takes as input the output of the function "results()" of DEseq2. And will save 3 tables: @@ -24,3 +33,26 @@ And will save 3 tables: \item A table including only the under-expressed genes } } +\examples{ +\dontrun{ +data(deseq2_results) + +# Save full results + over/under-expressed tables as .xlsx files +save_results( + df = deseq2_results, + name = "TCGA_LUAD_TumorVsNormal", + l2fc = 1, + cutoff_alpha = 0.05 +) + +# Creates: +# TCGA_LUAD_TumorVsNormal_full.xlsx +# TCGA_LUAD_TumorVsNormal_up_log2FC>1_FDR<0.05.xlsx +# TCGA_LUAD_TumorVsNormal_down_log2FC<1_FDR<0.05.xlsx +} + +} +\seealso{ +\code{\link[=detect_filter]{detect_filter()}} to further filter saved results; +\link{deseq2_results} for an example input. +} diff --git a/man/split_cases.Rd b/man/split_cases.Rd index 450803d..ec82263 100644 --- a/man/split_cases.Rd +++ b/man/split_cases.Rd @@ -32,9 +32,73 @@ split_cases( \item{change_cutoff}{The values of the change variable will be filtered by |change_var| > change_cutoff. Default: 0.} } +\value{ +A named list of 10 data frames (\verb{$Case1} through \verb{$Case10}), each +containing the genes belonging to that mutually exclusive expression +pattern. Cases 1–6 contain a \code{trend} column (\code{"up"} or \code{"dn"}). Case 10 +contains genes not significant in any comparison. +} \description{ When performing differential expression analysis of a study with 3 phenotypes, including the baseline, there are 10 mutually exclusive cases where genes can fall into. This function allows us to obtain these 10 cases and saves them into a list. } +\details{ +The 10 cases are: +\itemize{ +\item \strong{Case 1} : Ladder: significant in all 3, same direction. +\item \strong{Case 2} : Stronger in condition 1: significant in all 3, direction +reverses between conditions. +\item \strong{Case 3} : Stronger in condition 2. +\item \strong{Case 4} : Marker of condition 2: significant in CvsA and BvsC only. +\item \strong{Case 5} : Marker of condition 1: significant in BvsA and BvsC only. +\item \strong{Case 6} : Shared: significant in BvsA and CvsA only. +\item \strong{Cases 7-9} : Significant in only one comparison. +\item \strong{Case 10} : Not significant in any comparison. +} +} +\examples{ +\dontrun{ +# split_cases requires three DESeq2 comparisons. +# Simulate a 3-phenotype study: Normal (A), Primary (B), Metastasis (C) +set.seed(174) +n_genes <- 500 + +make_res <- function(seed) { + set.seed(seed) + data.frame( + ensembl = paste0("ENSG", sprintf("\%011d", seq_len(n_genes))), + log2FoldChange = rnorm(n_genes, 0, 1.5), + padj = runif(n_genes, 0, 0.5), + stringsAsFactors = FALSE + ) +} + +df_BvsA <- make_res(1) +df_CvsA <- make_res(2) +df_BvsC <- make_res(3) + +cases <- split_cases( + df.BvsA = df_BvsA, + df.CvsA = df_CvsA, + df.BvsC = df_BvsC, + unique_id = "ensembl", + significance_var = "padj", + significance_cutoff = 0.25, + change_var = "log2FoldChange", + change_cutoff = 0 +) + +# Number of genes per case +sapply(cases, nrow) + +# Inspect Case 1 (ladder genes: significant in all 3 comparisons) +head(cases$Case1) +} + +} +\seealso{ +\code{\link[=detect_filter]{detect_filter()}} to pre-filter genes before splitting; +\code{\link[=nice_Volcano]{nice_Volcano()}} to visualize individual comparison results. +} diff --git a/man/splot_PA.Rd b/man/splot_PA.Rd new file mode 100644 index 0000000..aa4053f --- /dev/null +++ b/man/splot_PA.Rd @@ -0,0 +1,117 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot_PA.R +\name{splot_PA} +\alias{splot_PA} +\title{Pathway analysis visualization for a single comparison} +\usage{ +splot_PA( + data, + geneset_col = "NAME", + collection_col = "COLLECTION", + nes_col = "NES", + fdr_col = "FDR", + order = "desc", + fill_limits = NULL, + fill_palette = c("white", "red"), + theme_params = list() +) +} +\arguments{ +\item{data}{A data frame of pathway analysis results for a single +comparison. Typically the output of \code{\link[=merge_PA]{merge_PA()}} filtered to one value +of the \code{COMPARISON} column, or results from a single CAMERA/GSEA run. +Must contain the columns specified by \code{geneset_col}, \code{collection_col}, +\code{nes_col}, and \code{fdr_col}.} + +\item{geneset_col}{Name of the column containing gene set labels shown on +the y-axis. Default: \code{"NAME"}.} + +\item{collection_col}{Name of the column containing MSigDB collection +labels used to group gene sets (e.g., \code{"KEGG"}, \code{"HALLMARK"}, \code{"GO"}). +Default: \code{"COLLECTION"}.} + +\item{nes_col}{Name of the column containing NES values (x-axis). +Default: \code{"NES"}.} + +\item{fdr_col}{Name of the column containing FDR values. \code{-log10(FDR)} is +computed internally and used as the fill color. Default: \code{"FDR"}.} + +\item{order}{One of \code{"desc"} or \code{"asc"}. Sort order for NES values on the +y-axis. Default: \code{"desc"}.} + +\item{fill_limits}{Numeric vector of length 2 setting the color scale range +for \code{-log10(FDR)}. Values outside this range are clamped to the nearest +limit. For example, \code{fill_limits = c(0, 5)} maps all gene sets with +\code{-log10(FDR) >= 5} (i.e., FDR <= 0.00001) to the maximum color (red), +and any value below 0 to the minimum color (white). Useful when a few +gene sets have extreme significance that washes out color variation in the +rest. Default: \code{NULL} (auto uses the actual data range).} + +\item{fill_palette}{Character vector of two colors for the fill gradient +(low to high -log10(FDR)). Default: \code{c("white", "red")}.} + +\item{theme_params}{Named list to override default theme parameters. +See Details.} +} +\value{ +A \code{patchwork} object combining six ggplot2 panels. +} +\description{ +Generates a publication-quality multi-panel pathway enrichment plot for a +single comparison using patchwork. Gene sets appear on the y-axis grouped +by MSigDB collection, NES on the x-axis, and -log10(FDR) as fill color. +Six panels are assembled side by side: a "Pathways" label, gene set names, +the NES bar chart, collection labels, a "MSigDB" label, and the color legend. +} +\details{ +For visualizing enrichment across multiple comparisons, use +\code{\link[=multiplot_PA]{multiplot_PA()}} instead. + +\code{theme_params} accepts any of the following named elements: +\describe{ +\item{\code{side_label_size}}{Size for "Pathways" and "MSigDB" labels. +Default: \code{35}.} +\item{\code{geneset_text_size}}{Text size for gene set labels. Default: \code{5}.} +\item{\code{collection_text_size}}{Text size for collection labels. +Default: \code{5}.} +\item{\code{panel_widths}}{Patchwork relative widths for the 6 panels. +Default: \code{c(4, 25, 15, 3, 10, 3)}.} +\item{\code{col_size}}{Border linewidth for \code{geom_col}. Default: \code{1}.} +\item{\code{axis_title_size}}{Font size for axis titles. Default: \code{45}.} +\item{\code{axis_text_size_x}}{Font size for x-axis labels. Default: \code{30}.} +\item{\code{tick_size}}{Linewidth for axis ticks. Default: \code{1.5}.} +\item{\code{tick_length}}{Length of axis ticks in cm. Default: \code{0.3}.} +\item{\code{panel_spacing_single}}{Spacing between facets. Default: \code{4}.} +} +} +\examples{ +\dontrun{ +gsea_results <- merge_PA("path/to/gsea_results/") + +# Filter to one comparison +single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ] + +splot_PA( + data = single, + geneset_col = "NAME", + collection_col = "COLLECTION", + nes_col = "NES", + fdr_col = "FDR" +) + +# Cap color scale at -log10(FDR) = 5 so subtle differences are visible +# (gene sets with FDR <= 0.00001 all get the same max red color) +splot_PA( + data = single, + geneset_col = "NAME", collection_col = "COLLECTION", + nes_col = "NES", fdr_col = "FDR", + fill_limits = c(0, 5) +) +} + +} +\seealso{ +\code{\link[=multiplot_PA]{multiplot_PA()}} for multi-comparison faceted barplots; +\code{\link[=merge_PA]{merge_PA()}} to generate the input data frame; +\link{camera_results} for a minimal example dataset. +} diff --git a/man/tpm.Rd b/man/tpm.Rd index f648a45..195c640 100644 --- a/man/tpm.Rd +++ b/man/tpm.Rd @@ -11,6 +11,10 @@ tpm(raw_counts, gene_lengths) \item{gene_lengths}{A column with the gene lengths.} } +\value{ +A numeric matrix of the same dimensions as \code{raw_counts} with TPM +values. Column sums equal 1,000,000 by definition. +} \description{ TPM: Transcript per million. See https://www.biostars.org/p/273537/ The input table is numeric: @@ -20,3 +24,40 @@ The input table is numeric: The gene lengths are in a column of a dataframe with the same row order. } } +\note{ +TPM normalizes for both sequencing depth and gene length, making +values comparable between genes within a sample. It is not appropriate +for differential expression analysis, use DESeq2 normalized counts +(\link{norm_counts}) for that purpose. Gene lengths from \code{\link[=get_annotations]{get_annotations()}} +are genomic lengths (including introns); for higher accuracy use +transcript-level lengths. +} +\examples{ +\dontrun{ +data(raw_counts) +data(deseq2_results) + +# Gene lengths are needed, retrieve from get_annotations() or use +# pre-fetched lengths. Here we use the gene_length column if available. +annotations <- get_annotations( + ensembl_ids = rownames(raw_counts), + mode = "genes" +) + +# Match gene lengths to raw_counts row order +gene_lengths <- annotations$gene_length[ + match(rownames(raw_counts), annotations$geneID) +] + +# Calculate TPM +tpm_matrix <- tpm(raw_counts, gene_lengths) + +# Check: column sums should all be 1,000,000 +round(colSums(tpm_matrix)[1:3]) +} + +} +\seealso{ +\code{\link[=get_annotations]{get_annotations()}} to obtain \code{gene_lengths}; +\link{norm_counts} for DESeq2 size-factor normalized counts. +} diff --git a/man/vst_counts.Rd b/man/vst_counts.Rd new file mode 100644 index 0000000..8811dc6 --- /dev/null +++ b/man/vst_counts.Rd @@ -0,0 +1,96 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/deseq2_results.R +\docType{data} +\name{vst_counts} +\alias{vst_counts} +\title{Variance-stabilized counts matrix for TCGA-LUAD} +\format{ +A numeric matrix with 21,330 rows (genes) and 32 columns (samples): +\describe{ +\item{rows}{Ensembl gene IDs (e.g., \code{"ENSG00000141510"}).} +\item{columns}{Sample IDs matching the \code{patient_id} column of +\link{sampledata}.} +\item{values}{Numeric. VST-transformed expression values on a log2-like +scale. Range: [1.78, 20.85].} +} +} +\source{ +TCGA-LUAD STAR counts downloaded from the GDC Data Portal +(\url{https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz}). +Transformed with \code{\link[DESeq2:vst]{DESeq2::vst()}} (\code{blind = TRUE}); generated by +\code{data-raw/deseq2_results.R}. +} +\usage{ +vst_counts +} +\description{ +Variance Stabilizing Transformation (VST) applied to the TCGA-LUAD RNA-seq +dataset (16 tumor samples, 16 normal samples) using \code{\link[DESeq2:vst]{DESeq2::vst()}} with +\code{blind = TRUE}. VST removes the mean-variance dependence characteristic of +RNA-seq count data, placing all genes on a comparable log2-like scale. This +makes it the appropriate input for sample-level dimensionality reduction and +clustering methods. +} +\details{ +Suitable as input for \code{\link[=nice_PCA]{nice_PCA()}}, \code{\link[=nice_UMAP]{nice_UMAP()}}, and \code{\link[=nice_tSNE]{nice_tSNE()}}. For +gene-level expression plots (\code{\link[=nice_VSB]{nice_VSB()}}) or filtering (\code{\link[=detect_filter]{detect_filter()}}) +use \link{norm_counts} instead. +} +\examples{ +data(vst_counts) +data(sampledata) + +# Dimensions +dim(vst_counts) + +# Value range (log2-like scale) +range(vst_counts) + +# PCA plot colored by sample type +colnames(sampledata)[colnames(sampledata) == "patient_id"] <- "id" +nice_PCA( + object = vst_counts, + annotations = sampledata, + variables = c(fill = "sample_type"), + legend_names = c(fill = "Sample Type"), + colors = c("steelblue", "firebrick"), + shapes = c(21, 21), + title = "TCGA-LUAD PCA" +) + +\dontrun{ +# UMAP plot +colnames(sampledata)[colnames(sampledata) == "patient_id"] <- "id" + +nice_UMAP( + object = vst_counts, + annotations = sampledata, + variables = c(fill = "sample_type"), + legend_names = c(fill = "Sample Type"), + colors = c("steelblue", "firebrick"), + shapes = c(21, 21), + title = "TCGA-LUAD UMAP" +) + +# tSNE plot +# perplexity must be lower than the number of samples divided by 3 + +colnames(sampledata)[colnames(sampledata) == "patient_id"] <- "id" +nice_tSNE( + object = vst_counts, + annotations = sampledata, + perplexity = 5, + variables = c(fill = "sample_type"), + legend_names = c(fill = "Sample Type"), + colors = c("steelblue", "firebrick"), + shapes = c(21, 21), + title = "TCGA-LUAD tSNE" +) +} + +} +\seealso{ +\link{norm_counts}, \link{deseq2_results}, \link{sampledata}, \code{\link[=nice_PCA]{nice_PCA()}}, +\code{\link[=nice_UMAP]{nice_UMAP()}}, \code{\link[=nice_tSNE]{nice_tSNE()}} +} +\keyword{datasets} diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 0000000..097b241 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,2 @@ +*.html +*.R diff --git a/vignettes/DEA_workflow.Rmd b/vignettes/DEA_workflow.Rmd new file mode 100644 index 0000000..b8388b7 --- /dev/null +++ b/vignettes/DEA_workflow.Rmd @@ -0,0 +1,478 @@ +--- +title: "Differential Expression Analysis Workflow with OmicsKit" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Differential Expression Analysis Workflow with OmicsKit} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.align = "center", + warning = FALSE, + message = FALSE +) +``` + +## Overview + +This vignette demonstrates a complete RNA-seq differential expression analysis +(DEA) workflow using OmicsKit, from study design through visualization, using +real TCGA-LUAD data (16 tumor vs. 16 normal lung samples). + +The workflow covers: + +1. **Study design** : statistical power estimation +2. **Quality control** : unsupervised dimensionality reduction +3. **DEA results** : annotation, normalization, and export +4. **Visualization** : volcano plots and gene-level expression plots +5. **Multi-comparison** : splitting genes into exclusive expression cases + +## Required packages + +```{r packages} +library(OmicsKit) +library(ggplot2) +``` + +--- + +## Section 1 : Study design: power analysis + +Before collecting data or running any analysis, it is good practice to estimate +the minimum sample size needed to reliably detect differentially expressed genes +at a given effect size and false discovery rate. + +`power_analysis()` computes statistical power across a range of sample sizes +using an analytical approximation that accounts for multiple testing: + +```{r power_analysis} +result <- power_analysis( + effect_size = 1, # minimum log2 fold-change to detect + dispersion = 0.1, # biological coefficient of variation squared + n_genes = 20000, # total genes tested + prop_de = 0.05, # expected proportion of DE genes + alpha = 0.05, # desired FDR + power_target = 0.8, # desired power (80%) + max_n = 30, + plot = TRUE +) + +# Minimum samples per group needed +result$min_sample_size + +# Full power curve +head(result$power_table) + +# See plot +result$plot +``` + +```{r power_plot, echo = FALSE, fig.cap = "Statistical power vs. sample size per group. The red dashed line marks 80% power; the blue dashed line marks the minimum required sample size.", fig.width = 7, fig.height = 5} +knitr::include_graphics("figures/power_analysis.png") +``` + +The TCGA-LUAD dataset used in this vignette has 16 samples per group, well +above the minimum required for detecting log2FC ≥ 1 at 80% power. + +--- + +## Section 2 : Quality control: unsupervised clustering + +Before running DEA, it is essential to verify that samples cluster by their +biological group and to detect potential batch effects or outliers. OmicsKit +provides three complementary dimensionality reduction methods that all accept +the same VST-transformed count matrix. + +**Why VST?** Variance Stabilizing Transformation removes the mean-variance +dependence of RNA-seq counts, placing all genes on a comparable log2-like +scale. This prevents highly expressed genes from dominating the sample-level +distances. + +```{r load_data} +data(vst_counts) +data(sampledata) + +# nice_PCA, nice_UMAP, and nice_tSNE require a column named "id" +sampledata_dim <- sampledata +colnames(sampledata_dim)[colnames(sampledata_dim) == "patient_id"] <- "id" + +dim(vst_counts) +table(sampledata$sample_type) +``` + +### PCA + +PCA is the fastest method and should always be the first step. The first two +principal components often capture the main sources of variation. + +```{r nice_PCA, eval = FALSE} +nice_PCA( + object = vst_counts, + annotations = sampledata_dim, + variables = c(fill = "sample_type"), + legend_names = c(fill = "Sample Type"), + colors = c("steelblue", "firebrick"), + shapes = c(21, 21), + title = "TCGA-LUAD PCA: Tumor vs Normal" +) +``` + +```{r pca_plot, echo = FALSE, fig.cap = "PCA of VST-transformed TCGA-LUAD counts. Tumor (red) and normal (blue) samples separate cleanly along PC1.", fig.width = 7, fig.height = 6} +knitr::include_graphics("figures/nice_PCA.png") +``` + +### UMAP + +UMAP captures non-linear structure and is particularly useful for larger +datasets or when PCA does not show clear separation. + +```{r nice_UMAP, eval = FALSE} +nice_UMAP( + object = vst_counts, + annotations = sampledata_dim, + variables = c(fill = "sample_type"), + legend_names = c(fill = "Sample Type"), + colors = c("steelblue", "firebrick"), + shapes = c(21, 21), + title = "TCGA-LUAD UMAP: Tumor vs Normal", + neighbors = 5, + epochs = 1000, + seed = 174 +) +``` + +```{r umap_plot, echo = FALSE, fig.cap = "UMAP of VST-transformed TCGA-LUAD counts.", fig.width = 7, fig.height = 6} +knitr::include_graphics("figures/nice_UMAP.png") +``` + +### tSNE + +tSNE is useful for visualizing tight local clusters. Note that `perplexity` +must be less than one-third of the number of samples. + +```{r nice_tSNE, eval = FALSE} +# With 32 samples, perplexity must be < 10 +nice_tSNE( + object = vst_counts, + annotations = sampledata_dim, + perplexity = 5, + max_iterations = 1000, + variables = c(fill = "sample_type"), + legend_names = c(fill = "Sample Type"), + colors = c("steelblue", "firebrick"), + shapes = c(21, 21), + title = "TCGA-LUAD tSNE: Tumor vs Normal", + seed = 174 +) +``` + +```{r tsne_plot, echo = FALSE, fig.cap = "tSNE of VST-transformed TCGA-LUAD counts.", fig.width = 7, fig.height = 6} +knitr::include_graphics("figures/nice_tSNE.png") +``` + +--- + +## Section 3 : DEA results: annotation, normalization, and export + +Once QC confirms clean group separation, we work with the DESeq2 results. The +`deseq2_results` object is already provided as an example dataset. + +```{r load_results} +data(deseq2_results) +data(norm_counts) + +# Overview +dim(deseq2_results) +sum(deseq2_results$padj < 0.05, na.rm = TRUE) +``` + +### Gene annotations + +`get_annotations()` queries Ensembl via biomaRt to retrieve gene symbols, +biotype, chromosomal location, and length. Requires an internet connection. + +```{r get_annotations, eval = FALSE} +annotations <- get_annotations( + ensembl_ids = deseq2_results$gene_id, + mode = "genes", + version = "Current", + filename = "luad_gene_annotations", + format = "csv" +) + +head(annotations) +``` + +### Adding annotations to results and counts + +`add_annotations()` joins annotation columns to any matrix or data frame using +Ensembl IDs as the key: + +```{r add_annotations, eval = FALSE} +# Add gene symbol and biotype to normalized counts +norm_counts_annot <- add_annotations( + object = norm_counts, + reference = annotations, + variables = c("symbol", "biotype") +) + +head(norm_counts_annot[, c("geneID", "symbol", "biotype")]) +``` + +### TPM normalization + +While DESeq2 normalized counts are appropriate for DEA, TPM is useful for +comparing expression levels between genes within a sample. Note that TPM +requires accurate gene lengths, `get_annotations()` provides these via +`$gene_length` (computed as `end - start + 1`). + +```{r tpm, eval = FALSE} +gene_lengths <- annotations$gene_length[ + match(rownames(raw_counts), annotations$geneID) +] + +tpm_matrix <- tpm(raw_counts, gene_lengths) + +# Column sums should all equal 1,000,000 +round(colSums(tpm_matrix)[1:3]) +``` + +### Saving results + +`save_results()` exports three Excel files: all genes, over-expressed only, +and under-expressed only: + +```{r save_results, eval = FALSE} +save_results( + df = deseq2_results, + name = "TCGA_LUAD_TumorVsNormal", + l2fc = 1, + cutoff_alpha = 0.05 +) +# Creates: +# TCGA_LUAD_TumorVsNormal_full.xlsx +# TCGA_LUAD_TumorVsNormal_up_log2FC>1_FDR<0.05.xlsx +# TCGA_LUAD_TumorVsNormal_down_log2FC<1_FDR<0.05.xlsx +``` + +--- + +## Section 4 : Visualization + +### Volcano plot + +The volcano plot is the standard way to visualize DEA results, showing the +relationship between effect size (log2FC) and significance (FDR). + +> **Tip:** To display gene symbols instead of Ensembl IDs, run +> `get_annotations()` and join the `symbol` column to `deseq2_results` +> with `add_annotations()` before calling `nice_Volcano()`. + +```{r nice_Volcano, eval = FALSE} +nice_Volcano( + results = deseq2_results, + x_var = "log2FoldChange", + y_var = "padj", + label_var = "gene_id", + title = "TCGA-LUAD: Tumor vs Normal", + cutoff_y = 0.05, + cutoff_x = 1, + x_range = 8, + y_max = 10 +) +``` + +```{r volcano_plot, echo = FALSE, fig.cap = "Volcano plot of TCGA-LUAD DEA results. Red: upregulated in tumor; blue: downregulated; grey: not significant.", fig.width = 8, fig.height = 7} +knitr::include_graphics("figures/nice_Volcano.png") +``` + +### Detectable genes + +`detect_filter()` identifies genes with reliable expression levels by applying +thresholds on baseMean and mean normalized counts per group. + +```{r detect_filter, eval = FALSE} +# detect_filter requires a column named "ensembl" +res <- deseq2_results +colnames(res)[colnames(res) == "gene_id"] <- "ensembl" +rownames(res) <- res$ensembl + +samples_normal <- sampledata$patient_id[sampledata$sample_type == "normal"] +samples_tumor <- sampledata$patient_id[sampledata$sample_type == "tumor"] + +detected <- detect_filter( + norm.counts = as.data.frame(norm_counts), + df.BvsA = res, + samples.baseline = samples_normal, + samples.condition1 = samples_tumor, + cutoffs = c(50, 50, 0) +) + +length(detected$DetectGenes) +``` + +### Significance stars + +`get_stars()` converts FDR values to asterisk notation for annotation in plots: + +```{r get_stars} +data(deseq2_results) + +res_stars <- deseq2_results +colnames(res_stars)[colnames(res_stars) == "gene_id"] <- "ensembl" + +# Most significant gene +get_stars( + geneID = res_stars$ensembl[1], + object = res_stars +) + +# Least significant gene +get_stars( + geneID = res_stars$ensembl[nrow(res_stars)], + object = res_stars +) +``` + +### Gene-level expression: Violin-Scatter-Box plot + +`nice_VSB()` shows the distribution of normalized expression for a single gene +across sample groups. The input is the normalized counts matrix. + +```{r nice_VSB, eval = FALSE} +top_gene <- deseq2_results$gene_id[1] + +# Get symbol +#symbol <- deseq2_results$symbol[1] + +nice_VSB( + object = norm_counts, + annotations = sampledata, + variables = c(fill = "sample_type"), + genename = top_gene, + categories = c("normal", "tumor"), + labels = c("Normal", "Tumor"), + colors = c("steelblue", "firebrick"), + shapes = 21, + markersize = 3 +) +``` + +> **Note: alternative using a DESeq2 object directly:** If you still have +> your `DESeqDataSet` object in your session, you can use `nice_VSB_DESeq2()` +> instead. It extracts the normalized counts internally via +> `DESeq2::counts(dds, normalized = TRUE)`, so you do not need to provide +> a separate `norm_counts` matrix: +> +> ```r +> nice_VSB_DESeq2( +> object = dds, +> variables = c(fill = "sample_type"), +> genename = top_gene, +> symbol = "TP53", # optional: displayed in plot title +> categories = c("normal", "tumor"), +> labels = c("Normal", "Tumor"), +> colors = c("steelblue", "firebrick"), +> shapes = 21, +> markersize = 3 +> ) +> ``` +> +> Use `nice_VSB()` when working from a pre-computed normalized counts matrix, +> and `nice_VSB_DESeq2()` when the DESeq2 object is still available in your +> environment. + + +```{r vsb_plot, echo = FALSE, fig.cap = "Violin-Scatter-Box plot of the most significant DEG across tumor and normal samples.", fig.width = 6, fig.height = 6} +knitr::include_graphics("figures/nice_VSB.png") +``` + + + +--- + +## Section 5 : Multi-comparison analysis (optional) + +When a study includes three phenotypes (e.g., normal, primary tumor, +metastasis), `split_cases()` classifies genes into 10 mutually exclusive +expression patterns based on significance and direction across all three +pairwise comparisons. + +```{r split_cases, eval = FALSE} +# Requires three DESeq2 result data frames: +# df.BvsA : condition 1 vs baseline +# df.CvsA : condition 2 vs baseline +# df.BvsC : condition 1 vs condition 2 + +cases <- split_cases( + df.BvsA = df_tumor_vs_normal, + df.CvsA = df_meta_vs_normal, + df.BvsC = df_tumor_vs_meta, + unique_id = "ensembl", + significance_var = "padj", + significance_cutoff = 0.05, + change_var = "log2FoldChange", + change_cutoff = 1 +) + +# Number of genes per case +sapply(cases, nrow) + +# Case 1: ladder genes : progressive up or down across all comparisons +head(cases$Case1) +``` + +--- + +## Full workflow - summary + +```{r full_workflow, eval = FALSE} +library(OmicsKit) + +# 1. Study design +power_analysis(effect_size = 1, dispersion = 0.1, n_genes = 20000) + +# 2. QC: dimensionality reduction (use VST counts) +data(vst_counts); data(sampledata) +sampledata$id <- sampledata$patient_id + +nice_PCA(vst_counts, sampledata, variables = c(fill = "sample_type"), + legend_names = c(fill = "Sample Type"), + colors = c("steelblue", "firebrick"), shapes = c(21, 21)) + +# 3. Annotate and export results +annotations <- get_annotations(deseq2_results$gene_id) +norm_counts_annot <- add_annotations(norm_counts, annotations) +save_results(deseq2_results, name = "TumorVsNormal", l2fc = 1) + +# 4. Visualize and filter non-detectable genes +nice_Volcano(deseq2_results, x_var = "log2FoldChange", + y_var = "padj", label_var = "gene_id", + title = "Tumor vs Normal") + +detected <- detect_filter(as.data.frame(norm_counts), deseq2_results, + samples.baseline = samples_normal, + samples.condition1 = samples_tumor) + +nice_VSB(norm_counts, sampledata, + variables = c(fill = "sample_type"), + genename = detected$DetectGenes[1], + categories = c("normal", "tumor"), + labels = c("Normal", "Tumor"), + colors = c("steelblue", "firebrick")) + +# 5. Multi-comparison (if 3 phenotypes) +cases <- split_cases(df.BvsA, df.CvsA, df.BvsC) +``` + +--- + +## Session info + +```{r session_info} +sessionInfo() +``` diff --git a/vignettes/PA_clustering.Rmd b/vignettes/PA_clustering.Rmd new file mode 100644 index 0000000..cb48f83 --- /dev/null +++ b/vignettes/PA_clustering.Rmd @@ -0,0 +1,327 @@ +--- +title: "Pathway Analysis Clustering with OmicsKit" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Pathway Analysis Clustering with OmicsKit} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.width = 7, + fig.height = 5, + warning = FALSE, + message = FALSE +) +``` + +## Overview + +After a gene set enrichment analysis (e.g., CAMERA, GSEA, fgsea), a common +challenge is that hundreds of significant gene sets are returned — many of which +are redundant because they share large overlapping gene memberships. This +vignette demonstrates how to use OmicsKit's pathway clustering functions to: + +1. **Load gene sets** from GMT files with `list_gmts()` +2. **Quantify redundancy** between gene sets using Jaccard similarity with + `geneset_similarity()` +3. **Cluster** redundant gene sets hierarchically with `do_clust()` +4. **Detect communities** in the gene set network and generate interpretable + labels with `get_network_communities()` +5. **Visualize** the network with `network_clust()` (base R) or + `network_clust_gg()` (ggplot2) + +## Required packages + +```{r packages} +library(OmicsKit) + +# Suggested packages used in this vignette +library(igraph) +library(ComplexHeatmap) +``` + +--- + +## Step 1 — Load gene sets + +In a real analysis, gene sets are stored as `.gmt` files. `list_gmts()` reads +all `.gmt` files in a directory and returns a single named list. + +```{r list_gmts, eval = FALSE} +geneset_list <- list_gmts("path/to/your/gmt_folder/") +``` + +For this vignette we use the built-in example data, which contains 40 curated +gene sets spanning four biological themes: apoptosis, cell cycle, immune +response, and metabolism. + +```{r load_data} +data(geneset_list) +data(camera_results) + +# How many gene sets? +length(geneset_list) + +# What does the enrichment results table look like? +head(camera_results) + +# How many are significant at FDR < 0.05? +sum(camera_results$FDR < 0.05) +``` + +--- + +## Step 2 — Jaccard similarity matrix + +`geneset_similarity()` filters the gene sets by FDR threshold and computes all +pairwise Jaccard similarity coefficients: + +$$J(A, B) = \frac{|A \cap B|}{|A \cup B|}$$ + +A value of 1 means the two gene sets share identical gene membership; 0 means +no overlap at all. + +```{r geneset_similarity} +jac <- geneset_similarity( + geneset_list = geneset_list, + results = camera_results, + fdr_th = 0.05 +) + +# The JaccardResult object contains three slots +names(jac) + +# Dimensions of the similarity matrix +dim(jac$jaccard_sim) +``` + +The `$dist_mat` slot (1 − Jaccard similarity) can be reused independently — +for example as input to `nice_UMAP()` or any other distance-based method. + +```{r jaccard_preview} +# Preview the top-left corner of the similarity matrix +jac$jaccard_sim[1:5, 1:5] +``` + +--- + +## Step 3 — Hierarchical clustering + +`do_clust()` performs Ward-D2 hierarchical clustering on the distance matrix +and automatically selects the optimal number of clusters *k* by maximizing the +average silhouette width. + +```{r do_clust} +clust <- do_clust(jac) + +# Optimal k selected automatically +clust$optimal_k + +# Cluster assignments (one row per gene set) +head(clust$cluster_assignments) +``` + +### Silhouette curve + +The red dot marks the selected *k*: + +```{r silhouette_plot, fig.cap = "Average silhouette width vs. number of clusters. The optimal k is highlighted in red."} +clust$silhouette_plot +``` + +### Jaccard heatmap with dendrogram + +Gene sets in the same hierarchical cluster appear as blocks of high similarity +(darker blue) along the diagonal: + +```{r heatmap, fig.width = 6, fig.height = 6, fig.cap = "Jaccard similarity heatmap with Ward-D2 dendrogram."} +ComplexHeatmap::draw(clust$heatmap) +``` + +--- + +## Step 4 — Community detection and super-terms + +Hierarchical clustering groups gene sets by overall similarity, but the network +community detection captures a different structure: densely connected +sub-networks within the Jaccard graph. + +`get_network_communities()` does three things in a single call: + +- Builds a binary adjacency matrix (edges where Jaccard > `threshold`) +- Runs the Louvain algorithm to detect communities +- Calls `get_superterm()` internally to generate TF-IDF labels for each + community + +```{r get_network_communities} +net <- get_network_communities( + x = jac, + threshold = 0.3, + method = "louvain", + superterms = TRUE, + n_terms = 3, + remove_prefix = TRUE, + seed = 1905 +) + +# How many communities were detected? +length(unique(net$membership)) + +# Community summary: label + size +net$superterms$summary +``` + +The `$superterms$mapping` tibble links every gene set to its community and +its super-term label: + +```{r superterm_mapping} +head(net$superterms$mapping) +``` + +--- + +## Step 5 — Network visualization + +### Option A: base R igraph (`network_clust`) + +`network_clust()` draws plots directly to the active graphics device and +returns node attributes invisibly. It is faster and requires only `igraph`. + +```{r network_clust_clean, fig.width = 7, fig.height = 7, fig.cap = "Gene set network — clean view. Node color reflects hierarchical cluster."} +result <- network_clust( + x = jac, + clust_result = clust, + jaccard_threshold = 0.10, + min_degree = 1, + superterms = TRUE, + superterm_data = net$superterms, + type = "clean", + seed = 1905 +) +``` + +```{r network_clust_superterms, fig.width = 7, fig.height = 7, fig.cap = "Gene set network with community super-term labels."} +network_clust( + x = jac, + clust_result = clust, + jaccard_threshold = 0.10, + min_degree = 1, + superterms = TRUE, + superterm_data = net$superterms, + type = "superterms", + seed = 1905 +) +``` + +### Option B: ggraph / ggplot2 (`network_clust_gg`) + +`network_clust_gg()` returns a named list of ggplot2 objects that can be +further customized, saved with `ggsave()`, or composed with `patchwork`. +Requires `ggraph` and `tidygraph`. + +```{r network_clust_gg, fig.width = 7, fig.height = 7, fig.cap = "Gene set network (ggraph) with super-term labels."} +plots <- network_clust_gg( + x = jac, + clust_result = clust, + jaccard_threshold = 0.10, + min_degree = 1, + superterms = TRUE, + superterm_data = net$superterms, + type = "all", + seed = 1905 +) + +# Display the combined view (super-terms + individual node labels) +plots$superterms +``` + +```{r patchwork, fig.width = 12, fig.height = 6, fig.cap = "Side-by-side comparison: clean view (left) vs. super-term labels (right)."} +library(patchwork) +plots$clean + plots$superterms +``` + +### Saving plots + +```{r save, eval = FALSE} +# Save with ggsave +ggplot2::ggsave( + filename = "network_combined.pdf", + plot = plots$combined, + width = 14, + height = 14 +) + +# Save base R igraph plot to PDF +pdf("network_superterms_igraph.pdf", width = 14, height = 14) +network_clust(jac, clust, + superterms = TRUE, + superterm_data = net$superterms, + type = "superterms") +dev.off() +``` + +--- + +## Step 6 — Downstream tables + +Both network functions return node-level attributes for downstream analysis: + +```{r node_attributes} +# From the igraph version (invisible return) +head(result$node_attributes) +``` + +```{r superterm_report} +# Superterm report: community membership breakdown +result$superterm_report +``` + +--- + +## Full workflow — summary + +```{r full_workflow, eval = FALSE} +library(OmicsKit) + +# 1. Load gene sets +gsl <- list_gmts("path/to/gmt_folder/") + +# 2. Jaccard similarity (filter by FDR < 0.05) +jac <- geneset_similarity(gsl, camera_results, fdr_th = 0.05) + +# 3. Hierarchical clustering +clust <- do_clust(jac) +clust$silhouette_plot +ComplexHeatmap::draw(clust$heatmap) + +# 4. Community detection + super-terms +net <- get_network_communities(jac, threshold = 0.3) +net$superterms$summary + +# 5a. Network plot — base R (draws to device) +network_clust(jac, clust, + superterms = TRUE, + superterm_data = net$superterms) + +# 5b. Network plot — ggplot2 (returns objects) +plots <- network_clust_gg(jac, clust, + superterms = TRUE, + superterm_data = net$superterms) +plots$combined +ggplot2::ggsave("network.pdf", plots$combined, width = 14, height = 14) + +# 6. Reuse the distance matrix for UMAP +nice_UMAP(as.matrix(jac$dist_mat)) +``` + +--- + +## Session info + +```{r session_info} +sessionInfo() +``` diff --git a/vignettes/PA_workflow.Rmd b/vignettes/PA_workflow.Rmd new file mode 100644 index 0000000..97a42c9 --- /dev/null +++ b/vignettes/PA_workflow.Rmd @@ -0,0 +1,381 @@ +--- +title: "Pathway Analysis Workflow with OmicsKit" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Pathway Analysis Workflow with OmicsKit} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} + +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.align = "center", + warning = FALSE, + message = FALSE +) +``` + +## Overview +This vignette demonstrates a complete gene set enrichment analysis (GSEA) +downstream workflow using OmicsKit, from loading gene sets through +publication-quality visualization. + +The workflow covers: + +1. **Loading gene sets** : reading GMT files with `list_gmts()` +2. **Merging results** : combining multi-collection GSEA TSV files with + `merge_PA()` +3. **Gene extraction** : retrieving leading edge and top-ranked genes with + `getgenesPA()` and annotating results with `addgenesPA()` +4. **Heatmaps** : visualizing gene expression per pathway with `heatmap_PA()` + (file-path alternative: `heatmap_path_PA()`) +5. **Single-comparison plot** : `splot_PA()` : a publication-quality + multi-panel barplot for one comparison +6. **Multi-comparison plot** : `multiplot_PA()` : faceted barplots comparing + enrichment across conditions + +## Required packages +```{r packages} +library(OmicsKit) +library(ggplot2) +``` + +--- + +## Section 1 : Load gene sets +### `list_gmts()` +Gene sets used for enrichment analysis are stored as `.gmt` files, the +standard format used by MSigDB. `list_gmts()` reads all `.gmt` files from a +directory and returns a single named list, ready for downstream use. + +```{r list_gmts, eval = FALSE} +geneset_list <- list_gmts("path/to/your/gmt_folder/") + +# Number of gene sets loaded +length(geneset_list) +# Names of the first five gene sets +names(geneset_list)[1:5] +# Genes in one specific set +geneset_list[["HALLMARK_INTERFERON_GAMMA_RESPONSE"]] +``` + +In this vignette, we use a built-in example `geneset_list`, which contains +40 curated gene sets across four biological themes (apoptosis, cell cycle, +immune response, and metabolism), and the built-in `gsea_results` dataset, +which mimics the output of `merge_PA()` for three comparisons across HALLMARK, +KEGG, and GO collections. +```{r load_data} +data(geneset_list) +data(gsea_results) +data(deseq2_results) +data(vst_counts) +data(sampledata) +# Ranked gene list: DESeq2 Wald stat from most positive to most negative +ranked_genes <- deseq2_results$gene_id[ + order(deseq2_results$stat, decreasing = TRUE) +] +# Overview of the enrichment results +dim(gsea_results) +unique(gsea_results$COMPARISON) +unique(gsea_results$COLLECTION) +``` + +--- + +## Section 2 : Merge GSEA results +### `merge_PA()` +In a real analysis, GSEA produces one results file per MSigDB collection +(e.g., `H.tsv` for HALLMARK, `C2.tsv` for KEGG). `merge_PA()` reads all +`.tsv` files from a directory, standardizes column names, parses the leading +edge string into numeric components (`tags`, `list`, `signal`), computes +`-log10(FDR)`, and returns a single combined data frame. +```{r merge_PA, eval = FALSE} +gsea_data <- merge_PA( + input_directory = "path/to/gsea_results/", + fdr_replace = 0.001 # replace FDR = 0 (below permutation resolution) +) +# Inspect result +head(gsea_data[, c("NAME", "NES", "FDR", "COLLECTION", "COMPARISON", "tags")]) +``` + +> **Note:** Each `.tsv` file must contain a `Comparison` column identifying +> the comparison name (e.g., `"TumorVsNormal"`). Then `merge_PA()` renames it to +> `COMPARISON`. If your files come from a single run without that column, +> add it manually: +> ```r +> your_data$Comparison <- "TumorVsNormal" +> ``` + +The built-in `gsea_results` already has this structure: +```{r inspect_data} +# Key columns produced by merge_PA() +head(gsea_results[, c("NES", "FDR", "Log10FDR", + "COLLECTION", "COMPARISON", "tags", "SIZE")], n = 2) +``` + +--- + +## Section 3 : Extract leading edge genes +### `getgenesPA()` +### `addgenesPA()` +After obtaining pathway results, `getgenesPA()` retrieves the gene members +relevant to each enrichment signal. Three extraction modes are available: +- **`"le"`** (GSEA only): leading edge genes : the subset that drives the + enrichment score. +- **`"top"`**: top-ranked *n* % of genes by rank metric : applicable to any + enrichment method (GSEA, CAMERA, PADOG, etc.). +- **`"all"`**: all members of the gene set, ordered by rank. +`addgenesPA()` then appends the gene lists as comma-separated columns +(`le_genes`, `top_genes`, `all_genes`) to the pathway results table. + +```{r getgenesPA_addgenesPA, eval = FALSE} +# Filter to one comparison +pa_single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ] +# Optional: define the top fraction for mode "top" +pa_single$top <- 0.30 # top 30% of gene set members by rank +# Extract genes using all three modes +gene_lists <- getgenesPA( + pa_data = pa_single, + geneset_list = geneset_list, + ranked_genes = ranked_genes, + genes = c("all", "le", "top") +) +# Inspect results for one pathway + ## leading edge + gene_lists$le[["HALLMARK_INTERFERON_GAMMA_RESPONSE"]] + ## top 30% by rank + gene_lists$top[["HALLMARK_INTERFERON_GAMMA_RESPONSE"]] + ## all members + gene_lists$all[["HALLMARK_INTERFERON_GAMMA_RESPONSE"]] +# Append gene columns to the pathway table +pa_annot <- addgenesPA(pa_single, gene_lists) +# Number of gene sets annotated +nrow(pa_annot) + +head(pa_annot[, c("NAME", "all_genes", "le_genes", "top_genes")]) +``` + +> **Tip:** For CAMERA or other enrichment methods that do not produce leading +> edge information, use `genes = c("all", "top")` and set `pa_data$top` to +> your desired fraction (e.g., `0.25` for the top 25 % by rank). Do not +> request `genes = "le"` without a `tags` column. + +--- + +## Section 4 : Heatmaps +### `heatmap_PA()` +`heatmap_PA()` generates one heatmap per gene set based on `pa_data_annot`. +Genes are ordered within each heatmap by their position in `ranked_genes`. Output +files are organized into subdirectories by format (PDF / JPG) and gene +selection mode (`all_genes`, `le_genes`, `top_genes`). + +```{r heatmap_PA, eval = FALSE} +heatmap_PA( + expression_data = vst_counts, + metadata = sampledata, + pa_data_annot = pa_annot, + ranked_genes = ranked_genes, + plot_genes = c("all_genes", "le_genes", "top_genes"), + sample_col = "patient_id", + group_col = "sample_type", + out_dir = "heatmaps_PA", + pdf = TRUE, + jpg = TRUE +) +# Creates, for example: +# heatmaps_PA/jpg/top_genes/HALLMARK_INTERFERON_GAMMA_RESPONSE_heatmap.jpg +# heatmaps_PA/pdf/le_genes/HALLMARK_INTERFERON_GAMMA_RESPONSE_heatmap.pdf +# ... (one file per gene set per format per mode) +``` + +The heatmap below shows the top-ranked genes for +`HALLMARK_INTERFERON_GAMMA_RESPONSE` across tumor and normal TCGA-LUAD +samples: + +```{r heatmap_plot, echo = FALSE, fig.cap = "Expression heatmap of top-ranked genes in HALLMARK_INTERFERON_GAMMA_RESPONSE across TCGA-LUAD tumor and normal samples. Genes are ordered by DESeq2 Wald statistic.", fig.width = 7, fig.height = 6} + +knitr::include_graphics( + "figures_PA/heatmaps/jpg/top_genes/HALLMARK_INTERFERON_GAMMA_RESPONSE_heatmap.jpg" +) +``` + +### `heatmap_path_PA()` : file-path alternative +`heatmap_path_PA()` is a convenience wrapper that reads all inputs from disk +(expression TSV, metadata XLSX, GMT file, ranked-genes TSV, GSEA TSV) and +calls the same heatmap engine internally. It is useful when running the +analysis immediately after GSEA without loading data into R. + +```{r heatmap_path_PA, eval = FALSE} + +heatmap_path_PA( + main_dir = "path/to/analysis/", + expression_file = "vst_expression.tsv", + metadata_file = "metadata.xlsx", + gmt_file = "h.all.v2023.symbols.gmt", + ranked_genes_file = "ranked_genes.tsv", + gsea_file = "H.tsv", + output_dir = "leading_edge_heatmaps", + sample_col = "patient_id", + group_col = "sample_type", + save_dataframe = TRUE # also saves intermediate gene table as .tsv +) +# Produces the same output as heatmap_PA() for a single GSEA file +``` + +> **Which function to use?** Prefer `heatmap_PA()` when your data are already +> in R (e.g., after calling `merge_PA()`, `getgenesPA()`, and `addgenesPA()`). +> Use `heatmap_path_PA()` when you want a quick one-call solution that reads +> directly from files on disk. Both functions produce identical heatmaps. + +--- + +## Section 5 : General plot | Single-comparison +`splot_PA()` generates a publication-quality multi-panel enrichment plot for +**one comparison**. Gene sets appear on the y-axis (grouped by MSigDB +collection), NES on the x-axis, and −log10(FDR) as fill color. Six panels +are assembled side-by-side using `patchwork`. +```{r splot_PA, eval = FALSE} + +single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ] +splot_PA( + data = single, + geneset_col = "NAME", + collection_col = "COLLECTION", + nes_col = "NES", + fdr_col = "FDR", + order = "desc", + fill_limits = c(0, 2), # cap at -log10(FDR) = 5 + fill_palette = c("white", "red") +) +``` + +```{r splot_plot, echo = FALSE, fig.cap = "Single-comparison pathway enrichment plot (TumorVsNormal). Gene sets are sorted by NES; fill color encodes -log10(FDR), capped at 2. Collections are annotated to the right.", fig.width = 6, fig.height = 5} +knitr::include_graphics("figures_PA/splot_PA.jpg") +``` +> **Tip:** Use `fill_limits = c(0, 2)` to prevent a handful of extremely +> significant gene sets from washing out the color contrast for the rest. Any +> pathway with FDR ≤ 0.01 will be shown in the maximum color (red). + +--- + +## Section 6 : General plot | Multi-comparison +`multiplot_PA()` generates a **faceted barplot** showing NES across multiple +comparisons for a selected set of gene sets. Each facet represents one gene +set; bars represent the NES per comparison, colored by −log10(FDR). This +layout makes it straightforward to assess how enrichment changes across +conditions. +```{r multiplot_PA, eval = FALSE} + +# Subset to pathways of interest across all comparisons +pathways_of_interest <- c( + "HALLMARK_INTERFERON_GAMMA_RESPONSE", + "HALLMARK_INFLAMMATORY_RESPONSE", + "HALLMARK_G2M_CHECKPOINT", + "HALLMARK_E2F_TARGETS", + "HALLMARK_GLYCOLYSIS", + "HALLMARK_OXIDATIVE_PHOSPHORYLATION" +) +multi_data <- gsea_results[gsea_results$NAME %in% pathways_of_interest, ] +multiplot_PA( + data = multi_data, + comparison_col = "COMPARISON", + facet_col = "NAME", + axis_y = "NES", + fdr_col = "FDR", + comparison_order = c("TumorVsNormal", + "MetastasisVsNormal", + "MetastasisVsTumor"), + custom_labels = c( + TumorVsNormal = "Tumor", + MetastasisVsNormal = "Meta", + MetastasisVsTumor = "Meta/Tumor" + ), + ncol_wrap = 3, + free_y = TRUE, + fill_limits = c(0, 5), + fill_palette = c("white", "red") +) +``` + +```{r multiplot_plot, echo = FALSE, fig.cap = "Multi-comparison pathway enrichment plot. Each facet shows NES for one HALLMARK gene set across three pairwise comparisons. Fill color encodes -log10(FDR), capped at 5.", fig.width = 6, fig.height = 5} +knitr::include_graphics("figures_PA/multiplot_PA.jpg") +``` + +> **Tip:** Use `comparison_order` to control the left-to-right arrangement of +> comparisons on the x-axis of each facet, and `custom_labels` to replace +> long comparison names with shorter axis labels. + +## Full workflow : summary +```{r full_workflow, eval = FALSE} +library(OmicsKit) + +# 1. Load gene sets +gsl <- list_gmts("path/to/gmt_folder/") + +# 2. Merge GSEA output TSV files +gsea_data <- merge_PA( + input_directory = "path/to/gsea_results/", + fdr_replace = 0.001 +) +# 3. Build ranked gene list (from DESeq2 stat or log2FC) +ranked <- deseq2_results$gene_id[ + order(deseq2_results$stat, decreasing = TRUE) +] +# 4. Extract leading edge and top-ranked genes +pa_single <- gsea_data[gsea_data$COMPARISON == "TumorVsNormal", ] +pa_single$top <- 0.30 +gene_lists <- getgenesPA(pa_single, gsl, ranked, genes = c("all", "le", "top")) +pa_annot <- addgenesPA(pa_single, gene_lists) +# 5. Heatmaps +heatmap_PA( + expression_data = vst_counts, + metadata = sampledata, + pa_data_annot = pa_annot, + ranked_genes = ranked, + plot_genes = c("all_genes", "le_genes", "top_genes"), + sample_col = "patient_id", + group_col = "sample_type", + out_dir = "heatmaps_PA" +) +# Alternative: file-path version (reads directly from disk) +heatmap_path_PA( + main_dir = "path/to/analysis/", + expression_file = "vst_expression.tsv", + metadata_file = "metadata.xlsx", + gmt_file = "h.all.v2023.symbols.gmt", + ranked_genes_file = "ranked_genes.tsv", + gsea_file = "H.tsv", + output_dir = "leading_edge_heatmaps" +) +# 6. Single-comparison enrichment plot +splot_PA( + data = pa_single, + geneset_col = "NAME", + collection_col = "COLLECTION", + nes_col = "NES", + fdr_col = "FDR", + fill_limits = c(0, 5) +) +# 7. Multi-comparison enrichment plot +pathways_oi <- c( + "HALLMARK_INTERFERON_GAMMA_RESPONSE", + "HALLMARK_INFLAMMATORY_RESPONSE", + "HALLMARK_G2M_CHECKPOINT" +) +multiplot_PA( + data = gsea_data[gsea_data$NAME %in% pathways_oi, ], + comparison_col = "COMPARISON", + facet_col = "NAME", + fdr_col = "FDR", + comparison_order = c("TumorVsNormal", "MetastasisVsNormal", "MetastasisVsTumor"), + ncol_wrap = 3 +) +``` + +## Session info +```{r session_info} +sessionInfo() +``` diff --git a/vignettes/figures/nice_PCA.png b/vignettes/figures/nice_PCA.png new file mode 100644 index 0000000..348d6da Binary files /dev/null and b/vignettes/figures/nice_PCA.png differ diff --git a/vignettes/figures/nice_UMAP.png b/vignettes/figures/nice_UMAP.png new file mode 100644 index 0000000..a6aa562 Binary files /dev/null and b/vignettes/figures/nice_UMAP.png differ diff --git a/vignettes/figures/nice_VSB.png b/vignettes/figures/nice_VSB.png new file mode 100644 index 0000000..caf0369 Binary files /dev/null and b/vignettes/figures/nice_VSB.png differ diff --git a/vignettes/figures/nice_Volcano.png b/vignettes/figures/nice_Volcano.png new file mode 100644 index 0000000..a84432b Binary files /dev/null and b/vignettes/figures/nice_Volcano.png differ diff --git a/vignettes/figures/nice_tSNE.png b/vignettes/figures/nice_tSNE.png new file mode 100644 index 0000000..2736fe6 Binary files /dev/null and b/vignettes/figures/nice_tSNE.png differ diff --git a/vignettes/figures/power_analysis.png b/vignettes/figures/power_analysis.png new file mode 100644 index 0000000..5d1a639 Binary files /dev/null and b/vignettes/figures/power_analysis.png differ diff --git a/vignettes/figures_PA/heatmaps/jpg/top_genes/HALLMARK_INTERFERON_GAMMA_RESPONSE_heatmap.jpg b/vignettes/figures_PA/heatmaps/jpg/top_genes/HALLMARK_INTERFERON_GAMMA_RESPONSE_heatmap.jpg new file mode 100644 index 0000000..12a390a Binary files /dev/null and b/vignettes/figures_PA/heatmaps/jpg/top_genes/HALLMARK_INTERFERON_GAMMA_RESPONSE_heatmap.jpg differ diff --git a/vignettes/figures_PA/multiplot_PA.jpg b/vignettes/figures_PA/multiplot_PA.jpg new file mode 100644 index 0000000..1b0cc70 Binary files /dev/null and b/vignettes/figures_PA/multiplot_PA.jpg differ diff --git a/vignettes/figures_PA/splot_PA.jpg b/vignettes/figures_PA/splot_PA.jpg new file mode 100644 index 0000000..ef4b363 Binary files /dev/null and b/vignettes/figures_PA/splot_PA.jpg differ