diff --git a/NAMESPACE b/NAMESPACE index 9cd61e0..d7bc4fc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand export(add_annotations) +export(addgenesPA) export(calc_jaccard) export(detect_filter) export(do_clust) @@ -8,9 +9,12 @@ export(get_annotations) export(get_network_communities) export(get_stars) export(get_superterm) +export(getgenesPA) export(heatmap_PA) +export(heatmap_path_PA) export(list_gmts) export(merge_PA) +export(multiplot_PA) export(network_clust) export(network_clust_gg) export(nice_KM) @@ -20,13 +24,12 @@ export(nice_VSB) export(nice_VSB_DEseq2) export(nice_Volcano) export(nice_tSNE) -export(plot_PA) export(power_analysis) export(save_results) export(split_cases) +export(splot_PA) export(tpm) import(ggplot2) -importFrom(cowplot,get_legend) importFrom(magrittr,"%>%") importFrom(patchwork,plot_layout) importFrom(rlang,.data) diff --git a/R/get_genes_PA.R b/R/get_genes_PA.R new file mode 100644 index 0000000..92ca2c3 --- /dev/null +++ b/R/get_genes_PA.R @@ -0,0 +1,344 @@ +###################### +# Function getgenesPA # +###################### + +#' Extract gene members from pathway analysis results +#' +#' For each gene set in a pathway analysis results table, retrieves leading +#' edge genes, a user-defined top fraction of genes, all genes in the gene +#' set, or any combination. All gene lists are ordered by their rank in the +#' provided ranked gene list. +#' +#' **Three extraction modes:** +#' +#' * `"le"`: **GSEA output only.** Leading edge genes: the subset of genes +#' that drives the enrichment signal. Size is computed as +#' `round(SIZE * tags)`, where `tags` is the fraction of gene hits before +#' (positive ES) or after (negative ES) the peak in the running enrichment +#' score. Definition from the GSEA User Guide: *"The percentage of gene hits +#' before (for positive ES) or after (for negative ES) the peak in the +#' running enrichment score. This gives an indication of the percentage of +#' genes contributing to the enrichment score."* +#' (\url{https://docs.gsea-msigdb.org/#GSEA/GSEA_User_Guide/}). +#' Requires columns `SIZE` and `tags` in `pa_data`, produced automatically +#' by [merge_PA()]. +#' +#' * `"top"`: **Any enrichment result (GSEA, CAMERA, PADOG, etc.).** +#' A user-defined top fraction of genes ordered by rank. Size is computed as +#' `round(SIZE * top)`, where `top` is a numeric value between 0 and 1 +#' provided in a `top` column of `pa_data`. This does **not** represent true +#' leading edge genes: it is a flexible, rank-based selection suitable for +#' exploratory visualization with any pathway analysis method. +#' Requires columns `SIZE` and `top` in `pa_data`. +#' +#' * `"all"`: All genes in the gene set, ordered by rank. +#' +#' @param pa_data A data frame of pathway analysis results. Must always +#' contain: +#' * `NAME`: gene set name. +#' +#' Additionally required depending on `genes`: +#' * `SIZE`: number of genes in the gene set. Required for `"le"` and +#' `"top"`. +#' * `tags`: numeric fraction (0-1) of genes contributing to the +#' enrichment score (GSEA leading edge). Produced automatically by +#' [merge_PA()]. Required for `genes = "le"`. +#' * `top`: numeric fraction (0-1) defining the proportion of top-ranked +#' genes to extract. Set manually by the user (e.g., +#' `pa_data$top <- 0.25` for the top 25%). Required for `genes = "top"`. +#' +#' Typically the output of [merge_PA()]. +#' @param geneset_list A named list of gene sets, where each element is a +#' character vector of gene symbols. Typically the output of [list_gmts()], +#' or use the built-in [geneset_list] for quick testing. +#' @param ranked_genes A character vector of gene symbols ordered by their +#' ranking metric (e.g., DESeq2 `stat`, log2FC, or signal-to-noise ratio), +#' from most positive to most negative. Non-significant genes fall in the +#' middle of the list. Used to order genes within each extracted set. +#' @param genes Character vector specifying which extraction mode(s) to use. +#' Any combination of `"all"`, `"le"`, and `"top"`. Default: +#' `c("all", "le")`. +#' +#' @return Depends on `genes`: +#' * Single mode (e.g., `genes = "le"`): a named list where each element +#' is a character vector of gene symbols. The list has an attribute +#' `genes_type` used by [addgenesPA()] to name the output column. +#' * Multiple modes (e.g., `genes = c("all", "le", "top")`): a named list +#' with one element per requested mode: +#' * `$all`: named list of all gene set members. +#' * `$le`: named list of leading edge genes (GSEA only). +#' * `$top`: named list of top-ranked genes. +#' +#' @examples +#' \dontrun{ +#' data(gsea_results) +#' data(geneset_list) +#' data(deseq2_results) +#' +#' #or +#' gsl <- list_gmts("path/to/gmt_folder/") +#' +#' ranked <- deseq2_results$gene_id[order(deseq2_results$stat, +#' decreasing = TRUE)] +#' pa_single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ] +#' +#' # ── GSEA results: all three modes available +#' gene_lists <- getgenesPA(pa_single, geneset_list, ranked, +#' genes = c("all", "le", "top")) +#' +#' # But first add the top column (e.g. top 30% of genes by rank) +#' pa_single$top <- 0.30 +#' gene_lists <- getgenesPA(pa_single, geneset_list, ranked, +#' genes = c("all", "le", "top")) +#' +#' gene_lists$le[["KEGG_APOPTOSIS"]] # leading edge genes +#' gene_lists$top[["KEGG_APOPTOSIS"]] # top 30% by rank +#' gene_lists$all[["KEGG_APOPTOSIS"]] # all genes +#' +#' pa_annot <- addgenesPA(pa_single, gene_lists) +#' head(pa_annot[, c("NAME", "all_genes", "le_genes", "top_genes")]) +#' +#' # ── CAMERA results: use "top" (no leading edge available) ─── +#' data(camera_results) +#' camera_pa <- camera_results +#' colnames(camera_pa)[colnames(camera_pa) == "GeneSet"] <- "NAME" +#' camera_pa$SIZE <- sapply(camera_pa$NAME, +#' function(x) length(geneset_list[[x]])) +#' camera_pa$top <- 0.25 # top 25% by rank +#' +#' gene_lists_cam <- getgenesPA(camera_pa, geneset_list, ranked, +#' genes = c("all", "top")) +#' pa_annot_cam <- addgenesPA(camera_pa, gene_lists_cam) +#' head(pa_annot_cam[, c("NAME", "all_genes", "top_genes")]) +#' } +#' +#' @seealso [addgenesPA()] to append gene columns to pa_data; +#' [heatmap_PA()] for heatmap visualization; +#' [list_gmts()] to generate `geneset_list`; +#' [merge_PA()] to generate `pa_data` with the required `tags` column. +#' +#' @export + +getgenesPA <- function(pa_data, geneset_list, ranked_genes, + genes = c("all", "le")) { + + genes <- match.arg(genes, choices = c("all", "le", "top"), several.ok = TRUE) + + # --- Input validation ---- + if (!is.data.frame(pa_data)) { + stop("`pa_data` must be a data frame.", call. = FALSE) + } + if (!"NAME" %in% colnames(pa_data)) { + stop("`pa_data` must contain a column named 'NAME'.", call. = FALSE) + } + if ("le" %in% genes && !all(c("SIZE", "tags") %in% colnames(pa_data))) { + stop( + "`pa_data` must contain columns 'SIZE' and 'tags' when genes = 'le'. ", + "These are produced by merge_PA() from GSEA output. ", + "For non-GSEA results, use genes = 'top' and add: ", + "pa_data$SIZE <- ...; pa_data$top <- 0.25", + call. = FALSE + ) + } + if ("top" %in% genes && !all(c("SIZE", "top") %in% colnames(pa_data))) { + stop( + "`pa_data` must contain columns 'SIZE' and 'top' when genes = 'top'. ", + "Add them manually: pa_data$SIZE <- ...; pa_data$top <- 0.25", + call. = FALSE + ) + } + if (!is.list(geneset_list) || is.null(names(geneset_list))) { + stop("`geneset_list` must be a named list. Use list_gmts() to generate it.", + call. = FALSE) + } + if (!is.character(ranked_genes) || length(ranked_genes) == 0) { + stop("`ranked_genes` must be a non-empty character vector.", call. = FALSE) + } + + common_sets <- intersect(pa_data$NAME, names(geneset_list)) + if (length(common_sets) == 0) { + stop( + "No gene set names in `pa_data$NAME` match `geneset_list`. ", + "Check that both use the same naming convention.", + call. = FALSE + ) + } + + # --- Internal helper: order genes by rank and take top n - + .extract_top_n <- function(all_g, n, ranked_genes) { + if (is.na(n) || n <= 0) return(character(0)) + gene_ranks <- match(all_g, ranked_genes) + ordered_g <- all_g[order(gene_ranks, na.last = TRUE)] + utils::head(ordered_g, n) + } + + result <- list() + + # --- all: every gene in the gene set ordered by rank + if ("all" %in% genes) { + result$all <- stats::setNames( + lapply(common_sets, function(gs) { + all_g <- geneset_list[[gs]] + gene_ranks <- match(all_g, ranked_genes) + all_g[order(gene_ranks, na.last = TRUE)] + }), + common_sets + ) + } + + # --- le: leading edge genes (GSEA only, uses tags column) + if ("le" %in% genes) { + result$le <- stats::setNames( + lapply(common_sets, function(gs) { + row <- pa_data[pa_data$NAME == gs, ][1, ] + all_g <- geneset_list[[gs]] + # Leading edge size from tags (fraction contributing to ES peak) + le_size <- round(as.numeric(row$SIZE) * as.numeric(row$tags)) + .extract_top_n(all_g, le_size, ranked_genes) + }), + common_sets + ) + } + + # --- top: user-defined top fraction by rank (any enrichment method) - + if ("top" %in% genes) { + result$top <- stats::setNames( + lapply(common_sets, function(gs) { + row <- pa_data[pa_data$NAME == gs, ][1, ] + all_g <- geneset_list[[gs]] + top_size <- round(as.numeric(row$SIZE) * as.numeric(row$top)) + .extract_top_n(all_g, top_size, ranked_genes) + }), + common_sets + ) + } + + # --- Simplify output if only one mode requested ----- + if (length(genes) == 1) { + out <- result[[genes]] + attr(out, "genes_type") <- genes + return(out) + } + + return(result) +} + + +###################### +# Function addgenesPA # +###################### + +#' Add gene columns to pathway analysis results +#' +#' Appends `all_genes`, `le_genes`, and/or `top_genes` columns to a pathway +#' analysis results data frame based on the output of [getgenesPA()]. +#' Gene symbols within each cell are comma-separated. Automatically detects +#' which column(s) to add based on the structure of the input. +#' +#' @param pa_data A data frame of pathway analysis results containing a `NAME` +#' column. Typically the output of [merge_PA()]. +#' @param gene_lists Output of [getgenesPA()]. Can be: +#' * A list with `$all`, `$le`, and/or `$top` slots: when multiple modes +#' are requested (e.g., `getgenesPA(..., genes = c("all", "le", "top"))`). +#' Adds the corresponding columns. +#' * A flat named list with attribute `genes_type`: when a single mode is +#' requested. Adds the corresponding column (`all_genes`, `le_genes`, or +#' `top_genes`). +#' +#' @return The input `pa_data` data frame with one or more additional columns: +#' * `all_genes`: comma-separated string of all gene set members ordered by +#' rank. +#' * `le_genes`: comma-separated string of leading edge genes (GSEA only), +#' ordered by rank. +#' * `top_genes`: comma-separated string of top-ranked genes based on the +#' user-defined `top` fraction. +#' +#' Gene sets not found in `gene_lists` receive `NA`. +#' +#' @examples +#' \dontrun{ +#' data(gsea_results) +#' data(geneset_list) +#' data(deseq2_results) +#' +#' ranked <- deseq2_results$gene_id[order(deseq2_results$stat, +#' decreasing = TRUE)] +#' pa_single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ] +#' pa_single$top <- 0.30 +#' +#' # Add all three columns +#' gene_lists <- getgenesPA(pa_single, geneset_list, ranked, +#' genes = c("all", "le", "top")) +#' pa_annot <- addgenesPA(pa_single, gene_lists) +#' head(pa_annot[, c("NAME", "all_genes", "le_genes", "top_genes")]) +#' +#' # Add only leading edge genes +#' le_only <- getgenesPA(pa_single, geneset_list, ranked, genes = "le") +#' pa_annot <- addgenesPA(pa_single, le_only) +#' head(pa_annot[, c("NAME", "le_genes")]) +#' +#' # CAMERA: add only top and all (no leading edge) +#' data(camera_results) +#' camera_pa <- camera_results +#' colnames(camera_pa)[colnames(camera_pa) == "GeneSet"] <- "NAME" +#' camera_pa$SIZE <- sapply(camera_pa$NAME, +#' function(x) length(geneset_list[[x]])) +#' camera_pa$top <- 0.25 +#' gene_lists_cam <- getgenesPA(camera_pa, geneset_list, ranked, +#' genes = c("all", "top")) +#' pa_annot_cam <- addgenesPA(camera_pa, gene_lists_cam) +#' head(pa_annot_cam[, c("NAME", "all_genes", "top_genes")]) +#' } +#' +#' @seealso [getgenesPA()] to generate `gene_lists`; +#' [heatmap_PA()] for heatmap visualization; +#' [save_results()] to export the annotated results. +#' +#' @export + +addgenesPA <- function(pa_data, gene_lists) { + + if (!is.data.frame(pa_data)) { + stop("`pa_data` must be a data frame.", call. = FALSE) + } + if (!"NAME" %in% colnames(pa_data)) { + stop("`pa_data` must contain a column named 'NAME'.", call. = FALSE) + } + if (!is.list(gene_lists)) { + stop("`gene_lists` must be a list, output of getgenesPA().", call. = FALSE) + } + + # Helper: collapse gene vector to comma-separated string per gene set + .collapse <- function(named_list, set_names) { + vapply(set_names, function(gs) { + if (!gs %in% names(named_list)) return(NA_character_) + g <- named_list[[gs]] + if (length(g) == 0) return(NA_character_) + paste(g, collapse = ",") + }, character(1)) + } + + # Column name mapping per genes_type + col_map <- c(all = "all_genes", le = "le_genes", top = "top_genes") + + has_all_slot <- "all" %in% names(gene_lists) && is.list(gene_lists$all) + has_le_slot <- "le" %in% names(gene_lists) && is.list(gene_lists$le) + has_top_slot <- "top" %in% names(gene_lists) && is.list(gene_lists$top) + genes_type <- attr(gene_lists, "genes_type") + + if (has_all_slot || has_le_slot || has_top_slot) { + if (has_all_slot) pa_data$all_genes <- .collapse(gene_lists$all, pa_data$NAME) + if (has_le_slot) pa_data$le_genes <- .collapse(gene_lists$le, pa_data$NAME) + if (has_top_slot) pa_data$top_genes <- .collapse(gene_lists$top, pa_data$NAME) + } else if (!is.null(genes_type) && genes_type %in% names(col_map)) { + pa_data[[col_map[[genes_type]]]] <- .collapse(gene_lists, pa_data$NAME) + } else { + stop( + "`gene_lists` structure not recognized. ", + "Pass the direct output of getgenesPA().", + call. = FALSE + ) + } + + return(pa_data) +} diff --git a/R/gsea_results.R b/R/gsea_results.R new file mode 100644 index 0000000..0959e3e --- /dev/null +++ b/R/gsea_results.R @@ -0,0 +1,86 @@ +###################### +# gsea_results data # +###################### + +#' Simulated GSEA pathway analysis results for TCGA-LUAD +#' +#' A simulated data frame representing the output of [merge_PA()] for three +#' pairwise comparisons of TCGA-LUAD samples across 40 gene sets from three +#' MSigDB collections (HALLMARK, KEGG, GO). Gene sets and gene memberships are +#' derived from [geneset_list]. NES values and FDR are simulated with +#' `set.seed(174)` to produce realistic enrichment patterns, where ~60% of +#' gene sets per comparison are significant (FDR < 0.05). +#' +#' This dataset is designed to demonstrate [splot_PA()], [multiplot_PA()], +#' [getgenesPA()], [addgenesPA()], and [heatmap_PA()] without requiring +#' external GSEA output files. +#' +#' @format A data frame with 120 rows (40 gene sets x 3 comparisons) and 15 +#' columns: +#' \describe{ +#' \item{NAME}{Character. Gene set name, matching the names in +#' [geneset_list].} +#' \item{SIZE}{Integer. Number of genes in the gene set.} +#' \item{ES}{Numeric. Enrichment score.} +#' \item{NES}{Numeric. Normalized enrichment score.} +#' \item{NOM p-val}{Numeric. Nominal p-value.} +#' \item{FDR}{Numeric. False discovery rate. Approximately 60% of gene +#' sets per comparison have FDR < 0.05.} +#' \item{FWER p-val}{Numeric. Family-wise error rate.} +#' \item{RANK AT MAX}{Integer. Gene rank at maximum enrichment score.} +#' \item{Log10FDR}{Numeric. `-log10(FDR)`.} +#' \item{tags}{Numeric. Fraction of gene set in the leading edge (0-1).} +#' \item{list}{Numeric. Fraction of the ranked list used (0-1).} +#' \item{signal}{Numeric. Enrichment signal strength (0-1).} +#' \item{LEADING EDGE}{Character. Leading edge string in GSEA format +#' (e.g., `"tags=20%, list=35%, signal=15%"`).} +#' \item{COLLECTION}{Character. MSigDB collection name: `"HALLMARK"`, +#' `"KEGG"`, or `"GO"`.} +#' \item{COMPARISON}{Character. Comparison name: `"TumorVsNormal"`, +#' `"MetastasisVsNormal"`, or `"MetastasisVsTumor"`.} +#' } +#' +#' @source Simulated with `set.seed(174)` in `data-raw/gsea_results.R`. +#' Gene set names and memberships derived from [geneset_list]. NES values +#' and significance are simulated to reflect realistic GSEA output patterns. +#' +#' @examples +#' data(gsea_results) +#' +#' # Overview +#' dim(gsea_results) +#' table(gsea_results$COMPARISON) +#' table(gsea_results$COLLECTION) +#' +#' # How many gene sets are significant per comparison? +#' tapply(gsea_results$FDR < 0.05, gsea_results$COMPARISON, sum) +#' +#' # Single comparison plot +#' single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ] +#' \dontrun{ +#' splot_PA( +#' data = single, +#' geneset_col = "NAME", +#' collection_col = "COLLECTION", +#' nes_col = "NES", +#' fdr_col = "FDR" +#' ) +#' } +#' +#' # Multi-comparison plot +#' \dontrun{ +#' multiplot_PA( +#' data = gsea_results, +#' comparison_col = "COMPARISON", +#' facet_col = "NAME", +#' fdr_col = "FDR", +#' comparison_order = c("TumorVsNormal", "MetastasisVsNormal", +#' "MetastasisVsTumor") +#' ) +#' } +#' +#' @seealso [merge_PA()] which produces this format from real GSEA output; +#' [splot_PA()], [multiplot_PA()] for visualization; +#' [getgenesPA()], [addgenesPA()] for gene-level annotation; +#' [geneset_list] for the gene set memberships used here. +"gsea_results" diff --git a/R/heatmap_PA.R b/R/heatmap_PA.R deleted file mode 100644 index 56f6e90..0000000 --- a/R/heatmap_PA.R +++ /dev/null @@ -1,170 +0,0 @@ -######################### -# Function heatmap_PA # -######################### - -#' Plot leading edge heatmaps from GSEA/CAMERA/PADOG results. -#' -#' Generates heatmaps based on normalized data of genes for each gene set from GSEA/CAMERA/PADOG output. -#' -#' @param main_dir Optional base directory. If supplied, it will be prepended to all relative file paths. -#' @param expression_file Path to the expression data file (tab-delimited) or relative to main_dir. -#' @param metadata_file Path to the metadata file (Excel) or relative to main_dir. -#' @param gmt_file Path to the GMT file defining gene sets or relative to main_dir. -#' @param ranked_genes_file Path to the ranked genes list file or relative to main_dir. -#' @param gsea_file Path to the GSEA results file with leading edge genes or relative to main_dir. -#' @param output_dir Directory to save heatmaps and optional TSV; default "leading_edge_heatmaps". -#' @param sample_col Name of the sample ID column in metadata; default "Sample". -#' @param group_col Name of the group column in metadata; default "group". -#' @param save_dataframe Logical; if TRUE, saves the merged data frame as TSV before plotting. -#' @return Saves one PDF and one JPG heatmap per gene set under output_dir; optionally saves intermediate TSV. -#' @export - -heatmap_PA <- function(main_dir = NULL, expression_file, metadata_file, gmt_file, - ranked_genes_file, gsea_file, output_dir = "leading_edge_heatmaps", - sample_col = "Sample", group_col = "group", save_dataframe = FALSE) -{ - # Avoid check NOTES for global variables across multiple functions - utils::globalVariables(c( - "NAME", "GENES", "SIZE", "tags", "L.EDGE_size" - )) - - # Ensure required packages are installed - if (!requireNamespace("readr", quietly = TRUE)) stop("Package \"readr\" must be installed to use this function.", call. = FALSE) - if (!requireNamespace("grDevices", quietly = TRUE)) stop("Package \"grDevices\" must be installed to use this function.", call. = FALSE) - if (!requireNamespace("tidyselect", quietly = TRUE)) stop("Package \"tidyselect\" must be installed to use this function.", call. = FALSE) - if (!requireNamespace("openxlsx", quietly = TRUE)) stop("Package \"openxlsx\" must be installed to use this function.", call. = FALSE) - if (!requireNamespace("pheatmap", quietly = TRUE)) stop("Package \"pheatmap\" must be installed to use this function.", call. = FALSE) - - # Prepend base directory if provided - if (!is.null(main_dir)) { - expression_file <- file.path(main_dir, expression_file) - metadata_file <- file.path(main_dir, metadata_file) - gmt_file <- file.path(main_dir, gmt_file) - ranked_genes_file <- file.path(main_dir, ranked_genes_file) - gsea_file <- file.path(main_dir, gsea_file) - output_dir <- file.path(main_dir, output_dir) - } - - # 1) Read and process GMT - gmt_data <- readLines(gmt_file) %>% - strsplit("\t") %>% - lapply(function(x) data.frame(NAME = x[1], DESCRIPTION = x[2], GENES = paste(x[-c(1,2)], collapse = ","), stringsAsFactors = FALSE)) %>% - dplyr::bind_rows() - - # 2) Read GSEA results and join genes - gsea_df <- readr::read_tsv(gsea_file, show_col_types = FALSE) %>% - dplyr::left_join(gmt_data %>% dplyr::select(NAME, GENES), by = "NAME") - - # 3) Read ranked genes list - ranked_df <- readr::read_tsv(ranked_genes_file, show_col_types = FALSE) - ranked_vector <- ranked_df[[1]] - - # 4) Internal helper: extract top-n genes from leading edge - extract_top_n <- function(genes_str, n) { - if (is.na(genes_str) || n <= 0) return(NA_character_) - glist <- unlist(strsplit(genes_str, ",")) - glist <- glist[order(match(glist, ranked_vector), na.last = TRUE)] - paste(utils::head(glist, n), collapse = ",") - } - - # 5) Compute leading edge size and genes - gsea_df <- gsea_df %>% - dplyr::mutate(L.EDGE_size = ifelse(is.na(SIZE * tags), NA, ifelse((SIZE * tags) %% 1 <= 0.5, floor(SIZE * tags), ceiling(SIZE * tags)))) %>% - dplyr::rowwise() %>% dplyr::mutate(LEADING_EDGE_GENES = extract_top_n(GENES, L.EDGE_size)) %>% - dplyr::ungroup() - - # Save intermediate dataframe if requested - if (save_dataframe) { - if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE) - intermediate_file <- file.path(output_dir, "leading_edge_genes_df.tsv") - readr::write_tsv(gsea_df, intermediate_file) - message("Saved data frame to: ", intermediate_file) - } - - # 6) Read metadata and prepare annotation - meta <- openxlsx::read.xlsx(metadata_file) %>% - dplyr::select(tidyselect::all_of(c(sample_col, group_col))) %>% - dplyr::rename(Sample = tidyselect::all_of(sample_col), Group = tidyselect::all_of(group_col)) %>% - as.data.frame() - rownames(meta) <- meta$Sample - - # 7) Read expression data - expr_raw <- utils::read.table(expression_file, header = TRUE, sep = "\t", - stringsAsFactors = FALSE, check.names = FALSE) - # Determine gene-name column - if ("NAME" %in% colnames(expr_raw)) { - rownames(expr_raw) <- expr_raw$NAME - expr_mat <- expr_raw[, setdiff(colnames(expr_raw), "NAME"), drop = FALSE] - } else { - gene_col <- colnames(expr_raw)[1] - rownames(expr_raw) <- expr_raw[[gene_col]] - expr_mat <- expr_raw[, -1, drop = FALSE] - } - # Clean sample names - colnames(expr_mat) <- sub("^X", "", colnames(expr_mat)) - - # Ensure output directory exists - if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE) - - # 8) Loop through each gene set and plot heatmap - for (i in seq_len(nrow(gsea_df))) { - geneset_name <- gsea_df$NAME[i] - leading_genes <- unlist(strsplit(gsea_df$LEADING_EDGE_GENES[i], ",")) - genes_present <- leading_genes[leading_genes %in% rownames(expr_mat)] - if (length(genes_present) == 0) next - - heatmap_mat <- expr_mat[genes_present, , drop = FALSE] - common_samps <- intersect(colnames(heatmap_mat), rownames(meta)) - if (length(common_samps) == 0) next - - heatmap_mat <- heatmap_mat[, common_samps, drop = FALSE] - annot_col <- data.frame(Group = meta[common_samps, "Group"]) - rownames(annot_col) <- common_samps - - # Dynamic sizing - w <- 10 - h <- max(5, nrow(heatmap_mat) * 0.1 + 2) - - # PDF output - grDevices::pdf(file.path(output_dir, paste0(geneset_name, "_heatmap.pdf")), width = w, height = h) - pheatmap::pheatmap( - heatmap_mat, - main = geneset_name, - color = grDevices::colorRampPalette(c("blue","white","red"))(30), - scale = "row", - clustering_distance_rows = "euclidean", - cluster_cols = FALSE, - clustering_method = "complete", - fontsize_row = 6, - fontsize_col = 7, - annotation_col = annot_col, - border_color = NA, - cellheight = 5, - cellwidth = 8 - ) - grDevices::dev.off() - - # JPG output - grDevices::jpeg(file.path(output_dir, paste0(geneset_name, "_heatmap.jpg")), - width = w * 100, height = h * 100, res = 150) - pheatmap::pheatmap( - heatmap_mat, - main = geneset_name, - color = grDevices::colorRampPalette(c("blue","white","red"))(30), - scale = "row", - clustering_distance_rows = "euclidean", - cluster_cols = FALSE, - clustering_method = "complete", - fontsize_row = 6, - fontsize_col = 7, - annotation_col = annot_col, - border_color = NA, - cellheight = 5, - cellwidth = 8 - ) - grDevices::dev.off() - } - - message("Heatmaps saved in: ", normalizePath(output_dir)) - return(TRUE) -} diff --git a/R/merge_PA.R b/R/merge_PA.R index 5f40792..b5cb9ca 100644 --- a/R/merge_PA.R +++ b/R/merge_PA.R @@ -1,74 +1,205 @@ ####################### -# Function merge_PA # +# Function merge_PA # ####################### -#' Merge GSEA results data frames. +utils::globalVariables(c( + "LEADING EDGE", "tags", "signal", + "FDR q-val", "Log10FDR", "FWER p-val", "Comparison", "...12" +)) + +#' Merge GSEA result files into a single data frame +#' +#' Reads all `.tsv` files produced by `GSEA_merge.sh` (from the GSEA.sh +#' pipeline) from a directory, standardizes numeric columns, parses the +#' leading edge string, computes `-log10(FDR)`, and returns a single merged +#' data frame ready for downstream use with [splot_PA()], [multiplot_PA()] , [getgenesPA()], and +#' [heatmap_PA()]. +#' +#' **Input file format:** Each `.tsv` file corresponds to one MSigDB collection +#' (e.g., `H.tsv`, `C2.tsv`) and must follow the standard GSEA output +#' format with the following columns: +#' * `NAME`: gene set name. +#' * `SIZE`: number of genes in the gene set. +#' * `ES`: enrichment score. +#' * `NES`: normalized enrichment score. +#' * `NOM p-val`: nominal p-value. +#' * `FDR q-val`: false discovery rate. Values of exactly `0` indicate +#' that no permutation produced an equally extreme NES (i.e., the true +#' FDR is below the permutation resolution `1 / n_permutations`). These +#' are replaced by `fdr_replace` to avoid `-Inf` in log-transforms. +#' * `FWER p-val`: family-wise error rate. +#' * `RANK AT MAX`: gene rank at maximum enrichment score. +#' * `LEADING EDGE`: string encoding the leading edge statistics in the +#' format `"tags=XX%, list=XX%, signal=XX%"`. Parsed into three numeric +#' columns: `tags` (fraction of gene set in leading edge), `list` +#' (fraction of ranked list used), and `signal` (enrichment signal +#' strength). +#' * `Comparison`: name of the comparison (e.g., `"TumorVsNormal"`). +#' Renamed to `COMPARISON` in the output. Required for visualization +#' with [splot_PA()] or [multiplot_PA()] . +#' * `GS
follow link to MSigDB` and `GS DETAILS`: removed automatically. +#' +#' **Output columns:** All input columns (minus the two removed above) plus: +#' * `COLLECTION`: name of the MSigDB collection, derived from the file name +#' by removing the `.tsv` suffix. +#' * `tags`, `list`, `signal`: numeric leading edge components (0-1 scale). +#' * `Log10FDR`: `-log10(FDR)` computed after applying `fdr_replace`. +#' * `FDR`: renamed from `FDR q-val`. +#' * `COMPARISON`: renamed from `Comparison`. +#' +#' @param input_directory Character. Path to the directory containing one or +#' more GSEA collection result files in `.tsv` format (e.g., output of +#' `GSEA_merge.sh`). Each file must end in `.tsv`. +#' @param fdr_replace Numeric. Value used to replace `FDR q-val = 0`. This +#' occurs when no permutation produced an NES as extreme as the observed +#' one, meaning the true FDR is below `1 / n_permutations`. With the +#' standard 1,000 permutations, the recommended value is `0.001`. Adjust +#' to `1 / n_permutations` if a different number of permutations was used. +#' Default: `0.001`. +#' +#' @note The input `.tsv` files must contain a `Comparison` column identifying +#' each comparison (e.g., `"TumorVsNormal"`). This column is renamed to +#' `COMPARISON` in the output and is required by [splot_PA()] or [multiplot_PA()] to operate in +#' multi-comparison mode. If your files come from a single comparison and +#' do not have this column, add it manually to each file before merging: +#' `your_data$Comparison <- "YourComparisonName"`. +#' +#' @return A data frame (`gsea_data`) containing all merged and processed GSEA +#' results with standardized column names. #' -#' After running GSEA_all.sh from GSEA.sh, merge_PA function joins .tsv files to a single file +#' @examples +#' \dontrun{ +#' # Merge all GSEA collection TSV files from a directory +#' gsea_data <- merge_PA( +#' input_directory = "path/to/gsea_results/", +#' fdr_replace = 0.001 # standard for 1000 permutations +#' ) +#' +#' # Inspect result +#' head(gsea_data) +#' colnames(gsea_data) +#' +#' # Use directly in downstream functions +#' gsl <- list_gmts("path/to/gmt_folder/") +#' ranked <- deseq2_results$gene_id[order(deseq2_results$stat, +#' decreasing = TRUE)] +#' gene_lists <- getgenesPA(gsea_data, gsl, ranked) +#' pa_annot <- addgenesPA(gsea_data, gene_lists) +#' +#' plot_PA(gsea_data, comparison_col = "COMPARISON") +#' } +#' +#' @seealso [splot_PA()] or [multiplot_PA()] for visualization of merged results; +#' [getgenesPA()] and [addgenesPA()] for gene-level annotation; +#' [heatmap_PA()] for leading edge heatmaps; +#' [list_gmts()] to load gene sets. #' -#' @param input_directory The directory containing the GSEA collection results in TSV format. -#' @param output_file The output file to save the merged data. If not provided, the file will be saved in the input directory. #' @importFrom magrittr %>% #' @export -merge_PA <- function(input_directory, output_file = "collections_merged_gsea_data.tsv") { - - # Avoid check NOTES for global variables across multiple functions - utils::globalVariables(c( - "...12", "numeric_cols", "LEADING EDGE", "tags", "signal", - "FDR q-val", "Log10FDR", "FWER p-val", "Comparison" - )) - - # Ensure required packages are installed - if (!requireNamespace("dplyr", quietly = TRUE)) stop("Package \"dplyr\" must be installed to use this function.", call. = FALSE) - if (!requireNamespace("readr", quietly = TRUE)) stop("Package \"readr\" must be installed to use this function.", call. = FALSE) - if (!requireNamespace("tidyr", quietly = TRUE)) stop("Package \"tidyr\" must be installed to use this function.", call. = FALSE) - - # Validate input directory and check for TSV files + +merge_PA <- function(input_directory, + fdr_replace = 0.001) { + + if (!requireNamespace("readr", quietly = TRUE)) { + stop("Package \"readr\" must be installed to use this function.", call. = FALSE) + } + if (!requireNamespace("tidyr", quietly = TRUE)) { + stop("Package \"tidyr\" must be installed to use this function.", call. = FALSE) + } + if (!requireNamespace("tidyselect", quietly = TRUE)) { + stop("Package \"tidyselect\" must be installed to use this function.", call. = FALSE) + } + + # --- Input validation -- if (!dir.exists(input_directory)) { - stop("The specified directory does not exist: ", input_directory) + stop("The specified directory does not exist: ", input_directory, call. = FALSE) } - files <- list.files(path = input_directory, pattern = "\\.tsv$", full.names = TRUE) + + files <- list.files(path = input_directory, pattern = "\\.tsv$", + full.names = TRUE) if (length(files) == 0) { - stop("No TSV files found in ", input_directory) + stop("No TSV files found in: ", input_directory, call. = FALSE) } - # Function to read each file and add a column with the modified file name - read_file <- function(file) { - data <- readr::read_tsv(file) - file_name <- basename(file) - file_name <- sub("_all.tsv$", "", file_name) # Change the pattern if necessary - numeric_cols <- c("SIZE", "ES", "NES", "NOM p-val", "FDR q-val", "FWER p-val", "RANK AT MAX") + if (!is.numeric(fdr_replace) || fdr_replace <= 0 || fdr_replace >= 1) { + stop("`fdr_replace` must be a numeric value between 0 and 1.", call. = FALSE) + } + + # --- Numeric columns present in every GSEA output file + numeric_cols <- c("SIZE", "ES", "NES", "NOM p-val", + "FDR q-val", "FWER p-val", "RANK AT MAX") + + # --- Read and combine all TSV files + read_one <- function(file) { + data <- readr::read_tsv(file, show_col_types = FALSE) + coll_name <- sub("\\.tsv$", "", basename(file)) data <- data %>% - dplyr::mutate(dplyr::across(tidyselect::all_of(numeric_cols), as.numeric)) - data$COLLECTION <- file_name + dplyr::mutate( + dplyr::across(tidyselect::all_of(numeric_cols), as.numeric) + ) + data$COLLECTION <- coll_name return(data) } - # Read and combine all TSV files into a single data frame, remove empty columns - gsea_data <- lapply(files, read_file) %>% dplyr::bind_rows() %>% dplyr::select(-`...12`) + gsea_data <- lapply(files, read_one) %>% + dplyr::bind_rows() - # Find problematic values in numeric columns - gsea_data %>% - dplyr::filter(dplyr::if_any(tidyselect::all_of(numeric_cols), ~ !grepl("^-?[0-9.]+$", .))) %>% - print() + # Remove empty trailing column if present + if ("...12" %in% colnames(gsea_data)) { + gsea_data <- dplyr::select(gsea_data, -`...12`) + } + + # --- Report any non-numeric values in numeric columns -- + problematic <- gsea_data %>% + dplyr::filter( + dplyr::if_any(tidyselect::all_of(numeric_cols), + ~ !grepl("^-?[0-9.]+([eE][+-]?[0-9]+)?$", + as.character(.))) + ) + if (nrow(problematic) > 0) { + message("Warning: ", nrow(problematic), + " row(s) with non-numeric values in numeric columns:") + print(problematic) + } - # Data processing: selection, separation, mutation, and renaming of columns + # --- Check Comparison column --- + if (!"Comparison" %in% colnames(gsea_data)) { + stop( + "No 'Comparison' column found in the TSV files. ", + "Add it manually to each file before merging: ", + "your_data$Comparison <- 'YourComparisonName'.", + call. = FALSE + ) + } + + # --- Process and standardize --- gsea_data <- gsea_data %>% - dplyr::select(-"GS
follow link to MSigDB", -"GS DETAILS") %>% - tidyr::separate(col = `LEADING EDGE`, into = c("tags", "list", "signal"), sep = ",", remove = FALSE) %>% + dplyr::select(-dplyr::any_of(c("GS
follow link to MSigDB", + "GS DETAILS"))) %>% + tidyr::separate( + col = `LEADING EDGE`, + into = c("tags", "list", "signal"), + sep = ",", + remove = FALSE + ) %>% dplyr::mutate( - tags = 0.01 * as.numeric(sub("%", "", sub("tags=", "", tags))), - list = 0.01 * as.numeric(sub("%", "", sub("list=", "", list))), - signal = 0.01 * as.numeric(sub("%", "", sub("signal=", "", signal))), - `FDR q-val` = ifelse(`FDR q-val` == 0, 0.001, `FDR q-val`), - `Log10FDR` = -log10(`FDR q-val`) + tags = 0.01 * as.numeric(sub("%", "", sub("tags=", "", tags))), + list = 0.01 * as.numeric(sub("%", "", sub("list=", "", list))), + signal = 0.01 * as.numeric(sub("%", "", sub("signal=", "", signal))), + # Replace FDR = 0 to avoid -Inf in log-transforms. + # FDR = 0 means no permutation matched the observed NES, + # so the true FDR < 1/n_permutations (typically 0.001 for 1000 perms). + `FDR q-val` = ifelse(`FDR q-val` == 0, fdr_replace, `FDR q-val`), + Log10FDR = -log10(`FDR q-val`) ) %>% - dplyr::relocate(`Log10FDR`, .after = `FWER p-val`) %>% + dplyr::relocate(Log10FDR, .after = `FWER p-val`) %>% dplyr::rename(COMPARISON = Comparison, FDR = `FDR q-val`) - # Save the processed data to a TSV file - readr::write_tsv(gsea_data, output_file) - message("GSEA data saved to:", output_file, "\n") + # --- Message and return + message("Merged GSEA data processed: ", + nrow(gsea_data), " gene sets from ", + length(unique(gsea_data$COLLECTION)), " collection(s) and ", + length(unique(gsea_data$COMPARISON)), " comparison(s).") - return(TRUE) + return(gsea_data) } diff --git a/R/plot_PA.R b/R/plot_PA.R index 1205d48..0499106 100644 --- a/R/plot_PA.R +++ b/R/plot_PA.R @@ -1,171 +1,1000 @@ +#################### +# Function splot_PA # +#################### + utils::globalVariables(c( - "NAME","GENES","SIZE","tags","L.EDGE_size", - "...12","numeric_cols","LEADING EDGE","signal", - "FDR q-val","Log10FDR","FWER p-val","Comparison" + "NAME", "GENES", "SIZE", "tags", "L.EDGE_size", + "numeric_cols", "LEADING EDGE", "signal", + "FDR q-val", "Log10FDR", "FWER p-val", "Comparison" )) -#' Unified Pathway analysis results plotting function with theme configuration -#' -#' Creates either a global GSEA plot or a faceted barplot depending on the number of unique -#' comparisons in the `Comparison` column. Allows customizing all previously hard-coded theme -#' parameters via a single `theme_params` list. -#' -#' @param data A data frame containing GSEA results. -#' @param Comparison Name of the column defining different comparisons. Default: "Comparison". -#' @param custom_labels Named vector of labels for the x-axis discrete scale (barplot mode only). Default: NULL. -#' @param axis_y Name of the column to use for the y-axis aesthetic. Default: "NES". -#' @param fdr_col Name of the column containing FDR values. Default: "FDR". -#' @param logFDR Logical; if TRUE, compute -log10(FDR) from `fdr_col`, otherwise use `fdr_col` directly. Default: TRUE. -#' @param geneset_col Name of the column containing the geneset labels (single comparison mode). -#' @param collection_col Name of the column containing the MSigDB collections (single comparison mode). -#' @param nes_col Name of the column containing NES values (single comparison mode). -#' @param logfdr_col Name of the column containing -log10(FDR) or similar (single comparison mode). -#' @param order One of "desc" or "asc"; order of `axis_y` values. Default: c("desc","asc"). -#' @param ncol_wrap Number of columns for `facet_wrap` in barplot mode. Default: 2. -#' @param free_y Logical; if TRUE, allow free y scales in facets. Default: TRUE. -#' @param fill_limits Numeric vector of length 2 to set fill gradient limits (barplot mode). Default: NULL. -#' @param fill_palette Character vector of two colors for fill gradient. Default: c("white","red"). -#' @param theme_params Named list to override default theme parameters (see details). -#' @details theme_params may include: +#' Pathway analysis visualization for a single comparison +#' +#' Generates a publication-quality multi-panel pathway enrichment plot for a +#' single comparison using patchwork. Gene sets appear on the y-axis grouped +#' by MSigDB collection, NES on the x-axis, and -log10(FDR) as fill color. +#' Six panels are assembled side by side: a "Pathways" label, gene set names, +#' the NES bar chart, collection labels, a "MSigDB" label, and the color legend. +#' +#' For visualizing enrichment across multiple comparisons, use +#' [multiplot_PA()] instead. +#' +#' @param data A data frame of pathway analysis results for a single +#' comparison. Typically the output of [merge_PA()] filtered to one value +#' of the `COMPARISON` column, or results from a single CAMERA/GSEA run. +#' Must contain the columns specified by `geneset_col`, `collection_col`, +#' `nes_col`, and `fdr_col`. +#' @param geneset_col Name of the column containing gene set labels shown on +#' the y-axis. Default: `"NAME"`. +#' @param collection_col Name of the column containing MSigDB collection +#' labels used to group gene sets (e.g., `"KEGG"`, `"HALLMARK"`, `"GO"`). +#' Default: `"COLLECTION"`. +#' @param nes_col Name of the column containing NES values (x-axis). +#' Default: `"NES"`. +#' @param fdr_col Name of the column containing FDR values. `-log10(FDR)` is +#' computed internally and used as the fill color. Default: `"FDR"`. +#' @param order One of `"desc"` or `"asc"`. Sort order for NES values on the +#' y-axis. Default: `"desc"`. +#' @param fill_limits Numeric vector of length 2 setting the color scale range +#' for `-log10(FDR)`. Values outside this range are clamped to the nearest +#' limit. For example, `fill_limits = c(0, 5)` maps all gene sets with +#' `-log10(FDR) >= 5` (i.e., FDR <= 0.00001) to the maximum color (red), +#' and any value below 0 to the minimum color (white). Useful when a few +#' gene sets have extreme significance that washes out color variation in the +#' rest. Default: `NULL` (auto uses the actual data range). +#' @param fill_palette Character vector of two colors for the fill gradient +#' (low to high -log10(FDR)). Default: `c("white", "red")`. +#' @param theme_params Named list to override default theme parameters. +#' See Details. +#' +#' @details +#' `theme_params` accepts any of the following named elements: #' \describe{ -#' \item{side_label_size}{Size for side panel labels (default 35)} -#' \item{geneset_text_size}{Text size for geneset labels (default 5)} -#' \item{collection_text_size}{Text size for collection labels (default 5)} -#' \item{panel_widths}{Patchwork widths vector (default c(4,25,15,3,10,3))} -#' \item{bar_col}{Bar/col border color (default "black")} -#' \item{bar_size}{Border size for bars (default 0.5)} -#' \item{bar_width}{Width for bars (default 0.6)} -#' \item{col_size}{Border size for geom_col (default 1)} -#' \item{hline_size}{Size for horizontal line at y=0 (default 2)} -#' \item{axis_title_size}{Font size for axis titles (default 45)} -#' \item{axis_text_size_x}{Font size for x-axis text (default 30)} -#' \item{axis_text_size_y}{Font size for y-axis text (default 50)} -#' \item{tick_size}{Size for axis ticks (default 1.5)} -#' \item{tick_length}{Length for axis ticks in cm (default 0.3)} -#' \item{strip_text_size}{Font size for strip text (default 50)} -#' \item{panel_spacing_single}{Facet spacing single mode (default 4)} -#' \item{panel_spacing_multi}{Facet spacing multi mode (default 0.6)} +#' \item{`side_label_size`}{Size for "Pathways" and "MSigDB" labels. +#' Default: `35`.} +#' \item{`geneset_text_size`}{Text size for gene set labels. Default: `5`.} +#' \item{`collection_text_size`}{Text size for collection labels. +#' Default: `5`.} +#' \item{`panel_widths`}{Patchwork relative widths for the 6 panels. +#' Default: `c(4, 25, 15, 3, 10, 3)`.} +#' \item{`col_size`}{Border linewidth for `geom_col`. Default: `1`.} +#' \item{`axis_title_size`}{Font size for axis titles. Default: `45`.} +#' \item{`axis_text_size_x`}{Font size for x-axis labels. Default: `30`.} +#' \item{`tick_size`}{Linewidth for axis ticks. Default: `1.5`.} +#' \item{`tick_length`}{Length of axis ticks in cm. Default: `0.3`.} +#' \item{`panel_spacing_single`}{Spacing between facets. Default: `4`.} #' } -#' @return A ggplot or patchwork object for the GSEA plot. +#' +#' @return A `patchwork` object combining six ggplot2 panels. +#' +#' @examples +#' \dontrun{ +#' gsea_results <- merge_PA("path/to/gsea_results/") +#' +#' # Filter to one comparison +#' single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ] +#' +#' splot_PA( +#' data = single, +#' geneset_col = "NAME", +#' collection_col = "COLLECTION", +#' nes_col = "NES", +#' fdr_col = "FDR" +#' ) +#' +#' # Cap color scale at -log10(FDR) = 5 so subtle differences are visible +#' # (gene sets with FDR <= 0.00001 all get the same max red color) +#' splot_PA( +#' data = single, +#' geneset_col = "NAME", collection_col = "COLLECTION", +#' nes_col = "NES", fdr_col = "FDR", +#' fill_limits = c(0, 5) +#' ) +#' } +#' +#' @seealso [multiplot_PA()] for multi-comparison faceted barplots; +#' [merge_PA()] to generate the input data frame; +#' [camera_results] for a minimal example dataset. +#' #' @import ggplot2 #' @importFrom rlang .data #' @importFrom patchwork plot_layout -#' @importFrom cowplot get_legend #' @importFrom utils modifyList #' @export -plot_PA <- function( - data, - Comparison = "Comparison", - custom_labels = NULL, - axis_y = "NES", - fdr_col = "FDR", - logFDR = TRUE, - geneset_col, - collection_col, - nes_col, - logfdr_col, - order = c("desc", "asc"), - ncol_wrap = 2, - free_y = TRUE, - fill_limits = NULL, - fill_palette = c("white", "red"), - theme_params = list() -) { + +splot_PA <- function(data, + geneset_col = "NAME", + collection_col = "COLLECTION", + nes_col = "NES", + fdr_col = "FDR", + order = "desc", + fill_limits = NULL, + fill_palette = c("white", "red"), + theme_params = list()) { + + if (!requireNamespace("patchwork", quietly = TRUE)) { + stop("Package \"patchwork\" must be installed to use this function.", call. = FALSE) + } + if (!requireNamespace("cowplot", quietly = TRUE)) { + stop("Package \"cowplot\" must be installed to use this function.", call. = FALSE) + } + + if (!is.data.frame(data)) stop("`data` must be a data frame.", call. = FALSE) + + for (col in c(geneset_col, collection_col, nes_col, fdr_col)) { + if (!col %in% colnames(data)) { + stop("Column '", col, "' not found in `data`.", call. = FALSE) + } + } + + order <- match.arg(order, c("desc", "asc")) + defaults <- list( side_label_size = 35, geneset_text_size = 5, collection_text_size = 5, - panel_widths = c(4,25,15,3,10,3), - bar_col = "black", - bar_size = 0.5, - bar_width = 0.6, + panel_widths = c(4, 25, 15, 3, 10, 3), col_size = 1, - hline_size = 2, axis_title_size = 45, axis_text_size_x = 30, - axis_text_size_y = 50, tick_size = 1.5, tick_length = 0.3, - strip_text_size = 50, - panel_spacing_single = 4, - panel_spacing_multi = 0.6 + panel_spacing_single = 4 ) params <- utils::modifyList(defaults, theme_params) - if (logFDR) data$logFDR <- -log10(data[[fdr_col]]) else data$logFDR <- data[[fdr_col]] - order <- match.arg(order) - data <- data[order(data[[axis_y]], decreasing = (order == "desc")), ] - if (length(unique(data[[Comparison]])) == 1) { - if (!requireNamespace("patchwork", quietly = TRUE)) stop("patchwork required", call. = FALSE) - if (!requireNamespace("cowplot", quietly = TRUE)) stop("cowplot required", call. = FALSE) - df <- data[, c(geneset_col, collection_col, nes_col, logfdr_col)] - colnames(df) <- c("Geneset", "Collection", "NES", "logFDR") - df$Geneset <- factor(df$Geneset, levels = rev(unique(df$Geneset))) - df$Collection <- factor(df$Collection, levels = unique(df$Collection)) - plot_text_pathways <- ggplot() + - annotate("text", label = "Pathways", fontface = "bold.italic", angle = 90, - size = params$side_label_size, x = 0, y = 0.5) + theme_void() - plot_left <- ggplot(df, aes(y = .data$Geneset, x = 0, label = .data$Geneset)) + - geom_text(hjust = 1, size = params$geneset_text_size) + theme_void() + - theme(axis.text.y = element_blank(), plot.margin = margin(0, 0, 0, -50)) - plot_center <- ggplot(df, aes(x = .data$NES, y = .data$Geneset, fill = .data$logFDR)) + - geom_col(color = params$bar_col, size = params$col_size) + - scale_fill_gradient(low = fill_palette[1], high = fill_palette[2], - limits = fill_limits, breaks = scales::pretty_breaks()) + - scale_y_discrete(position = "right") + facet_grid(Collection ~ ., scales = "free_y", space = "free_y") + - theme_bw() + labs(x = "NES", y = "") + - theme(axis.text.y = element_blank(), strip.background = element_rect(fill = "white", color = "black", linewidth = 1), - axis.ticks.y = element_line(size = params$tick_size), - axis.ticks.length = grid::unit(params$tick_length, "cm"), - strip.text.y = element_text(size = 1), legend.position = "none", - axis.title.x = element_text(size = params$axis_title_size), - axis.text.x = element_text(size = params$axis_text_size_x), - panel.spacing = grid::unit(params$panel_spacing_single, "lines")) - plot_text_msigdb <- ggplot() + - annotate("text", label = "MSigDB", fontface = "bold.italic", angle = 90, - size = params$side_label_size, x = 0, y = 0.5) + theme_void() - plot_right <- ggplot(df, aes(y = .data$Geneset, x = 1.5, label = .data$Collection)) + - geom_text(aes(label = ifelse(duplicated(.data$Collection), "", .data$Collection)), - hjust = 0.5, size = params$collection_text_size, fontface = "bold") + - facet_grid(Collection ~ ., scales = "free_y", space = "free", switch = "y") + theme_void() + - theme(strip.text.y = element_text(size = params$collection_text_size), - panel.spacing = grid::unit(1, "lines")) - plot_legend <- ggplot(df, aes(x = .data$NES, y = .data$Geneset, fill = .data$logFDR)) + - geom_tile() + scale_fill_gradient(low = fill_palette[1], high = fill_palette[2], - name = expression(-log[10] ~ FDR), limits = fill_limits, - guide = guide_colorbar(ticks.colour = "black", ticks.linewidth = 1.5, - draw.ulim = TRUE, draw.llim = TRUE)) + theme_bw() + - theme(legend.title = element_text(size = 44, face = "bold"), - legend.text = element_text(size = 30), - legend.key.size = grid::unit(1.5, "cm"), - legend.key.height = grid::unit(2, "cm"), - legend.spacing = grid::unit(3.5, "cm"), - legend.box.margin = margin(10, 20, 10, 10)) - plot_right_legend <- cowplot::get_legend(plot_legend) - final_plot <- plot_text_pathways + plot_left + plot_center + plot_right + - plot_text_msigdb + plot_right_legend + - patchwork::plot_layout(ncol = 6, widths = params$panel_widths) + + # Always compute -log10(FDR) internally + data$tmp_log10FDR <- -log10(data[[fdr_col]]) + + data <- data[order(data[[nes_col]], decreasing = (order == "desc")), ] + + df <- data[, c(geneset_col, collection_col, nes_col, "tmp_log10FDR")] + colnames(df) <- c("Geneset", "Collection", "NES", "tmp_log10FDR") + df$Geneset <- factor(df$Geneset, levels = rev(unique(df$Geneset))) + df$Collection <- factor(df$Collection, levels = unique(df$Collection)) + + plot_text_pathways <- ggplot() + + annotate("text", label = "Pathways", fontface = "bold.italic", angle = 90, + size = params$side_label_size, x = 0, y = 0.5) + + theme_void() + + plot_left <- ggplot(df, aes(y = .data$Geneset, x = 0)) + + geom_text(aes(label = .data$Geneset), hjust = 1, + size = params$geneset_text_size) + + theme_void() + + theme(axis.text.y = element_blank(), plot.margin = margin(0, 0, 0, -50)) + + plot_center <- ggplot(df, aes(x = .data$NES, y = .data$Geneset, + fill = .data$tmp_log10FDR)) + + geom_col(color = "black", linewidth = params$col_size) + + scale_fill_gradient(low = fill_palette[1], high = fill_palette[2], + limits = fill_limits, + breaks = scales::pretty_breaks()) + + scale_y_discrete(position = "right") + + facet_grid(Collection ~ ., scales = "free_y", space = "free_y") + + theme_bw() + labs(x = "NES", y = "") + + theme( + axis.text.y = element_blank(), + strip.background = element_rect(fill = "white", color = "black", linewidth = 1), + axis.ticks.y = element_line(linewidth = params$tick_size), + axis.ticks.length = grid::unit(params$tick_length, "cm"), + strip.text.y = element_text(size = 1), + legend.position = "none", + axis.title.x = element_text(size = params$axis_title_size), + axis.text.x = element_text(size = params$axis_text_size_x), + panel.spacing = grid::unit(params$panel_spacing_single, "lines") + ) + + plot_text_msigdb <- ggplot() + + annotate("text", label = "MSigDB", fontface = "bold.italic", angle = 90, + size = params$side_label_size, x = 0, y = 0.5) + + theme_void() + + plot_right <- ggplot(df, aes(y = .data$Geneset, x = 1.5)) + + geom_text( + aes(label = ifelse(duplicated(.data$Collection), "", + as.character(.data$Collection))), + hjust = 0.5, size = params$collection_text_size, fontface = "bold" + ) + + facet_grid(Collection ~ ., scales = "free_y", space = "free", switch = "y") + + theme_void() + + theme(strip.text.y = element_text(size = params$collection_text_size), + panel.spacing = grid::unit(1, "lines")) + + plot_legend <- ggplot(df, aes(x = .data$NES, y = .data$Geneset, + fill = .data$tmp_log10FDR)) + + geom_tile() + + scale_fill_gradient( + low = fill_palette[1], high = fill_palette[2], + name = expression(-log[10] ~ FDR), limits = fill_limits, + guide = guide_colorbar(ticks.colour = "black", ticks.linewidth = 1.5, + draw.ulim = TRUE, draw.llim = TRUE) + ) + + theme_bw() + + theme( + legend.title = element_text(size = 44, face = "bold"), + legend.text = element_text(size = 30), + legend.key.size = grid::unit(1.5, "cm"), + legend.key.height = grid::unit(2, "cm"), + legend.spacing = grid::unit(3.5, "cm"), + legend.box.margin = margin(10, 20, 10, 10) + ) + + plot_right_legend <- cowplot::get_legend(plot_legend) + + final_plot <- plot_text_pathways + plot_left + plot_center + plot_right + + plot_text_msigdb + plot_right_legend + + patchwork::plot_layout(ncol = 6, widths = params$panel_widths) + + return(final_plot) +} + + +######################## +# Function multiplot_PA # +######################## + +#' Pathway analysis visualization across multiple comparisons +#' +#' Generates a faceted barplot showing NES values across multiple comparisons +#' for a set of gene sets. Each facet represents one gene set and bars +#' represent the NES per comparison, colored by -log10(FDR). This layout makes +#' it easy to compare how enrichment of gene sets changes across conditions +#' (e.g., TumorVsNormal, MetastasisVsNormal). +#' +#' All comparisons must be combined in a single data frame with a column +#' identifying each comparison as produced by [merge_PA()]. +#' +#' For visualizing a single comparison with full collection grouping, use +#' [splot_PA()] instead. +#' +#' @param data A data frame of pathway analysis results containing two or more +#' comparisons. Typically the output of [merge_PA()]. +#' @param comparison_col Name of the column identifying each comparison. +#' Appears on the x-axis of each facet. Default: `"COMPARISON"`. +#' @param facet_col Name of the column used to define facets one facet per +#' unique value. Can be the original gene set name column (e.g., `"NAME"`) +#' or a manually curated column with cleaner or shorter labels +#' (e.g., `"clean_name"`). Default: `"NAME"`. +#' @param axis_y Name of the column to use for the y-axis. Default: `"NES"`. +#' @param fdr_col Name of the column containing FDR values. `-log10(FDR)` is +#' computed internally and used as the fill color. Default: `"FDR"`. +#' @param comparison_order Character vector specifying the left-to-right order +#' of comparisons on the x-axis of each facet. For example, +#' `comparison_order = c("BvsA", "CvsA")` places `BvsA` on the left and +#' `CvsA` on the right. If `NULL` (default), the order follows the factor +#' levels of `comparison_col` as they appear in `data`. +#' @param custom_labels Named character vector of x-axis tick labels. Useful +#' for shortening comparison names on the axis. For example, +#' `custom_labels = c(TumorVsNormal = "Tumor", MetastasisVsNormal = "Mets")`. +#' Default: `NULL`. +#' @param ncol_wrap Integer. Number of columns in `facet_wrap`. Default: `2`. +#' @param free_y Logical. If `TRUE`, each facet uses its own y-axis scale. +#' Default: `TRUE`. +#' @param fill_limits Numeric vector of length 2 setting the color scale range +#' for `-log10(FDR)`. Values outside this range are clamped to the nearest +#' limit. For example, `fill_limits = c(0, 5)` maps all gene sets with +#' `-log10(FDR) >= 5` (FDR <= 0.00001) to maximum red, and any value below +#' 0 to white. Useful when one gene set has extreme significance that makes +#' the rest appear uniform. Default: `NULL` (auto). +#' @param fill_palette Character vector of two colors for the fill gradient +#' (low to high -log10(FDR)). Default: `c("white", "red")`. +#' @param theme_params Named list to override default theme parameters. +#' See Details. +#' +#' @details +#' `theme_params` accepts any of the following named elements: +#' \describe{ +#' \item{`bar_col`}{Bar border color. Default: `"black"`.} +#' \item{`bar_size`}{Bar border linewidth. Default: `0.5`.} +#' \item{`bar_width`}{Bar width. Default: `0.6`.} +#' \item{`hline_size`}{Linewidth for horizontal line at y = 0. Default: `2`.} +#' \item{`axis_title_size`}{Font size for axis titles. Default: `45`.} +#' \item{`axis_text_size_x`}{Font size for x-axis labels. Default: `30`.} +#' \item{`axis_text_size_y`}{Font size for y-axis labels. Default: `50`.} +#' \item{`tick_size`}{Linewidth for axis ticks. Default: `1.5`.} +#' \item{`tick_length`}{Length of axis ticks in cm. Default: `0.3`.} +#' \item{`strip_text_size`}{Font size for facet strip labels. Default: `50`.} +#' \item{`panel_spacing_multi`}{Spacing between facets. Default: `0.6`.} +#' } +#' +#' @return A ggplot2 object. +#' +#' @examples +#' \dontrun{ +#' gsea_results <- merge_PA("path/to/gsea_results/") +#' +#' # Basic multi-comparison plot +#' multiplot_PA( +#' data = gsea_results, +#' comparison_col = "COMPARISON", +#' facet_col = "NAME", +#' fdr_col = "FDR", +#' ncol_wrap = 3 +#' ) +#' +#' # Control left-to-right order of comparisons on the x-axis +#' multiplot_PA( +#' data = gsea_results, +#' comparison_col = "COMPARISON", +#' facet_col = "NAME", +#' fdr_col = "FDR", +#' comparison_order = c("BvsA", "CvsA") # BvsA on the left, CvsA on the right +#' ) +#' +#' # Use cleaner facet labels and shorten x-axis tick names +#' gsea_results$clean_name <- gsub("_", " ", gsea_results$NAME) +#' +#' multiplot_PA( +#' data = gsea_results, +#' comparison_col = "COMPARISON", +#' facet_col = "clean_name", +#' fdr_col = "FDR", +#' comparison_order = c("BvsA", "CvsA"), +#' custom_labels = c(BvsA = "Tumor", CvsA = "Metastasis") +#' ) +#' } +#' +#' @seealso [splot_PA()] for single-comparison patchwork plots; +#' [merge_PA()] to generate the input data frame; +#' [camera_results] for a minimal example dataset. +#' +#' @import ggplot2 +#' @importFrom rlang .data +#' @importFrom utils modifyList +#' @export + +multiplot_PA <- function(data, + comparison_col = "COMPARISON", + facet_col = "NAME", + axis_y = "NES", + fdr_col = "FDR", + comparison_order = NULL, + custom_labels = NULL, + ncol_wrap = 2, + free_y = TRUE, + fill_limits = NULL, + fill_palette = c("white", "red"), + theme_params = list()) { + + if (!is.data.frame(data)) stop("`data` must be a data frame.", call. = FALSE) + + for (col in c(comparison_col, axis_y, fdr_col)) { + if (!col %in% colnames(data)) { + stop("Column '", col, "' not found in `data`.", call. = FALSE) + } + } + + if (!facet_col %in% colnames(data)) { + stop( + "Column '", facet_col, "' not found in `data`. ", + "`facet_col` can be the original gene set name column (e.g., 'NAME') ", + "or a manually curated column with cleaner labels.", + call. = FALSE + ) + } + + defaults <- list( + bar_col = "black", + bar_size = 0.5, + bar_width = 0.6, + hline_size = 2, + axis_title_size = 45, + axis_text_size_x = 30, + axis_text_size_y = 50, + tick_size = 1.5, + tick_length = 0.3, + strip_text_size = 50, + panel_spacing_multi = 0.6 + ) + params <- utils::modifyList(defaults, theme_params) + + # Always compute -log10(FDR) internally + data$tmp_log10FDR <- -log10(data[[fdr_col]]) + + # Apply comparison order if specified + if (!is.null(comparison_order)) { + missing_comps <- setdiff(comparison_order, unique(data[[comparison_col]])) + if (length(missing_comps) > 0) { + warning( + "The following values in `comparison_order` were not found in `", + comparison_col, "`: ", + paste(missing_comps, collapse = ", "), + call. = FALSE + ) + } + data[[comparison_col]] <- factor(data[[comparison_col]], + levels = comparison_order) + } + + p <- ggplot(data, aes(x = .data[[comparison_col]], + y = .data[[axis_y]], + fill = .data$tmp_log10FDR)) + + geom_bar(stat = "identity", + color = params$bar_col, + linewidth = params$bar_size, + width = params$bar_width) + + scale_fill_gradient( + low = fill_palette[1], high = fill_palette[2], + limits = fill_limits, oob = scales::squish, + name = expression(-log[10] ~ FDR), + guide = guide_colorbar(barwidth = 3, barheight = 18) + ) + + labs(x = "Comparisons", y = axis_y) + + theme_bw() + + theme( + axis.line.x = element_blank(), + axis.line = element_line(linewidth = 0.5), + axis.title.x = element_text(size = params$axis_title_size), + axis.title.y = element_text(size = params$axis_title_size), + axis.text.x = element_text(size = params$axis_text_size_x), + axis.text.y = element_text(size = params$axis_text_size_y), + axis.ticks = element_line(linewidth = params$tick_size), + axis.ticks.length = grid::unit(params$tick_length, "cm"), + strip.text = element_text(size = params$strip_text_size), + panel.spacing = grid::unit(params$panel_spacing_multi, "lines") + ) + + geom_hline(yintercept = 0, linewidth = params$hline_size) + + expand_limits(y = 0) + + facet_wrap(~ .data[[facet_col]], ncol = ncol_wrap, + scales = if (free_y) "free_y" else "fixed") + + if (!is.null(custom_labels)) { + p <- p + scale_x_discrete(labels = custom_labels) + } + + return(p) +} + + +######################## +# Function heatmap_PA # +######################## + +#' Plot leading edge heatmaps from pathway analysis results +#' +#' Generates heatmaps of gene expression for each gene set in `pa_data_annot`, +#' using the `all_genes`, `le_genes` (GSEA output only), and/or `top_genes` +#' columns produced by [addgenesPA()]. Genes within each heatmap are ordered +#' by their position in `ranked_genes`. +#' +#' The recommended workflow before calling this function is: +#' ```r +#' gsl <- list_gmts("path/to/gmt/") +#' pa_data <- merge_PA("path/to/pa_data/") +#' ranked <- deseq2_results$gene_id[order(deseq2_results$stat, +#' decreasing = TRUE)] +#' gene_lists <- getgenesPA(pa_data, gsl, ranked, genes = c("all", "le")) +#' pa_annot <- addgenesPA(pa_data, gene_lists) +#' +#' heatmap_PA( +#' expression_data = vst_counts, +#' metadata = sampledata, +#' pa_data_annot = pa_annot, +#' ranked_genes = ranked, +#' plot_genes = c("all_genes", "le_genes") +#' ) +#' ``` +#' +#' @param expression_data A numeric matrix or data frame of expression values +#' with gene symbols or Ensembl IDs as row names and sample IDs as column +#' names. Recommended input: VST-transformed counts from [vst_counts] or +#' normalized coutns [norm_counts]. +#' @param metadata A data frame of sample annotations. Must contain a column +#' matching `sample_col` (sample IDs) and a column matching `group_col` +#' (condition labels, e.g., `"Control"`, `"Treatment"`). +#' @param pa_data_annot A data frame of pathway analysis results enriched with +#' gene columns. Must contain the column `NAME` and at least one of +#' `all_genes`, `le_genes`, or `top_genes` (comma-separated gene symbols per +#' gene set). Typically the output of [addgenesPA()]. +#' @param ranked_genes A character vector of gene symbols ordered by their +#' ranking metric (e.g., stat, log2FC or signal-to-noise ratio), used to sort +#' genes within each heatmap row. +#' @param plot_genes Character vector specifying which gene columns to plot. +#' One or both of `"all_genes"` and `"le_genes"`, and `"top_genes"`. Each +#' selection produces its own set of output files in a dedicated subfolder. +#' Default: `c("all_genes", "le_genes")`. +#' @param sample_col Name of the sample ID column in `metadata`. +#' Default: `"Sample"`. +#' @param group_col Name of the condition/group column in `metadata` +#' (e.g., `"Control"` vs `"Treatment"`). Used for heatmap column +#' annotations. Default: `"group"`. +#' @param out_dir Character. Path to the output directory. Subdirectories are +#' created automatically based on `pdf`, `jpg`, and `plot_genes`: +#' * `/pdf/all_genes/` +#' * `/pdf/le_genes/` +#' * `/pdf/top_genes/` +#' * `/jpg/all_genes/` +#' * `/jpg/le_genes/` +#' * `/jpg/top_genes/` +#' Default: `"heatmaps_PA"`. +#' @param pdf Logical. If `TRUE`, saves PDF heatmaps. Default: `TRUE`. +#' @param jpg Logical. If `TRUE`, saves JPG heatmaps. Default: `TRUE`. +#' +#' @return Invisibly returns `TRUE` upon completion. Saves heatmap files to +#' the corresponding subdirectories under `out_dir`. +#' +#' @examples +#' \dontrun{ +#' data(vst_counts) +#' data(sampledata) +#' data(deseq2_results) +#' data(gsea_results) +#' data(geneset_list) +#' +#' ranked <- deseq2_results$gene_id[order(deseq2_results$stat, +#' decreasing = TRUE)] +#' +#' # ── Example 1: GSEA results (all_genes + le_genes) ──── +#' pa_single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ] +#' gene_lists <- getgenesPA(pa_single, geneset_list, ranked, +#' genes = c("all", "le")) +#' pa_annot <- addgenesPA(pa_single, gene_lists) +#' +#' heatmap_PA( +#' expression_data = vst_counts, +#' metadata = sampledata, +#' pa_data_annot = pa_annot, +#' ranked_genes = ranked, +#' plot_genes = c("all_genes", "le_genes"), +#' sample_col = "patient_id", +#' group_col = "sample_type", +#' out_dir = "heatmaps_gsea", +#' pdf = TRUE, +#' jpg = TRUE +#' ) +#' # Creates: +#' # heatmaps_gsea/pdf/all_genes/_heatmap.pdf +#' # heatmaps_gsea/pdf/le_genes/_heatmap.pdf +#' # heatmaps_gsea/jpg/all_genes/_heatmap.jpg +#' # heatmaps_gsea/jpg/le_genes/_heatmap.jpg +#' +#' # ── Example 2: CAMERA results (all_genes + top_genes) +#' # camera_results does not contain leading edge information. +#' # Use genes = "top" with a manually set top fraction instead. +#' # Note: top_genes are rank-based and do NOT represent true leading edge genes. +#' data(camera_results) +#' camera_pa <- camera_results +#' colnames(camera_pa)[colnames(camera_pa) == "GeneSet"] <- "NAME" +#' camera_pa$SIZE <- sapply(camera_pa$NAME, +#' function(x) length(geneset_list[[x]])) +#' camera_pa$top <- 0.25 +#' +#' gene_lists_cam <- getgenesPA(camera_pa, geneset_list, ranked, +#' genes = c("all", "top")) +#' pa_annot_cam <- addgenesPA(camera_pa, gene_lists_cam) +#' +#' heatmap_PA( +#' expression_data = vst_counts, +#' metadata = sampledata, +#' pa_data_annot = pa_annot_cam, +#' ranked_genes = ranked, +#' plot_genes = c("all_genes", "top_genes"), +#' sample_col = "patient_id", +#' group_col = "sample_type", +#' out_dir = "heatmaps_camera" +#' ) +#' } +#' +#' @seealso [getgenesPA()] for gene extraction; +#' [addgenesPA()] to generate `pa_data_annot`; +#' [list_gmts()] to generate the geneset list; +#' [merge_PA()] to generate `pa_data`; +#' [vst_counts] for an example expression matrix. +#' +#' @export + +heatmap_PA <- function(expression_data, + metadata, + pa_data_annot, + ranked_genes, + plot_genes = c("all_genes", "le_genes"), + sample_col = "Sample", + group_col = "group", + out_dir = "heatmaps_PA", + pdf = TRUE, + jpg = TRUE) { + + if (!requireNamespace("pheatmap", quietly = TRUE)) { + stop("Package \"pheatmap\" must be installed to use this function.", call. = FALSE) + } + if (!requireNamespace("grDevices", quietly = TRUE)) { + stop("Package \"grDevices\" must be installed to use this function.", call. = FALSE) + } + + # --- Input validation ---- + if (!is.matrix(expression_data) && !is.data.frame(expression_data)) { + stop("`expression_data` must be a matrix or data frame.", call. = FALSE) + } + if (!is.data.frame(metadata)) { + stop("`metadata` must be a data frame.", call. = FALSE) + } + if (!sample_col %in% colnames(metadata)) { + stop("`sample_col` ('", sample_col, "') not found in `metadata`.", call. = FALSE) + } + if (!group_col %in% colnames(metadata)) { + stop("`group_col` ('", group_col, "') not found in `metadata`.", call. = FALSE) + } + if (!is.data.frame(pa_data_annot)) { + stop("`pa_data_annot` must be a data frame.", call. = FALSE) + } + if (!"NAME" %in% colnames(pa_data_annot)) { + stop("`pa_data_annot` must contain a column named 'NAME'.", call. = FALSE) + } + + plot_genes <- match.arg(plot_genes, + choices = c("all_genes", "le_genes", "top_genes"), + several.ok = TRUE) + + missing_cols <- setdiff(plot_genes, colnames(pa_data_annot)) + if (length(missing_cols) > 0) { + stop( + "Column(s) not found in `pa_data_annot`: ", + paste(missing_cols, collapse = ", "), + ". Run addgenesPA() first.", + call. = FALSE + ) + } + + if (!is.character(ranked_genes) || length(ranked_genes) == 0) { + stop("`ranked_genes` must be a non-empty character vector.", call. = FALSE) + } + + # --- Prepare metadata ---- + meta <- metadata[, c(sample_col, group_col), drop = FALSE] + colnames(meta) <- c("Sample", "Group") + rownames(meta) <- meta$Sample + + expr_mat <- as.matrix(expression_data) + common_samps <- intersect(colnames(expr_mat), rownames(meta)) + + if (length(common_samps) == 0) { + stop( + "No common samples between `expression_data` columns and `metadata` rows. ", + "Check that `sample_col` matches the column names of `expression_data`.", + call. = FALSE + ) + } + + # --- Internal heatmap drawer -- + .draw_heatmap <- function(mat, title) { + annot_col <- data.frame(Group = meta[colnames(mat), "Group"]) + rownames(annot_col) <- colnames(mat) + pheatmap::pheatmap( + mat, + main = title, + color = grDevices::colorRampPalette( + c("blue", "white", "red"))(30), + scale = "row", + clustering_distance_rows = "euclidean", + cluster_cols = FALSE, + clustering_method = "complete", + fontsize_row = 6, + fontsize_col = 7, + annotation_col = annot_col, + border_color = NA, + cellheight = 5, + cellwidth = 8 + ) + } + + # --- Loop over requested gene column types --- + n_plotted <- 0L + + for (gene_col in plot_genes) { + + if (pdf) { + pdf_sub <- file.path(out_dir, "pdf", gene_col) + if (!dir.exists(pdf_sub)) dir.create(pdf_sub, recursive = TRUE) + } + if (jpg) { + jpg_sub <- file.path(out_dir, "jpg", gene_col) + if (!dir.exists(jpg_sub)) dir.create(jpg_sub, recursive = TRUE) + } + + for (i in seq_len(nrow(pa_data_annot))) { + + gs <- pa_data_annot$NAME[i] + genes_str <- pa_data_annot[[gene_col]][i] + + if (is.na(genes_str) || nchar(genes_str) == 0) next + + genes_vec <- strsplit(genes_str, ",")[[1]] + genes_in_mat <- genes_vec[genes_vec %in% rownames(expr_mat)] + + if (length(genes_in_mat) == 0) next + + gene_ranks <- match(genes_in_mat, ranked_genes) + genes_ordered <- genes_in_mat[order(gene_ranks, na.last = TRUE)] + # Order samples by group so conditions are grouped in the heatmap + sample_groups <- meta[common_samps, "Group"] + samps_ordered <- common_samps[order(sample_groups)] + heatmap_mat <- expr_mat[genes_ordered, samps_ordered, drop = FALSE] + + w <- 10 + h <- max(5, nrow(heatmap_mat) * 0.1 + 2) + + if (pdf) { + grDevices::pdf(file.path(pdf_sub, paste0(gs, "_heatmap.pdf")), + width = w, height = h) + .draw_heatmap(heatmap_mat, gs) + grDevices::dev.off() + } + + if (jpg) { + grDevices::jpeg(file.path(jpg_sub, paste0(gs, "_heatmap.jpg")), + width = w * 100, height = h * 100, res = 150) + .draw_heatmap(heatmap_mat, gs) + grDevices::dev.off() + } + + n_plotted <- n_plotted + 1L + } + } + + message("Done. ", n_plotted, " heatmap(s) saved in: ", + normalizePath(out_dir)) + invisible(TRUE) +} + +############################ +# Function heatmap_path_PA # +############################ + +utils::globalVariables(c( + "NAME", "GENES", "SIZE", "tags", "L.EDGE_size" +)) + +#' Plot leading edge heatmaps from GSEA analysis results using file paths +#' +#' Generates one heatmap per gene set from GSEA/CAMERA/PADOG output by reading +#' all required inputs from file paths. For each gene set, the leading edge +#' genes are extracted, ordered by their rank in the ranked gene list, and +#' plotted as a scaled row heatmap against the expression matrix. +#' +#' This function is the file-path-based alternative to [heatmap_PA()], which +#' accepts R objects directly. Use this version when working from raw output +#' files on disk (e.g., directly after running `GSEA_merge.sh`). +#' +#' @param main_dir Character or `NULL`. Optional base directory prepended to +#' all relative file paths. If `NULL` (default), all paths are used as-is. +#' @param expression_file Character. Path to a tab-delimited expression data +#' file. Rows are genes (first column or a column named `NAME` used as row +#' names), columns are sample IDs. Recommended input: VST-transformed counts. +#' @param metadata_file Character. Path to an Excel (`.xlsx`) metadata file. +#' Must contain a column matching `sample_col` (sample IDs) and a column +#' matching `group_col` (condition labels, e.g., `"Control"`, +#' `"Treatment"`). +#' @param gmt_file Character. Path to a `.gmt` file defining gene sets. Each +#' row contains: gene set name (column 1), description (column 2, ignored), +#' and gene symbols (columns 3+). +#' @param ranked_genes_file Character. Path to a tab-delimited file where the +#' first column contains gene symbols ordered by their ranking metric (e.g., +#' log2FC or signal-to-noise ratio), from most positive to most negative. +#' Used to order leading edge genes within each heatmap. +#' @param gsea_file Character. Path to a GSEA results `.tsv` file containing +#' at least the columns `NAME`, `SIZE`, and `tags` (from the `LEADING EDGE` +#' column parsed by [merge_PA()]). +#' @param output_dir Character. Directory where heatmap files are saved. +#' Created automatically if it does not exist. Default: +#' `"leading_edge_heatmaps"`. +#' @param sample_col Name of the sample ID column in the metadata file. +#' Default: `"Sample"`. +#' @param group_col Name of the condition/group column in the metadata file +#' (e.g., `"Control"` vs `"Treatment"`). Used for heatmap column +#' annotations. Default: `"group"`. +#' @param save_dataframe Logical. If `TRUE`, saves the intermediate data frame +#' (gene sets with computed leading edge genes) as a `.tsv` file in +#' `output_dir` before plotting. Useful for inspection or reuse. +#' Default: `FALSE`. +#' +#' @return Invisibly returns `TRUE` upon completion. Saves two files per gene +#' set in `output_dir`: +#' * `_heatmap.pdf` +#' * `_heatmap.jpg` +#' +#' If `save_dataframe = TRUE`, also saves +#' `/leading_edge_genes_df.tsv`. +#' +#' @note For a more flexible workflow that accepts R objects directly (avoiding +#' repeated file reads), use [heatmap_PA()] instead, which takes +#' `expression_data`, `metadata`, and `pa_data_annot` as R objects and +#' integrates with [getgenesPA()] and [addgenesPA()]. +#' +#' @examples +#' \dontrun{ +#' # Run with all files in a single base directory +#' heatmap_path_PA( +#' main_dir = "path/to/analysis/", +#' expression_file = "vst_expression.tsv", +#' metadata_file = "metadata.xlsx", +#' gmt_file = "genesets.gmt", +#' ranked_genes_file = "ranked_genes.tsv", +#' gsea_file = "gsea_results.tsv", +#' output_dir = "leading_edge_heatmaps", +#' sample_col = "Sample", +#' group_col = "group", +#' save_dataframe = TRUE +#' ) +#' # Saves: +#' # leading_edge_heatmaps/_heatmap.pdf +#' # leading_edge_heatmaps/_heatmap.jpg +#' # leading_edge_heatmaps/leading_edge_genes_df.tsv (if save_dataframe = TRUE) +#' +#' # Run with absolute paths (no main_dir) +#' heatmap_path_PA( +#' expression_file = "/data/vst_counts.tsv", +#' metadata_file = "/data/metadata.xlsx", +#' gmt_file = "/data/h.all.v2023.gmt", +#' ranked_genes_file = "/data/ranked_genes.tsv", +#' gsea_file = "/data/gsea_results.tsv" +#' ) +#' } +#' +#' @seealso [heatmap_PA()] for the R-object-based alternative; +#' [getgenesPA()] and [addgenesPA()] for extracting leading edge genes +#' from R objects; [merge_PA()] to generate the GSEA results input; +#' [list_gmts()] to load GMT files as R objects. +#' +#' @importFrom magrittr %>% +#' @export + +heatmap_path_PA <- function(main_dir = NULL, + expression_file, + metadata_file, + gmt_file, + ranked_genes_file, + gsea_file, + output_dir = "leading_edge_heatmaps", + sample_col = "Sample", + group_col = "group", + save_dataframe = FALSE) { + + if (!requireNamespace("readr", quietly = TRUE)) stop("Package \"readr\" must be installed to use this function.", call. = FALSE) + if (!requireNamespace("grDevices", quietly = TRUE)) stop("Package \"grDevices\" must be installed to use this function.", call. = FALSE) + if (!requireNamespace("tidyselect", quietly = TRUE)) stop("Package \"tidyselect\" must be installed to use this function.", call. = FALSE) + if (!requireNamespace("openxlsx", quietly = TRUE)) stop("Package \"openxlsx\" must be installed to use this function.", call. = FALSE) + if (!requireNamespace("pheatmap", quietly = TRUE)) stop("Package \"pheatmap\" must be installed to use this function.", call. = FALSE) + + # Prepend base directory if provided + if (!is.null(main_dir)) { + expression_file <- file.path(main_dir, expression_file) + metadata_file <- file.path(main_dir, metadata_file) + gmt_file <- file.path(main_dir, gmt_file) + ranked_genes_file <- file.path(main_dir, ranked_genes_file) + gsea_file <- file.path(main_dir, gsea_file) + output_dir <- file.path(main_dir, output_dir) + } + + # 1) Read and process GMT + gmt_data <- readLines(gmt_file) %>% + strsplit("\t") %>% + lapply(function(x) data.frame( + NAME = x[1], + GENES = paste(x[-c(1, 2)], collapse = ","), + stringsAsFactors = FALSE + )) %>% + dplyr::bind_rows() + + # 2) Read GSEA results and join genes + gsea_df <- readr::read_tsv(gsea_file, show_col_types = FALSE) %>% + dplyr::left_join( + dplyr::select(gmt_data, NAME, GENES), + by = "NAME" + ) + + # 3) Read ranked genes list + ranked_df <- readr::read_tsv(ranked_genes_file, show_col_types = FALSE) + ranked_vector <- ranked_df[[1]] + + # 4) Internal helper: extract top-n genes ordered by rank + extract_top_n <- function(genes_str, n) { + if (is.na(genes_str) || n <= 0) return(NA_character_) + glist <- unlist(strsplit(genes_str, ",")) + glist <- glist[order(match(glist, ranked_vector), na.last = TRUE)] + paste(utils::head(glist, n), collapse = ",") + } + + # 5) Compute leading edge size and extract genes + gsea_df <- gsea_df %>% + dplyr::mutate( + L.EDGE_size = ifelse( + is.na(SIZE * tags), NA, + ifelse((SIZE * tags) %% 1 <= 0.5, + floor(SIZE * tags), + ceiling(SIZE * tags)) + ) + ) %>% + dplyr::rowwise() %>% + dplyr::mutate(LEADING_EDGE_GENES = extract_top_n(GENES, L.EDGE_size)) %>% + dplyr::ungroup() + + # Optionally save intermediate data frame + if (save_dataframe) { + if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE) + intermediate_file <- file.path(output_dir, "leading_edge_genes_df.tsv") + readr::write_tsv(gsea_df, intermediate_file) + message("Saved intermediate data frame to: ", intermediate_file) + } + + # 6) Read metadata and prepare annotation + meta <- openxlsx::read.xlsx(metadata_file) %>% + dplyr::select(tidyselect::all_of(c(sample_col, group_col))) %>% + dplyr::rename( + Sample = tidyselect::all_of(sample_col), + Group = tidyselect::all_of(group_col) + ) %>% + as.data.frame() + rownames(meta) <- meta$Sample + + # 7) Read expression matrix + expr_raw <- utils::read.table(expression_file, header = TRUE, sep = "\t", + stringsAsFactors = FALSE, check.names = FALSE) + + if ("NAME" %in% colnames(expr_raw)) { + rownames(expr_raw) <- expr_raw$NAME + expr_mat <- expr_raw[, setdiff(colnames(expr_raw), "NAME"), drop = FALSE] } else { - final_plot <- ggplot(data, aes(x = .data[[Comparison]], y = .data[[axis_y]], fill = .data$logFDR)) + - geom_bar(stat = "identity", color = params$bar_col, size = params$bar_size, - width = params$bar_width) + - scale_fill_gradient(low = fill_palette[1], high = fill_palette[2], - limits = fill_limits, oob = scales::squish, - guide = guide_colorbar(barwidth = 3, barheight = 18)) + - labs(x = "Comparisons", y = axis_y) + theme_bw() + - theme(axis.line.x = element_blank(), axis.line = element_line(size = 0.5), - axis.title.x = element_text(size = params$axis_title_size), - axis.title.y = element_text(size = params$axis_title_size), - axis.text.x = element_text(size = params$axis_text_size_x), - axis.text.y = element_text(size = params$axis_text_size_y), - axis.ticks = element_line(size = params$tick_size), - axis.ticks.length = grid::unit(params$tick_length, "cm"), - strip.text = element_text(size = params$strip_text_size), - panel.spacing = grid::unit(params$panel_spacing_multi, "lines")) + - geom_hline(yintercept = 0, size = params$hline_size) + expand_limits(y = 0) + - facet_wrap(~ .data$New_name, ncol = ncol_wrap, - scales = if (free_y) "free_y" else "fixed") - if (!is.null(custom_labels)) final_plot <- final_plot + - scale_x_discrete(labels = custom_labels) + gene_col <- colnames(expr_raw)[1] + rownames(expr_raw) <- expr_raw[[gene_col]] + expr_mat <- expr_raw[, -1, drop = FALSE] } - return(final_plot) + + # Clean sample names (remove leading "X" added by R) + colnames(expr_mat) <- sub("^X", "", colnames(expr_mat)) + + # Ensure output directory exists + if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE) + + # 8) Loop through each gene set and plot heatmap + for (i in seq_len(nrow(gsea_df))) { + + geneset_name <- gsea_df$NAME[i] + leading_genes <- unlist(strsplit(gsea_df$LEADING_EDGE_GENES[i], ",")) + genes_present <- leading_genes[leading_genes %in% rownames(expr_mat)] + + if (length(genes_present) == 0) next + + common_samps <- intersect(colnames(expr_mat), rownames(meta)) + if (length(common_samps) == 0) next + + heatmap_mat <- expr_mat[genes_present, common_samps, drop = FALSE] + annot_col <- data.frame(Group = meta[common_samps, "Group"]) + rownames(annot_col) <- common_samps + + w <- 10 + h <- max(5, nrow(heatmap_mat) * 0.1 + 2) + + .draw <- function() { + pheatmap::pheatmap( + heatmap_mat, + main = geneset_name, + color = grDevices::colorRampPalette(c("blue", "white", "red"))(30), + scale = "row", + clustering_distance_rows = "euclidean", + cluster_cols = FALSE, + clustering_method = "complete", + fontsize_row = 6, + fontsize_col = 7, + annotation_col = annot_col, + border_color = NA, + cellheight = 5, + cellwidth = 8 + ) + } + + grDevices::pdf(file.path(output_dir, paste0(geneset_name, "_heatmap.pdf")), + width = w, height = h) + .draw() + grDevices::dev.off() + + grDevices::jpeg(file.path(output_dir, paste0(geneset_name, "_heatmap.jpg")), + width = w * 100, height = h * 100, res = 150) + .draw() + grDevices::dev.off() + } + + message("Heatmaps saved in: ", normalizePath(output_dir)) + invisible(TRUE) } + diff --git a/data-raw/gsea_results.R b/data-raw/gsea_results.R new file mode 100644 index 0000000..ca8f730 --- /dev/null +++ b/data-raw/gsea_results.R @@ -0,0 +1,121 @@ +## data-raw/gsea_results.R +## Generates gsea_results a simulated merge_PA() output for three comparisons +## (TumorVsNormal, MetastasisVsNormal, and MetastasisVsTumor) across three +## MSigDB collections (HALLMARK, KEGG, GO). Uses geneset_list and deseq2_results +## already in the package. +## Run once to regenerate data/gsea_results.rda +## Requires: OmicsKit data objects already loaded + + +# Setup +# ============================================================================= +set.seed(174) + +data(geneset_list) +data(deseq2_results) + +# ranked_genes: genes ordered by DESeq2 Wald stat (most positive = most upregulated) +ranked_genes <- deseq2_results$gene_id[ + order(deseq2_results$stat, decreasing = TRUE) +] + +# Define gene sets per collection +# We use names from geneset_list split by prefix +# ============================================================================= +gs_names <- names(geneset_list) +hallmark <- gs_names[grepl("^HALLMARK_", gs_names)] +kegg <- gs_names[grepl("^KEGG_", gs_names)] +go <- gs_names[grepl("^GO_", gs_names)] + +# Use all available from each collection +collection_map <- c( + setNames(rep("HALLMARK", length(hallmark)), hallmark), + setNames(rep("KEGG", length(kegg)), kegg), + setNames(rep("GO", length(go)), go) +) + +all_sets <- names(collection_map) +n_sets <- length(all_sets) + +# Helper: simulate one comparison +# ============================================================================= +simulate_comparison <- function(comparison_name, seed_offset = 0) { + + set.seed(174 + seed_offset) + + sizes <- vapply(all_sets, function(gs) length(geneset_list[[gs]]), integer(1)) + + # Simulate NES: mix of up/down enrichment + nes <- rnorm(n_sets, mean = 0, sd = 1.5) + + # Simulate FDR directly: ~60% significant (FDR < 0.05), rest non-significant + n_sig <- round(n_sets * 0.6) + n_nonsig <- n_sets - n_sig + fdr_sig <- sort(runif(n_sig, min = 0.001, max = 0.049)) + fdr_nonsig <- runif(n_nonsig, min = 0.06, max = 0.95) + rank_by_abs <- order(abs(nes), decreasing = TRUE) + fdr <- numeric(n_sets) + fdr[rank_by_abs[seq_len(n_sig)]] <- fdr_sig + fdr[rank_by_abs[seq(n_sig + 1L, n_sets)]] <- fdr_nonsig + nom_pval <- pmin(1, fdr * runif(n_sets, 0.3, 0.9)) + fwer <- pmin(1, fdr * 1.5) + + # Simulate rank at max + rank_at_max <- sample(seq_len(length(ranked_genes)), n_sets, replace = TRUE) + + # Simulate leading edge: tags% ~ |NES| / 3, capped at 60% + tags_pct <- pmin(0.60, abs(nes) / 3 + runif(n_sets, 0, 0.1)) + list_pct <- runif(n_sets, 0.20, 0.80) + signal_pct <- tags_pct * (1 - list_pct) / (1 - tags_pct * list_pct + 1e-6) + signal_pct <- pmin(signal_pct, 1) + + leading_edge <- sprintf( + "tags=%.0f%%, list=%.0f%%, signal=%.0f%%", + tags_pct * 100, + list_pct * 100, + signal_pct * 100 + ) + + data.frame( + NAME = all_sets, + SIZE = sizes, + ES = nes * 0.7, + NES = nes, + `NOM p-val` = nom_pval, + FDR = fdr, + `FWER p-val` = fwer, + `RANK AT MAX` = rank_at_max, + Log10FDR = -log10(fdr), + tags = tags_pct, + list = list_pct, + signal = signal_pct, + `LEADING EDGE` = leading_edge, + COLLECTION = unname(collection_map[all_sets]), + COMPARISON = comparison_name, + stringsAsFactors = FALSE, + check.names = FALSE + ) +} + +# Generate three comparisons +# ============================================================================= +comp1 <- simulate_comparison("TumorVsNormal", seed_offset = 0) +comp2 <- simulate_comparison("MetastasisVsNormal", seed_offset = 7) +comp3 <- simulate_comparison("MetastasisVsTumor", seed_offset = 14) + +gsea_results <- rbind(comp1, comp2, comp3) + +cat("gsea_results:\n") +cat(" Rows :", nrow(gsea_results), "\n") +cat(" Comparisons :", paste(unique(gsea_results$COMPARISON), collapse = ", "), "\n") +cat(" Collections :", paste(unique(gsea_results$COLLECTION), collapse = ", "), "\n") +cat(" Gene sets :", n_sets, "\n") +cat(" FDR < 0.05 :", + sum(gsea_results$FDR < 0.05), "out of", nrow(gsea_results), "\n") +cat(" Columns :", paste(colnames(gsea_results), collapse = ", "), "\n") + +# Save +# ============================================================================= +usethis::use_data(gsea_results, compress = "xz", overwrite = TRUE) + +message("Done. Saved: data/gsea_results.rda") diff --git a/data/gsea_results.rda b/data/gsea_results.rda new file mode 100644 index 0000000..8e6888b Binary files /dev/null and b/data/gsea_results.rda differ diff --git a/man/addgenesPA.Rd b/man/addgenesPA.Rd new file mode 100644 index 0000000..89b123f --- /dev/null +++ b/man/addgenesPA.Rd @@ -0,0 +1,82 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_genes_PA.R +\name{addgenesPA} +\alias{addgenesPA} +\title{Add gene columns to pathway analysis results} +\usage{ +addgenesPA(pa_data, gene_lists) +} +\arguments{ +\item{pa_data}{A data frame of pathway analysis results containing a \code{NAME} +column. Typically the output of \code{\link[=merge_PA]{merge_PA()}}.} + +\item{gene_lists}{Output of \code{\link[=getgenesPA]{getgenesPA()}}. Can be: +\itemize{ +\item A list with \verb{$all}, \verb{$le}, and/or \verb{$top} slots: when multiple modes +are requested (e.g., \code{getgenesPA(..., genes = c("all", "le", "top"))}). +Adds the corresponding columns. +\item A flat named list with attribute \code{genes_type}: when a single mode is +requested. Adds the corresponding column (\code{all_genes}, \code{le_genes}, or +\code{top_genes}). +}} +} +\value{ +The input \code{pa_data} data frame with one or more additional columns: +\itemize{ +\item \code{all_genes}: comma-separated string of all gene set members ordered by +rank. +\item \code{le_genes}: comma-separated string of leading edge genes (GSEA only), +ordered by rank. +\item \code{top_genes}: comma-separated string of top-ranked genes based on the +user-defined \code{top} fraction. +} + +Gene sets not found in \code{gene_lists} receive \code{NA}. +} +\description{ +Appends \code{all_genes}, \code{le_genes}, and/or \code{top_genes} columns to a pathway +analysis results data frame based on the output of \code{\link[=getgenesPA]{getgenesPA()}}. +Gene symbols within each cell are comma-separated. Automatically detects +which column(s) to add based on the structure of the input. +} +\examples{ +\dontrun{ +data(gsea_results) +data(geneset_list) +data(deseq2_results) + +ranked <- deseq2_results$gene_id[order(deseq2_results$stat, + decreasing = TRUE)] +pa_single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ] +pa_single$top <- 0.30 + +# Add all three columns +gene_lists <- getgenesPA(pa_single, geneset_list, ranked, + genes = c("all", "le", "top")) +pa_annot <- addgenesPA(pa_single, gene_lists) +head(pa_annot[, c("NAME", "all_genes", "le_genes", "top_genes")]) + +# Add only leading edge genes +le_only <- getgenesPA(pa_single, geneset_list, ranked, genes = "le") +pa_annot <- addgenesPA(pa_single, le_only) +head(pa_annot[, c("NAME", "le_genes")]) + +# CAMERA: add only top and all (no leading edge) +data(camera_results) +camera_pa <- camera_results +colnames(camera_pa)[colnames(camera_pa) == "GeneSet"] <- "NAME" +camera_pa$SIZE <- sapply(camera_pa$NAME, + function(x) length(geneset_list[[x]])) +camera_pa$top <- 0.25 +gene_lists_cam <- getgenesPA(camera_pa, geneset_list, ranked, + genes = c("all", "top")) +pa_annot_cam <- addgenesPA(camera_pa, gene_lists_cam) +head(pa_annot_cam[, c("NAME", "all_genes", "top_genes")]) +} + +} +\seealso{ +\code{\link[=getgenesPA]{getgenesPA()}} to generate \code{gene_lists}; +\code{\link[=heatmap_PA]{heatmap_PA()}} for heatmap visualization; +\code{\link[=save_results]{save_results()}} to export the annotated results. +} diff --git a/man/getgenesPA.Rd b/man/getgenesPA.Rd new file mode 100644 index 0000000..49c0f93 --- /dev/null +++ b/man/getgenesPA.Rd @@ -0,0 +1,137 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_genes_PA.R +\name{getgenesPA} +\alias{getgenesPA} +\title{Extract gene members from pathway analysis results} +\usage{ +getgenesPA(pa_data, geneset_list, ranked_genes, genes = c("all", "le")) +} +\arguments{ +\item{pa_data}{A data frame of pathway analysis results. Must always +contain: +\itemize{ +\item \code{NAME}: gene set name. +} + +Additionally required depending on \code{genes}: +\itemize{ +\item \code{SIZE}: number of genes in the gene set. Required for \code{"le"} and +\code{"top"}. +\item \code{tags}: numeric fraction (0-1) of genes contributing to the +enrichment score (GSEA leading edge). Produced automatically by +\code{\link[=merge_PA]{merge_PA()}}. Required for \code{genes = "le"}. +\item \code{top}: numeric fraction (0-1) defining the proportion of top-ranked +genes to extract. Set manually by the user (e.g., +\code{pa_data$top <- 0.25} for the top 25\%). Required for \code{genes = "top"}. +} + +Typically the output of \code{\link[=merge_PA]{merge_PA()}}.} + +\item{geneset_list}{A named list of gene sets, where each element is a +character vector of gene symbols. Typically the output of \code{\link[=list_gmts]{list_gmts()}}, +or use the built-in \link{geneset_list} for quick testing.} + +\item{ranked_genes}{A character vector of gene symbols ordered by their +ranking metric (e.g., DESeq2 \code{stat}, log2FC, or signal-to-noise ratio), +from most positive to most negative. Non-significant genes fall in the +middle of the list. Used to order genes within each extracted set.} + +\item{genes}{Character vector specifying which extraction mode(s) to use. +Any combination of \code{"all"}, \code{"le"}, and \code{"top"}. Default: +\code{c("all", "le")}.} +} +\value{ +Depends on \code{genes}: +\itemize{ +\item Single mode (e.g., \code{genes = "le"}): a named list where each element +is a character vector of gene symbols. The list has an attribute +\code{genes_type} used by \code{\link[=addgenesPA]{addgenesPA()}} to name the output column. +\item Multiple modes (e.g., \code{genes = c("all", "le", "top")}): a named list +with one element per requested mode: +\itemize{ +\item \verb{$all}: named list of all gene set members. +\item \verb{$le}: named list of leading edge genes (GSEA only). +\item \verb{$top}: named list of top-ranked genes. +} +} +} +\description{ +For each gene set in a pathway analysis results table, retrieves leading +edge genes, a user-defined top fraction of genes, all genes in the gene +set, or any combination. All gene lists are ordered by their rank in the +provided ranked gene list. +} +\details{ +\strong{Three extraction modes:} +\itemize{ +\item \code{"le"}: \strong{GSEA output only.} Leading edge genes: the subset of genes +that drives the enrichment signal. Size is computed as +\code{round(SIZE * tags)}, where \code{tags} is the fraction of gene hits before +(positive ES) or after (negative ES) the peak in the running enrichment +score. Definition from the GSEA User Guide: \emph{"The percentage of gene hits +before (for positive ES) or after (for negative ES) the peak in the +running enrichment score. This gives an indication of the percentage of +genes contributing to the enrichment score."} +(\url{https://docs.gsea-msigdb.org/#GSEA/GSEA_User_Guide/}). +Requires columns \code{SIZE} and \code{tags} in \code{pa_data}, produced automatically +by \code{\link[=merge_PA]{merge_PA()}}. +\item \code{"top"}: \strong{Any enrichment result (GSEA, CAMERA, PADOG, etc.).} +A user-defined top fraction of genes ordered by rank. Size is computed as +\code{round(SIZE * top)}, where \code{top} is a numeric value between 0 and 1 +provided in a \code{top} column of \code{pa_data}. This does \strong{not} represent true +leading edge genes: it is a flexible, rank-based selection suitable for +exploratory visualization with any pathway analysis method. +Requires columns \code{SIZE} and \code{top} in \code{pa_data}. +\item \code{"all"}: All genes in the gene set, ordered by rank. +} +} +\examples{ +\dontrun{ +data(gsea_results) +data(geneset_list) +data(deseq2_results) + +#or +gsl <- list_gmts("path/to/gmt_folder/") + +ranked <- deseq2_results$gene_id[order(deseq2_results$stat, + decreasing = TRUE)] +pa_single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ] + +# ── GSEA results: all three modes available +gene_lists <- getgenesPA(pa_single, geneset_list, ranked, + genes = c("all", "le", "top")) + +# But first add the top column (e.g. top 30\% of genes by rank) +pa_single$top <- 0.30 +gene_lists <- getgenesPA(pa_single, geneset_list, ranked, + genes = c("all", "le", "top")) + +gene_lists$le[["KEGG_APOPTOSIS"]] # leading edge genes +gene_lists$top[["KEGG_APOPTOSIS"]] # top 30\% by rank +gene_lists$all[["KEGG_APOPTOSIS"]] # all genes + +pa_annot <- addgenesPA(pa_single, gene_lists) +head(pa_annot[, c("NAME", "all_genes", "le_genes", "top_genes")]) + +# ── CAMERA results: use "top" (no leading edge available) ─── +data(camera_results) +camera_pa <- camera_results +colnames(camera_pa)[colnames(camera_pa) == "GeneSet"] <- "NAME" +camera_pa$SIZE <- sapply(camera_pa$NAME, + function(x) length(geneset_list[[x]])) +camera_pa$top <- 0.25 # top 25\% by rank + +gene_lists_cam <- getgenesPA(camera_pa, geneset_list, ranked, + genes = c("all", "top")) +pa_annot_cam <- addgenesPA(camera_pa, gene_lists_cam) +head(pa_annot_cam[, c("NAME", "all_genes", "top_genes")]) +} + +} +\seealso{ +\code{\link[=addgenesPA]{addgenesPA()}} to append gene columns to pa_data; +\code{\link[=heatmap_PA]{heatmap_PA()}} for heatmap visualization; +\code{\link[=list_gmts]{list_gmts()}} to generate \code{geneset_list}; +\code{\link[=merge_PA]{merge_PA()}} to generate \code{pa_data} with the required \code{tags} column. +} diff --git a/man/gsea_results.Rd b/man/gsea_results.Rd new file mode 100644 index 0000000..92ecede --- /dev/null +++ b/man/gsea_results.Rd @@ -0,0 +1,96 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gsea_results.R +\docType{data} +\name{gsea_results} +\alias{gsea_results} +\title{Simulated GSEA pathway analysis results for TCGA-LUAD} +\format{ +A data frame with 120 rows (40 gene sets x 3 comparisons) and 15 +columns: +\describe{ +\item{NAME}{Character. Gene set name, matching the names in +\link{geneset_list}.} +\item{SIZE}{Integer. Number of genes in the gene set.} +\item{ES}{Numeric. Enrichment score.} +\item{NES}{Numeric. Normalized enrichment score.} +\item{NOM p-val}{Numeric. Nominal p-value.} +\item{FDR}{Numeric. False discovery rate. Approximately 60\% of gene +sets per comparison have FDR < 0.05.} +\item{FWER p-val}{Numeric. Family-wise error rate.} +\item{RANK AT MAX}{Integer. Gene rank at maximum enrichment score.} +\item{Log10FDR}{Numeric. \code{-log10(FDR)}.} +\item{tags}{Numeric. Fraction of gene set in the leading edge (0-1).} +\item{list}{Numeric. Fraction of the ranked list used (0-1).} +\item{signal}{Numeric. Enrichment signal strength (0-1).} +\item{LEADING EDGE}{Character. Leading edge string in GSEA format +(e.g., \code{"tags=20\%, list=35\%, signal=15\%"}).} +\item{COLLECTION}{Character. MSigDB collection name: \code{"HALLMARK"}, +\code{"KEGG"}, or \code{"GO"}.} +\item{COMPARISON}{Character. Comparison name: \code{"TumorVsNormal"}, +\code{"MetastasisVsNormal"}, or \code{"MetastasisVsTumor"}.} +} +} +\source{ +Simulated with \code{set.seed(174)} in \code{data-raw/gsea_results.R}. +Gene set names and memberships derived from \link{geneset_list}. NES values +and significance are simulated to reflect realistic GSEA output patterns. +} +\usage{ +gsea_results +} +\description{ +A simulated data frame representing the output of \code{\link[=merge_PA]{merge_PA()}} for three +pairwise comparisons of TCGA-LUAD samples across 40 gene sets from three +MSigDB collections (HALLMARK, KEGG, GO). Gene sets and gene memberships are +derived from \link{geneset_list}. NES values and FDR are simulated with +\code{set.seed(174)} to produce realistic enrichment patterns, where ~60\% of +gene sets per comparison are significant (FDR < 0.05). +} +\details{ +This dataset is designed to demonstrate \code{\link[=splot_PA]{splot_PA()}}, \code{\link[=multiplot_PA]{multiplot_PA()}}, +\code{\link[=getgenesPA]{getgenesPA()}}, \code{\link[=addgenesPA]{addgenesPA()}}, and \code{\link[=heatmap_PA]{heatmap_PA()}} without requiring +external GSEA output files. +} +\examples{ +data(gsea_results) + +# Overview +dim(gsea_results) +table(gsea_results$COMPARISON) +table(gsea_results$COLLECTION) + +# How many gene sets are significant per comparison? +tapply(gsea_results$FDR < 0.05, gsea_results$COMPARISON, sum) + +# Single comparison plot +single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ] +\dontrun{ +splot_PA( + data = single, + geneset_col = "NAME", + collection_col = "COLLECTION", + nes_col = "NES", + fdr_col = "FDR" +) +} + +# Multi-comparison plot +\dontrun{ +multiplot_PA( + data = gsea_results, + comparison_col = "COMPARISON", + facet_col = "NAME", + fdr_col = "FDR", + comparison_order = c("TumorVsNormal", "MetastasisVsNormal", + "MetastasisVsTumor") +) +} + +} +\seealso{ +\code{\link[=merge_PA]{merge_PA()}} which produces this format from real GSEA output; +\code{\link[=splot_PA]{splot_PA()}}, \code{\link[=multiplot_PA]{multiplot_PA()}} for visualization; +\code{\link[=getgenesPA]{getgenesPA()}}, \code{\link[=addgenesPA]{addgenesPA()}} for gene-level annotation; +\link{geneset_list} for the gene set memberships used here. +} +\keyword{datasets} diff --git a/man/heatmap_PA.Rd b/man/heatmap_PA.Rd index 23a8f53..ebfd113 100644 --- a/man/heatmap_PA.Rd +++ b/man/heatmap_PA.Rd @@ -1,46 +1,165 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/heatmap_PA.R +% Please edit documentation in R/plot_PA.R \name{heatmap_PA} \alias{heatmap_PA} -\title{Plot leading edge heatmaps from GSEA/CAMERA/PADOG results.} +\title{Plot leading edge heatmaps from pathway analysis results} \usage{ heatmap_PA( - main_dir = NULL, - expression_file, - metadata_file, - gmt_file, - ranked_genes_file, - gsea_file, - output_dir = "leading_edge_heatmaps", + expression_data, + metadata, + pa_data_annot, + ranked_genes, + plot_genes = c("all_genes", "le_genes"), sample_col = "Sample", group_col = "group", - save_dataframe = FALSE + out_dir = "heatmaps_PA", + pdf = TRUE, + jpg = TRUE ) } \arguments{ -\item{main_dir}{Optional base directory. If supplied, it will be prepended to all relative file paths.} +\item{expression_data}{A numeric matrix or data frame of expression values +with gene symbols or Ensembl IDs as row names and sample IDs as column +names. Recommended input: VST-transformed counts from \link{vst_counts} or +normalized coutns \link{norm_counts}.} -\item{expression_file}{Path to the expression data file (tab-delimited) or relative to main_dir.} +\item{metadata}{A data frame of sample annotations. Must contain a column +matching \code{sample_col} (sample IDs) and a column matching \code{group_col} +(condition labels, e.g., \code{"Control"}, \code{"Treatment"}).} -\item{metadata_file}{Path to the metadata file (Excel) or relative to main_dir.} +\item{pa_data_annot}{A data frame of pathway analysis results enriched with +gene columns. Must contain the column \code{NAME} and at least one of +\code{all_genes}, \code{le_genes}, or \code{top_genes} (comma-separated gene symbols per +gene set). Typically the output of \code{\link[=addgenesPA]{addgenesPA()}}.} -\item{gmt_file}{Path to the GMT file defining gene sets or relative to main_dir.} +\item{ranked_genes}{A character vector of gene symbols ordered by their +ranking metric (e.g., stat, log2FC or signal-to-noise ratio), used to sort +genes within each heatmap row.} -\item{ranked_genes_file}{Path to the ranked genes list file or relative to main_dir.} +\item{plot_genes}{Character vector specifying which gene columns to plot. +One or both of \code{"all_genes"} and \code{"le_genes"}, and \code{"top_genes"}. Each +selection produces its own set of output files in a dedicated subfolder. +Default: \code{c("all_genes", "le_genes")}.} -\item{gsea_file}{Path to the GSEA results file with leading edge genes or relative to main_dir.} +\item{sample_col}{Name of the sample ID column in \code{metadata}. +Default: \code{"Sample"}.} -\item{output_dir}{Directory to save heatmaps and optional TSV; default "leading_edge_heatmaps".} +\item{group_col}{Name of the condition/group column in \code{metadata} +(e.g., \code{"Control"} vs \code{"Treatment"}). Used for heatmap column +annotations. Default: \code{"group"}.} -\item{sample_col}{Name of the sample ID column in metadata; default "Sample".} +\item{out_dir}{Character. Path to the output directory. Subdirectories are +created automatically based on \code{pdf}, \code{jpg}, and \code{plot_genes}: +\itemize{ +\item \verb{/pdf/all_genes/} +\item \verb{/pdf/le_genes/} +\item \verb{/pdf/top_genes/} +\item \verb{/jpg/all_genes/} +\item \verb{/jpg/le_genes/} +\item \verb{/jpg/top_genes/} +Default: \code{"heatmaps_PA"}. +}} -\item{group_col}{Name of the group column in metadata; default "group".} +\item{pdf}{Logical. If \code{TRUE}, saves PDF heatmaps. Default: \code{TRUE}.} -\item{save_dataframe}{Logical; if TRUE, saves the merged data frame as TSV before plotting.} +\item{jpg}{Logical. If \code{TRUE}, saves JPG heatmaps. Default: \code{TRUE}.} } \value{ -Saves one PDF and one JPG heatmap per gene set under output_dir; optionally saves intermediate TSV. +Invisibly returns \code{TRUE} upon completion. Saves heatmap files to +the corresponding subdirectories under \code{out_dir}. } \description{ -Generates heatmaps based on normalized data of genes for each gene set from GSEA/CAMERA/PADOG output. +Generates heatmaps of gene expression for each gene set in \code{pa_data_annot}, +using the \code{all_genes}, \code{le_genes} (GSEA output only), and/or \code{top_genes} +columns produced by \code{\link[=addgenesPA]{addgenesPA()}}. Genes within each heatmap are ordered +by their position in \code{ranked_genes}. +} +\details{ +The recommended workflow before calling this function is: + +\if{html}{\out{
}}\preformatted{gsl <- list_gmts("path/to/gmt/") +pa_data <- merge_PA("path/to/pa_data/") +ranked <- deseq2_results$gene_id[order(deseq2_results$stat, + decreasing = TRUE)] +gene_lists <- getgenesPA(pa_data, gsl, ranked, genes = c("all", "le")) +pa_annot <- addgenesPA(pa_data, gene_lists) + +heatmap_PA( + expression_data = vst_counts, + metadata = sampledata, + pa_data_annot = pa_annot, + ranked_genes = ranked, + plot_genes = c("all_genes", "le_genes") +) +}\if{html}{\out{
}} +} +\examples{ +\dontrun{ +data(vst_counts) +data(sampledata) +data(deseq2_results) +data(gsea_results) +data(geneset_list) + +ranked <- deseq2_results$gene_id[order(deseq2_results$stat, + decreasing = TRUE)] + +# ── Example 1: GSEA results (all_genes + le_genes) ──── +pa_single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ] +gene_lists <- getgenesPA(pa_single, geneset_list, ranked, + genes = c("all", "le")) +pa_annot <- addgenesPA(pa_single, gene_lists) + +heatmap_PA( + expression_data = vst_counts, + metadata = sampledata, + pa_data_annot = pa_annot, + ranked_genes = ranked, + plot_genes = c("all_genes", "le_genes"), + sample_col = "patient_id", + group_col = "sample_type", + out_dir = "heatmaps_gsea", + pdf = TRUE, + jpg = TRUE +) +# Creates: +# heatmaps_gsea/pdf/all_genes/_heatmap.pdf +# heatmaps_gsea/pdf/le_genes/_heatmap.pdf +# heatmaps_gsea/jpg/all_genes/_heatmap.jpg +# heatmaps_gsea/jpg/le_genes/_heatmap.jpg + +# ── Example 2: CAMERA results (all_genes + top_genes) +# camera_results does not contain leading edge information. +# Use genes = "top" with a manually set top fraction instead. +# Note: top_genes are rank-based and do NOT represent true leading edge genes. +data(camera_results) +camera_pa <- camera_results +colnames(camera_pa)[colnames(camera_pa) == "GeneSet"] <- "NAME" +camera_pa$SIZE <- sapply(camera_pa$NAME, + function(x) length(geneset_list[[x]])) +camera_pa$top <- 0.25 + +gene_lists_cam <- getgenesPA(camera_pa, geneset_list, ranked, + genes = c("all", "top")) +pa_annot_cam <- addgenesPA(camera_pa, gene_lists_cam) + +heatmap_PA( + expression_data = vst_counts, + metadata = sampledata, + pa_data_annot = pa_annot_cam, + ranked_genes = ranked, + plot_genes = c("all_genes", "top_genes"), + sample_col = "patient_id", + group_col = "sample_type", + out_dir = "heatmaps_camera" +) +} + +} +\seealso{ +\code{\link[=getgenesPA]{getgenesPA()}} for gene extraction; +\code{\link[=addgenesPA]{addgenesPA()}} to generate \code{pa_data_annot}; +\code{\link[=list_gmts]{list_gmts()}} to generate the geneset list; +\code{\link[=merge_PA]{merge_PA()}} to generate \code{pa_data}; +\link{vst_counts} for an example expression matrix. } diff --git a/man/heatmap_path_PA.Rd b/man/heatmap_path_PA.Rd new file mode 100644 index 0000000..5d72ced --- /dev/null +++ b/man/heatmap_path_PA.Rd @@ -0,0 +1,126 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot_PA.R +\name{heatmap_path_PA} +\alias{heatmap_path_PA} +\title{Plot leading edge heatmaps from GSEA analysis results using file paths} +\usage{ +heatmap_path_PA( + main_dir = NULL, + expression_file, + metadata_file, + gmt_file, + ranked_genes_file, + gsea_file, + output_dir = "leading_edge_heatmaps", + sample_col = "Sample", + group_col = "group", + save_dataframe = FALSE +) +} +\arguments{ +\item{main_dir}{Character or \code{NULL}. Optional base directory prepended to +all relative file paths. If \code{NULL} (default), all paths are used as-is.} + +\item{expression_file}{Character. Path to a tab-delimited expression data +file. Rows are genes (first column or a column named \code{NAME} used as row +names), columns are sample IDs. Recommended input: VST-transformed counts.} + +\item{metadata_file}{Character. Path to an Excel (\code{.xlsx}) metadata file. +Must contain a column matching \code{sample_col} (sample IDs) and a column +matching \code{group_col} (condition labels, e.g., \code{"Control"}, +\code{"Treatment"}).} + +\item{gmt_file}{Character. Path to a \code{.gmt} file defining gene sets. Each +row contains: gene set name (column 1), description (column 2, ignored), +and gene symbols (columns 3+).} + +\item{ranked_genes_file}{Character. Path to a tab-delimited file where the +first column contains gene symbols ordered by their ranking metric (e.g., +log2FC or signal-to-noise ratio), from most positive to most negative. +Used to order leading edge genes within each heatmap.} + +\item{gsea_file}{Character. Path to a GSEA results \code{.tsv} file containing +at least the columns \code{NAME}, \code{SIZE}, and \code{tags} (from the \verb{LEADING EDGE} +column parsed by \code{\link[=merge_PA]{merge_PA()}}).} + +\item{output_dir}{Character. Directory where heatmap files are saved. +Created automatically if it does not exist. Default: +\code{"leading_edge_heatmaps"}.} + +\item{sample_col}{Name of the sample ID column in the metadata file. +Default: \code{"Sample"}.} + +\item{group_col}{Name of the condition/group column in the metadata file +(e.g., \code{"Control"} vs \code{"Treatment"}). Used for heatmap column +annotations. Default: \code{"group"}.} + +\item{save_dataframe}{Logical. If \code{TRUE}, saves the intermediate data frame +(gene sets with computed leading edge genes) as a \code{.tsv} file in +\code{output_dir} before plotting. Useful for inspection or reuse. +Default: \code{FALSE}.} +} +\value{ +Invisibly returns \code{TRUE} upon completion. Saves two files per gene +set in \code{output_dir}: +\itemize{ +\item \verb{_heatmap.pdf} +\item \verb{_heatmap.jpg} +} + +If \code{save_dataframe = TRUE}, also saves +\verb{/leading_edge_genes_df.tsv}. +} +\description{ +Generates one heatmap per gene set from GSEA/CAMERA/PADOG output by reading +all required inputs from file paths. For each gene set, the leading edge +genes are extracted, ordered by their rank in the ranked gene list, and +plotted as a scaled row heatmap against the expression matrix. +} +\details{ +This function is the file-path-based alternative to \code{\link[=heatmap_PA]{heatmap_PA()}}, which +accepts R objects directly. Use this version when working from raw output +files on disk (e.g., directly after running \code{GSEA_merge.sh}). +} +\note{ +For a more flexible workflow that accepts R objects directly (avoiding +repeated file reads), use \code{\link[=heatmap_PA]{heatmap_PA()}} instead, which takes +\code{expression_data}, \code{metadata}, and \code{pa_data_annot} as R objects and +integrates with \code{\link[=getgenesPA]{getgenesPA()}} and \code{\link[=addgenesPA]{addgenesPA()}}. +} +\examples{ +\dontrun{ +# Run with all files in a single base directory +heatmap_path_PA( + main_dir = "path/to/analysis/", + expression_file = "vst_expression.tsv", + metadata_file = "metadata.xlsx", + gmt_file = "genesets.gmt", + ranked_genes_file = "ranked_genes.tsv", + gsea_file = "gsea_results.tsv", + output_dir = "leading_edge_heatmaps", + sample_col = "Sample", + group_col = "group", + save_dataframe = TRUE +) +# Saves: +# leading_edge_heatmaps/_heatmap.pdf +# leading_edge_heatmaps/_heatmap.jpg +# leading_edge_heatmaps/leading_edge_genes_df.tsv (if save_dataframe = TRUE) + +# Run with absolute paths (no main_dir) +heatmap_path_PA( + expression_file = "/data/vst_counts.tsv", + metadata_file = "/data/metadata.xlsx", + gmt_file = "/data/h.all.v2023.gmt", + ranked_genes_file = "/data/ranked_genes.tsv", + gsea_file = "/data/gsea_results.tsv" +) +} + +} +\seealso{ +\code{\link[=heatmap_PA]{heatmap_PA()}} for the R-object-based alternative; +\code{\link[=getgenesPA]{getgenesPA()}} and \code{\link[=addgenesPA]{addgenesPA()}} for extracting leading edge genes +from R objects; \code{\link[=merge_PA]{merge_PA()}} to generate the GSEA results input; +\code{\link[=list_gmts]{list_gmts()}} to load GMT files as R objects. +} diff --git a/man/merge_PA.Rd b/man/merge_PA.Rd index c0a5bc9..b5bc037 100644 --- a/man/merge_PA.Rd +++ b/man/merge_PA.Rd @@ -2,15 +2,104 @@ % Please edit documentation in R/merge_PA.R \name{merge_PA} \alias{merge_PA} -\title{Merge GSEA results data frames.} +\title{Merge GSEA result files into a single data frame} \usage{ -merge_PA(input_directory, output_file = "collections_merged_gsea_data.tsv") +merge_PA(input_directory, fdr_replace = 0.001) } \arguments{ -\item{input_directory}{The directory containing the GSEA collection results in TSV format.} +\item{input_directory}{Character. Path to the directory containing one or +more GSEA collection result files in \code{.tsv} format (e.g., output of +\code{GSEA_merge.sh}). Each file must end in \code{.tsv}.} -\item{output_file}{The output file to save the merged data. If not provided, the file will be saved in the input directory.} +\item{fdr_replace}{Numeric. Value used to replace \verb{FDR q-val = 0}. This +occurs when no permutation produced an NES as extreme as the observed +one, meaning the true FDR is below \code{1 / n_permutations}. With the +standard 1,000 permutations, the recommended value is \code{0.001}. Adjust +to \code{1 / n_permutations} if a different number of permutations was used. +Default: \code{0.001}.} +} +\value{ +A data frame (\code{gsea_data}) containing all merged and processed GSEA +results with standardized column names. } \description{ -After running GSEA_all.sh from GSEA.sh, merge_PA function joins .tsv files to a single file +Reads all \code{.tsv} files produced by \code{GSEA_merge.sh} (from the GSEA.sh +pipeline) from a directory, standardizes numeric columns, parses the +leading edge string, computes \code{-log10(FDR)}, and returns a single merged +data frame ready for downstream use with \code{\link[=splot_PA]{splot_PA()}}, \code{\link[=multiplot_PA]{multiplot_PA()}} , \code{\link[=getgenesPA]{getgenesPA()}}, and +\code{\link[=heatmap_PA]{heatmap_PA()}}. +} +\details{ +\strong{Input file format:} Each \code{.tsv} file corresponds to one MSigDB collection +(e.g., \code{H.tsv}, \code{C2.tsv}) and must follow the standard GSEA output +format with the following columns: +\itemize{ +\item \code{NAME}: gene set name. +\item \code{SIZE}: number of genes in the gene set. +\item \code{ES}: enrichment score. +\item \code{NES}: normalized enrichment score. +\item \verb{NOM p-val}: nominal p-value. +\item \verb{FDR q-val}: false discovery rate. Values of exactly \code{0} indicate +that no permutation produced an equally extreme NES (i.e., the true +FDR is below the permutation resolution \code{1 / n_permutations}). These +are replaced by \code{fdr_replace} to avoid \code{-Inf} in log-transforms. +\item \verb{FWER p-val}: family-wise error rate. +\item \verb{RANK AT MAX}: gene rank at maximum enrichment score. +\item \verb{LEADING EDGE}: string encoding the leading edge statistics in the +format \code{"tags=XX\%, list=XX\%, signal=XX\%"}. Parsed into three numeric +columns: \code{tags} (fraction of gene set in leading edge), \code{list} +(fraction of ranked list used), and \code{signal} (enrichment signal +strength). +\item \code{Comparison}: name of the comparison (e.g., \code{"TumorVsNormal"}). +Renamed to \code{COMPARISON} in the output. Required for visualization +with \code{\link[=splot_PA]{splot_PA()}} or \code{\link[=multiplot_PA]{multiplot_PA()}} . +\item \verb{GS
follow link to MSigDB} and \verb{GS DETAILS}: removed automatically. +} + +\strong{Output columns:} All input columns (minus the two removed above) plus: +\itemize{ +\item \code{COLLECTION}: name of the MSigDB collection, derived from the file name +by removing the \code{.tsv} suffix. +\item \code{tags}, \code{list}, \code{signal}: numeric leading edge components (0-1 scale). +\item \code{Log10FDR}: \code{-log10(FDR)} computed after applying \code{fdr_replace}. +\item \code{FDR}: renamed from \verb{FDR q-val}. +\item \code{COMPARISON}: renamed from \code{Comparison}. +} +} +\note{ +The input \code{.tsv} files must contain a \code{Comparison} column identifying +each comparison (e.g., \code{"TumorVsNormal"}). This column is renamed to +\code{COMPARISON} in the output and is required by \code{\link[=splot_PA]{splot_PA()}} or \code{\link[=multiplot_PA]{multiplot_PA()}} to operate in +multi-comparison mode. If your files come from a single comparison and +do not have this column, add it manually to each file before merging: +\code{your_data$Comparison <- "YourComparisonName"}. +} +\examples{ +\dontrun{ +# Merge all GSEA collection TSV files from a directory +gsea_data <- merge_PA( + input_directory = "path/to/gsea_results/", + fdr_replace = 0.001 # standard for 1000 permutations +) + +# Inspect result +head(gsea_data) +colnames(gsea_data) + +# Use directly in downstream functions +gsl <- list_gmts("path/to/gmt_folder/") +ranked <- deseq2_results$gene_id[order(deseq2_results$stat, + decreasing = TRUE)] +gene_lists <- getgenesPA(gsea_data, gsl, ranked) +pa_annot <- addgenesPA(gsea_data, gene_lists) + +plot_PA(gsea_data, comparison_col = "COMPARISON") +} + +} +\seealso{ +\code{\link[=splot_PA]{splot_PA()}} or \code{\link[=multiplot_PA]{multiplot_PA()}} for visualization of merged results; +\code{\link[=getgenesPA]{getgenesPA()}} and \code{\link[=addgenesPA]{addgenesPA()}} for gene-level annotation; +\code{\link[=heatmap_PA]{heatmap_PA()}} for leading edge heatmaps; +\code{\link[=list_gmts]{list_gmts()}} to load gene sets. } diff --git a/man/multiplot_PA.Rd b/man/multiplot_PA.Rd new file mode 100644 index 0000000..678a152 --- /dev/null +++ b/man/multiplot_PA.Rd @@ -0,0 +1,140 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot_PA.R +\name{multiplot_PA} +\alias{multiplot_PA} +\title{Pathway analysis visualization across multiple comparisons} +\usage{ +multiplot_PA( + data, + comparison_col = "COMPARISON", + facet_col = "NAME", + axis_y = "NES", + fdr_col = "FDR", + comparison_order = NULL, + custom_labels = NULL, + ncol_wrap = 2, + free_y = TRUE, + fill_limits = NULL, + fill_palette = c("white", "red"), + theme_params = list() +) +} +\arguments{ +\item{data}{A data frame of pathway analysis results containing two or more +comparisons. Typically the output of \code{\link[=merge_PA]{merge_PA()}}.} + +\item{comparison_col}{Name of the column identifying each comparison. +Appears on the x-axis of each facet. Default: \code{"COMPARISON"}.} + +\item{facet_col}{Name of the column used to define facets one facet per +unique value. Can be the original gene set name column (e.g., \code{"NAME"}) +or a manually curated column with cleaner or shorter labels +(e.g., \code{"clean_name"}). Default: \code{"NAME"}.} + +\item{axis_y}{Name of the column to use for the y-axis. Default: \code{"NES"}.} + +\item{fdr_col}{Name of the column containing FDR values. \code{-log10(FDR)} is +computed internally and used as the fill color. Default: \code{"FDR"}.} + +\item{comparison_order}{Character vector specifying the left-to-right order +of comparisons on the x-axis of each facet. For example, +\code{comparison_order = c("BvsA", "CvsA")} places \code{BvsA} on the left and +\code{CvsA} on the right. If \code{NULL} (default), the order follows the factor +levels of \code{comparison_col} as they appear in \code{data}.} + +\item{custom_labels}{Named character vector of x-axis tick labels. Useful +for shortening comparison names on the axis. For example, +\code{custom_labels = c(TumorVsNormal = "Tumor", MetastasisVsNormal = "Mets")}. +Default: \code{NULL}.} + +\item{ncol_wrap}{Integer. Number of columns in \code{facet_wrap}. Default: \code{2}.} + +\item{free_y}{Logical. If \code{TRUE}, each facet uses its own y-axis scale. +Default: \code{TRUE}.} + +\item{fill_limits}{Numeric vector of length 2 setting the color scale range +for \code{-log10(FDR)}. Values outside this range are clamped to the nearest +limit. For example, \code{fill_limits = c(0, 5)} maps all gene sets with +\code{-log10(FDR) >= 5} (FDR <= 0.00001) to maximum red, and any value below +0 to white. Useful when one gene set has extreme significance that makes +the rest appear uniform. Default: \code{NULL} (auto).} + +\item{fill_palette}{Character vector of two colors for the fill gradient +(low to high -log10(FDR)). Default: \code{c("white", "red")}.} + +\item{theme_params}{Named list to override default theme parameters. +See Details.} +} +\value{ +A ggplot2 object. +} +\description{ +Generates a faceted barplot showing NES values across multiple comparisons +for a set of gene sets. Each facet represents one gene set and bars +represent the NES per comparison, colored by -log10(FDR). This layout makes +it easy to compare how enrichment of gene sets changes across conditions +(e.g., TumorVsNormal, MetastasisVsNormal). +} +\details{ +All comparisons must be combined in a single data frame with a column +identifying each comparison as produced by \code{\link[=merge_PA]{merge_PA()}}. + +For visualizing a single comparison with full collection grouping, use +\code{\link[=splot_PA]{splot_PA()}} instead. + +\code{theme_params} accepts any of the following named elements: +\describe{ +\item{\code{bar_col}}{Bar border color. Default: \code{"black"}.} +\item{\code{bar_size}}{Bar border linewidth. Default: \code{0.5}.} +\item{\code{bar_width}}{Bar width. Default: \code{0.6}.} +\item{\code{hline_size}}{Linewidth for horizontal line at y = 0. Default: \code{2}.} +\item{\code{axis_title_size}}{Font size for axis titles. Default: \code{45}.} +\item{\code{axis_text_size_x}}{Font size for x-axis labels. Default: \code{30}.} +\item{\code{axis_text_size_y}}{Font size for y-axis labels. Default: \code{50}.} +\item{\code{tick_size}}{Linewidth for axis ticks. Default: \code{1.5}.} +\item{\code{tick_length}}{Length of axis ticks in cm. Default: \code{0.3}.} +\item{\code{strip_text_size}}{Font size for facet strip labels. Default: \code{50}.} +\item{\code{panel_spacing_multi}}{Spacing between facets. Default: \code{0.6}.} +} +} +\examples{ +\dontrun{ +gsea_results <- merge_PA("path/to/gsea_results/") + +# Basic multi-comparison plot +multiplot_PA( + data = gsea_results, + comparison_col = "COMPARISON", + facet_col = "NAME", + fdr_col = "FDR", + ncol_wrap = 3 +) + +# Control left-to-right order of comparisons on the x-axis +multiplot_PA( + data = gsea_results, + comparison_col = "COMPARISON", + facet_col = "NAME", + fdr_col = "FDR", + comparison_order = c("BvsA", "CvsA") # BvsA on the left, CvsA on the right +) + +# Use cleaner facet labels and shorten x-axis tick names +gsea_results$clean_name <- gsub("_", " ", gsea_results$NAME) + +multiplot_PA( + data = gsea_results, + comparison_col = "COMPARISON", + facet_col = "clean_name", + fdr_col = "FDR", + comparison_order = c("BvsA", "CvsA"), + custom_labels = c(BvsA = "Tumor", CvsA = "Metastasis") +) +} + +} +\seealso{ +\code{\link[=splot_PA]{splot_PA()}} for single-comparison patchwork plots; +\code{\link[=merge_PA]{merge_PA()}} to generate the input data frame; +\link{camera_results} for a minimal example dataset. +} diff --git a/man/plot_PA.Rd b/man/plot_PA.Rd deleted file mode 100644 index 3fb9ced..0000000 --- a/man/plot_PA.Rd +++ /dev/null @@ -1,88 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot_PA.R -\name{plot_PA} -\alias{plot_PA} -\title{Unified Pathway analysis results plotting function with theme configuration} -\usage{ -plot_PA( - data, - Comparison = "Comparison", - custom_labels = NULL, - axis_y = "NES", - fdr_col = "FDR", - logFDR = TRUE, - geneset_col, - collection_col, - nes_col, - logfdr_col, - order = c("desc", "asc"), - ncol_wrap = 2, - free_y = TRUE, - fill_limits = NULL, - fill_palette = c("white", "red"), - theme_params = list() -) -} -\arguments{ -\item{data}{A data frame containing GSEA results.} - -\item{Comparison}{Name of the column defining different comparisons. Default: "Comparison".} - -\item{custom_labels}{Named vector of labels for the x-axis discrete scale (barplot mode only). Default: NULL.} - -\item{axis_y}{Name of the column to use for the y-axis aesthetic. Default: "NES".} - -\item{fdr_col}{Name of the column containing FDR values. Default: "FDR".} - -\item{logFDR}{Logical; if TRUE, compute -log10(FDR) from \code{fdr_col}, otherwise use \code{fdr_col} directly. Default: TRUE.} - -\item{geneset_col}{Name of the column containing the geneset labels (single comparison mode).} - -\item{collection_col}{Name of the column containing the MSigDB collections (single comparison mode).} - -\item{nes_col}{Name of the column containing NES values (single comparison mode).} - -\item{logfdr_col}{Name of the column containing -log10(FDR) or similar (single comparison mode).} - -\item{order}{One of "desc" or "asc"; order of \code{axis_y} values. Default: c("desc","asc").} - -\item{ncol_wrap}{Number of columns for \code{facet_wrap} in barplot mode. Default: 2.} - -\item{free_y}{Logical; if TRUE, allow free y scales in facets. Default: TRUE.} - -\item{fill_limits}{Numeric vector of length 2 to set fill gradient limits (barplot mode). Default: NULL.} - -\item{fill_palette}{Character vector of two colors for fill gradient. Default: c("white","red").} - -\item{theme_params}{Named list to override default theme parameters (see details).} -} -\value{ -A ggplot or patchwork object for the GSEA plot. -} -\description{ -Creates either a global GSEA plot or a faceted barplot depending on the number of unique -comparisons in the \code{Comparison} column. Allows customizing all previously hard-coded theme -parameters via a single \code{theme_params} list. -} -\details{ -theme_params may include: -\describe{ -\item{side_label_size}{Size for side panel labels (default 35)} -\item{geneset_text_size}{Text size for geneset labels (default 5)} -\item{collection_text_size}{Text size for collection labels (default 5)} -\item{panel_widths}{Patchwork widths vector (default c(4,25,15,3,10,3))} -\item{bar_col}{Bar/col border color (default "black")} -\item{bar_size}{Border size for bars (default 0.5)} -\item{bar_width}{Width for bars (default 0.6)} -\item{col_size}{Border size for geom_col (default 1)} -\item{hline_size}{Size for horizontal line at y=0 (default 2)} -\item{axis_title_size}{Font size for axis titles (default 45)} -\item{axis_text_size_x}{Font size for x-axis text (default 30)} -\item{axis_text_size_y}{Font size for y-axis text (default 50)} -\item{tick_size}{Size for axis ticks (default 1.5)} -\item{tick_length}{Length for axis ticks in cm (default 0.3)} -\item{strip_text_size}{Font size for strip text (default 50)} -\item{panel_spacing_single}{Facet spacing single mode (default 4)} -\item{panel_spacing_multi}{Facet spacing multi mode (default 0.6)} -} -} diff --git a/man/splot_PA.Rd b/man/splot_PA.Rd new file mode 100644 index 0000000..aa4053f --- /dev/null +++ b/man/splot_PA.Rd @@ -0,0 +1,117 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot_PA.R +\name{splot_PA} +\alias{splot_PA} +\title{Pathway analysis visualization for a single comparison} +\usage{ +splot_PA( + data, + geneset_col = "NAME", + collection_col = "COLLECTION", + nes_col = "NES", + fdr_col = "FDR", + order = "desc", + fill_limits = NULL, + fill_palette = c("white", "red"), + theme_params = list() +) +} +\arguments{ +\item{data}{A data frame of pathway analysis results for a single +comparison. Typically the output of \code{\link[=merge_PA]{merge_PA()}} filtered to one value +of the \code{COMPARISON} column, or results from a single CAMERA/GSEA run. +Must contain the columns specified by \code{geneset_col}, \code{collection_col}, +\code{nes_col}, and \code{fdr_col}.} + +\item{geneset_col}{Name of the column containing gene set labels shown on +the y-axis. Default: \code{"NAME"}.} + +\item{collection_col}{Name of the column containing MSigDB collection +labels used to group gene sets (e.g., \code{"KEGG"}, \code{"HALLMARK"}, \code{"GO"}). +Default: \code{"COLLECTION"}.} + +\item{nes_col}{Name of the column containing NES values (x-axis). +Default: \code{"NES"}.} + +\item{fdr_col}{Name of the column containing FDR values. \code{-log10(FDR)} is +computed internally and used as the fill color. Default: \code{"FDR"}.} + +\item{order}{One of \code{"desc"} or \code{"asc"}. Sort order for NES values on the +y-axis. Default: \code{"desc"}.} + +\item{fill_limits}{Numeric vector of length 2 setting the color scale range +for \code{-log10(FDR)}. Values outside this range are clamped to the nearest +limit. For example, \code{fill_limits = c(0, 5)} maps all gene sets with +\code{-log10(FDR) >= 5} (i.e., FDR <= 0.00001) to the maximum color (red), +and any value below 0 to the minimum color (white). Useful when a few +gene sets have extreme significance that washes out color variation in the +rest. Default: \code{NULL} (auto uses the actual data range).} + +\item{fill_palette}{Character vector of two colors for the fill gradient +(low to high -log10(FDR)). Default: \code{c("white", "red")}.} + +\item{theme_params}{Named list to override default theme parameters. +See Details.} +} +\value{ +A \code{patchwork} object combining six ggplot2 panels. +} +\description{ +Generates a publication-quality multi-panel pathway enrichment plot for a +single comparison using patchwork. Gene sets appear on the y-axis grouped +by MSigDB collection, NES on the x-axis, and -log10(FDR) as fill color. +Six panels are assembled side by side: a "Pathways" label, gene set names, +the NES bar chart, collection labels, a "MSigDB" label, and the color legend. +} +\details{ +For visualizing enrichment across multiple comparisons, use +\code{\link[=multiplot_PA]{multiplot_PA()}} instead. + +\code{theme_params} accepts any of the following named elements: +\describe{ +\item{\code{side_label_size}}{Size for "Pathways" and "MSigDB" labels. +Default: \code{35}.} +\item{\code{geneset_text_size}}{Text size for gene set labels. Default: \code{5}.} +\item{\code{collection_text_size}}{Text size for collection labels. +Default: \code{5}.} +\item{\code{panel_widths}}{Patchwork relative widths for the 6 panels. +Default: \code{c(4, 25, 15, 3, 10, 3)}.} +\item{\code{col_size}}{Border linewidth for \code{geom_col}. Default: \code{1}.} +\item{\code{axis_title_size}}{Font size for axis titles. Default: \code{45}.} +\item{\code{axis_text_size_x}}{Font size for x-axis labels. Default: \code{30}.} +\item{\code{tick_size}}{Linewidth for axis ticks. Default: \code{1.5}.} +\item{\code{tick_length}}{Length of axis ticks in cm. Default: \code{0.3}.} +\item{\code{panel_spacing_single}}{Spacing between facets. Default: \code{4}.} +} +} +\examples{ +\dontrun{ +gsea_results <- merge_PA("path/to/gsea_results/") + +# Filter to one comparison +single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ] + +splot_PA( + data = single, + geneset_col = "NAME", + collection_col = "COLLECTION", + nes_col = "NES", + fdr_col = "FDR" +) + +# Cap color scale at -log10(FDR) = 5 so subtle differences are visible +# (gene sets with FDR <= 0.00001 all get the same max red color) +splot_PA( + data = single, + geneset_col = "NAME", collection_col = "COLLECTION", + nes_col = "NES", fdr_col = "FDR", + fill_limits = c(0, 5) +) +} + +} +\seealso{ +\code{\link[=multiplot_PA]{multiplot_PA()}} for multi-comparison faceted barplots; +\code{\link[=merge_PA]{merge_PA()}} to generate the input data frame; +\link{camera_results} for a minimal example dataset. +} diff --git a/vignettes/PA_workflow.Rmd b/vignettes/PA_workflow.Rmd new file mode 100644 index 0000000..97a42c9 --- /dev/null +++ b/vignettes/PA_workflow.Rmd @@ -0,0 +1,381 @@ +--- +title: "Pathway Analysis Workflow with OmicsKit" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Pathway Analysis Workflow with OmicsKit} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} + +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.align = "center", + warning = FALSE, + message = FALSE +) +``` + +## Overview +This vignette demonstrates a complete gene set enrichment analysis (GSEA) +downstream workflow using OmicsKit, from loading gene sets through +publication-quality visualization. + +The workflow covers: + +1. **Loading gene sets** : reading GMT files with `list_gmts()` +2. **Merging results** : combining multi-collection GSEA TSV files with + `merge_PA()` +3. **Gene extraction** : retrieving leading edge and top-ranked genes with + `getgenesPA()` and annotating results with `addgenesPA()` +4. **Heatmaps** : visualizing gene expression per pathway with `heatmap_PA()` + (file-path alternative: `heatmap_path_PA()`) +5. **Single-comparison plot** : `splot_PA()` : a publication-quality + multi-panel barplot for one comparison +6. **Multi-comparison plot** : `multiplot_PA()` : faceted barplots comparing + enrichment across conditions + +## Required packages +```{r packages} +library(OmicsKit) +library(ggplot2) +``` + +--- + +## Section 1 : Load gene sets +### `list_gmts()` +Gene sets used for enrichment analysis are stored as `.gmt` files, the +standard format used by MSigDB. `list_gmts()` reads all `.gmt` files from a +directory and returns a single named list, ready for downstream use. + +```{r list_gmts, eval = FALSE} +geneset_list <- list_gmts("path/to/your/gmt_folder/") + +# Number of gene sets loaded +length(geneset_list) +# Names of the first five gene sets +names(geneset_list)[1:5] +# Genes in one specific set +geneset_list[["HALLMARK_INTERFERON_GAMMA_RESPONSE"]] +``` + +In this vignette, we use a built-in example `geneset_list`, which contains +40 curated gene sets across four biological themes (apoptosis, cell cycle, +immune response, and metabolism), and the built-in `gsea_results` dataset, +which mimics the output of `merge_PA()` for three comparisons across HALLMARK, +KEGG, and GO collections. +```{r load_data} +data(geneset_list) +data(gsea_results) +data(deseq2_results) +data(vst_counts) +data(sampledata) +# Ranked gene list: DESeq2 Wald stat from most positive to most negative +ranked_genes <- deseq2_results$gene_id[ + order(deseq2_results$stat, decreasing = TRUE) +] +# Overview of the enrichment results +dim(gsea_results) +unique(gsea_results$COMPARISON) +unique(gsea_results$COLLECTION) +``` + +--- + +## Section 2 : Merge GSEA results +### `merge_PA()` +In a real analysis, GSEA produces one results file per MSigDB collection +(e.g., `H.tsv` for HALLMARK, `C2.tsv` for KEGG). `merge_PA()` reads all +`.tsv` files from a directory, standardizes column names, parses the leading +edge string into numeric components (`tags`, `list`, `signal`), computes +`-log10(FDR)`, and returns a single combined data frame. +```{r merge_PA, eval = FALSE} +gsea_data <- merge_PA( + input_directory = "path/to/gsea_results/", + fdr_replace = 0.001 # replace FDR = 0 (below permutation resolution) +) +# Inspect result +head(gsea_data[, c("NAME", "NES", "FDR", "COLLECTION", "COMPARISON", "tags")]) +``` + +> **Note:** Each `.tsv` file must contain a `Comparison` column identifying +> the comparison name (e.g., `"TumorVsNormal"`). Then `merge_PA()` renames it to +> `COMPARISON`. If your files come from a single run without that column, +> add it manually: +> ```r +> your_data$Comparison <- "TumorVsNormal" +> ``` + +The built-in `gsea_results` already has this structure: +```{r inspect_data} +# Key columns produced by merge_PA() +head(gsea_results[, c("NES", "FDR", "Log10FDR", + "COLLECTION", "COMPARISON", "tags", "SIZE")], n = 2) +``` + +--- + +## Section 3 : Extract leading edge genes +### `getgenesPA()` +### `addgenesPA()` +After obtaining pathway results, `getgenesPA()` retrieves the gene members +relevant to each enrichment signal. Three extraction modes are available: +- **`"le"`** (GSEA only): leading edge genes : the subset that drives the + enrichment score. +- **`"top"`**: top-ranked *n* % of genes by rank metric : applicable to any + enrichment method (GSEA, CAMERA, PADOG, etc.). +- **`"all"`**: all members of the gene set, ordered by rank. +`addgenesPA()` then appends the gene lists as comma-separated columns +(`le_genes`, `top_genes`, `all_genes`) to the pathway results table. + +```{r getgenesPA_addgenesPA, eval = FALSE} +# Filter to one comparison +pa_single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ] +# Optional: define the top fraction for mode "top" +pa_single$top <- 0.30 # top 30% of gene set members by rank +# Extract genes using all three modes +gene_lists <- getgenesPA( + pa_data = pa_single, + geneset_list = geneset_list, + ranked_genes = ranked_genes, + genes = c("all", "le", "top") +) +# Inspect results for one pathway + ## leading edge + gene_lists$le[["HALLMARK_INTERFERON_GAMMA_RESPONSE"]] + ## top 30% by rank + gene_lists$top[["HALLMARK_INTERFERON_GAMMA_RESPONSE"]] + ## all members + gene_lists$all[["HALLMARK_INTERFERON_GAMMA_RESPONSE"]] +# Append gene columns to the pathway table +pa_annot <- addgenesPA(pa_single, gene_lists) +# Number of gene sets annotated +nrow(pa_annot) + +head(pa_annot[, c("NAME", "all_genes", "le_genes", "top_genes")]) +``` + +> **Tip:** For CAMERA or other enrichment methods that do not produce leading +> edge information, use `genes = c("all", "top")` and set `pa_data$top` to +> your desired fraction (e.g., `0.25` for the top 25 % by rank). Do not +> request `genes = "le"` without a `tags` column. + +--- + +## Section 4 : Heatmaps +### `heatmap_PA()` +`heatmap_PA()` generates one heatmap per gene set based on `pa_data_annot`. +Genes are ordered within each heatmap by their position in `ranked_genes`. Output +files are organized into subdirectories by format (PDF / JPG) and gene +selection mode (`all_genes`, `le_genes`, `top_genes`). + +```{r heatmap_PA, eval = FALSE} +heatmap_PA( + expression_data = vst_counts, + metadata = sampledata, + pa_data_annot = pa_annot, + ranked_genes = ranked_genes, + plot_genes = c("all_genes", "le_genes", "top_genes"), + sample_col = "patient_id", + group_col = "sample_type", + out_dir = "heatmaps_PA", + pdf = TRUE, + jpg = TRUE +) +# Creates, for example: +# heatmaps_PA/jpg/top_genes/HALLMARK_INTERFERON_GAMMA_RESPONSE_heatmap.jpg +# heatmaps_PA/pdf/le_genes/HALLMARK_INTERFERON_GAMMA_RESPONSE_heatmap.pdf +# ... (one file per gene set per format per mode) +``` + +The heatmap below shows the top-ranked genes for +`HALLMARK_INTERFERON_GAMMA_RESPONSE` across tumor and normal TCGA-LUAD +samples: + +```{r heatmap_plot, echo = FALSE, fig.cap = "Expression heatmap of top-ranked genes in HALLMARK_INTERFERON_GAMMA_RESPONSE across TCGA-LUAD tumor and normal samples. Genes are ordered by DESeq2 Wald statistic.", fig.width = 7, fig.height = 6} + +knitr::include_graphics( + "figures_PA/heatmaps/jpg/top_genes/HALLMARK_INTERFERON_GAMMA_RESPONSE_heatmap.jpg" +) +``` + +### `heatmap_path_PA()` : file-path alternative +`heatmap_path_PA()` is a convenience wrapper that reads all inputs from disk +(expression TSV, metadata XLSX, GMT file, ranked-genes TSV, GSEA TSV) and +calls the same heatmap engine internally. It is useful when running the +analysis immediately after GSEA without loading data into R. + +```{r heatmap_path_PA, eval = FALSE} + +heatmap_path_PA( + main_dir = "path/to/analysis/", + expression_file = "vst_expression.tsv", + metadata_file = "metadata.xlsx", + gmt_file = "h.all.v2023.symbols.gmt", + ranked_genes_file = "ranked_genes.tsv", + gsea_file = "H.tsv", + output_dir = "leading_edge_heatmaps", + sample_col = "patient_id", + group_col = "sample_type", + save_dataframe = TRUE # also saves intermediate gene table as .tsv +) +# Produces the same output as heatmap_PA() for a single GSEA file +``` + +> **Which function to use?** Prefer `heatmap_PA()` when your data are already +> in R (e.g., after calling `merge_PA()`, `getgenesPA()`, and `addgenesPA()`). +> Use `heatmap_path_PA()` when you want a quick one-call solution that reads +> directly from files on disk. Both functions produce identical heatmaps. + +--- + +## Section 5 : General plot | Single-comparison +`splot_PA()` generates a publication-quality multi-panel enrichment plot for +**one comparison**. Gene sets appear on the y-axis (grouped by MSigDB +collection), NES on the x-axis, and −log10(FDR) as fill color. Six panels +are assembled side-by-side using `patchwork`. +```{r splot_PA, eval = FALSE} + +single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ] +splot_PA( + data = single, + geneset_col = "NAME", + collection_col = "COLLECTION", + nes_col = "NES", + fdr_col = "FDR", + order = "desc", + fill_limits = c(0, 2), # cap at -log10(FDR) = 5 + fill_palette = c("white", "red") +) +``` + +```{r splot_plot, echo = FALSE, fig.cap = "Single-comparison pathway enrichment plot (TumorVsNormal). Gene sets are sorted by NES; fill color encodes -log10(FDR), capped at 2. Collections are annotated to the right.", fig.width = 6, fig.height = 5} +knitr::include_graphics("figures_PA/splot_PA.jpg") +``` +> **Tip:** Use `fill_limits = c(0, 2)` to prevent a handful of extremely +> significant gene sets from washing out the color contrast for the rest. Any +> pathway with FDR ≤ 0.01 will be shown in the maximum color (red). + +--- + +## Section 6 : General plot | Multi-comparison +`multiplot_PA()` generates a **faceted barplot** showing NES across multiple +comparisons for a selected set of gene sets. Each facet represents one gene +set; bars represent the NES per comparison, colored by −log10(FDR). This +layout makes it straightforward to assess how enrichment changes across +conditions. +```{r multiplot_PA, eval = FALSE} + +# Subset to pathways of interest across all comparisons +pathways_of_interest <- c( + "HALLMARK_INTERFERON_GAMMA_RESPONSE", + "HALLMARK_INFLAMMATORY_RESPONSE", + "HALLMARK_G2M_CHECKPOINT", + "HALLMARK_E2F_TARGETS", + "HALLMARK_GLYCOLYSIS", + "HALLMARK_OXIDATIVE_PHOSPHORYLATION" +) +multi_data <- gsea_results[gsea_results$NAME %in% pathways_of_interest, ] +multiplot_PA( + data = multi_data, + comparison_col = "COMPARISON", + facet_col = "NAME", + axis_y = "NES", + fdr_col = "FDR", + comparison_order = c("TumorVsNormal", + "MetastasisVsNormal", + "MetastasisVsTumor"), + custom_labels = c( + TumorVsNormal = "Tumor", + MetastasisVsNormal = "Meta", + MetastasisVsTumor = "Meta/Tumor" + ), + ncol_wrap = 3, + free_y = TRUE, + fill_limits = c(0, 5), + fill_palette = c("white", "red") +) +``` + +```{r multiplot_plot, echo = FALSE, fig.cap = "Multi-comparison pathway enrichment plot. Each facet shows NES for one HALLMARK gene set across three pairwise comparisons. Fill color encodes -log10(FDR), capped at 5.", fig.width = 6, fig.height = 5} +knitr::include_graphics("figures_PA/multiplot_PA.jpg") +``` + +> **Tip:** Use `comparison_order` to control the left-to-right arrangement of +> comparisons on the x-axis of each facet, and `custom_labels` to replace +> long comparison names with shorter axis labels. + +## Full workflow : summary +```{r full_workflow, eval = FALSE} +library(OmicsKit) + +# 1. Load gene sets +gsl <- list_gmts("path/to/gmt_folder/") + +# 2. Merge GSEA output TSV files +gsea_data <- merge_PA( + input_directory = "path/to/gsea_results/", + fdr_replace = 0.001 +) +# 3. Build ranked gene list (from DESeq2 stat or log2FC) +ranked <- deseq2_results$gene_id[ + order(deseq2_results$stat, decreasing = TRUE) +] +# 4. Extract leading edge and top-ranked genes +pa_single <- gsea_data[gsea_data$COMPARISON == "TumorVsNormal", ] +pa_single$top <- 0.30 +gene_lists <- getgenesPA(pa_single, gsl, ranked, genes = c("all", "le", "top")) +pa_annot <- addgenesPA(pa_single, gene_lists) +# 5. Heatmaps +heatmap_PA( + expression_data = vst_counts, + metadata = sampledata, + pa_data_annot = pa_annot, + ranked_genes = ranked, + plot_genes = c("all_genes", "le_genes", "top_genes"), + sample_col = "patient_id", + group_col = "sample_type", + out_dir = "heatmaps_PA" +) +# Alternative: file-path version (reads directly from disk) +heatmap_path_PA( + main_dir = "path/to/analysis/", + expression_file = "vst_expression.tsv", + metadata_file = "metadata.xlsx", + gmt_file = "h.all.v2023.symbols.gmt", + ranked_genes_file = "ranked_genes.tsv", + gsea_file = "H.tsv", + output_dir = "leading_edge_heatmaps" +) +# 6. Single-comparison enrichment plot +splot_PA( + data = pa_single, + geneset_col = "NAME", + collection_col = "COLLECTION", + nes_col = "NES", + fdr_col = "FDR", + fill_limits = c(0, 5) +) +# 7. Multi-comparison enrichment plot +pathways_oi <- c( + "HALLMARK_INTERFERON_GAMMA_RESPONSE", + "HALLMARK_INFLAMMATORY_RESPONSE", + "HALLMARK_G2M_CHECKPOINT" +) +multiplot_PA( + data = gsea_data[gsea_data$NAME %in% pathways_oi, ], + comparison_col = "COMPARISON", + facet_col = "NAME", + fdr_col = "FDR", + comparison_order = c("TumorVsNormal", "MetastasisVsNormal", "MetastasisVsTumor"), + ncol_wrap = 3 +) +``` + +## Session info +```{r session_info} +sessionInfo() +``` diff --git a/vignettes/figures_PA/heatmaps/jpg/top_genes/HALLMARK_INTERFERON_GAMMA_RESPONSE_heatmap.jpg b/vignettes/figures_PA/heatmaps/jpg/top_genes/HALLMARK_INTERFERON_GAMMA_RESPONSE_heatmap.jpg new file mode 100644 index 0000000..12a390a Binary files /dev/null and b/vignettes/figures_PA/heatmaps/jpg/top_genes/HALLMARK_INTERFERON_GAMMA_RESPONSE_heatmap.jpg differ diff --git a/vignettes/figures_PA/multiplot_PA.jpg b/vignettes/figures_PA/multiplot_PA.jpg new file mode 100644 index 0000000..1b0cc70 Binary files /dev/null and b/vignettes/figures_PA/multiplot_PA.jpg differ diff --git a/vignettes/figures_PA/splot_PA.jpg b/vignettes/figures_PA/splot_PA.jpg new file mode 100644 index 0000000..ef4b363 Binary files /dev/null and b/vignettes/figures_PA/splot_PA.jpg differ