/jpg/top_genes/}
+Default: \code{"heatmaps_PA"}.
+}}
+
+\item{pdf}{Logical. If \code{TRUE}, saves PDF heatmaps. Default: \code{TRUE}.}
+
+\item{jpg}{Logical. If \code{TRUE}, saves JPG heatmaps. Default: \code{TRUE}.}
+}
+\value{
+Invisibly returns \code{TRUE} upon completion. Saves heatmap files to
+the corresponding subdirectories under \code{out_dir}.
+}
+\description{
+Generates heatmaps of gene expression for each gene set in \code{pa_data_annot},
+using the \code{all_genes}, \code{le_genes} (GSEA output only), and/or \code{top_genes}
+columns produced by \code{\link[=addgenesPA]{addgenesPA()}}. Genes within each heatmap are ordered
+by their position in \code{ranked_genes}.
+}
+\details{
+The recommended workflow before calling this function is:
+
+\if{html}{\out{}}\preformatted{gsl <- list_gmts("path/to/gmt/")
+pa_data <- merge_PA("path/to/pa_data/")
+ranked <- deseq2_results$gene_id[order(deseq2_results$stat,
+ decreasing = TRUE)]
+gene_lists <- getgenesPA(pa_data, gsl, ranked, genes = c("all", "le"))
+pa_annot <- addgenesPA(pa_data, gene_lists)
+
+heatmap_PA(
+ expression_data = vst_counts,
+ metadata = sampledata,
+ pa_data_annot = pa_annot,
+ ranked_genes = ranked,
+ plot_genes = c("all_genes", "le_genes")
+)
+}\if{html}{\out{
}}
+}
+\examples{
+\dontrun{
+data(vst_counts)
+data(sampledata)
+data(deseq2_results)
+data(gsea_results)
+data(geneset_list)
+
+ranked <- deseq2_results$gene_id[order(deseq2_results$stat,
+ decreasing = TRUE)]
+
+# ── Example 1: GSEA results (all_genes + le_genes) ────
+pa_single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ]
+gene_lists <- getgenesPA(pa_single, geneset_list, ranked,
+ genes = c("all", "le"))
+pa_annot <- addgenesPA(pa_single, gene_lists)
+
+heatmap_PA(
+ expression_data = vst_counts,
+ metadata = sampledata,
+ pa_data_annot = pa_annot,
+ ranked_genes = ranked,
+ plot_genes = c("all_genes", "le_genes"),
+ sample_col = "patient_id",
+ group_col = "sample_type",
+ out_dir = "heatmaps_gsea",
+ pdf = TRUE,
+ jpg = TRUE
+)
+# Creates:
+# heatmaps_gsea/pdf/all_genes/_heatmap.pdf
+# heatmaps_gsea/pdf/le_genes/_heatmap.pdf
+# heatmaps_gsea/jpg/all_genes/_heatmap.jpg
+# heatmaps_gsea/jpg/le_genes/_heatmap.jpg
+
+# ── Example 2: CAMERA results (all_genes + top_genes)
+# camera_results does not contain leading edge information.
+# Use genes = "top" with a manually set top fraction instead.
+# Note: top_genes are rank-based and do NOT represent true leading edge genes.
+data(camera_results)
+camera_pa <- camera_results
+colnames(camera_pa)[colnames(camera_pa) == "GeneSet"] <- "NAME"
+camera_pa$SIZE <- sapply(camera_pa$NAME,
+ function(x) length(geneset_list[[x]]))
+camera_pa$top <- 0.25
+
+gene_lists_cam <- getgenesPA(camera_pa, geneset_list, ranked,
+ genes = c("all", "top"))
+pa_annot_cam <- addgenesPA(camera_pa, gene_lists_cam)
+
+heatmap_PA(
+ expression_data = vst_counts,
+ metadata = sampledata,
+ pa_data_annot = pa_annot_cam,
+ ranked_genes = ranked,
+ plot_genes = c("all_genes", "top_genes"),
+ sample_col = "patient_id",
+ group_col = "sample_type",
+ out_dir = "heatmaps_camera"
+)
+}
+
+}
+\seealso{
+\code{\link[=getgenesPA]{getgenesPA()}} for gene extraction;
+\code{\link[=addgenesPA]{addgenesPA()}} to generate \code{pa_data_annot};
+\code{\link[=list_gmts]{list_gmts()}} to generate the geneset list;
+\code{\link[=merge_PA]{merge_PA()}} to generate \code{pa_data};
+\link{vst_counts} for an example expression matrix.
+}
diff --git a/man/heatmap_path_PA.Rd b/man/heatmap_path_PA.Rd
new file mode 100644
index 0000000..5d72ced
--- /dev/null
+++ b/man/heatmap_path_PA.Rd
@@ -0,0 +1,126 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot_PA.R
+\name{heatmap_path_PA}
+\alias{heatmap_path_PA}
+\title{Plot leading edge heatmaps from GSEA analysis results using file paths}
+\usage{
+heatmap_path_PA(
+ main_dir = NULL,
+ expression_file,
+ metadata_file,
+ gmt_file,
+ ranked_genes_file,
+ gsea_file,
+ output_dir = "leading_edge_heatmaps",
+ sample_col = "Sample",
+ group_col = "group",
+ save_dataframe = FALSE
+)
+}
+\arguments{
+\item{main_dir}{Character or \code{NULL}. Optional base directory prepended to
+all relative file paths. If \code{NULL} (default), all paths are used as-is.}
+
+\item{expression_file}{Character. Path to a tab-delimited expression data
+file. Rows are genes (first column or a column named \code{NAME} used as row
+names), columns are sample IDs. Recommended input: VST-transformed counts.}
+
+\item{metadata_file}{Character. Path to an Excel (\code{.xlsx}) metadata file.
+Must contain a column matching \code{sample_col} (sample IDs) and a column
+matching \code{group_col} (condition labels, e.g., \code{"Control"},
+\code{"Treatment"}).}
+
+\item{gmt_file}{Character. Path to a \code{.gmt} file defining gene sets. Each
+row contains: gene set name (column 1), description (column 2, ignored),
+and gene symbols (columns 3+).}
+
+\item{ranked_genes_file}{Character. Path to a tab-delimited file where the
+first column contains gene symbols ordered by their ranking metric (e.g.,
+log2FC or signal-to-noise ratio), from most positive to most negative.
+Used to order leading edge genes within each heatmap.}
+
+\item{gsea_file}{Character. Path to a GSEA results \code{.tsv} file containing
+at least the columns \code{NAME}, \code{SIZE}, and \code{tags} (from the \verb{LEADING EDGE}
+column parsed by \code{\link[=merge_PA]{merge_PA()}}).}
+
+\item{output_dir}{Character. Directory where heatmap files are saved.
+Created automatically if it does not exist. Default:
+\code{"leading_edge_heatmaps"}.}
+
+\item{sample_col}{Name of the sample ID column in the metadata file.
+Default: \code{"Sample"}.}
+
+\item{group_col}{Name of the condition/group column in the metadata file
+(e.g., \code{"Control"} vs \code{"Treatment"}). Used for heatmap column
+annotations. Default: \code{"group"}.}
+
+\item{save_dataframe}{Logical. If \code{TRUE}, saves the intermediate data frame
+(gene sets with computed leading edge genes) as a \code{.tsv} file in
+\code{output_dir} before plotting. Useful for inspection or reuse.
+Default: \code{FALSE}.}
+}
+\value{
+Invisibly returns \code{TRUE} upon completion. Saves two files per gene
+set in \code{output_dir}:
+\itemize{
+\item \verb{_heatmap.pdf}
+\item \verb{_heatmap.jpg}
+}
+
+If \code{save_dataframe = TRUE}, also saves
+\verb{/leading_edge_genes_df.tsv}.
+}
+\description{
+Generates one heatmap per gene set from GSEA/CAMERA/PADOG output by reading
+all required inputs from file paths. For each gene set, the leading edge
+genes are extracted, ordered by their rank in the ranked gene list, and
+plotted as a scaled row heatmap against the expression matrix.
+}
+\details{
+This function is the file-path-based alternative to \code{\link[=heatmap_PA]{heatmap_PA()}}, which
+accepts R objects directly. Use this version when working from raw output
+files on disk (e.g., directly after running \code{GSEA_merge.sh}).
+}
+\note{
+For a more flexible workflow that accepts R objects directly (avoiding
+repeated file reads), use \code{\link[=heatmap_PA]{heatmap_PA()}} instead, which takes
+\code{expression_data}, \code{metadata}, and \code{pa_data_annot} as R objects and
+integrates with \code{\link[=getgenesPA]{getgenesPA()}} and \code{\link[=addgenesPA]{addgenesPA()}}.
+}
+\examples{
+\dontrun{
+# Run with all files in a single base directory
+heatmap_path_PA(
+ main_dir = "path/to/analysis/",
+ expression_file = "vst_expression.tsv",
+ metadata_file = "metadata.xlsx",
+ gmt_file = "genesets.gmt",
+ ranked_genes_file = "ranked_genes.tsv",
+ gsea_file = "gsea_results.tsv",
+ output_dir = "leading_edge_heatmaps",
+ sample_col = "Sample",
+ group_col = "group",
+ save_dataframe = TRUE
+)
+# Saves:
+# leading_edge_heatmaps/_heatmap.pdf
+# leading_edge_heatmaps/_heatmap.jpg
+# leading_edge_heatmaps/leading_edge_genes_df.tsv (if save_dataframe = TRUE)
+
+# Run with absolute paths (no main_dir)
+heatmap_path_PA(
+ expression_file = "/data/vst_counts.tsv",
+ metadata_file = "/data/metadata.xlsx",
+ gmt_file = "/data/h.all.v2023.gmt",
+ ranked_genes_file = "/data/ranked_genes.tsv",
+ gsea_file = "/data/gsea_results.tsv"
+)
+}
+
+}
+\seealso{
+\code{\link[=heatmap_PA]{heatmap_PA()}} for the R-object-based alternative;
+\code{\link[=getgenesPA]{getgenesPA()}} and \code{\link[=addgenesPA]{addgenesPA()}} for extracting leading edge genes
+from R objects; \code{\link[=merge_PA]{merge_PA()}} to generate the GSEA results input;
+\code{\link[=list_gmts]{list_gmts()}} to load GMT files as R objects.
+}
diff --git a/man/list_gmts.Rd b/man/list_gmts.Rd
new file mode 100644
index 0000000..77ddc64
--- /dev/null
+++ b/man/list_gmts.Rd
@@ -0,0 +1,47 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/list_gmts.R
+\name{list_gmts}
+\alias{list_gmts}
+\title{Read GMT files from a directory into a named gene set list}
+\usage{
+list_gmts(dir)
+}
+\arguments{
+\item{dir}{Character. Path to the directory containing one or more \code{.gmt}
+files. The function searches the directory non-recursively.}
+}
+\value{
+A named list where each element is a character vector of gene symbols
+for one gene set. Names correspond to gene set names as defined in column 1
+of the GMT files. If the same gene set name appears in multiple files, the
+last occurrence overwrites the earlier one.
+}
+\description{
+Scans a directory for \code{.gmt} files, parses them, and returns a single named
+list where each element is a character vector of gene symbols for one gene
+set. The output is ready to be passed directly to \code{\link[=geneset_similarity]{geneset_similarity()}}.
+}
+\details{
+\strong{GMT format:} each row contains the gene set name in column 1, an optional
+description in column 2, and gene symbols from column 3 onward. Empty fields
+are automatically removed. This is the standard format used by MSigDB
+(Molecular Signatures Database) and other gene set annotation resources.
+}
+\examples{
+\dontrun{
+# Read all GMT files from a directory
+geneset_list <- list_gmts("path/to/gmt_files/")
+
+# Inspect output
+length(geneset_list) # number of gene sets
+names(geneset_list)[1:5] # first five gene set names
+geneset_list[["KEGG_APOPTOSIS"]] # genes in a specific set
+
+# Pass directly to geneset_similarity
+jac <- geneset_similarity(geneset_list, results_df, fdr_th = 0.05)
+}
+
+}
+\seealso{
+\code{\link[=geneset_similarity]{geneset_similarity()}}
+}
diff --git a/man/merge_GSEA.Rd b/man/merge_GSEA.Rd
deleted file mode 100644
index d845bc8..0000000
--- a/man/merge_GSEA.Rd
+++ /dev/null
@@ -1,16 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/merge_GSEA.R
-\name{merge_GSEA}
-\alias{merge_GSEA}
-\title{Merge GSEA results data frames.}
-\usage{
-merge_GSEA(input_directory, output_file = "collections_merged_gsea_data.tsv")
-}
-\arguments{
-\item{input_directory}{The directory containing the GSEA collection results in TSV format.}
-
-\item{output_file}{The output file to save the merged data. If not provided, the file will be saved in the input directory.}
-}
-\description{
-After running GSEA_all.sh from GSEA.sh, merge_GSEA function joins .tsv files to a single file
-}
diff --git a/man/merge_PA.Rd b/man/merge_PA.Rd
new file mode 100644
index 0000000..b5bc037
--- /dev/null
+++ b/man/merge_PA.Rd
@@ -0,0 +1,105 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/merge_PA.R
+\name{merge_PA}
+\alias{merge_PA}
+\title{Merge GSEA result files into a single data frame}
+\usage{
+merge_PA(input_directory, fdr_replace = 0.001)
+}
+\arguments{
+\item{input_directory}{Character. Path to the directory containing one or
+more GSEA collection result files in \code{.tsv} format (e.g., output of
+\code{GSEA_merge.sh}). Each file must end in \code{.tsv}.}
+
+\item{fdr_replace}{Numeric. Value used to replace \verb{FDR q-val = 0}. This
+occurs when no permutation produced an NES as extreme as the observed
+one, meaning the true FDR is below \code{1 / n_permutations}. With the
+standard 1,000 permutations, the recommended value is \code{0.001}. Adjust
+to \code{1 / n_permutations} if a different number of permutations was used.
+Default: \code{0.001}.}
+}
+\value{
+A data frame (\code{gsea_data}) containing all merged and processed GSEA
+results with standardized column names.
+}
+\description{
+Reads all \code{.tsv} files produced by \code{GSEA_merge.sh} (from the GSEA.sh
+pipeline) from a directory, standardizes numeric columns, parses the
+leading edge string, computes \code{-log10(FDR)}, and returns a single merged
+data frame ready for downstream use with \code{\link[=splot_PA]{splot_PA()}}, \code{\link[=multiplot_PA]{multiplot_PA()}} , \code{\link[=getgenesPA]{getgenesPA()}}, and
+\code{\link[=heatmap_PA]{heatmap_PA()}}.
+}
+\details{
+\strong{Input file format:} Each \code{.tsv} file corresponds to one MSigDB collection
+(e.g., \code{H.tsv}, \code{C2.tsv}) and must follow the standard GSEA output
+format with the following columns:
+\itemize{
+\item \code{NAME}: gene set name.
+\item \code{SIZE}: number of genes in the gene set.
+\item \code{ES}: enrichment score.
+\item \code{NES}: normalized enrichment score.
+\item \verb{NOM p-val}: nominal p-value.
+\item \verb{FDR q-val}: false discovery rate. Values of exactly \code{0} indicate
+that no permutation produced an equally extreme NES (i.e., the true
+FDR is below the permutation resolution \code{1 / n_permutations}). These
+are replaced by \code{fdr_replace} to avoid \code{-Inf} in log-transforms.
+\item \verb{FWER p-val}: family-wise error rate.
+\item \verb{RANK AT MAX}: gene rank at maximum enrichment score.
+\item \verb{LEADING EDGE}: string encoding the leading edge statistics in the
+format \code{"tags=XX\%, list=XX\%, signal=XX\%"}. Parsed into three numeric
+columns: \code{tags} (fraction of gene set in leading edge), \code{list}
+(fraction of ranked list used), and \code{signal} (enrichment signal
+strength).
+\item \code{Comparison}: name of the comparison (e.g., \code{"TumorVsNormal"}).
+Renamed to \code{COMPARISON} in the output. Required for visualization
+with \code{\link[=splot_PA]{splot_PA()}} or \code{\link[=multiplot_PA]{multiplot_PA()}} .
+\item \verb{GS
follow link to MSigDB} and \verb{GS DETAILS}: removed automatically.
+}
+
+\strong{Output columns:} All input columns (minus the two removed above) plus:
+\itemize{
+\item \code{COLLECTION}: name of the MSigDB collection, derived from the file name
+by removing the \code{.tsv} suffix.
+\item \code{tags}, \code{list}, \code{signal}: numeric leading edge components (0-1 scale).
+\item \code{Log10FDR}: \code{-log10(FDR)} computed after applying \code{fdr_replace}.
+\item \code{FDR}: renamed from \verb{FDR q-val}.
+\item \code{COMPARISON}: renamed from \code{Comparison}.
+}
+}
+\note{
+The input \code{.tsv} files must contain a \code{Comparison} column identifying
+each comparison (e.g., \code{"TumorVsNormal"}). This column is renamed to
+\code{COMPARISON} in the output and is required by \code{\link[=splot_PA]{splot_PA()}} or \code{\link[=multiplot_PA]{multiplot_PA()}} to operate in
+multi-comparison mode. If your files come from a single comparison and
+do not have this column, add it manually to each file before merging:
+\code{your_data$Comparison <- "YourComparisonName"}.
+}
+\examples{
+\dontrun{
+# Merge all GSEA collection TSV files from a directory
+gsea_data <- merge_PA(
+ input_directory = "path/to/gsea_results/",
+ fdr_replace = 0.001 # standard for 1000 permutations
+)
+
+# Inspect result
+head(gsea_data)
+colnames(gsea_data)
+
+# Use directly in downstream functions
+gsl <- list_gmts("path/to/gmt_folder/")
+ranked <- deseq2_results$gene_id[order(deseq2_results$stat,
+ decreasing = TRUE)]
+gene_lists <- getgenesPA(gsea_data, gsl, ranked)
+pa_annot <- addgenesPA(gsea_data, gene_lists)
+
+plot_PA(gsea_data, comparison_col = "COMPARISON")
+}
+
+}
+\seealso{
+\code{\link[=splot_PA]{splot_PA()}} or \code{\link[=multiplot_PA]{multiplot_PA()}} for visualization of merged results;
+\code{\link[=getgenesPA]{getgenesPA()}} and \code{\link[=addgenesPA]{addgenesPA()}} for gene-level annotation;
+\code{\link[=heatmap_PA]{heatmap_PA()}} for leading edge heatmaps;
+\code{\link[=list_gmts]{list_gmts()}} to load gene sets.
+}
diff --git a/man/multiplot_PA.Rd b/man/multiplot_PA.Rd
new file mode 100644
index 0000000..678a152
--- /dev/null
+++ b/man/multiplot_PA.Rd
@@ -0,0 +1,140 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot_PA.R
+\name{multiplot_PA}
+\alias{multiplot_PA}
+\title{Pathway analysis visualization across multiple comparisons}
+\usage{
+multiplot_PA(
+ data,
+ comparison_col = "COMPARISON",
+ facet_col = "NAME",
+ axis_y = "NES",
+ fdr_col = "FDR",
+ comparison_order = NULL,
+ custom_labels = NULL,
+ ncol_wrap = 2,
+ free_y = TRUE,
+ fill_limits = NULL,
+ fill_palette = c("white", "red"),
+ theme_params = list()
+)
+}
+\arguments{
+\item{data}{A data frame of pathway analysis results containing two or more
+comparisons. Typically the output of \code{\link[=merge_PA]{merge_PA()}}.}
+
+\item{comparison_col}{Name of the column identifying each comparison.
+Appears on the x-axis of each facet. Default: \code{"COMPARISON"}.}
+
+\item{facet_col}{Name of the column used to define facets one facet per
+unique value. Can be the original gene set name column (e.g., \code{"NAME"})
+or a manually curated column with cleaner or shorter labels
+(e.g., \code{"clean_name"}). Default: \code{"NAME"}.}
+
+\item{axis_y}{Name of the column to use for the y-axis. Default: \code{"NES"}.}
+
+\item{fdr_col}{Name of the column containing FDR values. \code{-log10(FDR)} is
+computed internally and used as the fill color. Default: \code{"FDR"}.}
+
+\item{comparison_order}{Character vector specifying the left-to-right order
+of comparisons on the x-axis of each facet. For example,
+\code{comparison_order = c("BvsA", "CvsA")} places \code{BvsA} on the left and
+\code{CvsA} on the right. If \code{NULL} (default), the order follows the factor
+levels of \code{comparison_col} as they appear in \code{data}.}
+
+\item{custom_labels}{Named character vector of x-axis tick labels. Useful
+for shortening comparison names on the axis. For example,
+\code{custom_labels = c(TumorVsNormal = "Tumor", MetastasisVsNormal = "Mets")}.
+Default: \code{NULL}.}
+
+\item{ncol_wrap}{Integer. Number of columns in \code{facet_wrap}. Default: \code{2}.}
+
+\item{free_y}{Logical. If \code{TRUE}, each facet uses its own y-axis scale.
+Default: \code{TRUE}.}
+
+\item{fill_limits}{Numeric vector of length 2 setting the color scale range
+for \code{-log10(FDR)}. Values outside this range are clamped to the nearest
+limit. For example, \code{fill_limits = c(0, 5)} maps all gene sets with
+\code{-log10(FDR) >= 5} (FDR <= 0.00001) to maximum red, and any value below
+0 to white. Useful when one gene set has extreme significance that makes
+the rest appear uniform. Default: \code{NULL} (auto).}
+
+\item{fill_palette}{Character vector of two colors for the fill gradient
+(low to high -log10(FDR)). Default: \code{c("white", "red")}.}
+
+\item{theme_params}{Named list to override default theme parameters.
+See Details.}
+}
+\value{
+A ggplot2 object.
+}
+\description{
+Generates a faceted barplot showing NES values across multiple comparisons
+for a set of gene sets. Each facet represents one gene set and bars
+represent the NES per comparison, colored by -log10(FDR). This layout makes
+it easy to compare how enrichment of gene sets changes across conditions
+(e.g., TumorVsNormal, MetastasisVsNormal).
+}
+\details{
+All comparisons must be combined in a single data frame with a column
+identifying each comparison as produced by \code{\link[=merge_PA]{merge_PA()}}.
+
+For visualizing a single comparison with full collection grouping, use
+\code{\link[=splot_PA]{splot_PA()}} instead.
+
+\code{theme_params} accepts any of the following named elements:
+\describe{
+\item{\code{bar_col}}{Bar border color. Default: \code{"black"}.}
+\item{\code{bar_size}}{Bar border linewidth. Default: \code{0.5}.}
+\item{\code{bar_width}}{Bar width. Default: \code{0.6}.}
+\item{\code{hline_size}}{Linewidth for horizontal line at y = 0. Default: \code{2}.}
+\item{\code{axis_title_size}}{Font size for axis titles. Default: \code{45}.}
+\item{\code{axis_text_size_x}}{Font size for x-axis labels. Default: \code{30}.}
+\item{\code{axis_text_size_y}}{Font size for y-axis labels. Default: \code{50}.}
+\item{\code{tick_size}}{Linewidth for axis ticks. Default: \code{1.5}.}
+\item{\code{tick_length}}{Length of axis ticks in cm. Default: \code{0.3}.}
+\item{\code{strip_text_size}}{Font size for facet strip labels. Default: \code{50}.}
+\item{\code{panel_spacing_multi}}{Spacing between facets. Default: \code{0.6}.}
+}
+}
+\examples{
+\dontrun{
+gsea_results <- merge_PA("path/to/gsea_results/")
+
+# Basic multi-comparison plot
+multiplot_PA(
+ data = gsea_results,
+ comparison_col = "COMPARISON",
+ facet_col = "NAME",
+ fdr_col = "FDR",
+ ncol_wrap = 3
+)
+
+# Control left-to-right order of comparisons on the x-axis
+multiplot_PA(
+ data = gsea_results,
+ comparison_col = "COMPARISON",
+ facet_col = "NAME",
+ fdr_col = "FDR",
+ comparison_order = c("BvsA", "CvsA") # BvsA on the left, CvsA on the right
+)
+
+# Use cleaner facet labels and shorten x-axis tick names
+gsea_results$clean_name <- gsub("_", " ", gsea_results$NAME)
+
+multiplot_PA(
+ data = gsea_results,
+ comparison_col = "COMPARISON",
+ facet_col = "clean_name",
+ fdr_col = "FDR",
+ comparison_order = c("BvsA", "CvsA"),
+ custom_labels = c(BvsA = "Tumor", CvsA = "Metastasis")
+)
+}
+
+}
+\seealso{
+\code{\link[=splot_PA]{splot_PA()}} for single-comparison patchwork plots;
+\code{\link[=merge_PA]{merge_PA()}} to generate the input data frame;
+\link{camera_results} for a minimal example dataset.
+}
diff --git a/man/network_clust.Rd b/man/network_clust.Rd
new file mode 100644
index 0000000..8931d94
--- /dev/null
+++ b/man/network_clust.Rd
@@ -0,0 +1,113 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plotclust_PA.R
+\name{network_clust}
+\alias{network_clust}
+\title{Gene set network clustering with igraph base R graphics}
+\usage{
+network_clust(
+ x,
+ clust_result,
+ jaccard_threshold = 0.5,
+ min_degree = 2,
+ superterms = FALSE,
+ superterm_data = NULL,
+ type = "all",
+ seed = 174
+)
+}
+\arguments{
+\item{x}{A \code{JaccardResult} object (output of \code{\link[=geneset_similarity]{geneset_similarity()}}).}
+
+\item{clust_result}{A list returned by \code{\link[=do_clust]{do_clust()}}, used to color nodes by
+hierarchical cluster assignment.}
+
+\item{jaccard_threshold}{Numeric. Minimum Jaccard similarity required for an
+edge to be included in the network. Default: \code{0.50}.}
+
+\item{min_degree}{Integer. Minimum node degree for a node to be retained in
+the network. Default: \code{2}.}
+
+\item{superterms}{Logical. If \code{TRUE}, overlays super-term labels at community
+centroids on the relevant plots. Requires \code{superterm_data}. Default:
+\code{FALSE}.}
+
+\item{superterm_data}{A list returned by \code{\link[=get_superterm]{get_superterm()}}. Required when
+\code{superterms = TRUE}.}
+
+\item{type}{Character. Which plot(s) to draw. One of:
+\itemize{
+\item \code{"clean"} : colored nodes, no labels.
+\item \code{"superterms"} : colored nodes with community super-term labels.
+\item \code{"combined"} : colored nodes with super-term labels and individual node
+labels.
+\item \code{"individual"} : colored nodes with individual node labels only.
+\item \code{"all"} : all four plots drawn sequentially. Default.
+}}
+
+\item{seed}{Integer. Random seed for the Fruchterman-Reingold layout and
+Louvain community detection. Default: \code{174}.}
+}
+\value{
+Invisibly, a named list with:
+\itemize{
+\item \verb{$graph}: The filtered \code{igraph} graph object (\code{g_clean}).
+\item \verb{$layout}: Numeric matrix with the normalized Fruchterman-Reingold node
+coordinates.
+\item \verb{$node_attributes}: A \code{\link[tibble:tibble]{tibble::tibble()}} with columns \code{NAME}, \code{cluster},
+\code{community}, \code{degree}, \code{betweenness}, and \code{closeness} (plus \code{superterm}
+if \code{superterms = TRUE}).
+\item \verb{$superterm_report}: A \code{\link[tibble:tibble]{tibble::tibble()}} with columns \code{community},
+\code{superterm}, \code{n_genesets}, and \code{geneset_members}. Only present if
+\code{superterms = TRUE}.
+}
+}
+\description{
+Builds a weighted gene set network from a Jaccard similarity matrix, applies
+Louvain community detection, and draws up to four network visualizations
+using base R igraph graphics directly to the active graphics device.
+Optionally overlays super-term community labels generated by
+\code{\link[=get_superterm]{get_superterm()}}.
+}
+\details{
+For a ggplot2-based version that returns plot objects instead of drawing
+them, see \code{\link[=network_clust_gg]{network_clust_gg()}}.
+}
+\examples{
+\dontrun{
+# Requires igraph
+gsl <- list_gmts("path/to/gmt_folder/")
+res <- read.csv("path/to/results.csv")
+
+jac <- geneset_similarity(gsl, res, fdr_th = 0.05)
+clust <- do_clust(jac)
+net <- get_network_communities(jac, threshold = 0.3)
+
+# Draw all four plots directly to the active graphics device
+result <- network_clust(
+ jac,
+ clust_result = clust,
+ superterms = TRUE,
+ superterm_data = net$superterms,
+ type = "all",
+ seed = 174
+)
+
+# Draw only the clean version (no labels)
+network_clust(jac, clust, type = "clean")
+
+# Access node-level metrics from the invisible return
+result$node_attributes
+result$superterm_report
+
+# Save a specific plot to PDF
+pdf("network_superterms.pdf", width = 14, height = 14)
+network_clust(jac, clust, superterms = TRUE,
+ superterm_data = net$superterms, type = "superterms")
+dev.off()
+}
+
+}
+\seealso{
+\code{\link[=geneset_similarity]{geneset_similarity()}}, \code{\link[=do_clust]{do_clust()}}, \code{\link[=get_network_communities]{get_network_communities()}},
+\code{\link[=get_superterm]{get_superterm()}}, \code{\link[=network_clust_gg]{network_clust_gg()}}
+}
diff --git a/man/network_clust_gg.Rd b/man/network_clust_gg.Rd
new file mode 100644
index 0000000..165e8de
--- /dev/null
+++ b/man/network_clust_gg.Rd
@@ -0,0 +1,121 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plotclust_PA.R
+\name{network_clust_gg}
+\alias{network_clust_gg}
+\title{Gene set network clustering with ggraph graphics}
+\usage{
+network_clust_gg(
+ x,
+ clust_result,
+ jaccard_threshold = 0.5,
+ min_degree = 2,
+ superterms = FALSE,
+ superterm_data = NULL,
+ type = "all",
+ seed = 174
+)
+}
+\arguments{
+\item{x}{A \code{JaccardResult} object (output of \code{\link[=geneset_similarity]{geneset_similarity()}}).}
+
+\item{clust_result}{A list returned by \code{\link[=do_clust]{do_clust()}}, used to color nodes by
+hierarchical cluster assignment.}
+
+\item{jaccard_threshold}{Numeric. Minimum Jaccard similarity required for an
+edge to be included in the network. Default: \code{0.50}.}
+
+\item{min_degree}{Integer. Minimum node degree for a node to be retained in
+the network. Default: \code{2}.}
+
+\item{superterms}{Logical. If \code{TRUE}, overlays super-term labels at community
+centroids on the relevant plots. Requires \code{superterm_data}. Default:
+\code{FALSE}.}
+
+\item{superterm_data}{A list returned by \code{\link[=get_network_communities]{get_network_communities()}} (slot
+\verb{$superterms}) or directly by \code{\link[=get_superterm]{get_superterm()}}. Required when
+\code{superterms = TRUE}.}
+
+\item{type}{Character. Which plot(s) to return. One of:
+\itemize{
+\item \code{"clean"} : colored nodes, no labels.
+\item \code{"superterms"} : colored nodes with community super-term labels.
+\item \code{"combined"} : colored nodes with super-term labels and individual node
+labels.
+\item \code{"individual"} : colored nodes with individual node labels only.
+\item \code{"all"} : all four plots. Default.
+}}
+
+\item{seed}{Integer. Random seed for the Fruchterman-Reingold layout and
+Louvain community detection. Default: \code{174}.}
+}
+\value{
+A named list. Depending on \code{type}, the list may contain any
+combination of the ggplot2 elements \verb{$clean}, \verb{$superterms}, \verb{$combined},
+and \verb{$individual}. The list always contains:
+\itemize{
+\item \verb{$node_attributes}: A \code{\link[tibble:tibble]{tibble::tibble()}} with columns \code{NAME}, \code{cluster},
+\code{community}, \code{degree}, \code{betweenness}, and \code{closeness} (plus \code{superterm}
+if \code{superterms = TRUE}).
+\item \verb{$superterm_report}: A \code{\link[tibble:tibble]{tibble::tibble()}} with columns \code{community},
+\code{superterm}, \code{n_genesets}, and \code{geneset_members}. Only present if
+\code{superterms = TRUE}.
+}
+}
+\description{
+Builds a weighted gene set network from a Jaccard similarity matrix, applies
+Louvain community detection, and returns up to four ggplot2 network
+visualizations using ggraph. Each plot is a standard ggplot2 object that
+can be further customized, saved with \code{\link[ggplot2:ggsave]{ggplot2::ggsave()}}, or combined
+with patchwork.
+}
+\details{
+Optionally overlays super-term community labels generated by
+\code{\link[=get_superterm]{get_superterm()}}.
+
+For a base R igraph version that draws directly to the active graphics
+device, see \code{\link[=network_clust]{network_clust()}}.
+}
+\examples{
+\dontrun{
+# Requires igraph, ggraph, tidygraph
+gsl <- list_gmts("path/to/gmt_folder/")
+res <- read.csv("path/to/results.csv")
+
+jac <- geneset_similarity(gsl, res, fdr_th = 0.05)
+clust <- do_clust(jac)
+net <- get_network_communities(jac, threshold = 0.3)
+
+# Return all four plots as ggplot2 objects
+plots <- network_clust_gg(
+ jac,
+ clust_result = clust,
+ superterms = TRUE,
+ superterm_data = net$superterms,
+ type = "all",
+ seed = 174
+)
+
+# Display individual plots
+plots$clean # no labels
+plots$superterms # community labels only
+plots$combined # community + node labels
+plots$individual # node labels only
+
+# Save with ggsave (full ggplot2 objects)
+ggplot2::ggsave("network_combined.pdf", plots$combined,
+ width = 14, height = 14)
+
+# Access node-level metrics
+plots$node_attributes
+plots$superterm_report
+
+# Combine two plots with patchwork
+library(patchwork)
+plots$clean + plots$superterms
+}
+
+}
+\seealso{
+\code{\link[=geneset_similarity]{geneset_similarity()}}, \code{\link[=do_clust]{do_clust()}}, \code{\link[=get_network_communities]{get_network_communities()}},
+\code{\link[=get_superterm]{get_superterm()}}, \code{\link[=network_clust]{network_clust()}}
+}
diff --git a/man/nice_KM.Rd b/man/nice_KM.Rd
index 76d24b3..18306c7 100644
--- a/man/nice_KM.Rd
+++ b/man/nice_KM.Rd
@@ -58,6 +58,13 @@ nice_KM(
}
\value{
A \code{ggplot} object (or a list with \code{km_fit} and \code{plot} if \code{returnData = TRUE}).
+
+A ggplot2 object if \code{returnData = FALSE} (default). If
+\code{returnData = TRUE}, a named list with two elements:
+\itemize{
+\item \verb{$km_fit}: The \code{\link[survival:survfit]{survival::survfit()}} object.
+\item \verb{$plot}: The ggplot2 survival curve.
+}
}
\description{
This function fits Kaplan Meier survival curves stratified by status (e.g., 'No' vs 'Yes') for a specified gene column in a data frame. It allows:
@@ -66,3 +73,9 @@ This function fits Kaplan Meier survival curves stratified by status (e.g., 'No'
\item Automatic handling of cases where only one category ('No' or 'Yes') is present, returning an 'empty' placeholder plot with a warning message.
}
}
+\references{
+Kaplan, E. L., & Meier, P. (1958). Nonparametric estimation from
+incomplete observations. \emph{Journal of the American Statistical
+Association}, 53(282), 457–481.
+\doi{10.1080/01621459.1958.10501452}
+}
diff --git a/man/nice_PCA.Rd b/man/nice_PCA.Rd
index a4beb55..b29f42b 100644
--- a/man/nice_PCA.Rd
+++ b/man/nice_PCA.Rd
@@ -74,6 +74,12 @@ nice_PCA(
\item{returnData}{Indicates if the function should return the data (TRUE) or the plot (FALSE). Default: FALSE.}
}
+\value{
+A ggplot2 object if \code{returnData = FALSE} (default). If
+\code{returnData = TRUE}, a numeric matrix of PCA coordinates with dimensions
+samples × \code{outPCs}, with a \code{percentVar} attribute containing the
+proportion of variance explained per component.
+}
\description{
This was inspired on the plotPCA function from DESeq2, made by Wolfgang Huber
But including some improvements made by David Requena. Now it allows:
@@ -83,3 +89,38 @@ But including some improvements made by David Requena. Now it allows:
\item To provide the colors, shapes and fonts.
}
}
+\examples{
+data(vst_counts)
+data(sampledata)
+
+# nice_PCA joins by a column named "id" in annotations
+sampledata_pca <- sampledata
+colnames(sampledata_pca)[colnames(sampledata_pca) == "patient_id"] <- "id"
+
+nice_PCA(
+ object = vst_counts,
+ annotations = sampledata_pca,
+ variables = c(fill = "sample_type"),
+ legend_names = c(fill = "Sample Type"),
+ colors = c("steelblue", "firebrick"),
+ shapes = c(21, 21),
+ title = "TCGA-LUAD PCA"
+)
+
+# Return PCA coordinates instead of plot
+pca_data <- nice_PCA(
+ object = vst_counts,
+ annotations = sampledata_pca,
+ variables = c(fill = "sample_type"),
+ legend_names = c(fill = "Sample Type"),
+ colors = c("steelblue", "firebrick"),
+ shapes = c(21, 21),
+ returnData = TRUE
+)
+head(pca_data)
+
+}
+\seealso{
+\code{\link[=nice_UMAP]{nice_UMAP()}}, \code{\link[=nice_tSNE]{nice_tSNE()}} for other alternatives;
+\link{vst_counts} for the recommended input matrix.
+}
diff --git a/man/nice_UMAP.Rd b/man/nice_UMAP.Rd
index 3f01428..a99f896 100644
--- a/man/nice_UMAP.Rd
+++ b/man/nice_UMAP.Rd
@@ -74,6 +74,44 @@ nice_UMAP(
\item{returnData}{Indicates if the function should return the data (TRUE) or the plot (FALSE). Default: FALSE.}
}
+\value{
+A ggplot2 object if \code{returnData = FALSE} (default). If
+\code{returnData = TRUE}, a data frame with UMAP coordinates and sample
+annotations.
+}
\description{
Function to make UMAP plots.
}
+\examples{
+\dontrun{
+data(vst_counts)
+data(sampledata)
+
+sampledata_u <- sampledata
+colnames(sampledata_u)[colnames(sampledata_u) == "patient_id"] <- "id"
+
+nice_UMAP(
+ object = vst_counts,
+ annotations = sampledata_u,
+ variables = c(fill = "sample_type"),
+ legend_names = c(fill = "Sample Type"),
+ colors = c("steelblue", "firebrick"),
+ shapes = c(21, 21),
+ title = "TCGA-LUAD UMAP",
+ neighbors = 5,
+ epochs = 1000,
+ seed = 1905
+)
+}
+
+}
+\references{
+McInnes, L., Healy, J., & Melville, J. (2018). Umap: Uniform Manifold
+Approximation and Projection for Dimension Reduction.
+\emph{arXiv preprint arXiv:1802.03426}.
+\url{https://arxiv.org/abs/1802.03426}
+}
+\seealso{
+\code{\link[=nice_PCA]{nice_PCA()}}, \code{\link[=nice_tSNE]{nice_tSNE()}} for alternative dimensionality
+reduction methods; \link{vst_counts} for the recommended input matrix.
+}
diff --git a/man/nice_VSB.Rd b/man/nice_VSB.Rd
index c7ad3c4..fe13bc4 100644
--- a/man/nice_VSB.Rd
+++ b/man/nice_VSB.Rd
@@ -2,7 +2,7 @@
% Please edit documentation in R/nice_VSB.R
\name{nice_VSB}
\alias{nice_VSB}
-\title{Function to make Violin-Scatter-Box plots.}
+\title{Function to make Violin-Scatter-Box plots from data frames.}
\usage{
nice_VSB(
object = NULL,
@@ -23,7 +23,7 @@ nice_VSB(
)
}
\arguments{
-\item{object}{A DEseq object already transformed with the variance stabilizing or rlog transformations.}
+\item{object}{A data frame object with normalized counts genes(in rows) across samples(in columns).}
\item{annotations}{Data frame with annotations.}
@@ -31,7 +31,8 @@ nice_VSB(
\item{genename}{The gene name to be used for the plot.}
-\item{symbol}{The gene symbol to be used for the plot.}
+\item{symbol}{The gene symbol to display in the plot title. To obtain
+gene symbols from Ensembl IDs, use \code{\link[=get_annotations]{get_annotations()}}.}
\item{labels}{A vector containing the x-labels of the box-plot. Default: c("N", "P", "R", "M").}
@@ -53,7 +54,32 @@ nice_VSB(
\item{legend_size}{Font of the title and elements of the legend. Default: c(title = 14, elements = 12).}
}
+\value{
+A ggplot2 object.
+}
\description{
This function will make a Boxplot, using a DEseq object.
It will show the data points on top with a small deviation (jitter) for a better visualization.
}
+\examples{
+data(norm_counts)
+data(sampledata)
+
+nice_VSB(
+ object = norm_counts,
+ annotations = sampledata,
+ variables = c(fill = "sample_type"),
+ genename = rownames(norm_counts)[1],
+ categories = c("normal", "tumor"),
+ labels = c("Normal", "Tumor"),
+ colors = c("steelblue", "firebrick"),
+ shapes = 21,
+ markersize = 3
+)
+
+}
+\seealso{
+\code{\link[=nice_Volcano]{nice_Volcano()}} for genome-wide visualization; \code{\link[=detect_filter]{detect_filter()}}
+to identify reliably expressed genes; \code{\link[=get_stars]{get_stars()}} to add significance
+annotations; \link{norm_counts} for an example input matrix.
+}
diff --git a/man/nice_VSB_DEseq2.Rd b/man/nice_VSB_DEseq2.Rd
new file mode 100644
index 0000000..5c0a10a
--- /dev/null
+++ b/man/nice_VSB_DEseq2.Rd
@@ -0,0 +1,88 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/nice_VSB_DEseq2.R
+\name{nice_VSB_DEseq2}
+\alias{nice_VSB_DEseq2}
+\title{Function to make Box-Scatter-Violin plots from DEseq2 output directly.}
+\usage{
+nice_VSB_DEseq2(
+ object = NULL,
+ variables = c(fill = "VarFill", shape = "VarShape"),
+ genename = NULL,
+ symbol = NULL,
+ labels = c("N", "P", "R", "M"),
+ categories = c("normal", "primary", "recurrence", "metastasis"),
+ colors = NULL,
+ shapes = NULL,
+ markersize = NULL,
+ alpha = 0.8,
+ width = NULL,
+ height = NULL,
+ jitter = 0.2,
+ dpi = 150,
+ save = FALSE,
+ title_size = c(axis = 20, fig = 24),
+ label_size = c(x = 20, y = 16),
+ legend_size = c(title = 14, elements = 12)
+)
+}
+\arguments{
+\item{object}{A DEseq object already transformed with the variance stabilizing or rlog transformations.}
+
+\item{variables}{To indicate the variables to be used as Shape and Fill of the markers.}
+
+\item{genename}{The gene name to be used for the plot.}
+
+\item{symbol}{The gene symbol to display in the plot title. To obtain
+gene symbols from Ensembl IDs, use \code{\link[=get_annotations]{get_annotations()}}.}
+
+\item{labels}{A vector containing the x-labels of the box-plot. Default: c("N", "P", "R", "M").}
+
+\item{categories}{A vector containing the labels for the legend. Default: c("normal", "primary", "recurrence", "metastasis").}
+
+\item{colors}{Vector of colors to be used for the categories of the variable assigned as Marker Fill.}
+
+\item{shapes}{Vector of shapes to be used for the categories of the variable assigned as Marker Shape.}
+
+\item{markersize}{Size of the marker.}
+
+\item{alpha}{Transparency of the marker, which goes from 0 (transparent) to 1 (no transparent). Default: 0.8.}
+
+\item{width}{Width of the plot.}
+
+\item{height}{Height of the plot.}
+
+\item{jitter}{Random deviation added to the dots. Default: 0.2.}
+
+\item{dpi}{DPI of the plot. Default: 150.}
+
+\item{save}{To save the plot. Default: FALSE.}
+
+\item{title_size}{Font of the title and axis names. Default: c(axis = 20, fig = 24).}
+
+\item{label_size}{Font of the labels (x-axis) and numbers (y-axis). Default: c(x = 20, y = 16).}
+
+\item{legend_size}{Font of the title and elements of the legend. Default: c(title = 14, elements = 12).}
+}
+\description{
+This function will make a Boxplot, using a DEseq object.
+It will show the data points on top with a small deviation (jitter) for a better visualization.
+}
+\examples{
+\dontrun{
+# requires a DESeq2 object
+
+data(sampledata)
+
+nice_VSB_DEseq2(
+ object = vst,
+ annotations = sampledata,
+ variables = c(fill = "sample_type"),
+ genename = rownames(norm_counts)[1],
+ categories = c("normal", "tumor"),
+ labels = c("Normal", "Tumor"),
+ colors = c("steelblue", "firebrick"),
+ shapes = 21,
+ markersize = 3
+)
+}
+}
diff --git a/man/nice_Volcano.Rd b/man/nice_Volcano.Rd
index b40a3db..73b8852 100644
--- a/man/nice_Volcano.Rd
+++ b/man/nice_Volcano.Rd
@@ -40,7 +40,9 @@ nice_Volcano(
\item{x_var}{Name of the column in \code{results} to plot on the x-axis (e.g. log₂FC).}
-\item{label_var}{to be defined.}
+\item{label_var}{Name of the column in \code{results} to use as point labels
+(e.g. gene IDs or HGNC symbols). To use gene symbols, first run
+\code{\link[=get_annotations]{get_annotations()}} and join the \code{symbol} column to your results table.}
\item{legend}{Logical. Control legend display. Default: TRUE.}
@@ -50,6 +52,9 @@ nice_Volcano(
\item{genes}{Vector of genes to label in the plot. Default: NULL.}
}
+\value{
+A ggplot2 object
+}
\description{
Volcano plot with configurable point shapes and threshold annotations:
\itemize{
@@ -58,3 +63,34 @@ Volcano plot with configurable point shapes and threshold annotations:
\item Vertical dashed lines at log-fold-change cutoffs, shown as custom x-axis ticks.
}
}
+\examples{
+data(deseq2_results)
+
+nice_Volcano(
+ results = deseq2_results,
+ x_var = "log2FoldChange",
+ y_var = "padj",
+ label_var = "gene_id",
+ title = "TCGA-LUAD: Tumor vs Normal",
+ cutoff_y = 0.05,
+ cutoff_x = 1,
+ x_range = 8,
+ y_max = 10
+)
+
+# Highlight specific genes
+nice_Volcano(
+ results = deseq2_results,
+ x_var = "log2FoldChange",
+ y_var = "padj",
+ label_var = "gene_id",
+ title = "TCGA-LUAD: Tumor vs Normal",
+ genes = deseq2_results$gene_id[1:5]
+)
+
+}
+\seealso{
+\code{\link[=nice_VSB]{nice_VSB()}} for gene-level expression visualization;
+\code{\link[=detect_filter]{detect_filter()}} to filter detectable genes before plotting;
+\link{deseq2_results} for an example input dataset.
+}
diff --git a/man/nice_tSNE.Rd b/man/nice_tSNE.Rd
index 653913e..4937df1 100644
--- a/man/nice_tSNE.Rd
+++ b/man/nice_tSNE.Rd
@@ -71,6 +71,43 @@ nice_tSNE(
\item{returnData}{Indicates if the function should return the data (TRUE) or the plot (FALSE). Default: FALSE.}
}
+\value{
+A ggplot2 object if \code{returnData = FALSE} (default). If
+\code{returnData = TRUE}, a data frame with tSNE coordinates and sample
+annotations.
+}
\description{
Function to make tSNE plots.
}
+\examples{
+\dontrun{
+data(vst_counts)
+data(sampledata)
+
+sampledata_t <- sampledata
+colnames(sampledata_t)[colnames(sampledata_t) == "patient_id"] <- "id"
+
+# perplexity must be < n_samples / 3; with 32 samples use perplexity = 5
+nice_tSNE(
+ object = vst_counts,
+ annotations = sampledata_t,
+ perplexity = 5,
+ max_iterations = 1000,
+ variables = c(fill = "sample_type"),
+ legend_names = c(fill = "Sample Type"),
+ colors = c("steelblue", "firebrick"),
+ shapes = c(21, 21),
+ title = "TCGA-LUAD tSNE",
+ seed = 1905
+)
+}
+}
+\references{
+van der Maaten, L., & Hinton, G. (2008). Visualizing data using t-SNE.
+\emph{Journal of Machine Learning Research}, 9, 2579–2605.
+\url{https://jmlr.org/papers/v9/vandermaaten08a.html}
+}
+\seealso{
+\code{\link[=nice_PCA]{nice_PCA()}}, \code{\link[=nice_UMAP]{nice_UMAP()}} for alternative dimensionality
+reduction methods; \link{vst_counts} for the recommended input matrix.
+}
diff --git a/man/norm_counts.Rd b/man/norm_counts.Rd
new file mode 100644
index 0000000..be06c00
--- /dev/null
+++ b/man/norm_counts.Rd
@@ -0,0 +1,113 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/deseq2_results.R
+\docType{data}
+\name{norm_counts}
+\alias{norm_counts}
+\title{Normalized counts matrix for TCGA-LUAD}
+\format{
+A numeric matrix with 21,330 rows (genes) and 32 columns (samples):
+\describe{
+\item{rows}{Ensembl gene IDs (e.g., \code{"ENSG00000141510"}).}
+\item{columns}{Sample IDs matching the \code{patient_id} column of
+\link{sampledata}.}
+\item{values}{Non-negative numeric. Size-factor normalized counts.
+Range: [0, 1,889,573].}
+}
+}
+\source{
+TCGA-LUAD STAR counts downloaded from the GDC Data Portal
+(\url{https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz}).
+Normalized with DESeq2::counts() (\code{normalized = TRUE}); generated by
+\code{data-raw/deseq2_results.R}.
+}
+\usage{
+norm_counts
+}
+\description{
+DESeq2 size-factor normalized counts derived from the TCGA-LUAD RNA-seq
+dataset (16 tumor samples, 16 normal samples). Counts are divided by
+DESeq2 size factors to correct for differences in library size across
+samples, but remain in counts scale (not log-transformed).
+}
+\details{
+Suitable as input for \code{\link[=nice_VSB]{nice_VSB()}}, \code{\link[=detect_filter]{detect_filter()}}, and
+\code{\link[=add_annotations]{add_annotations()}}. For dimensionality reduction methods (\code{\link[=nice_PCA]{nice_PCA()}},
+\code{\link[=nice_UMAP]{nice_UMAP()}}, \code{\link[=nice_tSNE]{nice_tSNE()}}) use \link{vst_counts} instead, which removes the
+mean-variance dependence of RNA-seq data.
+}
+\examples{
+data(norm_counts)
+data(sampledata)
+
+# Dimensions
+dim(norm_counts)
+
+# Value range
+range(norm_counts)
+
+# Expression of a specific gene across samples
+norm_counts["ENSG00000141510", ]
+
+# Violin-Scatter-Box plot for one gene
+nice_VSB(
+ object = norm_counts,
+ annotations = sampledata,
+ variables = c(fill = "sample_type"),
+ genename = "ENSG00000141510",
+ categories = c("normal", "tumor"),
+ labels = c("Normal", "Tumor"),
+ colors = c("steelblue", "firebrick")
+)
+
+\dontrun{
+# detect_filter: (required: "ensembl" column in results)
+deseq2_res <- deseq2_results
+colnames(deseq2_res)[colnames(deseq2_res) == "gene_id"] <- "ensembl"
+rownames(deseq2_res) <- deseq2_res$ensembl
+
+# Get sample IDs per group from sampledata
+samples_normal <- sampledata$patient_id[sampledata$sample_type == "normal"]
+samples_tumor <- sampledata$patient_id[sampledata$sample_type == "tumor"]
+
+detected <- detect_filter(
+ norm.counts = as.data.frame(norm_counts),
+ df.BvsA = deseq2_res,
+ samples.baseline = samples_normal,
+ samples.condition1 = samples_tumor,
+ cutoffs = c(50, 50, 0)
+)
+
+# Number of detectable genes
+length(detected$DetectGenes)
+
+# Subset results to detectable genes
+head(detected$Comparison1)
+
+# add_annotations: add gene symbols
+# Required: reference df with geneID + annotation columns
+# Example using biomaRt to fetch gene symbols
+library(biomaRt)
+mart <- useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl")
+ref <- getBM(
+ attributes = c("ensembl_gene_id", "hgnc_symbol", "gene_biotype"),
+ filters = "ensembl_gene_id",
+ values = rownames(norm_counts),
+ mart = mart
+)
+colnames(ref)[1] <- "geneID"
+
+norm_counts_annot <- add_annotations(
+ object = norm_counts,
+ reference = ref,
+ variables = c("hgnc_symbol", "gene_biotype")
+)
+
+head(norm_counts_annot[, c("geneID", "hgnc_symbol", "gene_biotype")])
+}
+
+}
+\seealso{
+\link{vst_counts}, \link{deseq2_results}, \link{sampledata}, \code{\link[=nice_VSB]{nice_VSB()}},
+\code{\link[=detect_filter]{detect_filter()}}, \code{\link[=add_annotations]{add_annotations()}}
+}
+\keyword{datasets}
diff --git a/man/plot_GSEA.Rd b/man/plot_GSEA.Rd
deleted file mode 100644
index 9e39ad3..0000000
--- a/man/plot_GSEA.Rd
+++ /dev/null
@@ -1,38 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/plot_GSEA.R
-\name{plot_GSEA}
-\alias{plot_GSEA}
-\title{Plot global GSEA results}
-\usage{
-plot_GSEA(
- data,
- geneset_col,
- collection_col,
- nes_col,
- logfdr_col,
- text_size_genesets = 5,
- text_size_collection = 5
-)
-}
-\arguments{
-\item{data}{Data frame containing the GSEA results.}
-
-\item{geneset_col}{Name of the column containing the genesets.}
-
-\item{collection_col}{Name of the column containing the collections.}
-
-\item{nes_col}{Name of the column containing the NES values.}
-
-\item{logfdr_col}{Name of the column containing \eqn{-\log_{10}(FDR)} values.}
-
-\item{text_size_genesets}{Text size for the geneset labels.}
-
-\item{text_size_collection}{Text size for the collection labels.}
-}
-\value{
-GSEA barplots arranged in a grid.
-}
-\description{
-Generates a composite plot displaying NES values, pathway labels,
-and a \emph{logFDR} legend, organized by MSigDB collections.
-}
diff --git a/man/power_analysis.Rd b/man/power_analysis.Rd
index 3d054d0..e5657c8 100644
--- a/man/power_analysis.Rd
+++ b/man/power_analysis.Rd
@@ -30,8 +30,44 @@ power_analysis(
\item{max_n}{Integer. Maximum sample size per group to explore.}
-\item{plot}{Logical. If TRUE, draws the power curve; if FALSE, skips plotting.}
+\item{plot}{Logical. If TRUE, draws the power curve; if FALSE, skips plotting.}
+}
+\value{
+A named list. If \code{plot = TRUE}, contains three elements:
+\itemize{
+\item \verb{$min_sample_size}: Integer. Minimum sample size per group to reach
+\code{power_target}.
+\item \verb{$power_table}: A data frame with columns \code{SampleSize} and \code{Power}.
+\item \verb{$plot}: A ggplot2 object of the power curve.
+}
+
+If \code{plot = FALSE}, returns only \verb{$min_sample_size} and \verb{$power_table}.
}
\description{
Power analysis for RNA-seq differential expression with optional plotting
}
+\examples{
+# Basic power analysis with default parameters
+result <- power_analysis(
+ effect_size = 1,
+ dispersion = 0.1,
+ n_genes = 20000,
+ prop_de = 0.05,
+ alpha = 0.05,
+ power_target = 0.8,
+ max_n = 20
+)
+
+# Minimum sample size to reach 80\% power
+result$min_sample_size
+
+# Full power table
+head(result$power_table)
+
+# Higher effect size requires fewer samples
+power_analysis(effect_size = 2, dispersion = 0.1, plot = FALSE)$min_sample_size
+
+# See plot
+#power_analysis$plot
+
+}
diff --git a/man/save_results.Rd b/man/save_results.Rd
index 6ea21ed..12176bd 100644
--- a/man/save_results.Rd
+++ b/man/save_results.Rd
@@ -15,6 +15,15 @@ save_results(df, name, l2fc = 0, cutoff_alpha = 0.25)
\item{cutoff_alpha}{The cut-off of the False Discovery Rate (FDR o padj). Default = 0.25.}
}
+\value{
+Invisibly returns \code{NULL}. Saves three \code{.xlsx} files to the working
+directory:
+\itemize{
+\item \verb{_full.xlsx} : all genes.
+\item \verb{_up_log2FC>_FDR<.xlsx} : upregulated genes.
+\item \verb{_down_log2FC<_FDR<.xlsx} : downregulated genes.
+}
+}
\description{
This function takes as input the output of the function "results()" of DEseq2.
And will save 3 tables:
@@ -24,3 +33,26 @@ And will save 3 tables:
\item A table including only the under-expressed genes
}
}
+\examples{
+\dontrun{
+data(deseq2_results)
+
+# Save full results + over/under-expressed tables as .xlsx files
+save_results(
+ df = deseq2_results,
+ name = "TCGA_LUAD_TumorVsNormal",
+ l2fc = 1,
+ cutoff_alpha = 0.05
+)
+
+# Creates:
+# TCGA_LUAD_TumorVsNormal_full.xlsx
+# TCGA_LUAD_TumorVsNormal_up_log2FC>1_FDR<0.05.xlsx
+# TCGA_LUAD_TumorVsNormal_down_log2FC<1_FDR<0.05.xlsx
+}
+
+}
+\seealso{
+\code{\link[=detect_filter]{detect_filter()}} to further filter saved results;
+\link{deseq2_results} for an example input.
+}
diff --git a/man/split_cases.Rd b/man/split_cases.Rd
index 450803d..ec82263 100644
--- a/man/split_cases.Rd
+++ b/man/split_cases.Rd
@@ -32,9 +32,73 @@ split_cases(
\item{change_cutoff}{The values of the change variable will be filtered by |change_var| > change_cutoff. Default: 0.}
}
+\value{
+A named list of 10 data frames (\verb{$Case1} through \verb{$Case10}), each
+containing the genes belonging to that mutually exclusive expression
+pattern. Cases 1–6 contain a \code{trend} column (\code{"up"} or \code{"dn"}). Case 10
+contains genes not significant in any comparison.
+}
\description{
When performing differential expression analysis of a study with 3 phenotypes,
including the baseline, there are 10 mutually exclusive cases where genes can
fall into. This function allows us to obtain these 10 cases and saves them
into a list.
}
+\details{
+The 10 cases are:
+\itemize{
+\item \strong{Case 1} : Ladder: significant in all 3, same direction.
+\item \strong{Case 2} : Stronger in condition 1: significant in all 3, direction
+reverses between conditions.
+\item \strong{Case 3} : Stronger in condition 2.
+\item \strong{Case 4} : Marker of condition 2: significant in CvsA and BvsC only.
+\item \strong{Case 5} : Marker of condition 1: significant in BvsA and BvsC only.
+\item \strong{Case 6} : Shared: significant in BvsA and CvsA only.
+\item \strong{Cases 7-9} : Significant in only one comparison.
+\item \strong{Case 10} : Not significant in any comparison.
+}
+}
+\examples{
+\dontrun{
+# split_cases requires three DESeq2 comparisons.
+# Simulate a 3-phenotype study: Normal (A), Primary (B), Metastasis (C)
+set.seed(174)
+n_genes <- 500
+
+make_res <- function(seed) {
+ set.seed(seed)
+ data.frame(
+ ensembl = paste0("ENSG", sprintf("\%011d", seq_len(n_genes))),
+ log2FoldChange = rnorm(n_genes, 0, 1.5),
+ padj = runif(n_genes, 0, 0.5),
+ stringsAsFactors = FALSE
+ )
+}
+
+df_BvsA <- make_res(1)
+df_CvsA <- make_res(2)
+df_BvsC <- make_res(3)
+
+cases <- split_cases(
+ df.BvsA = df_BvsA,
+ df.CvsA = df_CvsA,
+ df.BvsC = df_BvsC,
+ unique_id = "ensembl",
+ significance_var = "padj",
+ significance_cutoff = 0.25,
+ change_var = "log2FoldChange",
+ change_cutoff = 0
+)
+
+# Number of genes per case
+sapply(cases, nrow)
+
+# Inspect Case 1 (ladder genes: significant in all 3 comparisons)
+head(cases$Case1)
+}
+
+}
+\seealso{
+\code{\link[=detect_filter]{detect_filter()}} to pre-filter genes before splitting;
+\code{\link[=nice_Volcano]{nice_Volcano()}} to visualize individual comparison results.
+}
diff --git a/man/splot_PA.Rd b/man/splot_PA.Rd
new file mode 100644
index 0000000..aa4053f
--- /dev/null
+++ b/man/splot_PA.Rd
@@ -0,0 +1,117 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot_PA.R
+\name{splot_PA}
+\alias{splot_PA}
+\title{Pathway analysis visualization for a single comparison}
+\usage{
+splot_PA(
+ data,
+ geneset_col = "NAME",
+ collection_col = "COLLECTION",
+ nes_col = "NES",
+ fdr_col = "FDR",
+ order = "desc",
+ fill_limits = NULL,
+ fill_palette = c("white", "red"),
+ theme_params = list()
+)
+}
+\arguments{
+\item{data}{A data frame of pathway analysis results for a single
+comparison. Typically the output of \code{\link[=merge_PA]{merge_PA()}} filtered to one value
+of the \code{COMPARISON} column, or results from a single CAMERA/GSEA run.
+Must contain the columns specified by \code{geneset_col}, \code{collection_col},
+\code{nes_col}, and \code{fdr_col}.}
+
+\item{geneset_col}{Name of the column containing gene set labels shown on
+the y-axis. Default: \code{"NAME"}.}
+
+\item{collection_col}{Name of the column containing MSigDB collection
+labels used to group gene sets (e.g., \code{"KEGG"}, \code{"HALLMARK"}, \code{"GO"}).
+Default: \code{"COLLECTION"}.}
+
+\item{nes_col}{Name of the column containing NES values (x-axis).
+Default: \code{"NES"}.}
+
+\item{fdr_col}{Name of the column containing FDR values. \code{-log10(FDR)} is
+computed internally and used as the fill color. Default: \code{"FDR"}.}
+
+\item{order}{One of \code{"desc"} or \code{"asc"}. Sort order for NES values on the
+y-axis. Default: \code{"desc"}.}
+
+\item{fill_limits}{Numeric vector of length 2 setting the color scale range
+for \code{-log10(FDR)}. Values outside this range are clamped to the nearest
+limit. For example, \code{fill_limits = c(0, 5)} maps all gene sets with
+\code{-log10(FDR) >= 5} (i.e., FDR <= 0.00001) to the maximum color (red),
+and any value below 0 to the minimum color (white). Useful when a few
+gene sets have extreme significance that washes out color variation in the
+rest. Default: \code{NULL} (auto uses the actual data range).}
+
+\item{fill_palette}{Character vector of two colors for the fill gradient
+(low to high -log10(FDR)). Default: \code{c("white", "red")}.}
+
+\item{theme_params}{Named list to override default theme parameters.
+See Details.}
+}
+\value{
+A \code{patchwork} object combining six ggplot2 panels.
+}
+\description{
+Generates a publication-quality multi-panel pathway enrichment plot for a
+single comparison using patchwork. Gene sets appear on the y-axis grouped
+by MSigDB collection, NES on the x-axis, and -log10(FDR) as fill color.
+Six panels are assembled side by side: a "Pathways" label, gene set names,
+the NES bar chart, collection labels, a "MSigDB" label, and the color legend.
+}
+\details{
+For visualizing enrichment across multiple comparisons, use
+\code{\link[=multiplot_PA]{multiplot_PA()}} instead.
+
+\code{theme_params} accepts any of the following named elements:
+\describe{
+\item{\code{side_label_size}}{Size for "Pathways" and "MSigDB" labels.
+Default: \code{35}.}
+\item{\code{geneset_text_size}}{Text size for gene set labels. Default: \code{5}.}
+\item{\code{collection_text_size}}{Text size for collection labels.
+Default: \code{5}.}
+\item{\code{panel_widths}}{Patchwork relative widths for the 6 panels.
+Default: \code{c(4, 25, 15, 3, 10, 3)}.}
+\item{\code{col_size}}{Border linewidth for \code{geom_col}. Default: \code{1}.}
+\item{\code{axis_title_size}}{Font size for axis titles. Default: \code{45}.}
+\item{\code{axis_text_size_x}}{Font size for x-axis labels. Default: \code{30}.}
+\item{\code{tick_size}}{Linewidth for axis ticks. Default: \code{1.5}.}
+\item{\code{tick_length}}{Length of axis ticks in cm. Default: \code{0.3}.}
+\item{\code{panel_spacing_single}}{Spacing between facets. Default: \code{4}.}
+}
+}
+\examples{
+\dontrun{
+gsea_results <- merge_PA("path/to/gsea_results/")
+
+# Filter to one comparison
+single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ]
+
+splot_PA(
+ data = single,
+ geneset_col = "NAME",
+ collection_col = "COLLECTION",
+ nes_col = "NES",
+ fdr_col = "FDR"
+)
+
+# Cap color scale at -log10(FDR) = 5 so subtle differences are visible
+# (gene sets with FDR <= 0.00001 all get the same max red color)
+splot_PA(
+ data = single,
+ geneset_col = "NAME", collection_col = "COLLECTION",
+ nes_col = "NES", fdr_col = "FDR",
+ fill_limits = c(0, 5)
+)
+}
+
+}
+\seealso{
+\code{\link[=multiplot_PA]{multiplot_PA()}} for multi-comparison faceted barplots;
+\code{\link[=merge_PA]{merge_PA()}} to generate the input data frame;
+\link{camera_results} for a minimal example dataset.
+}
diff --git a/man/tpm.Rd b/man/tpm.Rd
index f648a45..195c640 100644
--- a/man/tpm.Rd
+++ b/man/tpm.Rd
@@ -11,6 +11,10 @@ tpm(raw_counts, gene_lengths)
\item{gene_lengths}{A column with the gene lengths.}
}
+\value{
+A numeric matrix of the same dimensions as \code{raw_counts} with TPM
+values. Column sums equal 1,000,000 by definition.
+}
\description{
TPM: Transcript per million. See https://www.biostars.org/p/273537/
The input table is numeric:
@@ -20,3 +24,40 @@ The input table is numeric:
The gene lengths are in a column of a dataframe with the same row order.
}
}
+\note{
+TPM normalizes for both sequencing depth and gene length, making
+values comparable between genes within a sample. It is not appropriate
+for differential expression analysis, use DESeq2 normalized counts
+(\link{norm_counts}) for that purpose. Gene lengths from \code{\link[=get_annotations]{get_annotations()}}
+are genomic lengths (including introns); for higher accuracy use
+transcript-level lengths.
+}
+\examples{
+\dontrun{
+data(raw_counts)
+data(deseq2_results)
+
+# Gene lengths are needed, retrieve from get_annotations() or use
+# pre-fetched lengths. Here we use the gene_length column if available.
+annotations <- get_annotations(
+ ensembl_ids = rownames(raw_counts),
+ mode = "genes"
+)
+
+# Match gene lengths to raw_counts row order
+gene_lengths <- annotations$gene_length[
+ match(rownames(raw_counts), annotations$geneID)
+]
+
+# Calculate TPM
+tpm_matrix <- tpm(raw_counts, gene_lengths)
+
+# Check: column sums should all be 1,000,000
+round(colSums(tpm_matrix)[1:3])
+}
+
+}
+\seealso{
+\code{\link[=get_annotations]{get_annotations()}} to obtain \code{gene_lengths};
+\link{norm_counts} for DESeq2 size-factor normalized counts.
+}
diff --git a/man/vst_counts.Rd b/man/vst_counts.Rd
new file mode 100644
index 0000000..8811dc6
--- /dev/null
+++ b/man/vst_counts.Rd
@@ -0,0 +1,96 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/deseq2_results.R
+\docType{data}
+\name{vst_counts}
+\alias{vst_counts}
+\title{Variance-stabilized counts matrix for TCGA-LUAD}
+\format{
+A numeric matrix with 21,330 rows (genes) and 32 columns (samples):
+\describe{
+\item{rows}{Ensembl gene IDs (e.g., \code{"ENSG00000141510"}).}
+\item{columns}{Sample IDs matching the \code{patient_id} column of
+\link{sampledata}.}
+\item{values}{Numeric. VST-transformed expression values on a log2-like
+scale. Range: [1.78, 20.85].}
+}
+}
+\source{
+TCGA-LUAD STAR counts downloaded from the GDC Data Portal
+(\url{https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz}).
+Transformed with \code{\link[DESeq2:vst]{DESeq2::vst()}} (\code{blind = TRUE}); generated by
+\code{data-raw/deseq2_results.R}.
+}
+\usage{
+vst_counts
+}
+\description{
+Variance Stabilizing Transformation (VST) applied to the TCGA-LUAD RNA-seq
+dataset (16 tumor samples, 16 normal samples) using \code{\link[DESeq2:vst]{DESeq2::vst()}} with
+\code{blind = TRUE}. VST removes the mean-variance dependence characteristic of
+RNA-seq count data, placing all genes on a comparable log2-like scale. This
+makes it the appropriate input for sample-level dimensionality reduction and
+clustering methods.
+}
+\details{
+Suitable as input for \code{\link[=nice_PCA]{nice_PCA()}}, \code{\link[=nice_UMAP]{nice_UMAP()}}, and \code{\link[=nice_tSNE]{nice_tSNE()}}. For
+gene-level expression plots (\code{\link[=nice_VSB]{nice_VSB()}}) or filtering (\code{\link[=detect_filter]{detect_filter()}})
+use \link{norm_counts} instead.
+}
+\examples{
+data(vst_counts)
+data(sampledata)
+
+# Dimensions
+dim(vst_counts)
+
+# Value range (log2-like scale)
+range(vst_counts)
+
+# PCA plot colored by sample type
+colnames(sampledata)[colnames(sampledata) == "patient_id"] <- "id"
+nice_PCA(
+ object = vst_counts,
+ annotations = sampledata,
+ variables = c(fill = "sample_type"),
+ legend_names = c(fill = "Sample Type"),
+ colors = c("steelblue", "firebrick"),
+ shapes = c(21, 21),
+ title = "TCGA-LUAD PCA"
+)
+
+\dontrun{
+# UMAP plot
+colnames(sampledata)[colnames(sampledata) == "patient_id"] <- "id"
+
+nice_UMAP(
+ object = vst_counts,
+ annotations = sampledata,
+ variables = c(fill = "sample_type"),
+ legend_names = c(fill = "Sample Type"),
+ colors = c("steelblue", "firebrick"),
+ shapes = c(21, 21),
+ title = "TCGA-LUAD UMAP"
+)
+
+# tSNE plot
+# perplexity must be lower than the number of samples divided by 3
+
+colnames(sampledata)[colnames(sampledata) == "patient_id"] <- "id"
+nice_tSNE(
+ object = vst_counts,
+ annotations = sampledata,
+ perplexity = 5,
+ variables = c(fill = "sample_type"),
+ legend_names = c(fill = "Sample Type"),
+ colors = c("steelblue", "firebrick"),
+ shapes = c(21, 21),
+ title = "TCGA-LUAD tSNE"
+)
+}
+
+}
+\seealso{
+\link{norm_counts}, \link{deseq2_results}, \link{sampledata}, \code{\link[=nice_PCA]{nice_PCA()}},
+\code{\link[=nice_UMAP]{nice_UMAP()}}, \code{\link[=nice_tSNE]{nice_tSNE()}}
+}
+\keyword{datasets}
diff --git a/vignettes/.gitignore b/vignettes/.gitignore
new file mode 100644
index 0000000..097b241
--- /dev/null
+++ b/vignettes/.gitignore
@@ -0,0 +1,2 @@
+*.html
+*.R
diff --git a/vignettes/DEA_workflow.Rmd b/vignettes/DEA_workflow.Rmd
new file mode 100644
index 0000000..b8388b7
--- /dev/null
+++ b/vignettes/DEA_workflow.Rmd
@@ -0,0 +1,478 @@
+---
+title: "Differential Expression Analysis Workflow with OmicsKit"
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{Differential Expression Analysis Workflow with OmicsKit}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteEncoding{UTF-8}
+---
+
+```{r setup, include = FALSE}
+knitr::opts_chunk$set(
+ collapse = TRUE,
+ comment = "#>",
+ fig.align = "center",
+ warning = FALSE,
+ message = FALSE
+)
+```
+
+## Overview
+
+This vignette demonstrates a complete RNA-seq differential expression analysis
+(DEA) workflow using OmicsKit, from study design through visualization, using
+real TCGA-LUAD data (16 tumor vs. 16 normal lung samples).
+
+The workflow covers:
+
+1. **Study design** : statistical power estimation
+2. **Quality control** : unsupervised dimensionality reduction
+3. **DEA results** : annotation, normalization, and export
+4. **Visualization** : volcano plots and gene-level expression plots
+5. **Multi-comparison** : splitting genes into exclusive expression cases
+
+## Required packages
+
+```{r packages}
+library(OmicsKit)
+library(ggplot2)
+```
+
+---
+
+## Section 1 : Study design: power analysis
+
+Before collecting data or running any analysis, it is good practice to estimate
+the minimum sample size needed to reliably detect differentially expressed genes
+at a given effect size and false discovery rate.
+
+`power_analysis()` computes statistical power across a range of sample sizes
+using an analytical approximation that accounts for multiple testing:
+
+```{r power_analysis}
+result <- power_analysis(
+ effect_size = 1, # minimum log2 fold-change to detect
+ dispersion = 0.1, # biological coefficient of variation squared
+ n_genes = 20000, # total genes tested
+ prop_de = 0.05, # expected proportion of DE genes
+ alpha = 0.05, # desired FDR
+ power_target = 0.8, # desired power (80%)
+ max_n = 30,
+ plot = TRUE
+)
+
+# Minimum samples per group needed
+result$min_sample_size
+
+# Full power curve
+head(result$power_table)
+
+# See plot
+result$plot
+```
+
+```{r power_plot, echo = FALSE, fig.cap = "Statistical power vs. sample size per group. The red dashed line marks 80% power; the blue dashed line marks the minimum required sample size.", fig.width = 7, fig.height = 5}
+knitr::include_graphics("figures/power_analysis.png")
+```
+
+The TCGA-LUAD dataset used in this vignette has 16 samples per group, well
+above the minimum required for detecting log2FC ≥ 1 at 80% power.
+
+---
+
+## Section 2 : Quality control: unsupervised clustering
+
+Before running DEA, it is essential to verify that samples cluster by their
+biological group and to detect potential batch effects or outliers. OmicsKit
+provides three complementary dimensionality reduction methods that all accept
+the same VST-transformed count matrix.
+
+**Why VST?** Variance Stabilizing Transformation removes the mean-variance
+dependence of RNA-seq counts, placing all genes on a comparable log2-like
+scale. This prevents highly expressed genes from dominating the sample-level
+distances.
+
+```{r load_data}
+data(vst_counts)
+data(sampledata)
+
+# nice_PCA, nice_UMAP, and nice_tSNE require a column named "id"
+sampledata_dim <- sampledata
+colnames(sampledata_dim)[colnames(sampledata_dim) == "patient_id"] <- "id"
+
+dim(vst_counts)
+table(sampledata$sample_type)
+```
+
+### PCA
+
+PCA is the fastest method and should always be the first step. The first two
+principal components often capture the main sources of variation.
+
+```{r nice_PCA, eval = FALSE}
+nice_PCA(
+ object = vst_counts,
+ annotations = sampledata_dim,
+ variables = c(fill = "sample_type"),
+ legend_names = c(fill = "Sample Type"),
+ colors = c("steelblue", "firebrick"),
+ shapes = c(21, 21),
+ title = "TCGA-LUAD PCA: Tumor vs Normal"
+)
+```
+
+```{r pca_plot, echo = FALSE, fig.cap = "PCA of VST-transformed TCGA-LUAD counts. Tumor (red) and normal (blue) samples separate cleanly along PC1.", fig.width = 7, fig.height = 6}
+knitr::include_graphics("figures/nice_PCA.png")
+```
+
+### UMAP
+
+UMAP captures non-linear structure and is particularly useful for larger
+datasets or when PCA does not show clear separation.
+
+```{r nice_UMAP, eval = FALSE}
+nice_UMAP(
+ object = vst_counts,
+ annotations = sampledata_dim,
+ variables = c(fill = "sample_type"),
+ legend_names = c(fill = "Sample Type"),
+ colors = c("steelblue", "firebrick"),
+ shapes = c(21, 21),
+ title = "TCGA-LUAD UMAP: Tumor vs Normal",
+ neighbors = 5,
+ epochs = 1000,
+ seed = 174
+)
+```
+
+```{r umap_plot, echo = FALSE, fig.cap = "UMAP of VST-transformed TCGA-LUAD counts.", fig.width = 7, fig.height = 6}
+knitr::include_graphics("figures/nice_UMAP.png")
+```
+
+### tSNE
+
+tSNE is useful for visualizing tight local clusters. Note that `perplexity`
+must be less than one-third of the number of samples.
+
+```{r nice_tSNE, eval = FALSE}
+# With 32 samples, perplexity must be < 10
+nice_tSNE(
+ object = vst_counts,
+ annotations = sampledata_dim,
+ perplexity = 5,
+ max_iterations = 1000,
+ variables = c(fill = "sample_type"),
+ legend_names = c(fill = "Sample Type"),
+ colors = c("steelblue", "firebrick"),
+ shapes = c(21, 21),
+ title = "TCGA-LUAD tSNE: Tumor vs Normal",
+ seed = 174
+)
+```
+
+```{r tsne_plot, echo = FALSE, fig.cap = "tSNE of VST-transformed TCGA-LUAD counts.", fig.width = 7, fig.height = 6}
+knitr::include_graphics("figures/nice_tSNE.png")
+```
+
+---
+
+## Section 3 : DEA results: annotation, normalization, and export
+
+Once QC confirms clean group separation, we work with the DESeq2 results. The
+`deseq2_results` object is already provided as an example dataset.
+
+```{r load_results}
+data(deseq2_results)
+data(norm_counts)
+
+# Overview
+dim(deseq2_results)
+sum(deseq2_results$padj < 0.05, na.rm = TRUE)
+```
+
+### Gene annotations
+
+`get_annotations()` queries Ensembl via biomaRt to retrieve gene symbols,
+biotype, chromosomal location, and length. Requires an internet connection.
+
+```{r get_annotations, eval = FALSE}
+annotations <- get_annotations(
+ ensembl_ids = deseq2_results$gene_id,
+ mode = "genes",
+ version = "Current",
+ filename = "luad_gene_annotations",
+ format = "csv"
+)
+
+head(annotations)
+```
+
+### Adding annotations to results and counts
+
+`add_annotations()` joins annotation columns to any matrix or data frame using
+Ensembl IDs as the key:
+
+```{r add_annotations, eval = FALSE}
+# Add gene symbol and biotype to normalized counts
+norm_counts_annot <- add_annotations(
+ object = norm_counts,
+ reference = annotations,
+ variables = c("symbol", "biotype")
+)
+
+head(norm_counts_annot[, c("geneID", "symbol", "biotype")])
+```
+
+### TPM normalization
+
+While DESeq2 normalized counts are appropriate for DEA, TPM is useful for
+comparing expression levels between genes within a sample. Note that TPM
+requires accurate gene lengths, `get_annotations()` provides these via
+`$gene_length` (computed as `end - start + 1`).
+
+```{r tpm, eval = FALSE}
+gene_lengths <- annotations$gene_length[
+ match(rownames(raw_counts), annotations$geneID)
+]
+
+tpm_matrix <- tpm(raw_counts, gene_lengths)
+
+# Column sums should all equal 1,000,000
+round(colSums(tpm_matrix)[1:3])
+```
+
+### Saving results
+
+`save_results()` exports three Excel files: all genes, over-expressed only,
+and under-expressed only:
+
+```{r save_results, eval = FALSE}
+save_results(
+ df = deseq2_results,
+ name = "TCGA_LUAD_TumorVsNormal",
+ l2fc = 1,
+ cutoff_alpha = 0.05
+)
+# Creates:
+# TCGA_LUAD_TumorVsNormal_full.xlsx
+# TCGA_LUAD_TumorVsNormal_up_log2FC>1_FDR<0.05.xlsx
+# TCGA_LUAD_TumorVsNormal_down_log2FC<1_FDR<0.05.xlsx
+```
+
+---
+
+## Section 4 : Visualization
+
+### Volcano plot
+
+The volcano plot is the standard way to visualize DEA results, showing the
+relationship between effect size (log2FC) and significance (FDR).
+
+> **Tip:** To display gene symbols instead of Ensembl IDs, run
+> `get_annotations()` and join the `symbol` column to `deseq2_results`
+> with `add_annotations()` before calling `nice_Volcano()`.
+
+```{r nice_Volcano, eval = FALSE}
+nice_Volcano(
+ results = deseq2_results,
+ x_var = "log2FoldChange",
+ y_var = "padj",
+ label_var = "gene_id",
+ title = "TCGA-LUAD: Tumor vs Normal",
+ cutoff_y = 0.05,
+ cutoff_x = 1,
+ x_range = 8,
+ y_max = 10
+)
+```
+
+```{r volcano_plot, echo = FALSE, fig.cap = "Volcano plot of TCGA-LUAD DEA results. Red: upregulated in tumor; blue: downregulated; grey: not significant.", fig.width = 8, fig.height = 7}
+knitr::include_graphics("figures/nice_Volcano.png")
+```
+
+### Detectable genes
+
+`detect_filter()` identifies genes with reliable expression levels by applying
+thresholds on baseMean and mean normalized counts per group.
+
+```{r detect_filter, eval = FALSE}
+# detect_filter requires a column named "ensembl"
+res <- deseq2_results
+colnames(res)[colnames(res) == "gene_id"] <- "ensembl"
+rownames(res) <- res$ensembl
+
+samples_normal <- sampledata$patient_id[sampledata$sample_type == "normal"]
+samples_tumor <- sampledata$patient_id[sampledata$sample_type == "tumor"]
+
+detected <- detect_filter(
+ norm.counts = as.data.frame(norm_counts),
+ df.BvsA = res,
+ samples.baseline = samples_normal,
+ samples.condition1 = samples_tumor,
+ cutoffs = c(50, 50, 0)
+)
+
+length(detected$DetectGenes)
+```
+
+### Significance stars
+
+`get_stars()` converts FDR values to asterisk notation for annotation in plots:
+
+```{r get_stars}
+data(deseq2_results)
+
+res_stars <- deseq2_results
+colnames(res_stars)[colnames(res_stars) == "gene_id"] <- "ensembl"
+
+# Most significant gene
+get_stars(
+ geneID = res_stars$ensembl[1],
+ object = res_stars
+)
+
+# Least significant gene
+get_stars(
+ geneID = res_stars$ensembl[nrow(res_stars)],
+ object = res_stars
+)
+```
+
+### Gene-level expression: Violin-Scatter-Box plot
+
+`nice_VSB()` shows the distribution of normalized expression for a single gene
+across sample groups. The input is the normalized counts matrix.
+
+```{r nice_VSB, eval = FALSE}
+top_gene <- deseq2_results$gene_id[1]
+
+# Get symbol
+#symbol <- deseq2_results$symbol[1]
+
+nice_VSB(
+ object = norm_counts,
+ annotations = sampledata,
+ variables = c(fill = "sample_type"),
+ genename = top_gene,
+ categories = c("normal", "tumor"),
+ labels = c("Normal", "Tumor"),
+ colors = c("steelblue", "firebrick"),
+ shapes = 21,
+ markersize = 3
+)
+```
+
+> **Note: alternative using a DESeq2 object directly:** If you still have
+> your `DESeqDataSet` object in your session, you can use `nice_VSB_DESeq2()`
+> instead. It extracts the normalized counts internally via
+> `DESeq2::counts(dds, normalized = TRUE)`, so you do not need to provide
+> a separate `norm_counts` matrix:
+>
+> ```r
+> nice_VSB_DESeq2(
+> object = dds,
+> variables = c(fill = "sample_type"),
+> genename = top_gene,
+> symbol = "TP53", # optional: displayed in plot title
+> categories = c("normal", "tumor"),
+> labels = c("Normal", "Tumor"),
+> colors = c("steelblue", "firebrick"),
+> shapes = 21,
+> markersize = 3
+> )
+> ```
+>
+> Use `nice_VSB()` when working from a pre-computed normalized counts matrix,
+> and `nice_VSB_DESeq2()` when the DESeq2 object is still available in your
+> environment.
+
+
+```{r vsb_plot, echo = FALSE, fig.cap = "Violin-Scatter-Box plot of the most significant DEG across tumor and normal samples.", fig.width = 6, fig.height = 6}
+knitr::include_graphics("figures/nice_VSB.png")
+```
+
+
+
+---
+
+## Section 5 : Multi-comparison analysis (optional)
+
+When a study includes three phenotypes (e.g., normal, primary tumor,
+metastasis), `split_cases()` classifies genes into 10 mutually exclusive
+expression patterns based on significance and direction across all three
+pairwise comparisons.
+
+```{r split_cases, eval = FALSE}
+# Requires three DESeq2 result data frames:
+# df.BvsA : condition 1 vs baseline
+# df.CvsA : condition 2 vs baseline
+# df.BvsC : condition 1 vs condition 2
+
+cases <- split_cases(
+ df.BvsA = df_tumor_vs_normal,
+ df.CvsA = df_meta_vs_normal,
+ df.BvsC = df_tumor_vs_meta,
+ unique_id = "ensembl",
+ significance_var = "padj",
+ significance_cutoff = 0.05,
+ change_var = "log2FoldChange",
+ change_cutoff = 1
+)
+
+# Number of genes per case
+sapply(cases, nrow)
+
+# Case 1: ladder genes : progressive up or down across all comparisons
+head(cases$Case1)
+```
+
+---
+
+## Full workflow - summary
+
+```{r full_workflow, eval = FALSE}
+library(OmicsKit)
+
+# 1. Study design
+power_analysis(effect_size = 1, dispersion = 0.1, n_genes = 20000)
+
+# 2. QC: dimensionality reduction (use VST counts)
+data(vst_counts); data(sampledata)
+sampledata$id <- sampledata$patient_id
+
+nice_PCA(vst_counts, sampledata, variables = c(fill = "sample_type"),
+ legend_names = c(fill = "Sample Type"),
+ colors = c("steelblue", "firebrick"), shapes = c(21, 21))
+
+# 3. Annotate and export results
+annotations <- get_annotations(deseq2_results$gene_id)
+norm_counts_annot <- add_annotations(norm_counts, annotations)
+save_results(deseq2_results, name = "TumorVsNormal", l2fc = 1)
+
+# 4. Visualize and filter non-detectable genes
+nice_Volcano(deseq2_results, x_var = "log2FoldChange",
+ y_var = "padj", label_var = "gene_id",
+ title = "Tumor vs Normal")
+
+detected <- detect_filter(as.data.frame(norm_counts), deseq2_results,
+ samples.baseline = samples_normal,
+ samples.condition1 = samples_tumor)
+
+nice_VSB(norm_counts, sampledata,
+ variables = c(fill = "sample_type"),
+ genename = detected$DetectGenes[1],
+ categories = c("normal", "tumor"),
+ labels = c("Normal", "Tumor"),
+ colors = c("steelblue", "firebrick"))
+
+# 5. Multi-comparison (if 3 phenotypes)
+cases <- split_cases(df.BvsA, df.CvsA, df.BvsC)
+```
+
+---
+
+## Session info
+
+```{r session_info}
+sessionInfo()
+```
diff --git a/vignettes/PA_clustering.Rmd b/vignettes/PA_clustering.Rmd
new file mode 100644
index 0000000..cb48f83
--- /dev/null
+++ b/vignettes/PA_clustering.Rmd
@@ -0,0 +1,327 @@
+---
+title: "Pathway Analysis Clustering with OmicsKit"
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{Pathway Analysis Clustering with OmicsKit}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteEncoding{UTF-8}
+---
+
+```{r setup, include = FALSE}
+knitr::opts_chunk$set(
+ collapse = TRUE,
+ comment = "#>",
+ fig.width = 7,
+ fig.height = 5,
+ warning = FALSE,
+ message = FALSE
+)
+```
+
+## Overview
+
+After a gene set enrichment analysis (e.g., CAMERA, GSEA, fgsea), a common
+challenge is that hundreds of significant gene sets are returned — many of which
+are redundant because they share large overlapping gene memberships. This
+vignette demonstrates how to use OmicsKit's pathway clustering functions to:
+
+1. **Load gene sets** from GMT files with `list_gmts()`
+2. **Quantify redundancy** between gene sets using Jaccard similarity with
+ `geneset_similarity()`
+3. **Cluster** redundant gene sets hierarchically with `do_clust()`
+4. **Detect communities** in the gene set network and generate interpretable
+ labels with `get_network_communities()`
+5. **Visualize** the network with `network_clust()` (base R) or
+ `network_clust_gg()` (ggplot2)
+
+## Required packages
+
+```{r packages}
+library(OmicsKit)
+
+# Suggested packages used in this vignette
+library(igraph)
+library(ComplexHeatmap)
+```
+
+---
+
+## Step 1 — Load gene sets
+
+In a real analysis, gene sets are stored as `.gmt` files. `list_gmts()` reads
+all `.gmt` files in a directory and returns a single named list.
+
+```{r list_gmts, eval = FALSE}
+geneset_list <- list_gmts("path/to/your/gmt_folder/")
+```
+
+For this vignette we use the built-in example data, which contains 40 curated
+gene sets spanning four biological themes: apoptosis, cell cycle, immune
+response, and metabolism.
+
+```{r load_data}
+data(geneset_list)
+data(camera_results)
+
+# How many gene sets?
+length(geneset_list)
+
+# What does the enrichment results table look like?
+head(camera_results)
+
+# How many are significant at FDR < 0.05?
+sum(camera_results$FDR < 0.05)
+```
+
+---
+
+## Step 2 — Jaccard similarity matrix
+
+`geneset_similarity()` filters the gene sets by FDR threshold and computes all
+pairwise Jaccard similarity coefficients:
+
+$$J(A, B) = \frac{|A \cap B|}{|A \cup B|}$$
+
+A value of 1 means the two gene sets share identical gene membership; 0 means
+no overlap at all.
+
+```{r geneset_similarity}
+jac <- geneset_similarity(
+ geneset_list = geneset_list,
+ results = camera_results,
+ fdr_th = 0.05
+)
+
+# The JaccardResult object contains three slots
+names(jac)
+
+# Dimensions of the similarity matrix
+dim(jac$jaccard_sim)
+```
+
+The `$dist_mat` slot (1 − Jaccard similarity) can be reused independently —
+for example as input to `nice_UMAP()` or any other distance-based method.
+
+```{r jaccard_preview}
+# Preview the top-left corner of the similarity matrix
+jac$jaccard_sim[1:5, 1:5]
+```
+
+---
+
+## Step 3 — Hierarchical clustering
+
+`do_clust()` performs Ward-D2 hierarchical clustering on the distance matrix
+and automatically selects the optimal number of clusters *k* by maximizing the
+average silhouette width.
+
+```{r do_clust}
+clust <- do_clust(jac)
+
+# Optimal k selected automatically
+clust$optimal_k
+
+# Cluster assignments (one row per gene set)
+head(clust$cluster_assignments)
+```
+
+### Silhouette curve
+
+The red dot marks the selected *k*:
+
+```{r silhouette_plot, fig.cap = "Average silhouette width vs. number of clusters. The optimal k is highlighted in red."}
+clust$silhouette_plot
+```
+
+### Jaccard heatmap with dendrogram
+
+Gene sets in the same hierarchical cluster appear as blocks of high similarity
+(darker blue) along the diagonal:
+
+```{r heatmap, fig.width = 6, fig.height = 6, fig.cap = "Jaccard similarity heatmap with Ward-D2 dendrogram."}
+ComplexHeatmap::draw(clust$heatmap)
+```
+
+---
+
+## Step 4 — Community detection and super-terms
+
+Hierarchical clustering groups gene sets by overall similarity, but the network
+community detection captures a different structure: densely connected
+sub-networks within the Jaccard graph.
+
+`get_network_communities()` does three things in a single call:
+
+- Builds a binary adjacency matrix (edges where Jaccard > `threshold`)
+- Runs the Louvain algorithm to detect communities
+- Calls `get_superterm()` internally to generate TF-IDF labels for each
+ community
+
+```{r get_network_communities}
+net <- get_network_communities(
+ x = jac,
+ threshold = 0.3,
+ method = "louvain",
+ superterms = TRUE,
+ n_terms = 3,
+ remove_prefix = TRUE,
+ seed = 1905
+)
+
+# How many communities were detected?
+length(unique(net$membership))
+
+# Community summary: label + size
+net$superterms$summary
+```
+
+The `$superterms$mapping` tibble links every gene set to its community and
+its super-term label:
+
+```{r superterm_mapping}
+head(net$superterms$mapping)
+```
+
+---
+
+## Step 5 — Network visualization
+
+### Option A: base R igraph (`network_clust`)
+
+`network_clust()` draws plots directly to the active graphics device and
+returns node attributes invisibly. It is faster and requires only `igraph`.
+
+```{r network_clust_clean, fig.width = 7, fig.height = 7, fig.cap = "Gene set network — clean view. Node color reflects hierarchical cluster."}
+result <- network_clust(
+ x = jac,
+ clust_result = clust,
+ jaccard_threshold = 0.10,
+ min_degree = 1,
+ superterms = TRUE,
+ superterm_data = net$superterms,
+ type = "clean",
+ seed = 1905
+)
+```
+
+```{r network_clust_superterms, fig.width = 7, fig.height = 7, fig.cap = "Gene set network with community super-term labels."}
+network_clust(
+ x = jac,
+ clust_result = clust,
+ jaccard_threshold = 0.10,
+ min_degree = 1,
+ superterms = TRUE,
+ superterm_data = net$superterms,
+ type = "superterms",
+ seed = 1905
+)
+```
+
+### Option B: ggraph / ggplot2 (`network_clust_gg`)
+
+`network_clust_gg()` returns a named list of ggplot2 objects that can be
+further customized, saved with `ggsave()`, or composed with `patchwork`.
+Requires `ggraph` and `tidygraph`.
+
+```{r network_clust_gg, fig.width = 7, fig.height = 7, fig.cap = "Gene set network (ggraph) with super-term labels."}
+plots <- network_clust_gg(
+ x = jac,
+ clust_result = clust,
+ jaccard_threshold = 0.10,
+ min_degree = 1,
+ superterms = TRUE,
+ superterm_data = net$superterms,
+ type = "all",
+ seed = 1905
+)
+
+# Display the combined view (super-terms + individual node labels)
+plots$superterms
+```
+
+```{r patchwork, fig.width = 12, fig.height = 6, fig.cap = "Side-by-side comparison: clean view (left) vs. super-term labels (right)."}
+library(patchwork)
+plots$clean + plots$superterms
+```
+
+### Saving plots
+
+```{r save, eval = FALSE}
+# Save with ggsave
+ggplot2::ggsave(
+ filename = "network_combined.pdf",
+ plot = plots$combined,
+ width = 14,
+ height = 14
+)
+
+# Save base R igraph plot to PDF
+pdf("network_superterms_igraph.pdf", width = 14, height = 14)
+network_clust(jac, clust,
+ superterms = TRUE,
+ superterm_data = net$superterms,
+ type = "superterms")
+dev.off()
+```
+
+---
+
+## Step 6 — Downstream tables
+
+Both network functions return node-level attributes for downstream analysis:
+
+```{r node_attributes}
+# From the igraph version (invisible return)
+head(result$node_attributes)
+```
+
+```{r superterm_report}
+# Superterm report: community membership breakdown
+result$superterm_report
+```
+
+---
+
+## Full workflow — summary
+
+```{r full_workflow, eval = FALSE}
+library(OmicsKit)
+
+# 1. Load gene sets
+gsl <- list_gmts("path/to/gmt_folder/")
+
+# 2. Jaccard similarity (filter by FDR < 0.05)
+jac <- geneset_similarity(gsl, camera_results, fdr_th = 0.05)
+
+# 3. Hierarchical clustering
+clust <- do_clust(jac)
+clust$silhouette_plot
+ComplexHeatmap::draw(clust$heatmap)
+
+# 4. Community detection + super-terms
+net <- get_network_communities(jac, threshold = 0.3)
+net$superterms$summary
+
+# 5a. Network plot — base R (draws to device)
+network_clust(jac, clust,
+ superterms = TRUE,
+ superterm_data = net$superterms)
+
+# 5b. Network plot — ggplot2 (returns objects)
+plots <- network_clust_gg(jac, clust,
+ superterms = TRUE,
+ superterm_data = net$superterms)
+plots$combined
+ggplot2::ggsave("network.pdf", plots$combined, width = 14, height = 14)
+
+# 6. Reuse the distance matrix for UMAP
+nice_UMAP(as.matrix(jac$dist_mat))
+```
+
+---
+
+## Session info
+
+```{r session_info}
+sessionInfo()
+```
diff --git a/vignettes/PA_workflow.Rmd b/vignettes/PA_workflow.Rmd
new file mode 100644
index 0000000..97a42c9
--- /dev/null
+++ b/vignettes/PA_workflow.Rmd
@@ -0,0 +1,381 @@
+---
+title: "Pathway Analysis Workflow with OmicsKit"
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{Pathway Analysis Workflow with OmicsKit}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteEncoding{UTF-8}
+---
+
+```{r setup, include = FALSE}
+
+knitr::opts_chunk$set(
+ collapse = TRUE,
+ comment = "#>",
+ fig.align = "center",
+ warning = FALSE,
+ message = FALSE
+)
+```
+
+## Overview
+This vignette demonstrates a complete gene set enrichment analysis (GSEA)
+downstream workflow using OmicsKit, from loading gene sets through
+publication-quality visualization.
+
+The workflow covers:
+
+1. **Loading gene sets** : reading GMT files with `list_gmts()`
+2. **Merging results** : combining multi-collection GSEA TSV files with
+ `merge_PA()`
+3. **Gene extraction** : retrieving leading edge and top-ranked genes with
+ `getgenesPA()` and annotating results with `addgenesPA()`
+4. **Heatmaps** : visualizing gene expression per pathway with `heatmap_PA()`
+ (file-path alternative: `heatmap_path_PA()`)
+5. **Single-comparison plot** : `splot_PA()` : a publication-quality
+ multi-panel barplot for one comparison
+6. **Multi-comparison plot** : `multiplot_PA()` : faceted barplots comparing
+ enrichment across conditions
+
+## Required packages
+```{r packages}
+library(OmicsKit)
+library(ggplot2)
+```
+
+---
+
+## Section 1 : Load gene sets
+### `list_gmts()`
+Gene sets used for enrichment analysis are stored as `.gmt` files, the
+standard format used by MSigDB. `list_gmts()` reads all `.gmt` files from a
+directory and returns a single named list, ready for downstream use.
+
+```{r list_gmts, eval = FALSE}
+geneset_list <- list_gmts("path/to/your/gmt_folder/")
+
+# Number of gene sets loaded
+length(geneset_list)
+# Names of the first five gene sets
+names(geneset_list)[1:5]
+# Genes in one specific set
+geneset_list[["HALLMARK_INTERFERON_GAMMA_RESPONSE"]]
+```
+
+In this vignette, we use a built-in example `geneset_list`, which contains
+40 curated gene sets across four biological themes (apoptosis, cell cycle,
+immune response, and metabolism), and the built-in `gsea_results` dataset,
+which mimics the output of `merge_PA()` for three comparisons across HALLMARK,
+KEGG, and GO collections.
+```{r load_data}
+data(geneset_list)
+data(gsea_results)
+data(deseq2_results)
+data(vst_counts)
+data(sampledata)
+# Ranked gene list: DESeq2 Wald stat from most positive to most negative
+ranked_genes <- deseq2_results$gene_id[
+ order(deseq2_results$stat, decreasing = TRUE)
+]
+# Overview of the enrichment results
+dim(gsea_results)
+unique(gsea_results$COMPARISON)
+unique(gsea_results$COLLECTION)
+```
+
+---
+
+## Section 2 : Merge GSEA results
+### `merge_PA()`
+In a real analysis, GSEA produces one results file per MSigDB collection
+(e.g., `H.tsv` for HALLMARK, `C2.tsv` for KEGG). `merge_PA()` reads all
+`.tsv` files from a directory, standardizes column names, parses the leading
+edge string into numeric components (`tags`, `list`, `signal`), computes
+`-log10(FDR)`, and returns a single combined data frame.
+```{r merge_PA, eval = FALSE}
+gsea_data <- merge_PA(
+ input_directory = "path/to/gsea_results/",
+ fdr_replace = 0.001 # replace FDR = 0 (below permutation resolution)
+)
+# Inspect result
+head(gsea_data[, c("NAME", "NES", "FDR", "COLLECTION", "COMPARISON", "tags")])
+```
+
+> **Note:** Each `.tsv` file must contain a `Comparison` column identifying
+> the comparison name (e.g., `"TumorVsNormal"`). Then `merge_PA()` renames it to
+> `COMPARISON`. If your files come from a single run without that column,
+> add it manually:
+> ```r
+> your_data$Comparison <- "TumorVsNormal"
+> ```
+
+The built-in `gsea_results` already has this structure:
+```{r inspect_data}
+# Key columns produced by merge_PA()
+head(gsea_results[, c("NES", "FDR", "Log10FDR",
+ "COLLECTION", "COMPARISON", "tags", "SIZE")], n = 2)
+```
+
+---
+
+## Section 3 : Extract leading edge genes
+### `getgenesPA()`
+### `addgenesPA()`
+After obtaining pathway results, `getgenesPA()` retrieves the gene members
+relevant to each enrichment signal. Three extraction modes are available:
+- **`"le"`** (GSEA only): leading edge genes : the subset that drives the
+ enrichment score.
+- **`"top"`**: top-ranked *n* % of genes by rank metric : applicable to any
+ enrichment method (GSEA, CAMERA, PADOG, etc.).
+- **`"all"`**: all members of the gene set, ordered by rank.
+`addgenesPA()` then appends the gene lists as comma-separated columns
+(`le_genes`, `top_genes`, `all_genes`) to the pathway results table.
+
+```{r getgenesPA_addgenesPA, eval = FALSE}
+# Filter to one comparison
+pa_single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ]
+# Optional: define the top fraction for mode "top"
+pa_single$top <- 0.30 # top 30% of gene set members by rank
+# Extract genes using all three modes
+gene_lists <- getgenesPA(
+ pa_data = pa_single,
+ geneset_list = geneset_list,
+ ranked_genes = ranked_genes,
+ genes = c("all", "le", "top")
+)
+# Inspect results for one pathway
+ ## leading edge
+ gene_lists$le[["HALLMARK_INTERFERON_GAMMA_RESPONSE"]]
+ ## top 30% by rank
+ gene_lists$top[["HALLMARK_INTERFERON_GAMMA_RESPONSE"]]
+ ## all members
+ gene_lists$all[["HALLMARK_INTERFERON_GAMMA_RESPONSE"]]
+# Append gene columns to the pathway table
+pa_annot <- addgenesPA(pa_single, gene_lists)
+# Number of gene sets annotated
+nrow(pa_annot)
+
+head(pa_annot[, c("NAME", "all_genes", "le_genes", "top_genes")])
+```
+
+> **Tip:** For CAMERA or other enrichment methods that do not produce leading
+> edge information, use `genes = c("all", "top")` and set `pa_data$top` to
+> your desired fraction (e.g., `0.25` for the top 25 % by rank). Do not
+> request `genes = "le"` without a `tags` column.
+
+---
+
+## Section 4 : Heatmaps
+### `heatmap_PA()`
+`heatmap_PA()` generates one heatmap per gene set based on `pa_data_annot`.
+Genes are ordered within each heatmap by their position in `ranked_genes`. Output
+files are organized into subdirectories by format (PDF / JPG) and gene
+selection mode (`all_genes`, `le_genes`, `top_genes`).
+
+```{r heatmap_PA, eval = FALSE}
+heatmap_PA(
+ expression_data = vst_counts,
+ metadata = sampledata,
+ pa_data_annot = pa_annot,
+ ranked_genes = ranked_genes,
+ plot_genes = c("all_genes", "le_genes", "top_genes"),
+ sample_col = "patient_id",
+ group_col = "sample_type",
+ out_dir = "heatmaps_PA",
+ pdf = TRUE,
+ jpg = TRUE
+)
+# Creates, for example:
+# heatmaps_PA/jpg/top_genes/HALLMARK_INTERFERON_GAMMA_RESPONSE_heatmap.jpg
+# heatmaps_PA/pdf/le_genes/HALLMARK_INTERFERON_GAMMA_RESPONSE_heatmap.pdf
+# ... (one file per gene set per format per mode)
+```
+
+The heatmap below shows the top-ranked genes for
+`HALLMARK_INTERFERON_GAMMA_RESPONSE` across tumor and normal TCGA-LUAD
+samples:
+
+```{r heatmap_plot, echo = FALSE, fig.cap = "Expression heatmap of top-ranked genes in HALLMARK_INTERFERON_GAMMA_RESPONSE across TCGA-LUAD tumor and normal samples. Genes are ordered by DESeq2 Wald statistic.", fig.width = 7, fig.height = 6}
+
+knitr::include_graphics(
+ "figures_PA/heatmaps/jpg/top_genes/HALLMARK_INTERFERON_GAMMA_RESPONSE_heatmap.jpg"
+)
+```
+
+### `heatmap_path_PA()` : file-path alternative
+`heatmap_path_PA()` is a convenience wrapper that reads all inputs from disk
+(expression TSV, metadata XLSX, GMT file, ranked-genes TSV, GSEA TSV) and
+calls the same heatmap engine internally. It is useful when running the
+analysis immediately after GSEA without loading data into R.
+
+```{r heatmap_path_PA, eval = FALSE}
+
+heatmap_path_PA(
+ main_dir = "path/to/analysis/",
+ expression_file = "vst_expression.tsv",
+ metadata_file = "metadata.xlsx",
+ gmt_file = "h.all.v2023.symbols.gmt",
+ ranked_genes_file = "ranked_genes.tsv",
+ gsea_file = "H.tsv",
+ output_dir = "leading_edge_heatmaps",
+ sample_col = "patient_id",
+ group_col = "sample_type",
+ save_dataframe = TRUE # also saves intermediate gene table as .tsv
+)
+# Produces the same output as heatmap_PA() for a single GSEA file
+```
+
+> **Which function to use?** Prefer `heatmap_PA()` when your data are already
+> in R (e.g., after calling `merge_PA()`, `getgenesPA()`, and `addgenesPA()`).
+> Use `heatmap_path_PA()` when you want a quick one-call solution that reads
+> directly from files on disk. Both functions produce identical heatmaps.
+
+---
+
+## Section 5 : General plot | Single-comparison
+`splot_PA()` generates a publication-quality multi-panel enrichment plot for
+**one comparison**. Gene sets appear on the y-axis (grouped by MSigDB
+collection), NES on the x-axis, and −log10(FDR) as fill color. Six panels
+are assembled side-by-side using `patchwork`.
+```{r splot_PA, eval = FALSE}
+
+single <- gsea_results[gsea_results$COMPARISON == "TumorVsNormal", ]
+splot_PA(
+ data = single,
+ geneset_col = "NAME",
+ collection_col = "COLLECTION",
+ nes_col = "NES",
+ fdr_col = "FDR",
+ order = "desc",
+ fill_limits = c(0, 2), # cap at -log10(FDR) = 5
+ fill_palette = c("white", "red")
+)
+```
+
+```{r splot_plot, echo = FALSE, fig.cap = "Single-comparison pathway enrichment plot (TumorVsNormal). Gene sets are sorted by NES; fill color encodes -log10(FDR), capped at 2. Collections are annotated to the right.", fig.width = 6, fig.height = 5}
+knitr::include_graphics("figures_PA/splot_PA.jpg")
+```
+> **Tip:** Use `fill_limits = c(0, 2)` to prevent a handful of extremely
+> significant gene sets from washing out the color contrast for the rest. Any
+> pathway with FDR ≤ 0.01 will be shown in the maximum color (red).
+
+---
+
+## Section 6 : General plot | Multi-comparison
+`multiplot_PA()` generates a **faceted barplot** showing NES across multiple
+comparisons for a selected set of gene sets. Each facet represents one gene
+set; bars represent the NES per comparison, colored by −log10(FDR). This
+layout makes it straightforward to assess how enrichment changes across
+conditions.
+```{r multiplot_PA, eval = FALSE}
+
+# Subset to pathways of interest across all comparisons
+pathways_of_interest <- c(
+ "HALLMARK_INTERFERON_GAMMA_RESPONSE",
+ "HALLMARK_INFLAMMATORY_RESPONSE",
+ "HALLMARK_G2M_CHECKPOINT",
+ "HALLMARK_E2F_TARGETS",
+ "HALLMARK_GLYCOLYSIS",
+ "HALLMARK_OXIDATIVE_PHOSPHORYLATION"
+)
+multi_data <- gsea_results[gsea_results$NAME %in% pathways_of_interest, ]
+multiplot_PA(
+ data = multi_data,
+ comparison_col = "COMPARISON",
+ facet_col = "NAME",
+ axis_y = "NES",
+ fdr_col = "FDR",
+ comparison_order = c("TumorVsNormal",
+ "MetastasisVsNormal",
+ "MetastasisVsTumor"),
+ custom_labels = c(
+ TumorVsNormal = "Tumor",
+ MetastasisVsNormal = "Meta",
+ MetastasisVsTumor = "Meta/Tumor"
+ ),
+ ncol_wrap = 3,
+ free_y = TRUE,
+ fill_limits = c(0, 5),
+ fill_palette = c("white", "red")
+)
+```
+
+```{r multiplot_plot, echo = FALSE, fig.cap = "Multi-comparison pathway enrichment plot. Each facet shows NES for one HALLMARK gene set across three pairwise comparisons. Fill color encodes -log10(FDR), capped at 5.", fig.width = 6, fig.height = 5}
+knitr::include_graphics("figures_PA/multiplot_PA.jpg")
+```
+
+> **Tip:** Use `comparison_order` to control the left-to-right arrangement of
+> comparisons on the x-axis of each facet, and `custom_labels` to replace
+> long comparison names with shorter axis labels.
+
+## Full workflow : summary
+```{r full_workflow, eval = FALSE}
+library(OmicsKit)
+
+# 1. Load gene sets
+gsl <- list_gmts("path/to/gmt_folder/")
+
+# 2. Merge GSEA output TSV files
+gsea_data <- merge_PA(
+ input_directory = "path/to/gsea_results/",
+ fdr_replace = 0.001
+)
+# 3. Build ranked gene list (from DESeq2 stat or log2FC)
+ranked <- deseq2_results$gene_id[
+ order(deseq2_results$stat, decreasing = TRUE)
+]
+# 4. Extract leading edge and top-ranked genes
+pa_single <- gsea_data[gsea_data$COMPARISON == "TumorVsNormal", ]
+pa_single$top <- 0.30
+gene_lists <- getgenesPA(pa_single, gsl, ranked, genes = c("all", "le", "top"))
+pa_annot <- addgenesPA(pa_single, gene_lists)
+# 5. Heatmaps
+heatmap_PA(
+ expression_data = vst_counts,
+ metadata = sampledata,
+ pa_data_annot = pa_annot,
+ ranked_genes = ranked,
+ plot_genes = c("all_genes", "le_genes", "top_genes"),
+ sample_col = "patient_id",
+ group_col = "sample_type",
+ out_dir = "heatmaps_PA"
+)
+# Alternative: file-path version (reads directly from disk)
+heatmap_path_PA(
+ main_dir = "path/to/analysis/",
+ expression_file = "vst_expression.tsv",
+ metadata_file = "metadata.xlsx",
+ gmt_file = "h.all.v2023.symbols.gmt",
+ ranked_genes_file = "ranked_genes.tsv",
+ gsea_file = "H.tsv",
+ output_dir = "leading_edge_heatmaps"
+)
+# 6. Single-comparison enrichment plot
+splot_PA(
+ data = pa_single,
+ geneset_col = "NAME",
+ collection_col = "COLLECTION",
+ nes_col = "NES",
+ fdr_col = "FDR",
+ fill_limits = c(0, 5)
+)
+# 7. Multi-comparison enrichment plot
+pathways_oi <- c(
+ "HALLMARK_INTERFERON_GAMMA_RESPONSE",
+ "HALLMARK_INFLAMMATORY_RESPONSE",
+ "HALLMARK_G2M_CHECKPOINT"
+)
+multiplot_PA(
+ data = gsea_data[gsea_data$NAME %in% pathways_oi, ],
+ comparison_col = "COMPARISON",
+ facet_col = "NAME",
+ fdr_col = "FDR",
+ comparison_order = c("TumorVsNormal", "MetastasisVsNormal", "MetastasisVsTumor"),
+ ncol_wrap = 3
+)
+```
+
+## Session info
+```{r session_info}
+sessionInfo()
+```
diff --git a/vignettes/figures/nice_PCA.png b/vignettes/figures/nice_PCA.png
new file mode 100644
index 0000000..348d6da
Binary files /dev/null and b/vignettes/figures/nice_PCA.png differ
diff --git a/vignettes/figures/nice_UMAP.png b/vignettes/figures/nice_UMAP.png
new file mode 100644
index 0000000..a6aa562
Binary files /dev/null and b/vignettes/figures/nice_UMAP.png differ
diff --git a/vignettes/figures/nice_VSB.png b/vignettes/figures/nice_VSB.png
new file mode 100644
index 0000000..caf0369
Binary files /dev/null and b/vignettes/figures/nice_VSB.png differ
diff --git a/vignettes/figures/nice_Volcano.png b/vignettes/figures/nice_Volcano.png
new file mode 100644
index 0000000..a84432b
Binary files /dev/null and b/vignettes/figures/nice_Volcano.png differ
diff --git a/vignettes/figures/nice_tSNE.png b/vignettes/figures/nice_tSNE.png
new file mode 100644
index 0000000..2736fe6
Binary files /dev/null and b/vignettes/figures/nice_tSNE.png differ
diff --git a/vignettes/figures/power_analysis.png b/vignettes/figures/power_analysis.png
new file mode 100644
index 0000000..5d1a639
Binary files /dev/null and b/vignettes/figures/power_analysis.png differ
diff --git a/vignettes/figures_PA/heatmaps/jpg/top_genes/HALLMARK_INTERFERON_GAMMA_RESPONSE_heatmap.jpg b/vignettes/figures_PA/heatmaps/jpg/top_genes/HALLMARK_INTERFERON_GAMMA_RESPONSE_heatmap.jpg
new file mode 100644
index 0000000..12a390a
Binary files /dev/null and b/vignettes/figures_PA/heatmaps/jpg/top_genes/HALLMARK_INTERFERON_GAMMA_RESPONSE_heatmap.jpg differ
diff --git a/vignettes/figures_PA/multiplot_PA.jpg b/vignettes/figures_PA/multiplot_PA.jpg
new file mode 100644
index 0000000..1b0cc70
Binary files /dev/null and b/vignettes/figures_PA/multiplot_PA.jpg differ
diff --git a/vignettes/figures_PA/splot_PA.jpg b/vignettes/figures_PA/splot_PA.jpg
new file mode 100644
index 0000000..ef4b363
Binary files /dev/null and b/vignettes/figures_PA/splot_PA.jpg differ