diff --git a/DESCRIPTION b/DESCRIPTION index e96e9be..46739cd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,9 +18,12 @@ License: CC BY-NC-SA 4.0 Encoding: UTF-8 Imports: dplyr, + broom, ggplot2, + ggpubr, tsne, cowplot, + ggraph, graphics, magrittr, rlang, @@ -35,9 +38,11 @@ Imports: GenomeInfoDb, GenomicRanges, stats, + survival, S4Vectors, grDevices, - tools + tools, + pROC Suggests: biomaRt, circlize, @@ -48,7 +53,6 @@ Suggests: rmarkdown, DESeq2, ggnewscale, - ggraph, ggrepel, igraph, matrixStats, @@ -56,7 +60,6 @@ Suggests: openxlsx, pheatmap, readr, - survival, survminer, tidygraph, tidyr, diff --git a/NAMESPACE b/NAMESPACE index 7eb0e5e..177cfc9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,12 +1,17 @@ # Generated by roxygen2: do not edit by hand S3method(print,circos_genome) +S3method(print,get_glm_result) export(add_annotations) export(addgenesPA) +export(concordanceDE) +export(crossLayerCorr) export(detect_filter) export(do_clust) export(geneset_similarity) export(get_annotations) +export(get_cox) +export(get_glm) export(get_network_communities) export(get_stars) export(get_superterm) @@ -21,14 +26,17 @@ export(merge_PA) export(multiplot_PA) export(network_clust) export(network_clust_gg) +export(nice_ConcordanceScatter) export(nice_GenomeTrack) export(nice_KM) export(nice_PCA) +export(nice_ROC) export(nice_UMAP) export(nice_VSB) export(nice_VSB_DEseq2) export(nice_Volcano) export(nice_circos) +export(nice_forest) export(nice_tSNE) export(power_analysis) export(resolve_genome) @@ -36,17 +44,58 @@ export(save_results) export(split_cases) export(splot_PA) export(tpm) +export(trend_filter) import(ggplot2) importFrom(S4Vectors,"mcols<-") importFrom(S4Vectors,mcols) +importFrom(broom,tidy) +importFrom(dplyr,bind_rows) +importFrom(dplyr,case_when) +importFrom(dplyr,filter) +importFrom(dplyr,mutate) +importFrom(dplyr,rename) +importFrom(dplyr,select) +importFrom(ggplot2,aes) +importFrom(ggplot2,annotate) +importFrom(ggplot2,element_blank) +importFrom(ggplot2,element_rect) +importFrom(ggplot2,element_text) +importFrom(ggplot2,geom_abline) +importFrom(ggplot2,geom_errorbar) +importFrom(ggplot2,geom_line) +importFrom(ggplot2,geom_point) +importFrom(ggplot2,geom_vline) +importFrom(ggplot2,ggplot) +importFrom(ggplot2,labs) +importFrom(ggplot2,scale_color_manual) +importFrom(ggplot2,scale_x_log10) +importFrom(ggplot2,theme) +importFrom(ggplot2,theme_classic) +importFrom(ggplot2,theme_minimal) importFrom(grDevices,adjustcolor) importFrom(grDevices,dev.off) importFrom(grDevices,pdf) importFrom(grid,gpar) importFrom(magrittr,"%>%") +importFrom(pROC,auc) +importFrom(pROC,ci.auc) +importFrom(pROC,roc) +importFrom(pROC,roc.test) +importFrom(pROC,smooth) importFrom(patchwork,plot_layout) importFrom(rlang,.data) +importFrom(stats,AIC) +importFrom(stats,as.formula) +importFrom(stats,complete.cases) +importFrom(stats,glm) importFrom(stats,na.omit) +importFrom(stats,nobs) +importFrom(stats,p.adjust) +importFrom(stats,predict) +importFrom(stats,relevel) importFrom(stats,setNames) +importFrom(survival,Surv) +importFrom(survival,coxph) +importFrom(tibble,tibble) importFrom(tools,file_ext) importFrom(utils,modifyList) diff --git a/R/concordanceDE.R b/R/concordanceDE.R new file mode 100644 index 0000000..b9fef8c --- /dev/null +++ b/R/concordanceDE.R @@ -0,0 +1,183 @@ +############################## +# Function concordanceDE # +############################## + +#' Classify differential results by cross-layer concordance. +#' +#' Compares two differential-analysis tables from paired omics layers, such as +#' RNA-seq and total-protein RPPA, by merging shared genes and classifying each +#' gene according to statistical significance and log-fold-change direction. +#' +#' Genes are assigned to one of five categories: `concordant`, `discordant`, +#' `only_x`, `only_y`, or `not_significant`. A concordance score is also +#' computed as the fraction of genes significant in both layers that have the +#' same effect direction. +#' +#' @param de_x Data frame for layer X. Must contain columns defined by +#' `gene_col`, `logfc_col`, and `padj_col`. +#' @param de_y Data frame for layer Y. Must contain columns defined by +#' `gene_col`, `logfc_col`, and `padj_col`. +#' @param gene_col Column name containing gene identifiers. Default: `"gene"`. +#' @param logfc_col Column name containing log-fold changes. Default: `"logFC"`. +#' @param padj_col Column name containing adjusted p-values/FDR values. +#' Default: `"padj"`. +#' @param padj_threshold Numeric threshold for adjusted p-values. Use a single +#' value for both layers or a length-two vector for layer X and layer Y, +#' respectively. Default: `0.05`. +#' @param logfc_threshold Numeric absolute log-fold-change threshold. Use a +#' single value for both layers or a length-two vector for layer X and layer Y, +#' respectively. Default: `1`. +#' +#' @return A list with: +#' \itemize{ +#' \item `table`: merged data frame with one `category` column. +#' \item `concordance_score`: fraction of genes significant in both layers +#' with matching log-fold-change direction. +#' \item `summary`: table with counts per concordance category. +#' \item `n_shared_genes`: number of genes shared by both layers. +#' \item `n_both_significant`: number of genes significant in both layers. +#' } +#' +#' @examples +#' de_rna <- data.frame( +#' gene = c("ESR1", "PGR", "EGFR", "MKI67", "GATA3"), +#' logFC = c(3.2, 2.1, -1.6, -1.4, 1.8), +#' padj = c(1e-6, 0.002, 0.01, 0.03, 0.04) +#' ) +#' +#' de_protein <- data.frame( +#' gene = c("ESR1", "PGR", "EGFR", "MKI67", "GATA3"), +#' logFC = c(0.7, 0.4, -0.5, 0.3, 0.05), +#' padj = c(1e-4, 0.01, 0.02, 0.04, 0.40) +#' ) +#' +#' res <- concordanceDE( +#' de_x = de_rna, +#' de_y = de_protein, +#' logfc_threshold = c(1, 0.2) +#' ) +#' res$summary +#' res$concordance_score +#' +#' @importFrom dplyr case_when +#' +#' @seealso +#' [nice_ConcordanceScatter()] to visualize the output of `concordanceDE()`; +#' [crossLayerCorr()] to estimate sample-level concordance between two omics +#' layers. +#' +#' @export +concordanceDE <- function( + de_x, + de_y, + gene_col = "gene", + logfc_col = "logFC", + padj_col = "padj", + padj_threshold = 0.05, + logfc_threshold = 1 +) { + de_x <- as.data.frame(de_x) + de_y <- as.data.frame(de_y) + + required_cols <- c(gene_col, logfc_col, padj_col) + missing_x <- setdiff(required_cols, names(de_x)) + missing_y <- setdiff(required_cols, names(de_y)) + + if (length(missing_x) > 0) { + stop( + "`de_x` is missing required columns: ", + paste(missing_x, collapse = ", "), + call. = FALSE + ) + } + + if (length(missing_y) > 0) { + stop( + "`de_y` is missing required columns: ", + paste(missing_y, collapse = ", "), + call. = FALSE + ) + } + + de_x <- de_x[!is.na(de_x[[gene_col]]) & nzchar(trimws(de_x[[gene_col]])), , drop = FALSE] + de_y <- de_y[!is.na(de_y[[gene_col]]) & nzchar(trimws(de_y[[gene_col]])), , drop = FALSE] + + if (anyDuplicated(de_x[[gene_col]]) > 0) { + stop("`de_x` must contain one row per gene in `gene_col`.", call. = FALSE) + } + + if (anyDuplicated(de_y[[gene_col]]) > 0) { + stop("`de_y` must contain one row per gene in `gene_col`.", call. = FALSE) + } + + if (!is.numeric(padj_threshold) || length(padj_threshold) > 2) { + stop("`padj_threshold` must be a numeric vector of length 1 or 2.", call. = FALSE) + } + + if (!is.numeric(logfc_threshold) || length(logfc_threshold) > 2) { + stop("`logfc_threshold` must be a numeric vector of length 1 or 2.", call. = FALSE) + } + + padj_threshold <- rep(padj_threshold, length.out = 2) + logfc_threshold <- rep(logfc_threshold, length.out = 2) + + shared <- merge(de_x, de_y, by = gene_col, suffixes = c("_x", "_y")) + + if (nrow(shared) == 0) { + stop("No shared genes were found between `de_x` and `de_y`.", call. = FALSE) + } + + lfc_x_col <- paste0(logfc_col, "_x") + lfc_y_col <- paste0(logfc_col, "_y") + padj_x_col <- paste0(padj_col, "_x") + padj_y_col <- paste0(padj_col, "_y") + + lfc_x <- suppressWarnings(as.numeric(shared[[lfc_x_col]])) + lfc_y <- suppressWarnings(as.numeric(shared[[lfc_y_col]])) + padj_x <- suppressWarnings(as.numeric(shared[[padj_x_col]])) + padj_y <- suppressWarnings(as.numeric(shared[[padj_y_col]])) + + sig_x <- !is.na(lfc_x) & !is.na(padj_x) & + abs(lfc_x) >= logfc_threshold[1] & padj_x < padj_threshold[1] + sig_y <- !is.na(lfc_y) & !is.na(padj_y) & + abs(lfc_y) >= logfc_threshold[2] & padj_y < padj_threshold[2] + + dir_x <- sign(lfc_x) + dir_y <- sign(lfc_y) + + shared$category <- dplyr::case_when( + sig_x & sig_y & dir_x == dir_y ~ "concordant", + sig_x & sig_y & dir_x != dir_y ~ "discordant", + sig_x & !sig_y ~ "only_x", + !sig_x & sig_y ~ "only_y", + TRUE ~ "not_significant" + ) + + category_levels <- c( + "concordant", + "discordant", + "only_x", + "only_y", + "not_significant" + ) + + shared$category <- factor(shared$category, levels = category_levels) + + both_sig <- sig_x & sig_y + concordance_score <- if (any(both_sig)) { + mean(dir_x[both_sig] == dir_y[both_sig], na.rm = TRUE) + } else { + NA_real_ + } + + out <- list( + table = shared, + concordance_score = round(concordance_score, 4), + summary = table(shared$category), + n_shared_genes = nrow(shared), + n_both_significant = sum(both_sig, na.rm = TRUE) + ) + + class(out) <- c("omicskit_concordanceDE", "list") + out +} diff --git a/R/crossLayerCorr.R b/R/crossLayerCorr.R new file mode 100644 index 0000000..c82e89d --- /dev/null +++ b/R/crossLayerCorr.R @@ -0,0 +1,186 @@ +############################## +# Function crossLayerCorr # +############################## + +#' Correlate paired sample profiles between two omics layers. +#' +#' Computes one correlation per shared sample between two normalized matrices. +#' Matrices are first matched by shared sample names and shared feature names, +#' making the function suitable for RNA-vs-protein comparisons after mapping +#' protein features to gene symbols. +#' +#' By default, the function uses the top variable shared features to reduce noise +#' and returns both a sorted correlation table and an optional bar plot. +#' +#' @param mat_x Numeric matrix for layer X with features as rows and samples as +#' columns. Row names and column names are required. +#' @param mat_y Numeric matrix for layer Y with features as rows and samples as +#' columns. Row names and column names are required. +#' @param method Correlation method. One of `"spearman"` or `"pearson"`. +#' Default: `"spearman"`. +#' @param top_n Number of most variable shared features to use. If `NULL`, all +#' shared features are used. Default: `500`. +#' @param plot Logical; if `TRUE`, returns a `ggplot2` bar plot in the `plot` +#' element. Default: `TRUE`. +#' +#' @return A list with: +#' \itemize{ +#' \item `correlations`: data frame with `sample` and `correlation` columns. +#' \item `median_r`: median sample-level correlation. +#' \item `n_shared_samples`: number of shared samples used. +#' \item `n_shared_features`: number of shared features before top-variable +#' filtering. +#' \item `n_features_used`: number of features used for correlation. +#' \item `plot`: `ggplot2` object or `NULL`. +#' } +#' +#' @examples +#' set.seed(1) +#' mat_rna <- matrix(rnorm(60), nrow = 10) +#' mat_protein <- mat_rna + matrix(rnorm(60, sd = 0.4), nrow = 10) +#' rownames(mat_rna) <- rownames(mat_protein) <- paste0("GENE", seq_len(10)) +#' colnames(mat_rna) <- colnames(mat_protein) <- paste0("S", seq_len(6)) +#' +#' res <- crossLayerCorr(mat_rna, mat_protein, top_n = 8, plot = FALSE) +#' head(res$correlations) +#' res$median_r +#' +#' @import ggplot2 +#' @importFrom rlang .data +#' +#' @seealso +#' [concordanceDE()] to classify genes by cross-layer differential concordance; +#' [nice_ConcordanceScatter()] to visualize RNA-protein differential +#' concordance. +#' +#' @export +crossLayerCorr <- function( + mat_x, + mat_y, + method = c("spearman", "pearson"), + top_n = 500, + plot = TRUE +) { + method <- match.arg(method) + + mat_x <- as.matrix(mat_x) + mat_y <- as.matrix(mat_y) + + if (is.null(colnames(mat_x)) || is.null(colnames(mat_y))) { + stop("Both matrices must have sample names as column names.", call. = FALSE) + } + + if (is.null(rownames(mat_x)) || is.null(rownames(mat_y))) { + stop("Both matrices must have feature names as row names.", call. = FALSE) + } + + storage.mode(mat_x) <- "numeric" + storage.mode(mat_y) <- "numeric" + + shared_samples <- intersect(colnames(mat_x), colnames(mat_y)) + + if (length(shared_samples) == 0) { + stop("No shared sample names between `mat_x` and `mat_y`.", call. = FALSE) + } + + shared_features <- intersect(rownames(mat_x), rownames(mat_y)) + + if (length(shared_features) < 2) { + stop( + "At least two shared feature names are required between `mat_x` and `mat_y`.", + call. = FALSE + ) + } + + mat_x <- mat_x[shared_features, shared_samples, drop = FALSE] + mat_y <- mat_y[shared_features, shared_samples, drop = FALSE] + + var_x <- apply(mat_x, 1, stats::var, na.rm = TRUE) + var_y <- apply(mat_y, 1, stats::var, na.rm = TRUE) + combined_var <- rowMeans(cbind(var_x, var_y), na.rm = TRUE) + + keep_features <- is.finite(combined_var) & combined_var > 0 + + if (sum(keep_features) < 2) { + stop("Too few variable shared features for correlation.", call. = FALSE) + } + + mat_x <- mat_x[keep_features, , drop = FALSE] + mat_y <- mat_y[keep_features, , drop = FALSE] + combined_var <- combined_var[keep_features] + + n_shared_features <- length(shared_features) + + if (!is.null(top_n)) { + if (!is.numeric(top_n) || length(top_n) != 1 || top_n < 2) { + stop("`top_n` must be `NULL` or a numeric value greater than or equal to 2.", call. = FALSE) + } + + top_idx <- order(combined_var, decreasing = TRUE)[seq_len(min(top_n, length(combined_var)))] + mat_x <- mat_x[top_idx, , drop = FALSE] + mat_y <- mat_y[top_idx, , drop = FALSE] + } + + cors <- vapply(shared_samples, function(sample_id) { + x <- mat_x[, sample_id] + y <- mat_y[, sample_id] + keep <- is.finite(x) & is.finite(y) + + if (sum(keep) < 2) { + return(NA_real_) + } + + stats::cor(x[keep], y[keep], method = method, use = "complete.obs") + }, numeric(1)) + + result <- data.frame( + sample = shared_samples, + correlation = cors, + stringsAsFactors = FALSE + ) + + result <- result[order(result$correlation, decreasing = TRUE, na.last = TRUE), , drop = FALSE] + result$sample_ordered <- factor(result$sample, levels = result$sample) + + median_r <- stats::median(cors, na.rm = TRUE) + + p <- NULL + + if (isTRUE(plot)) { + p <- ggplot2::ggplot( + result, + ggplot2::aes( + x = .data[["sample_ordered"]], + y = .data[["correlation"]] + ) + ) + + ggplot2::geom_col(fill = "#7F77DD", alpha = 0.85, width = 0.75) + + ggplot2::geom_hline( + yintercept = median_r, + linetype = "dashed", + color = "#EF9F27" + ) + + ggplot2::labs( + x = "Sample", + y = paste0(tools::toTitleCase(method), " r"), + caption = paste0( + "Median r = ", round(median_r, 3), + " | n = ", length(shared_samples), " samples", + " | features = ", nrow(mat_x) + ) + ) + + ggplot2::theme_classic(base_size = 11) + + ggplot2::theme( + axis.text.x = ggplot2::element_text(angle = 45, hjust = 1, size = 7) + ) + } + + list( + correlations = result, + median_r = median_r, + n_shared_samples = length(shared_samples), + n_shared_features = n_shared_features, + n_features_used = nrow(mat_x), + plot = p + ) +} diff --git a/R/detect_filter.R b/R/detect_filter.R index 24db9b1..b300dd2 100644 --- a/R/detect_filter.R +++ b/R/detect_filter.R @@ -58,7 +58,7 @@ #' #' @seealso [nice_VSB()] to plot expression of detected genes; #' [norm_counts] for an example normalized counts matrix. -#' +#' [trend_filter()] to find genes with significant expression trends across conditions. #' @export diff --git a/R/get_cox.R b/R/get_cox.R new file mode 100644 index 0000000..7922bc9 --- /dev/null +++ b/R/get_cox.R @@ -0,0 +1,684 @@ +#' Fit Cox Proportional Hazards Models and Return a Tidy Table +#' +#' Fits Cox proportional hazards models against a selected survival endpoint and +#' returns a tidy table with hazard ratios, confidence intervals, p-values, model +#' metadata, and labels ready to be plotted with \code{nice_forest()}. +#' +#' @param data A \code{data.frame} containing the survival columns +#' (\code{time_col}, \code{event_col}) and the variables to test. +#' @param time_col Character. Name of the column with follow-up time. +#' The column should be numeric or coercible to numeric. Default: +#' \code{"PFI.time"}. +#' @param event_col Character. Name of the event indicator column. +#' It should be coded as 0 = censored and 1 = event, or be coercible +#' to numeric 0/1. Default: \code{"PFI"}. +#' @param vars Character vector. Variables of interest to test. If +#' \code{NULL}, variables are selected automatically after excluding +#' survival columns, ID-like columns, columns starting with \code{"_"}, +#' and optionally survival/outcome-like columns. +#' @param model Character. Type of Cox model to fit. One of +#' \code{"univariable"}, \code{"multivariable"}, or \code{"adjusted"}. +#' \code{"univariable"} fits one model per variable. \code{"multivariable"} +#' fits one model containing all variables in \code{vars}. \code{"adjusted"} +#' fits one model per variable of interest, adjusted by \code{adjust_vars}. +#' @param adjust_vars Character vector. Covariates used when +#' \code{model = "adjusted"}. Ignored for \code{"univariable"} and +#' \code{"multivariable"} models. Default: \code{NULL}. +#' @param keep_adjust_terms Logical. If \code{TRUE} and +#' \code{model = "adjusted"}, adjustment covariate terms are kept in the +#' returned table. If \code{FALSE}, only terms from the variable of interest +#' are returned. Default: \code{FALSE}. +#' @param ref_levels A named list to explicitly set reference levels for +#' categorical variables. Names must be column names and values must be the +#' desired reference categories. Example: +#' \code{list(ER_Status_nature2012 = "Negative", +#' AJCC_Stage_nature2012 = "Stage I")}. +#' @param min_n Integer. Minimum total number of complete observations required +#' for each Cox model. Default: \code{10}. +#' @param min_n_per_level Integer. For categorical predictors, minimum number +#' of complete observations required in each category. Default: \code{5}. +#' Set to \code{0} to disable. +#' @param min_events Integer. Minimum total number of events required for each +#' Cox model. Default: \code{5}. Set to \code{0} to disable. +#' @param min_events_per_level Integer. For categorical predictors, minimum +#' number of events required in each category. This is the main safeguard +#' against sparse categories and infinite Cox coefficients. Default: +#' \code{5}. Set to \code{0} to disable. +#' @param exclude_survival_like Logical. If \code{TRUE} and \code{vars = NULL}, +#' automatic variable selection excludes columns whose names suggest survival +#' endpoints or outcome leakage, such as \code{OS}, \code{DSS}, \code{DFI}, +#' \code{PFI}, \code{days_to}, \code{death}, \code{vital_status}, +#' \code{followup}, or \code{survival}. Default: \code{TRUE}. +#' @param verbose Logical. If \code{TRUE}, prints informative messages when +#' models or variables are skipped. Default: \code{TRUE}. +#' +#' @return A \code{data.frame} with one row per model term and columns including +#' \code{model}, \code{model_id}, \code{variable}, \code{term}, +#' \code{term_clean}, \code{reference}, \code{HR}, \code{CI_low}, +#' \code{CI_high}, \code{p.value}, \code{n_used}, \code{n_events}, and +#' \code{adjusted_for}. +#' +#' @details +#' The function performs several checks before fitting each model: +#' \enumerate{ +#' \item Removes empty strings and incomplete rows. +#' \item Converts character and logical predictors to factors. +#' \item Drops unused factor levels. +#' \item Applies explicit reference levels via \code{ref_levels}. +#' \item Skips sparse categorical variables using \code{min_n_per_level} and +#' \code{min_events_per_level}. +#' \item Skips models with possible perfect separation or infinite +#' coefficients. +#' \item Returns exponentiated Cox coefficients as hazard ratios. +#' } +#' +#' In this function, \code{"multivariable"} means a Cox model with multiple +#' predictors for one survival endpoint. This is different from +#' \code{"multivariate"}, which usually refers to multiple outcomes. +#' +#' @importFrom survival Surv coxph +#' @importFrom broom tidy +#' @importFrom dplyr bind_rows +#' @importFrom stats as.formula complete.cases relevel +#' +#' @examples +#' \dontrun{ +#' # Univariable Cox models +#' cox_uni <- get_cox( +#' data = df, +#' time_col = "PFI.time", +#' event_col = "PFI", +#' vars = c("ER_Status_nature2012", "PAM50Call_RNAseq"), +#' model = "univariable" +#' ) +#' +#' # Multivariable Cox model +#' cox_multi <- get_cox( +#' data = df, +#' time_col = "PFI.time", +#' event_col = "PFI", +#' vars = c("age_at_initial_pathologic_diagnosis", +#' "AJCC_Stage_nature2012", +#' "PAM50Call_RNAseq"), +#' model = "multivariable" +#' ) +#' +#' # Adjusted Cox models +#' cox_adj <- get_cox( +#' data = df, +#' time_col = "PFI.time", +#' event_col = "PFI", +#' vars = c("ER_Status_nature2012", "HER2_Final_Status_nature2012"), +#' adjust_vars = c("age_at_initial_pathologic_diagnosis", +#' "AJCC_Stage_nature2012"), +#' model = "adjusted" +#' ) +#' +#' nice_forest(cox_adj) +#' } + +#' @seealso +#' \code{\link{nice_forest}} for plotting the tidy Cox model results returned +#' by \code{get_cox()}. +#' +#' \code{\link{nice_KM}} for Kaplan-Meier survival curve visualization. +#' +#' \code{\link[survival]{coxph}} and \code{\link[survival]{Surv}} for the +#' underlying Cox proportional hazards model and survival object. +#' +#' \code{\link[broom]{tidy}} for tidying model outputs. +#' +#' @export +get_cox <- function(data, + time_col = "PFI.time", + event_col = "PFI", + vars = NULL, + model = c("univariable", "multivariable", "adjusted"), + adjust_vars = NULL, + keep_adjust_terms = FALSE, + ref_levels = NULL, + min_n = 10, + min_n_per_level = 5, + min_events = 5, + min_events_per_level = 5, + exclude_survival_like = TRUE, + verbose = TRUE) { + + model <- match.arg(model) + + # --- 1. Validations --- + if (!is.data.frame(data)) { + stop("'data' should be a data.frame.") + } + + if (!time_col %in% colnames(data)) { + stop("Column '", time_col, "' not found in 'data'.") + } + + if (!event_col %in% colnames(data)) { + stop("Column '", event_col, "' not found in 'data'.") + } + + if (!is.null(vars) && !is.character(vars)) { + stop("'vars' should be NULL or a character vector of column names.") + } + + if (!is.null(adjust_vars) && !is.character(adjust_vars)) { + stop("'adjust_vars' should be NULL or a character vector of column names.") + } + + if (!is.null(ref_levels)) { + if (!is.list(ref_levels) || is.null(names(ref_levels))) { + stop("'ref_levels' should be NULL or a named list.") + } + } + + is_single_number <- function(x) { + is.numeric(x) && length(x) == 1 && !is.na(x) + } + + if (!is_single_number(min_n) || min_n < 1) { + stop("'min_n' should be a single numeric value >= 1.") + } + + if (!is_single_number(min_n_per_level) || min_n_per_level < 0) { + stop("'min_n_per_level' should be a single numeric value >= 0.") + } + + if (!is_single_number(min_events) || min_events < 0) { + stop("'min_events' should be a single numeric value >= 0.") + } + + if (!is_single_number(min_events_per_level) || min_events_per_level < 0) { + stop("'min_events_per_level' should be a single numeric value >= 0.") + } + + if (model == "adjusted" && (is.null(adjust_vars) || length(adjust_vars) == 0)) { + stop("'adjust_vars' is required when model = 'adjusted'.") + } + + # --- 2. Variable selection --- + if (is.null(vars)) { + all_cols <- colnames(data) + + id_like <- grep( + "(^sample$|^sampleID$|sample_id|patient|barcode|^ID$|_ID$)", + all_cols, + ignore.case = TRUE, + value = TRUE + ) + + survival_like <- character(0) + + if (isTRUE(exclude_survival_like)) { + survival_like <- grep( + "^(OS|DSS|DFI|PFI)(\\.time)?$|days_to|death|vital_status|last_follow|followup|survival", + all_cols, + ignore.case = TRUE, + value = TRUE + ) + } + + exclude_auto <- unique(c( + time_col, + event_col, + all_cols[grepl("^_", all_cols)], + id_like, + survival_like, + adjust_vars + )) + + vars <- setdiff(all_cols, exclude_auto) + } else { + missing_vars <- setdiff(vars, colnames(data)) + + if (length(missing_vars) > 0) { + stop( + "The following variables were not found in 'data': ", + paste(missing_vars, collapse = ", ") + ) + } + + vars <- unique(vars) + } + + if (!is.null(adjust_vars)) { + missing_adjust <- setdiff(adjust_vars, colnames(data)) + + if (length(missing_adjust) > 0) { + stop( + "The following adjustment variables were not found in 'data': ", + paste(missing_adjust, collapse = ", ") + ) + } + + adjust_vars <- unique(adjust_vars) + } + + if (length(vars) == 0) { + stop("No variables available to test.") + } + + # --- 3. Helper functions --- + bt <- function(x) { + paste0("`", x, "`") + } + + skip_model <- function(model_id, reason) { + if (isTRUE(verbose)) { + message("Model '", model_id, "' skipped: ", reason) + } + return(NULL) + } + + clean_term_label <- function(term, variable, reference) { + out <- as.character(term) + + bt_var <- bt(variable) + + if (startsWith(out, bt_var)) { + out <- substring(out, nchar(bt_var) + 1) + } else if (startsWith(out, variable)) { + out <- substring(out, nchar(variable) + 1) + } + + out <- trimws(out) + + if (out %in% c("", "``") && identical(reference, "continuous")) { + out <- "per 1-unit increase" + } else if (out %in% c("", "``")) { + out <- "level vs reference" + } + + out + } + + guess_variable <- function(term, candidate_vars) { + candidate_vars <- candidate_vars[order(nchar(candidate_vars), decreasing = TRUE)] + + for (v in candidate_vars) { + bt_v <- bt(v) + + if (identical(term, bt_v) || + startsWith(term, bt_v) || + identical(term, v) || + startsWith(term, v)) { + return(v) + } + } + + NA_character_ + } + + make_formula <- function(model_vars) { + rhs <- paste(bt(model_vars), collapse = " + ") + + paste0( + "survival::Surv(", bt(time_col), ", ", bt(event_col), ") ~ ", + rhs + ) + } + + prepare_model_data <- function(model_vars, model_id) { + model_vars <- unique(model_vars) + needed_cols <- unique(c(time_col, event_col, model_vars)) + + tmp <- data[, needed_cols, drop = FALSE] + + # Convert survival time to numeric + tmp[[time_col]] <- suppressWarnings(as.numeric(as.character(tmp[[time_col]]))) + + # Convert event indicator to numeric 0/1 + if (is.logical(tmp[[event_col]])) { + tmp[[event_col]] <- as.integer(tmp[[event_col]]) + } else { + tmp[[event_col]] <- suppressWarnings(as.numeric(as.character(tmp[[event_col]]))) + } + + # Convert empty strings to NA in character/factor predictors + for (v in model_vars) { + if (is.character(tmp[[v]]) || is.factor(tmp[[v]])) { + empty_idx <- !is.na(tmp[[v]]) & tmp[[v]] == "" + tmp[[v]][empty_idx] <- NA + } + } + + # Remove incomplete rows + tmp <- tmp[stats::complete.cases(tmp), , drop = FALSE] + + # Remove non-finite survival values + finite_idx <- is.finite(tmp[[time_col]]) & + is.finite(tmp[[event_col]]) & + tmp[[time_col]] >= 0 + + # Remove non-finite numeric predictors + for (v in model_vars) { + if (is.numeric(tmp[[v]]) || is.integer(tmp[[v]])) { + finite_idx <- finite_idx & is.finite(tmp[[v]]) + } + } + + tmp <- tmp[finite_idx, , drop = FALSE] + + if (nrow(tmp) < min_n) { + return(skip_model( + model_id, + paste0("fewer than ", min_n, " complete observations.") + )) + } + + event_values <- sort(unique(tmp[[event_col]])) + + if (!all(event_values %in% c(0, 1))) { + return(skip_model( + model_id, + "event column should be coded as 0/1 after removing missing values." + )) + } + + n_events <- sum(tmp[[event_col]] == 1, na.rm = TRUE) + + if (min_events > 0 && n_events < min_events) { + return(skip_model( + model_id, + paste0("fewer than ", min_events, " total events.") + )) + } + + for (v in model_vars) { + # Convert character and logical predictors to factors + if (is.character(tmp[[v]]) || is.logical(tmp[[v]])) { + tmp[[v]] <- as.factor(tmp[[v]]) + } + + # Drop unused factor levels + if (is.factor(tmp[[v]])) { + tmp[[v]] <- droplevels(tmp[[v]]) + } + + # Skip categorical predictors with fewer than two levels + if (is.factor(tmp[[v]]) && nlevels(tmp[[v]]) < 2) { + return(skip_model( + model_id, + paste0("variable '", v, "' has fewer than two non-empty factor levels.") + )) + } + + # Skip non-categorical predictors with no variation + if (!is.factor(tmp[[v]]) && length(unique(tmp[[v]])) < 2) { + return(skip_model( + model_id, + paste0("variable '", v, "' has no variation.") + )) + } + + # Apply explicit reference level + if (is.factor(tmp[[v]]) && !is.null(ref_levels) && !is.null(ref_levels[[v]])) { + ref <- ref_levels[[v]] + + if (ref %in% levels(tmp[[v]])) { + tmp[[v]] <- stats::relevel(tmp[[v]], ref = ref) + } else { + warning( + "Reference '", ref, "' not found in variable '", v, + "'. Available levels: ", + paste(levels(tmp[[v]]), collapse = ", ") + ) + } + } + + # Check sample size per factor level + if (is.factor(tmp[[v]])) { + level_n <- table(tmp[[v]]) + + if (min_n_per_level > 0 && any(level_n < min_n_per_level)) { + small_levels <- names(level_n)[level_n < min_n_per_level] + + return(skip_model( + model_id, + paste0( + "variable '", v, "' has level(s) with fewer than ", + min_n_per_level, " observations: ", + paste(small_levels, collapse = ", "), + "." + ) + )) + } + + event_n <- tapply( + tmp[[event_col]] == 1, + tmp[[v]], + sum, + na.rm = TRUE + ) + + event_n <- event_n[levels(tmp[[v]])] + event_n[is.na(event_n)] <- 0 + + if (min_events_per_level > 0 && any(event_n < min_events_per_level)) { + small_event_levels <- names(event_n)[event_n < min_events_per_level] + + return(skip_model( + model_id, + paste0( + "variable '", v, "' has level(s) with fewer than ", + min_events_per_level, " events: ", + paste(small_event_levels, collapse = ", "), + "." + ) + )) + } + } + } + + list( + data = tmp, + n_used = nrow(tmp), + n_events = n_events + ) + } + + fit_one_model <- function(model_vars, + model_id, + model_type, + target_vars = model_vars, + adjusted_for = character(0)) { + prepared <- prepare_model_data(model_vars, model_id) + + if (is.null(prepared)) { + return(NULL) + } + + tmp <- prepared$data + formula_str <- make_formula(model_vars) + + separation_warning <- FALSE + + separation_pattern <- paste( + "Loglik converged before variable", + "coefficient may be infinite", + "ran out of iterations", + sep = "|" + ) + + fit <- tryCatch( + withCallingHandlers( + survival::coxph(stats::as.formula(formula_str), data = tmp), + warning = function(w) { + warning_msg <- conditionMessage(w) + + if (grepl(separation_pattern, warning_msg, ignore.case = TRUE)) { + separation_warning <<- TRUE + invokeRestart("muffleWarning") + } + } + ), + error = function(e) { + if (isTRUE(verbose)) { + message( + "Model '", model_id, "' skipped: Cox model could not be fitted. ", + "Reason: ", conditionMessage(e) + ) + } + return(NULL) + } + ) + + if (is.null(fit)) { + return(NULL) + } + + if (isTRUE(separation_warning)) { + return(skip_model( + model_id, + "possible perfect separation or infinite coefficient." + )) + } + + res <- tryCatch( + broom::tidy(fit, exponentiate = TRUE, conf.int = TRUE), + error = function(e) { + if (isTRUE(verbose)) { + message( + "Model '", model_id, "' skipped: model results could not be tidied. ", + "Reason: ", conditionMessage(e) + ) + } + return(NULL) + } + ) + + if (is.null(res) || nrow(res) == 0) { + return(skip_model(model_id, "no valid model results.")) + } + + required_cols <- c("term", "estimate", "std.error", "conf.low", "conf.high", "p.value") + + if (!all(required_cols %in% colnames(res))) { + return(skip_model( + model_id, + "model output does not contain the required columns." + )) + } + + if (any(!is.finite(res$estimate)) || + any(!is.finite(res$std.error)) || + any(!is.finite(res$conf.low)) || + any(!is.finite(res$conf.high)) || + any(is.na(res$p.value))) { + return(skip_model( + model_id, + "non-finite HR, SE, confidence interval, or p-value." + )) + } + + res$variable <- vapply( + res$term, + guess_variable, + character(1), + candidate_vars = model_vars + ) + + res <- res[!is.na(res$variable), , drop = FALSE] + + if (nrow(res) == 0) { + return(skip_model(model_id, "terms could not be mapped to variables.")) + } + + if (model_type == "adjusted" && !isTRUE(keep_adjust_terms)) { + res <- res[res$variable %in% target_vars, , drop = FALSE] + } + + if (nrow(res) == 0) { + return(skip_model(model_id, "no target-variable terms remained.")) + } + + res$reference <- vapply( + res$variable, + function(v) { + if (is.factor(tmp[[v]])) { + levels(tmp[[v]])[1] + } else { + "continuous" + } + }, + character(1) + ) + + res$term_clean <- mapply( + clean_term_label, + res$term, + res$variable, + res$reference, + USE.NAMES = FALSE + ) + + names(res)[names(res) == "estimate"] <- "HR" + names(res)[names(res) == "conf.low"] <- "CI_low" + names(res)[names(res) == "conf.high"] <- "CI_high" + + res$model <- model_type + res$model_id <- model_id + res$n_used <- prepared$n_used + res$n_events <- prepared$n_events + res$adjusted_for <- if (length(adjusted_for) > 0) { + paste(adjusted_for, collapse = " + ") + } else { + NA_character_ + } + + preferred_cols <- c( + "model", "model_id", "variable", "term", "term_clean", "reference", + "HR", "CI_low", "CI_high", "p.value", "std.error", "statistic", + "n_used", "n_events", "adjusted_for" + ) + + other_cols <- setdiff(colnames(res), preferred_cols) + + res[, c(intersect(preferred_cols, colnames(res)), other_cols), drop = FALSE] + } + + # --- 4. Fit requested model type --- + if (model == "univariable") { + results_list <- lapply(vars, function(v) { + fit_one_model( + model_vars = v, + model_id = v, + model_type = "univariable", + target_vars = v + ) + }) + + } else if (model == "multivariable") { + results_list <- list( + fit_one_model( + model_vars = vars, + model_id = "multivariable", + model_type = "multivariable", + target_vars = vars + ) + ) + + } else { + results_list <- lapply(vars, function(v) { + adj <- setdiff(adjust_vars, v) + + fit_one_model( + model_vars = unique(c(v, adj)), + model_id = paste0(v, " adjusted"), + model_type = "adjusted", + target_vars = v, + adjusted_for = adj + ) + }) + } + + results <- dplyr::bind_rows(results_list) + + if (is.null(results) || nrow(results) == 0) { + stop("No Cox model could be fitted. Please check the data and variables.") + } + + rownames(results) <- NULL + results +} diff --git a/R/get_glm.R b/R/get_glm.R new file mode 100644 index 0000000..6c0199c --- /dev/null +++ b/R/get_glm.R @@ -0,0 +1,513 @@ +##################### +# Function get_glm # +##################### + +# Helper: %||% (NULL coalescing) - defined locally to avoid external dependency +`%||%` <- function(a, b) if (!is.null(a)) a else b + + +# Helper: quote variable names safely for formula construction +.quote_name <- function(x) { + needs_quotes <- !grepl("^[A-Za-z.][A-Za-z0-9._]*$", x) || + x %in% c( + "if", "else", "repeat", "while", "function", "for", "in", "next", + "break", "TRUE", "FALSE", "NULL", "Inf", "NaN", "NA", "NA_integer_", + "NA_real_", "NA_complex_", "NA_character_" + ) + + if (needs_quotes) { + paste0("`", gsub("`", "\\\\`", x), "`") + } else { + x + } +} + + +# Helper: normalize family input into a proper family object +.normalize_glm_family <- function(family, envir = parent.frame()) { + + if (inherits(family, "family")) { + return(family) + } + + if (is.character(family)) { + if (length(family) != 1 || is.na(family)) { + stop("`family` must be a single character string, family function, or family object.", + call. = FALSE) + } + + fam_fun <- NULL + + # First look in the calling environment / attached packages + if (exists(family, mode = "function", envir = envir, inherits = TRUE)) { + fam_fun <- get(family, mode = "function", envir = envir, inherits = TRUE) + } + + # Then look explicitly in stats + if (is.null(fam_fun) && + exists(family, mode = "function", envir = asNamespace("stats"), + inherits = FALSE)) { + fam_fun <- get(family, mode = "function", envir = asNamespace("stats"), + inherits = FALSE) + } + + if (is.null(fam_fun)) { + stop(sprintf("Could not find a GLM family function called '%s'.", family), + call. = FALSE) + } + + family <- fam_fun + } + + if (is.function(family)) { + family <- tryCatch( + family(), + error = function(e) { + stop(sprintf( + "`family` function could not be evaluated without arguments: %s", + conditionMessage(e) + ), call. = FALSE) + } + ) + } + + if (!inherits(family, "family")) { + stop("`family` must be a character string, family function, or valid family object.", + call. = FALSE) + } + + family +} + + +#' Function to Fit a GLM and Return Tidy Results with Effect Sizes and FDR Correction +#' +#' @description +#' Fits a Generalized Linear Model (GLM) for binary, continuous, count, +#' proportion, rate, or positive skewed outcomes, depending on the specified +#' \code{family}. The function returns a tidy \code{tibble} with coefficients, +#' optional exponentiated effect sizes, 95% confidence intervals, Wald test +#' statistics, raw p-values, and multiple-testing-adjusted q-values. +#' +#' The fitted \code{glm} object is attached as an attribute so that it can be +#' passed directly to downstream functions such as \code{\link{nice_ROC}} +#' without re-fitting. +#' +#' @param data A \code{data.frame} or \code{tibble} containing all outcome and +#' predictor variables. +#' +#' @param outcome Character string of length 1. Name of the outcome column. +#' For logistic regression this is typically coded as \code{0}/\code{1}, +#' although other binomial formats accepted by \code{\link[stats]{glm}} can +#' also be used. +#' +#' @param predictors Character vector. Names of the predictor columns to include. +#' Categorical variables should be \code{factor} or \code{character}; they are +#' handled by \code{\link[stats]{glm}} contrast coding automatically. +#' +#' @param family Character string, family function, or \code{\link[stats]{family}} +#' object passed to \code{\link[stats]{glm}}. It specifies both the assumed +#' distribution of the outcome and the link function used in the GLM. +#' Common options include: +#' \itemize{ +#' \item \code{"binomial"} or \code{binomial(link = "logit")} for binary, +#' proportion, or success/failure outcomes; this corresponds to logistic +#' regression when the link is \code{"logit"}. +#' \item \code{"gaussian"} or \code{gaussian(link = "identity")} for +#' approximately normally distributed continuous outcomes. +#' \item \code{"poisson"} or \code{poisson(link = "log")} for count or rate +#' outcomes. +#' \item \code{"quasibinomial"} and \code{"quasipoisson"} for binomial or +#' Poisson-type outcomes with overdispersion. +#' \item \code{Gamma(link = "log")} for positive, right-skewed continuous +#' outcomes. +#' \item \code{inverse.gaussian()} for positive continuous outcomes with +#' variance increasing strongly with the mean. +#' } +#' Custom family objects can also be supplied. Default: \code{"binomial"}. +#' +#' @param adjust_method Character string. Method for p-value adjustment passed +#' to \code{\link[stats]{p.adjust}}. Options: \code{"BH"}, +#' \code{"bonferroni"}, \code{"holm"}, \code{"BY"}, \code{"fdr"}, +#' \code{"none"}. Default: \code{"BH"}. +#' +#' @param conf_level Numeric value in \code{(0, 1)}. Confidence level for the +#' interval. Default: \code{0.95}. +#' +#' @param exponentiate Logical or \code{NULL}. If \code{TRUE}, coefficients and +#' confidence intervals are exponentiated. If \code{NULL}, exponentiation is +#' applied automatically when the model link is \code{"logit"} or \code{"log"}. +#' This gives odds ratios for logistic models with logit link, rate ratios for +#' Poisson-type models with log link, and multiplicative effects for other +#' log-link models. Default: \code{NULL}. +#' +#' @param remove_intercept Logical. Whether to drop the \code{(Intercept)} row +#' from the returned table. Default: \code{TRUE}. +#' +#' @param verbose Logical. If \code{TRUE}, prints the model summary to console. +#' Default: \code{FALSE}. +#' +#' @return +#' A \code{tibble} of class +#' \code{c("get_glm_result", "tbl_df", "tbl", "data.frame")} with the following +#' columns: +#' \describe{ +#' \item{\code{term}}{Predictor name; for factors, includes the level +#' according to the contrast coding used by \code{glm}.} +#' \item{\code{estimate}}{Exponentiated coefficient if +#' \code{exponentiate = TRUE}; otherwise the raw coefficient on the +#' model linear predictor scale.} +#' \item{\code{ci_lower}}{Lower bound of the confidence interval on the same +#' scale as \code{estimate}.} +#' \item{\code{ci_upper}}{Upper bound of the confidence interval on the same +#' scale as \code{estimate}.} +#' \item{\code{std_error}}{Standard error of the coefficient on the linear +#' predictor scale.} +#' \item{\code{statistic}}{Wald test statistic. For fixed-dispersion families +#' such as binomial and Poisson this is typically a z-statistic; for +#' families with estimated dispersion it may be a t-statistic.} +#' \item{\code{p_value}}{Two-sided p-value from the Wald test.} +#' \item{\code{q_value}}{P-value adjusted for multiple comparisons using +#' the method specified in \code{adjust_method}.} +#' \item{\code{significance}}{Star annotation based on raw p-value: +#' \code{"***"} p<0.001, \code{"**"} p<0.01, \code{"*"} p<0.05, +#' \code{"."} p<0.10, \code{" "} otherwise.} +#' } +#' +#' The following attributes are attached to the returned object: +#' \describe{ +#' \item{\code{model}}{The fitted \code{glm} object.} +#' \item{\code{formula}}{Character string of the model formula.} +#' \item{\code{family}}{Family name used.} +#' \item{\code{link}}{Link function used.} +#' \item{\code{n_obs}}{Number of observations used in model fitting.} +#' \item{\code{AIC}}{Akaike Information Criterion. May be \code{NA} for +#' quasi-likelihood families.} +#' \item{\code{exponentiate}}{Logical indicating whether coefficients were +#' exponentiated.} +#' \item{\code{adjust_method}}{P-value adjustment method used.} +#' } +#' +#' @details +#' Q-values are computed across all reported terms after removing the intercept +#' if \code{remove_intercept = TRUE}. Therefore, the multiple-testing correction +#' pool matches the number of hypotheses shown in the returned table. +#' +#' Automatic exponentiation is based on the link function. Models with +#' \code{logit} link are reported as odds ratios; models with \code{log} link +#' are reported as multiplicative effects such as rate ratios or mean ratios. +#' Models with identity, probit, cloglog, inverse, or other links are not +#' exponentiated automatically. +#' +#' @seealso \code{\link{nice_ROC}} for ROC curve visualisation of logistic +#' models produced by \code{get_glm}. +#' +#' @examples +#' \dontrun{ +#' # Binary outcome: logistic regression +#' res_logit <- get_glm( +#' data = train_data, +#' outcome = "stage_advanced", +#' predictors = c("age", "ER", "PR", "HER2", "histology", "menopause"), +#' family = "binomial" +#' ) +#' +#' # Continuous outcome: linear model through glm +#' res_gaussian <- get_glm( +#' data = train_data, +#' outcome = "tumor_size", +#' predictors = c("age", "ER", "PR", "HER2"), +#' family = "gaussian" +#' ) +#' +#' # Count outcome: Poisson regression +#' res_pois <- get_glm( +#' data = train_data, +#' outcome = "n_mutations", +#' predictors = c("age", "stage", "histology"), +#' family = "poisson" +#' ) +#' +#' # Positive skewed continuous outcome +#' res_gamma <- get_glm( +#' data = train_data, +#' outcome = "cost", +#' predictors = c("age", "stage", "treatment"), +#' family = Gamma(link = "log") +#' ) +#' +#' # Retrieve fitted model +#' fit <- attr(res_logit, "model") +#' } +#' +#' @importFrom stats glm p.adjust nobs AIC +#' @importFrom broom tidy +#' @importFrom dplyr rename mutate select filter case_when +#' @importFrom rlang .data +#' +#' @export +get_glm <- function( + data, + outcome, + predictors, + family = "binomial", + adjust_method = "BH", + conf_level = 0.95, + exponentiate = NULL, + remove_intercept = TRUE, + verbose = FALSE +) { + + # -- 1. Input validation ---------------------------------------------------- + if (!is.data.frame(data)) { + stop("`data` must be a data.frame or tibble.", call. = FALSE) + } + + if (!is.character(outcome) || length(outcome) != 1 || is.na(outcome)) { + stop("`outcome` must be a single non-missing character string.", + call. = FALSE) + } + + if (!outcome %in% names(data)) { + stop(sprintf("`outcome` column '%s' not found in `data`.", outcome), + call. = FALSE) + } + + if (!is.character(predictors) || length(predictors) < 1 || + any(is.na(predictors))) { + stop("`predictors` must be a non-empty character vector with no missing values.", + call. = FALSE) + } + + missing_preds <- setdiff(predictors, names(data)) + + if (length(missing_preds) > 0) { + stop(sprintf( + "The following predictors are not in `data`: %s", + paste(missing_preds, collapse = ", ") + ), call. = FALSE) + } + + if (!is.numeric(conf_level) || length(conf_level) != 1 || + is.na(conf_level) || conf_level <= 0 || conf_level >= 1) { + stop("`conf_level` must be a single numeric value between 0 and 1.", + call. = FALSE) + } + + if (!is.logical(remove_intercept) || length(remove_intercept) != 1 || + is.na(remove_intercept)) { + stop("`remove_intercept` must be TRUE or FALSE.", call. = FALSE) + } + + if (!is.logical(verbose) || length(verbose) != 1 || is.na(verbose)) { + stop("`verbose` must be TRUE or FALSE.", call. = FALSE) + } + + if (!is.null(exponentiate) && + (!is.logical(exponentiate) || length(exponentiate) != 1 || + is.na(exponentiate))) { + stop("`exponentiate` must be TRUE, FALSE, or NULL.", call. = FALSE) + } + + adjust_method <- match.arg( + adjust_method, + choices = c("BH", "bonferroni", "holm", "BY", "fdr", "none") + ) + + # -- 2. Normalize family object --------------------------------------------- + family_obj <- .normalize_glm_family(family, envir = parent.frame()) + + fam_name <- family_obj$family + link_name <- family_obj$link + + # -- 3. Auto-set exponentiate based on link --------------------------------- + if (is.null(exponentiate)) { + exponentiate <- link_name %in% c("logit", "log") + } + + # -- 4. Build formula and fit model ----------------------------------------- + outcome_quoted <- .quote_name(outcome) + predictors_quoted <- vapply(predictors, .quote_name, character(1)) + + formula_str <- paste( + outcome_quoted, + "~", + paste(predictors_quoted, collapse = " + ") + ) + + model_formula <- stats::as.formula(formula_str) + + fit <- tryCatch( + stats::glm( + formula = model_formula, + data = data, + family = family_obj + ), + error = function(e) { + stop(sprintf("GLM fitting failed: %s", conditionMessage(e)), + call. = FALSE) + } + ) + + if (verbose) { + message("-- Model summary ------------------------------") + print(summary(fit)) + } + + # -- 5. Tidy extraction via broom ------------------------------------------- + tidy_res <- tryCatch( + broom::tidy( + fit, + conf.int = TRUE, + conf.level = conf_level, + exponentiate = exponentiate + ), + error = function(e) { + stop(sprintf("Could not extract tidy model results: %s", + conditionMessage(e)), call. = FALSE) + } + ) + + # Standardise column names + tidy_res <- dplyr::rename( + tidy_res, + dplyr::all_of(c( + std_error = "std.error", + p_value = "p.value", + ci_lower = "conf.low", + ci_upper = "conf.high" + )) + ) + + # -- 6. Remove intercept before FDR ----------------------------------------- + if (remove_intercept) { + tidy_res <- dplyr::filter(tidy_res, .data$term != "(Intercept)") + } + + # -- 7. Multiple testing correction and significance stars ------------------ + tidy_res <- dplyr::mutate( + tidy_res, + q_value = stats::p.adjust(.data$p_value, method = adjust_method), + significance = dplyr::case_when( + .data$p_value < 0.001 ~ "***", + .data$p_value < 0.01 ~ "**", + .data$p_value < 0.05 ~ "*", + .data$p_value < 0.10 ~ ".", + TRUE ~ " " + ) + ) + + tidy_res <- dplyr::select( + tidy_res, + dplyr::all_of(c( + "term", "estimate", "ci_lower", "ci_upper", + "std_error", "statistic", "p_value", "q_value", "significance" + )) + ) + + # -- 8. Attach model and metadata as attributes ----------------------------- + aic_value <- tryCatch(stats::AIC(fit), error = function(e) NA_real_) + + attr(tidy_res, "model") <- fit + attr(tidy_res, "formula") <- formula_str + attr(tidy_res, "family") <- fam_name + attr(tidy_res, "link") <- link_name + attr(tidy_res, "n_obs") <- stats::nobs(fit) + attr(tidy_res, "AIC") <- aic_value + attr(tidy_res, "exponentiate") <- exponentiate + attr(tidy_res, "adjust_method") <- adjust_method + + class(tidy_res) <- c("get_glm_result", class(tidy_res)) + + return(tidy_res) +} + +# -- S3 print method ---------------------------------------------------------- + +#' @export +#' @noRd +print.get_glm_result <- function(x, digits = 3, ...) { + + fam <- attr(x, "family") %||% "?" + link <- attr(x, "link") %||% "?" + n <- attr(x, "n_obs") %||% "?" + aic <- attr(x, "AIC") %||% NA_real_ + adj <- attr(x, "adjust_method") %||% "BH" + exp_tf <- attr(x, "exponentiate") + + required_cols <- c( + "term", "estimate", "ci_lower", "ci_upper", + "p_value", "q_value", "significance" + ) + + missing_cols <- setdiff(required_cols, names(x)) + + if (length(missing_cols) > 0) { + stop( + sprintf( + "Cannot print `get_glm_result`: missing columns: %s", + paste(missing_cols, collapse = ", ") + ), + call. = FALSE + ) + } + + label <- if (isTRUE(exp_tf)) { + if (identical(link, "logit")) { + "OR" + } else if (identical(link, "log")) { + "Ratio" + } else { + "exp(Estimate)" + } + } else { + "Estimate" + } + + aic_txt <- if (is.na(aic)) "NA" else sprintf("%.2f", aic) + + cat(sprintf( + "\n-- get_glm result (%s, link = %s) -----------------------------\n", + fam, link + )) + + cat(sprintf( + " n = %s | AIC = %s | q-value correction: %s\n\n", + n, aic_txt, adj + )) + + # Important: + # Drop custom S3 class before formatting/printing to avoid recursive dispatch. + print_tbl <- as.data.frame(x, stringsAsFactors = FALSE) + + print_tbl$estimate <- sprintf("%.3f", print_tbl$estimate) + print_tbl$ci <- sprintf( + "[%.3f, %.3f]", + print_tbl$ci_lower, + print_tbl$ci_upper + ) + + print_tbl$p_value <- format.pval( + print_tbl$p_value, + digits = digits, + eps = 0.001 + ) + + print_tbl$q_value <- format.pval( + print_tbl$q_value, + digits = digits, + eps = 0.001 + ) + + out <- print_tbl[, c( + "term", "estimate", "ci", "p_value", "q_value", "significance" + ), drop = FALSE] + + names(out)[c(2, 3)] <- c(label, "95% CI") + + base::print.data.frame(out, row.names = FALSE, ...) + + cat("\n") + + invisible(x) +} diff --git a/R/nice_ConcordanceScatter.R b/R/nice_ConcordanceScatter.R new file mode 100644 index 0000000..670e1d9 --- /dev/null +++ b/R/nice_ConcordanceScatter.R @@ -0,0 +1,248 @@ +#################################### +# Function nice_ConcordanceScatter # +#################################### + +#' Plot cross-layer log-fold-change concordance. +#' +#' Builds a scatter plot comparing log-fold changes from two omics layers after +#' classification with [concordanceDE()]. Points are colored by concordance +#' category, reference lines divide the plot into directional quadrants, and an +#' optional regression line is drawn for concordant genes. +#' +#' @param concordance_result Output from [concordanceDE()]. A data frame with a +#' `category` column can also be supplied through `concordance_result$table`. +#' @param x_label X-axis label. Default: `"log2FC (RNA)"`. +#' @param y_label Y-axis label. Default: `"log2FC (Protein)"`. +#' @param genes_label Optional character vector of genes to label with +#' `ggrepel`. +#' @param method Correlation method used by `ggpubr::stat_cor()`. One of +#' `"pearson"` or `"spearman"`. Default: `"pearson"`. +#' @param ci_level Confidence level for the linear-model confidence interval. +#' Default: `0.95`. +#' @param point_size Point size. Default: `1.5`. +#' @param alpha Point transparency. Default: `0.65`. +#' @param gene_col Optional gene column name. If `NULL`, the function uses the +#' first column containing `"gene"`, ignoring case. +#' @param logfc_col Base log-fold-change column name used in [concordanceDE()]. +#' Default: `"logFC"`, which expects columns `logFC_x` and `logFC_y`. +#' @param category_colors Optional named character vector with colors for +#' `concordant`, `discordant`, `only_x`, `only_y`, and `not_significant`. +#' +#' @return A `ggplot2` object. +#' +#' @examples +#' de_rna <- data.frame( +#' gene = c("ESR1", "PGR", "EGFR", "MKI67", "GATA3"), +#' logFC = c(3.2, 2.1, -1.6, -1.4, 1.8), +#' padj = c(1e-6, 0.002, 0.01, 0.03, 0.04) +#' ) +#' +#' de_protein <- data.frame( +#' gene = c("ESR1", "PGR", "EGFR", "MKI67", "GATA3"), +#' logFC = c(0.7, 0.4, -0.5, 0.3, 0.05), +#' padj = c(1e-4, 0.01, 0.02, 0.04, 0.40) +#' ) +#' +#' res <- concordanceDE( +#' de_x = de_rna, +#' de_y = de_protein, +#' logfc_threshold = c(1, 0.2) +#' ) +#' +#' if ( +#' requireNamespace("ggpubr", quietly = TRUE) && +#' requireNamespace("ggrepel", quietly = TRUE) +#' ) { +#' nice_ConcordanceScatter( +#' res, +#' genes_label = c("ESR1", "EGFR"), +#' method = "spearman" +#' ) +#' } +#' +#' @import ggplot2 +#' @importFrom rlang .data +#' +#' @seealso +#' [concordanceDE()] to classify genes by cross-layer differential concordance; +#' [crossLayerCorr()] to estimate sample-level concordance between two omics +#' layers. +#' +#' @export +nice_ConcordanceScatter <- function( + concordance_result, + x_label = "log2FC (RNA)", + y_label = "log2FC (Protein)", + genes_label = NULL, + method = c("pearson", "spearman"), + ci_level = 0.95, + point_size = 1.5, + alpha = 0.65, + gene_col = NULL, + logfc_col = "logFC", + category_colors = NULL +) { + method <- match.arg(method) + + if (!requireNamespace("ggpubr", quietly = TRUE)) { + stop( + "Package \"ggpubr\" must be installed to use `nice_ConcordanceScatter()`.", + call. = FALSE + ) + } + + if (!is.null(genes_label) && !requireNamespace("ggrepel", quietly = TRUE)) { + stop( + "Package \"ggrepel\" must be installed when `genes_label` is used.", + call. = FALSE + ) + } + + if (!is.list(concordance_result) || is.null(concordance_result$table)) { + stop("`concordance_result` must be the output of `concordanceDE()`.", call. = FALSE) + } + + df <- as.data.frame(concordance_result$table) + + if (!"category" %in% names(df)) { + stop("`concordance_result$table` must contain a `category` column.", call. = FALSE) + } + + lfc_x <- paste0(logfc_col, "_x") + lfc_y <- paste0(logfc_col, "_y") + + if (!all(c(lfc_x, lfc_y) %in% names(df))) { + lfc_candidates_x <- grep("_x$", names(df), value = TRUE) + lfc_candidates_y <- grep("_y$", names(df), value = TRUE) + + lfc_x <- lfc_candidates_x[grepl("log", lfc_candidates_x, ignore.case = TRUE)][1] + lfc_y <- lfc_candidates_y[grepl("log", lfc_candidates_y, ignore.case = TRUE)][1] + } + + if (is.na(lfc_x) || is.na(lfc_y) || !all(c(lfc_x, lfc_y) %in% names(df))) { + stop( + "Could not identify log-fold-change columns. Expected `", + paste0(logfc_col, "_x"), "` and `", paste0(logfc_col, "_y"), "`.", + call. = FALSE + ) + } + + if (is.null(gene_col)) { + gene_col <- grep("gene", names(df), ignore.case = TRUE, value = TRUE)[1] + } + + default_colors <- c( + concordant = "#1D9E75", + discordant = "#D85A30", + only_x = "#7F77DD", + only_y = "#EF9F27", + not_significant = "#B4B2A9" + ) + + if (is.null(category_colors)) { + category_colors <- default_colors + } else { + category_colors <- utils::modifyList(default_colors, as.list(category_colors)) + category_colors <- unlist(category_colors, use.names = TRUE) + } + + df$category <- factor( + as.character(df$category), + levels = names(default_colors) + ) + + category_labels <- c( + concordant = paste0("Concordant (score=", concordance_result$concordance_score, ")"), + discordant = "Discordant", + only_x = "Only layer X", + only_y = "Only layer Y", + not_significant = "NS" + ) + + p <- ggplot2::ggplot( + df, + ggplot2::aes( + x = .data[[lfc_x]], + y = .data[[lfc_y]], + color = .data[["category"]] + ) + ) + + ggplot2::geom_hline( + yintercept = 0, + linetype = "dashed", + color = "grey70", + linewidth = 0.4 + ) + + ggplot2::geom_vline( + xintercept = 0, + linetype = "dashed", + color = "grey70", + linewidth = 0.4 + ) + + ggplot2::geom_point(alpha = alpha, size = point_size) + + ggpubr::stat_cor( + ggplot2::aes(x = .data[[lfc_x]], y = .data[[lfc_y]]), + method = method, + label.x.npc = 0.05, + inherit.aes = FALSE, + size = 3 + ) + + ggplot2::scale_color_manual( + values = category_colors, + labels = category_labels, + drop = FALSE + ) + + ggplot2::labs(x = x_label, y = y_label, color = NULL) + + ggplot2::theme_classic(base_size = 11) + + ggplot2::theme(legend.position = "right") + + df_concordant <- df[df$category == "concordant", , drop = FALSE] + + var_concordant_x <- stats::var(df_concordant[[lfc_x]], na.rm = TRUE) + var_concordant_y <- stats::var(df_concordant[[lfc_y]], na.rm = TRUE) + + if ( + nrow(df_concordant) >= 2 && + is.finite(var_concordant_x) && var_concordant_x > 0 && + is.finite(var_concordant_y) && var_concordant_y > 0 + ) { + p <- p + + ggplot2::geom_smooth( + data = df_concordant, + ggplot2::aes(x = .data[[lfc_x]], y = .data[[lfc_y]]), + method = "lm", + se = TRUE, + level = ci_level, + color = "#085041", + fill = "#9FE1CB", + linewidth = 0.8, + inherit.aes = FALSE + ) + } + + if (!is.null(genes_label)) { + if (is.na(gene_col) || !gene_col %in% names(df)) { + stop("Could not identify a gene column for `genes_label`.", call. = FALSE) + } + + df_lbl <- df[df[[gene_col]] %in% genes_label, , drop = FALSE] + + if (nrow(df_lbl) > 0) { + p <- p + + ggrepel::geom_label_repel( + data = df_lbl, + ggplot2::aes( + x = .data[[lfc_x]], + y = .data[[lfc_y]], + label = .data[[gene_col]] + ), + inherit.aes = FALSE, + size = 3, + max.overlaps = 20, + fill = scales::alpha("white", 0.8) + ) + } + } + + p +} diff --git a/R/nice_ROC.R b/R/nice_ROC.R new file mode 100644 index 0000000..c63e91a --- /dev/null +++ b/R/nice_ROC.R @@ -0,0 +1,363 @@ +##################### +# Function nice_ROC # +##################### + +#' Plot and Compare ROC Curves for Binary Classification Models +#' +#' @description +#' Generates publication-ready ROC curves for one or more binary classification +#' models evaluated on a held-out dataset. For each model the function computes +#' AUC with a 95% CI using the DeLong method. When exactly two models are +#' supplied, a DeLong paired test is performed to compare their AUCs and the +#' result is annotated on the plot. +#' +#' Models can be supplied either as fitted \code{glm} objects or as named +#' numeric vectors of predicted probabilities, making the function flexible +#' for any classifier that outputs scores. +#' +#' @param models +#' A \emph{named} list. Each element is one of: +#' \itemize{ +#' \item A fitted \code{glm} object (predictions generated internally via +#' \code{predict(model, newdata = data, type = "response")}). +#' \item A numeric vector of predicted probabilities of length +#' \code{nrow(data)} (values in \[0, 1\]). +#' } +#' Names are used as legend labels (e.g., +#' \code{list("Clinical" = fit_A, "Clinical + Omics" = fit_B)}). +#' @param data +#' A \code{data.frame} or \code{tibble}. Test / evaluation dataset. Required +#' when any element of \code{models} is a \code{glm} object. +#' @param outcome +#' Character (length 1). Name of the binary outcome column in \code{data} +#' (must be \code{0}/\code{1} integer or numeric; \code{1} = event / positive +#' class). +#' @param colors +#' Character vector of colour codes for the ROC curves. Recycled if shorter +#' than the number of models. +#' Default: a four-colour colorblind-friendly palette +#' (\code{c("#E63946", "#457B9D", "#2A9D8F", "#E9C46A")}). +#' @param show_ci +#' Logical. Whether to display the DeLong 95% CI for each AUC in the legend. +#' Default: \code{TRUE}. +#' @param show_delong +#' Logical. When exactly two models are provided, annotate the plot with the +#' DeLong test p-value and delta AUC. Default: \code{TRUE}. +#' @param smooth +#' Logical. Apply kernel smoothing to the ROC curves +#' (\code{pROC::smooth}). Useful for small samples; may hide real variability +#' on large samples. Default: \code{FALSE}. +#' @param direction +#' Character. Direction for \code{pROC::roc}. Usually \code{"auto"} +#' (default). Set to \code{"<"} or \code{">"} if you need to enforce a +#' direction. +#' @param plot_title +#' Character string. Main title of the figure. +#' Default: \code{"ROC Curve Comparison"}. +#' @param plot_subtitle +#' Character string or \code{NULL}. Subtitle. Default: \code{NULL}. +#' @param theme_fn +#' A \code{ggplot2} theme function (without parentheses). Applied to the +#' plot. Default: \code{ggplot2::theme_classic}. +#' @param return_data +#' Logical. If \code{TRUE}, returns a named list containing the ggplot object +#' \emph{and} the underlying data/statistics. If \code{FALSE} (default), only +#' the \code{ggplot} object is returned. +#' +#' @return +#' When \code{return_data = FALSE} (default): a \code{ggplot2} object. +#' +#' When \code{return_data = TRUE}: a named list with elements: +#' \describe{ +#' \item{\code{plot}}{The \code{ggplot2} ROC figure.} +#' \item{\code{auc_table}}{A \code{tibble} with one row per model containing: +#' \code{model}, \code{auc}, \code{ci_lower}, \code{ci_upper}, +#' \code{n_cases}, \code{n_controls}.} +#' \item{\code{delong_test}}{Result of \code{pROC::roc.test} (DeLong) if +#' exactly two models were supplied; otherwise \code{NULL}.} +#' \item{\code{roc_objects}}{Named list of \code{pROC::roc} objects for +#' further downstream analysis.} +#' } +#' +#' @details +#' ## AUC confidence intervals +#' CIs are computed with the DeLong method via \code{pROC::ci.auc} which +#' accounts for the correlation between paired measurements (same subjects). +#' +#' ## DeLong test +#' When two models are compared on the \emph{same} test set the observations +#' are paired, so the DeLong paired test is used +#' (\code{pROC::roc.test(method = "delong", paired = TRUE)}). +#' +#' ## Diagonal reference line +#' The grey dashed diagonal represents random performance (AUC = 0.50). +#' +#' @seealso \code{\link{get_glm}} for fitting the models fed into +#' \code{nice_ROC}. +#' +#' +#' +#' @examples +#' \dontrun{ +#' # -- Passing glm objects (predictions generated internally) ----------------- +#' roc_result <- nice_ROC( +#' models = list("Clinical only" = fit_A, +#' "Clinical + Omics" = fit_B), +#' data = test_data, +#' outcome = "stage_advanced", +#' return_data = TRUE +#' ) +#' +#' roc_result$plot +#' roc_result$auc_table +#' roc_result$delong_test +#' +#' # -- Passing probability vectors directly ----------------------------------- +#' prob_A <- predict(fit_A, newdata = test_data, type = "response") +#' prob_B <- predict(fit_B, newdata = test_data, type = "response") +#' +#' nice_ROC( +#' models = list("Clinical" = prob_A, "Clinical + Omics" = prob_B), +#' data = test_data, +#' outcome = "stage_advanced" +#' ) +#' +#' # -- Single model (no comparison) ------------------------------------------- +#' nice_ROC( +#' models = list("Clinical only" = fit_A), +#' data = test_data, +#' outcome = "stage_advanced", +#' show_delong = FALSE +#' ) +#' } +#' +#' @importFrom pROC roc auc ci.auc roc.test smooth +#' @importFrom ggplot2 ggplot aes geom_line geom_abline scale_color_manual labs theme theme_classic element_text element_rect annotate +#' @importFrom dplyr bind_rows +#' @importFrom tibble tibble +#' @importFrom rlang .data +#' @importFrom stats predict +#' +#' @export +nice_ROC <- function( + models, + data = NULL, + outcome, + colors = c("#E63946", "#457B9D", "#2A9D8F", "#E9C46A"), + show_ci = TRUE, + show_delong = TRUE, + smooth = FALSE, + direction = "auto", + plot_title = "ROC Curve Comparison", + plot_subtitle = NULL, + theme_fn = ggplot2::theme_classic, + return_data = FALSE +) { + + # -- 1. Validate inputs ----------------------------------------------------- + if (!is.list(models) || is.null(names(models))) { + stop("`models` must be a *named* list.", call. = FALSE) + } + if (!is.character(outcome) || length(outcome) != 1) { + stop("`outcome` must be a single character string.", call. = FALSE) + } + if (!is.null(data) && !outcome %in% names(data)) { + stop(sprintf( + "`outcome` column '%s' not found in `data`.", outcome + ), call. = FALSE) + } + + # -- 2. Extract true labels ------------------------------------------------- + # Determine true labels - either from data or from the first model element + if (!is.null(data)) { + true_labels <- data[[outcome]] + } else { + # All elements must be probability vectors of the same length; we need + # true_labels supplied separately - error out. + stop( + "When `models` contains probability vectors, `data` must be provided ", + "so that `outcome` labels can be extracted.", + call. = FALSE + ) + } + + if (!all(true_labels %in% c(0, 1, NA))) { + stop( + "`outcome` must be a binary 0/1 variable.", + call. = FALSE + ) + } + + # -- 3. Compute predicted probabilities for each model --------------------- + pred_list <- lapply(seq_along(models), function(i) { + mod <- models[[i]] + name <- names(models)[i] + if (inherits(mod, "glm")) { + if (is.null(data)) { + stop(sprintf( + "Model '%s' is a glm object but `data` is NULL.", name + ), call. = FALSE) + } + probs <- stats::predict(mod, newdata = data, type = "response") + } else if (is.numeric(mod)) { + if (length(mod) != nrow(data)) { + stop(sprintf( + "Probability vector for model '%s' has length %d, expected %d.", + name, length(mod), nrow(data) + ), call. = FALSE) + } + probs <- mod + } else { + stop(sprintf( + "Model '%s' must be a glm object or numeric probability vector.", name + ), call. = FALSE) + } + return(probs) + }) + names(pred_list) <- names(models) + + # -- 4. Build pROC::roc objects --------------------------------------------- + roc_list <- lapply(names(pred_list), function(nm) { + roc_obj <- pROC::roc( + response = true_labels, + predictor = pred_list[[nm]], + direction = direction, + quiet = TRUE + ) + if (smooth) { + roc_obj <- pROC::smooth(roc_obj, method = "density") + } + return(roc_obj) + }) + names(roc_list) <- names(models) + + # -- 5. Compute AUC + DeLong 95% CI ---------------------------------------- + auc_rows <- lapply(names(roc_list), function(nm) { + roc_obj <- roc_list[[nm]] + auc_val <- as.numeric(pROC::auc(roc_obj)) + ci_val <- pROC::ci.auc(roc_obj, method = "delong", conf.level = 0.95) + tibble::tibble( + model = nm, + auc = round(auc_val, 4), + ci_lower = round(as.numeric(ci_val[1]), 4), + ci_upper = round(as.numeric(ci_val[3]), 4), + n_cases = sum(true_labels == 1, na.rm = TRUE), + n_controls = sum(true_labels == 0, na.rm = TRUE) + ) + }) + auc_table <- dplyr::bind_rows(auc_rows) + + # -- 6. DeLong test (only when exactly 2 models) --------------------------- + delong_result <- NULL + delong_label <- NULL + if (length(roc_list) == 2 && show_delong) { + delong_result <- pROC::roc.test( + roc1 = roc_list[[1]], + roc2 = roc_list[[2]], + method = "delong", + paired = TRUE + ) + delta_auc <- round( + as.numeric(pROC::auc(roc_list[[1]])) - + as.numeric(pROC::auc(roc_list[[2]])), + 4 + ) + p_delong <- delong_result$p.value + delong_label <- sprintf( + "DeLong test\n\u0394AUC = %.4f\np = %s", + delta_auc, + format.pval(p_delong, digits = 3, eps = 0.001) + ) + } + + # -- 7. Build long data.frame for ggplot ----------------------------------- + roc_df <- dplyr::bind_rows(lapply(names(roc_list), function(nm) { + roc_obj <- roc_list[[nm]] + auc_row <- auc_table[auc_table$model == nm, ] + + if (show_ci) { + legend_label <- sprintf( + "%s\nAUC = %.3f [%.3f\u2013%.3f]", + nm, auc_row$auc, auc_row$ci_lower, auc_row$ci_upper + ) + } else { + legend_label <- sprintf("%s (AUC = %.3f)", nm, auc_row$auc) + } + + data.frame( + fpr = 1 - roc_obj$specificities, + tpr = roc_obj$sensitivities, + model = legend_label, + stringsAsFactors = FALSE + ) + })) + + # Preserve legend order + roc_df$model <- factor(roc_df$model, levels = unique(roc_df$model)) + + # -- 8. Recycle colours if needed ------------------------------------------ + n_models <- length(models) + pal_colors <- rep_len(colors, n_models) + names(pal_colors) <- levels(roc_df$model) + + # -- 9. Build ggplot ------------------------------------------------------- + p <- ggplot2::ggplot( + roc_df, + ggplot2::aes(x = .data$fpr, y = .data$tpr, color = .data$model) + ) + + ggplot2::geom_abline( + intercept = 0, slope = 1, + linetype = "dashed", + color = "grey55", + linewidth = 0.5 + ) + + ggplot2::geom_line(linewidth = 1.1, alpha = 0.92) + + ggplot2::scale_color_manual(values = pal_colors) + + ggplot2::labs( + title = plot_title, + subtitle = plot_subtitle, + x = "False Positive Rate (1 \u2212 Specificity)", + y = "True Positive Rate (Sensitivity)", + color = NULL + ) + + theme_fn() + + ggplot2::theme( + legend.position = c(0.75, 0.20), + legend.text = ggplot2::element_text(size = 9), + legend.background = ggplot2::element_rect( + fill = "white", + colour = "grey80", + linewidth = 0.4 + ), + plot.title = ggplot2::element_text(face = "bold", size = 13), + plot.subtitle = ggplot2::element_text(size = 10, color = "grey40"), + axis.title = ggplot2::element_text(size = 11) + ) + + # -- 10. Annotate DeLong test result --------------------------------------- + if (!is.null(delong_label)) { + p <- p + + ggplot2::annotate( + "label", + x = 0.68, + y = 0.35, + label = delong_label, + size = 3.2, + color = "grey20", + fill = "white", + linewidth = 0.3 + ) + } + + # -- 11. Return ------------------------------------------------------------ + if (!return_data) { + return(p) + } + + return(list( + plot = p, + auc_table = auc_table, + delong_test = delong_result, + roc_objects = roc_list + )) +} diff --git a/R/nice_forest.R b/R/nice_forest.R new file mode 100644 index 0000000..7cbd99a --- /dev/null +++ b/R/nice_forest.R @@ -0,0 +1,284 @@ +#' Draw a Forest Plot from Cox Model Results +#' +#' Creates a publication-ready forest plot from a tidy table of model results, +#' such as the output produced by \code{get_cox()}. +#' +#' @param data A \code{data.frame} containing model results. By default, it is +#' expected to contain \code{HR}, \code{CI_low}, \code{CI_high}, and +#' \code{p.value} columns. +#' @param estimate_col Character. Name of the column containing the point +#' estimate. Default: \code{"HR"}. +#' @param ci_low_col Character. Name of the lower confidence interval column. +#' Default: \code{"CI_low"}. +#' @param ci_high_col Character. Name of the upper confidence interval column. +#' Default: \code{"CI_high"}. +#' @param p_col Character. Name of the p-value column. Default: +#' \code{"p.value"}. +#' @param label_col Character or \code{NULL}. Name of a precomputed label +#' column. If \code{NULL}, labels are built automatically from +#' \code{variable_col}, \code{term_col}, and \code{reference_col}. +#' @param variable_col Character. Name of the variable column. Default: +#' \code{"variable"}. +#' @param term_col Character. Name of the cleaned term column. Default: +#' \code{"term_clean"}. +#' @param reference_col Character. Name of the reference-level column. +#' Default: \code{"reference"}. +#' @param p_display Numeric. Only rows with p-value <= \code{p_display} are +#' shown. Default: \code{1} (show all rows). +#' @param title Character. Plot title. If \code{NULL}, a default title is used. +#' @param xlab Character. X-axis label. Default: +#' \code{"Hazard Ratio (log scale)"}. +#' @param log_scale Logical. If \code{TRUE}, the x-axis is shown on a log10 +#' scale. Default: \code{TRUE}. +#' @param vline Numeric. Reference line position. Default: \code{1}. +#' @param color_sig Color for significant points (p < 0.05). Default: +#' \code{"#c0392b"}. +#' @param color_ns Color for non-significant points. Default: +#' \code{"#7f8c8d"}. +#' @param ref_line_color Color for the vertical reference line. Default: +#' \code{"#2c3e50"}. +#' @param sort_by Character. How to order the plot rows. One of +#' \code{"estimate"}, \code{"p.value"}, or \code{"input"}. +#' Default: \code{"estimate"}. +#' @param point_size Numeric. Point size. Default: \code{3.5}. +#' @param base_size Numeric. Base font size for the ggplot theme. +#' Default: \code{12}. +#' @param return_table Logical. If \code{TRUE}, returns a list with +#' \code{$plot} and \code{$table}. If \code{FALSE}, returns only the plot. +#' Default: \code{FALSE}. +#' +#' @return A \code{ggplot} object if \code{return_table = FALSE}, or a named +#' list with \code{$plot} and \code{$table} if \code{return_table = TRUE}. +#' +#' @details +#' This function is intentionally model-agnostic. It only requires a table with +#' point estimates, confidence intervals, and p-values. For Cox models generated +#' with \code{get_cox()}, the default columns work without modification. +#' +#' @importFrom ggplot2 ggplot aes geom_point geom_errorbar geom_vline scale_x_log10 scale_color_manual labs theme_minimal theme element_text element_blank +#' +#' @examples +#' \dontrun{ +#' cox_tab <- get_cox( +#' data = df, +#' time_col = "PFI.time", +#' event_col = "PFI", +#' vars = c("ER_Status_nature2012", "PAM50Call_RNAseq"), +#' model = "univariable" +#' ) +#' +#' nice_forest(cox_tab) +#' +#' nice_forest( +#' cox_tab, +#' p_display = 0.05, +#' title = "PFI — Significant Cox Terms" +#' ) +#' } +#' +#' @seealso +#' \code{\link{get_cox}} for fitting Cox proportional hazards models and +#' returning tidy results that can be passed directly to \code{nice_forest()}. +#' +#' \code{\link{nice_KM}} for Kaplan-Meier survival curve visualization. +#' +#' \code{\link[ggplot2]{ggplot}} for the underlying plotting system. +#' +#' @export +nice_forest <- function(data, + estimate_col = "HR", + ci_low_col = "CI_low", + ci_high_col = "CI_high", + p_col = "p.value", + label_col = NULL, + variable_col = "variable", + term_col = "term_clean", + reference_col = "reference", + p_display = 1, + title = NULL, + xlab = "Hazard Ratio (log scale)", + log_scale = TRUE, + vline = 1, + color_sig = "#c0392b", + color_ns = "#7f8c8d", + ref_line_color = "#2c3e50", + sort_by = c("estimate", "p.value", "input"), + point_size = 3.5, + base_size = 12, + return_table = FALSE) { + + sort_by <- match.arg(sort_by) + + # --- 1. Validations --- + if (!is.data.frame(data)) { + stop("'data' should be a data.frame.") + } + + required_cols <- c(estimate_col, ci_low_col, ci_high_col, p_col) + missing_cols <- setdiff(required_cols, colnames(data)) + + if (length(missing_cols) > 0) { + stop( + "The following required columns were not found in 'data': ", + paste(missing_cols, collapse = ", ") + ) + } + + is_single_number <- function(x) { + is.numeric(x) && length(x) == 1 && !is.na(x) + } + + if (!is_single_number(p_display) || p_display < 0 || p_display > 1) { + stop("'p_display' should be a single numeric value between 0 and 1.") + } + + if (!is_single_number(vline)) { + stop("'vline' should be a single numeric value.") + } + + # --- 2. Prepare plot table --- + plot_data <- data + + plot_data$.estimate_plot <- suppressWarnings(as.numeric(plot_data[[estimate_col]])) + plot_data$.ci_low_plot <- suppressWarnings(as.numeric(plot_data[[ci_low_col]])) + plot_data$.ci_high_plot <- suppressWarnings(as.numeric(plot_data[[ci_high_col]])) + plot_data$.p_plot <- suppressWarnings(as.numeric(plot_data[[p_col]])) + + keep <- !is.na(plot_data$.estimate_plot) & + !is.na(plot_data$.ci_low_plot) & + !is.na(plot_data$.ci_high_plot) & + !is.na(plot_data$.p_plot) & + is.finite(plot_data$.estimate_plot) & + is.finite(plot_data$.ci_low_plot) & + is.finite(plot_data$.ci_high_plot) & + plot_data$.p_plot <= p_display + + plot_data <- plot_data[keep, , drop = FALSE] + + if (nrow(plot_data) == 0) { + stop("No rows remained after filtering by finite values and 'p_display'.") + } + + if (isTRUE(log_scale)) { + positive <- plot_data$.estimate_plot > 0 & + plot_data$.ci_low_plot > 0 & + plot_data$.ci_high_plot > 0 & + vline > 0 + + if (!all(positive)) { + stop( + "All estimates, confidence intervals, and 'vline' must be > 0 ", + "when 'log_scale = TRUE'." + ) + } + } + + # --- 3. Build labels --- + if (!is.null(label_col)) { + if (!label_col %in% colnames(plot_data)) { + stop("Column '", label_col, "' not found in 'data'.") + } + + plot_data$.label_plot <- as.character(plot_data[[label_col]]) + + } else if (all(c(variable_col, term_col, reference_col) %in% colnames(plot_data))) { + plot_data$.label_plot <- paste0( + plot_data[[variable_col]], + " [ref: ", + plot_data[[reference_col]], + "]\n", + plot_data[[term_col]], + "\n", + "HR=", + round(plot_data$.estimate_plot, 2), + " [", + round(plot_data$.ci_low_plot, 2), + "-", + round(plot_data$.ci_high_plot, 2), + "]", + " p=", + formatC(plot_data$.p_plot, digits = 2, format = "e") + ) + + } else if (variable_col %in% colnames(plot_data)) { + plot_data$.label_plot <- as.character(plot_data[[variable_col]]) + + } else { + plot_data$.label_plot <- rownames(plot_data) + } + + # --- 4. Sort rows --- + if (sort_by == "estimate") { + plot_data <- plot_data[order(plot_data$.estimate_plot), , drop = FALSE] + } else if (sort_by == "p.value") { + plot_data <- plot_data[order(plot_data$.p_plot), , drop = FALSE] + } + + plot_data$.label_plot <- factor( + plot_data$.label_plot, + levels = plot_data$.label_plot + ) + + plot_data$.significance_plot <- ifelse( + plot_data$.p_plot < 0.05, + "p < 0.05", + "p >= 0.05" + ) + + # --- 5. Plot --- + if (is.null(title)) { + title <- "Forest Plot" + } + + p <- ggplot2::ggplot( + plot_data, + ggplot2::aes( + x = .data$.estimate_plot, + y = .data$.label_plot, + color = .data$.significance_plot + ) + ) + + ggplot2::geom_vline( + xintercept = vline, + linetype = "dashed", + color = ref_line_color, + linewidth = 0.8 + ) + + ggplot2::geom_errorbar( + ggplot2::aes(xmin = .data$.ci_low_plot, xmax = .data$.ci_high_plot), + orientation = "y", + width = 0.3, + linewidth = 0.6 + ) + + ggplot2::geom_point(size = point_size) + + ggplot2::scale_color_manual( + values = c("p < 0.05" = color_sig, "p >= 0.05" = color_ns), + name = "Significance" + ) + + ggplot2::labs( + title = title, + x = xlab, + y = NULL + ) + + ggplot2::theme_minimal(base_size = base_size) + + ggplot2::theme( + plot.title = ggplot2::element_text(face = "bold", size = base_size + 2), + axis.text.y = ggplot2::element_text(size = base_size - 4), + legend.position = "top", + panel.grid.major.y = ggplot2::element_blank() + ) + + if (isTRUE(log_scale)) { + p <- p + + ggplot2::scale_x_log10( + breaks = c(0.125, 0.25, 0.5, 1, 2, 4, 8, 16), + labels = c("0.125", "0.25", "0.5", "1", "2", "4", "8", "16") + ) + } + + if (return_table) { + return(list(plot = p, table = plot_data)) + } + + return(p) +} diff --git a/R/trend_filter.R b/R/trend_filter.R new file mode 100644 index 0000000..8140ba7 --- /dev/null +++ b/R/trend_filter.R @@ -0,0 +1,573 @@ +######################### +# Function trend_filter # +######################### + +#' Filter genes by patient-level trend consistency. +#' +#' This function filters differentially expressed genes according to the +#' consistency of their expression trend across paired patients. For each gene +#' \eqn{g} and patient \eqn{p}, the function calculates the mean expression in +#' baseline samples and condition samples: +#' \eqn{\bar{X}_{N}(g,p)} and \eqn{\bar{X}_{T}(g,p)}. +#' +#' For genes classified as UP-regulated at the group level, a gene is removed if +#' at least one paired patient shows: +#' \eqn{\bar{X}_{N}(g,p) >= \bar{X}_{T}(g,p) * ratio}. +#' +#' For genes classified as DOWN-regulated at the group level, a gene is removed +#' if at least one paired patient shows: +#' \eqn{\bar{X}_{T}(g,p) >= \bar{X}_{N}(g,p) * ratio}. +#' +#' The direction of regulation is defined from the group-level log2 fold-change. +#' The patient-level consistency check is performed using the expression matrix +#' supplied in `expr`. +#' +#' @param expr Numeric matrix or data frame of expression values with genes as +#' rows and sample IDs as columns. Row names must contain gene IDs. +#' @param sampledata Data frame with sample metadata. +#' @param results Data frame or named list of data frames containing +#' differential expression results. Each data frame must contain a gene ID +#' column and a log2 fold-change column. +#' @param baseline Character vector identifying the baseline group or groups in +#' `group.col`. +#' @param conditions Character vector or named list identifying the condition +#' group for each comparison in `results`. If `results` is a named list, +#' names in `conditions` should match names in `results`. +#' @param sample.col Column in `sampledata` containing sample IDs. Default is +#' `"sample_id"`. +#' @param patient.col Column in `sampledata` containing patient IDs. Default is +#' `"patient_id"`. +#' @param group.col Column in `sampledata` containing group labels. Default is +#' `"sample_type"`. +#' @param gene.col Column in `results` containing gene IDs. Default is +#' `"ensembl"`. +#' @param lfc.col Column in `results` containing log2 fold changes. Default is +#' `"log2FoldChange"`. +#' @param ratio Numeric value greater than 1. Defines the tolerated reversal +#' threshold. Default is `1.1`. +#' @param lfc.cutoff Non-negative numeric value used to define UP and DOWN +#' regulation from `lfc.col`. Default is `0`. +#' @param scale Expression scale. Use `"linear"` for normalized counts, CPM, TPM +#' or similar linear-scale values. Use `"log2"` for log2-transformed +#' expression values. Default is `"linear"`. +#' @param require_complete_pairs Logical. If `TRUE`, the function stops when +#' unpaired patients are found. If `FALSE`, only patients with both baseline +#' and condition samples are used. Default is `FALSE`. +#' @param na.rm Logical. Should missing expression values be removed when +#' calculating patient-level means? Default is `FALSE`. +#' @param return_removed Logical. Should the output include a vector of removed +#' genes? Default is `TRUE`. +#' +#' @return A named list containing: +#' \itemize{ +#' \item One filtered data frame per comparison. +#' \item `TrendGenes`: unique genes passing the trend consistency filter. +#' \item `Diagnostics`: gene-level filtering diagnostics. +#' \item `Summary`: comparison-level filtering summary. +#' \item `RemovedGenes`: unique genes removed by the filter, if +#' `return_removed = TRUE`. +#' } +#' +#' @details +#' The multiplicative rule `ratio = 1.1` is appropriate for linear-scale +#' expression values. If `expr` is log2-transformed, set `scale = "log2"`; the +#' function will use `log2(ratio)` as an additive threshold. +#' +#' @references Requena D. et al. Nat Commun 15, 10887 (2024). +#' +#' @examples +#' \dontrun{ +#' trend_res <- trend_filter( +#' expr = norm_counts, +#' sampledata = sampledata, +#' results = list(Tumor_vs_Normal = deseq_res), +#' baseline = "normal", +#' conditions = c(Tumor_vs_Normal = "tumor"), +#' sample.col = "sample_id", +#' patient.col = "patient_id", +#' group.col = "sample_type" +#' ) +#' +#' length(trend_res$TrendGenes) +#' head(trend_res$Tumor_vs_Normal) +#' head(trend_res$Diagnostics) +#' trend_res$Summary +#' } +#' +#' @seealso [detect_filter()] +#' +#' @export + +trend_filter <- function(expr, + sampledata, + results, + baseline, + conditions, + sample.col = "sample_id", + patient.col = "patient_id", + group.col = "sample_type", + gene.col = "ensembl", + lfc.col = "log2FoldChange", + ratio = 1.1, + lfc.cutoff = 0, + scale = c("linear", "log2"), + require_complete_pairs = FALSE, + na.rm = FALSE, + return_removed = TRUE) { + + scale <- match.arg(scale) + + expr <- as.data.frame(expr, check.names = FALSE) + sampledata <- as.data.frame(sampledata) + + if (is.null(rownames(expr))) { + stop("expr must have gene IDs as row names.") + } + + if (anyDuplicated(rownames(expr)) > 0) { + stop("expr must have unique gene IDs as row names.") + } + + required_cols <- c(sample.col, patient.col, group.col) + missing_cols <- setdiff(required_cols, colnames(sampledata)) + + if (length(missing_cols) > 0) { + stop( + "The following columns are missing from sampledata: ", + paste(missing_cols, collapse = ", ") + ) + } + + if (anyDuplicated(sampledata[[sample.col]]) > 0) { + stop("sampledata must contain one row per sample ID.") + } + + if (!is.character(baseline) || length(baseline) < 1) { + stop("baseline must be a character vector with at least one group label.") + } + + if (!is.numeric(ratio) || length(ratio) != 1 || is.na(ratio) || ratio <= 1) { + stop("ratio must be a single numeric value greater than 1.") + } + + if ( + !is.numeric(lfc.cutoff) || + length(lfc.cutoff) != 1 || + is.na(lfc.cutoff) || + lfc.cutoff < 0 + ) { + stop("lfc.cutoff must be a single numeric value greater than or equal to 0.") + } + + if (!is.logical(require_complete_pairs) || length(require_complete_pairs) != 1) { + stop("require_complete_pairs must be TRUE or FALSE.") + } + + if (!is.logical(na.rm) || length(na.rm) != 1) { + stop("na.rm must be TRUE or FALSE.") + } + + if (!is.logical(return_removed) || length(return_removed) != 1) { + stop("return_removed must be TRUE or FALSE.") + } + + if (inherits(results, "data.frame")) { + results <- list(Comparison1 = results) + } + + if (!is.list(results) || length(results) == 0) { + stop("results must be a data frame or a non-empty list of data frames.") + } + + valid_results <- vapply(results, inherits, logical(1), what = "data.frame") + + if (!all(valid_results)) { + stop("Every element in results must be a data frame.") + } + + if (is.null(names(results)) || any(names(results) == "")) { + names(results) <- paste0("Comparison", seq_along(results)) + } + + reserved_names <- c("TrendGenes", "Diagnostics", "Summary", "RemovedGenes") + + if (any(names(results) %in% reserved_names)) { + stop( + "The following names are reserved and cannot be used as comparison names: ", + paste(reserved_names, collapse = ", ") + ) + } + + if (is.character(conditions)) { + condition_names <- names(conditions) + conditions <- as.list(conditions) + + if (!is.null(condition_names)) { + names(conditions) <- condition_names + } + } + + if (!is.list(conditions) || length(conditions) != length(results)) { + stop("conditions must have the same length as results.") + } + + valid_conditions <- vapply( + conditions, + function(x) is.character(x) && length(x) >= 1, + logical(1) + ) + + if (!all(valid_conditions)) { + stop("Every element in conditions must be a character vector.") + } + + if (is.null(names(conditions)) || any(names(conditions) == "")) { + names(conditions) <- names(results) + } + + if (!all(names(results) %in% names(conditions))) { + stop("Names in conditions must match names in results.") + } + + conditions <- conditions[names(results)] + + sampledata[[sample.col]] <- as.character(sampledata[[sample.col]]) + sampledata[[patient.col]] <- as.character(sampledata[[patient.col]]) + sampledata[[group.col]] <- as.character(sampledata[[group.col]]) + + get_patient_means <- function(condition_label, comparison_name) { + baseline_data <- sampledata[ + sampledata[[group.col]] %in% baseline, + , + drop = FALSE + ] + + condition_data <- sampledata[ + sampledata[[group.col]] %in% condition_label, + , + drop = FALSE + ] + + if (nrow(baseline_data) == 0) { + stop("No baseline samples found for comparison: ", comparison_name) + } + + if (nrow(condition_data) == 0) { + stop("No condition samples found for comparison: ", comparison_name) + } + + baseline_patients <- unique(baseline_data[[patient.col]]) + condition_patients <- unique(condition_data[[patient.col]]) + + paired_patients <- intersect(baseline_patients, condition_patients) + all_patients <- union(baseline_patients, condition_patients) + unpaired_patients <- setdiff(all_patients, paired_patients) + + if (length(paired_patients) == 0) { + stop( + "No paired patients found between baseline and condition for comparison: ", + comparison_name + ) + } + + if (length(unpaired_patients) > 0 && require_complete_pairs) { + stop( + "Unpaired patients found in comparison ", + comparison_name, + ": ", + paste(unpaired_patients, collapse = ", ") + ) + } + + if (length(unpaired_patients) > 0 && !require_complete_pairs) { + warning( + "Using only paired patients for comparison ", + comparison_name, + ". Unpaired patients were ignored: ", + paste(unpaired_patients, collapse = ", "), + call. = FALSE + ) + } + + baseline_data <- baseline_data[ + baseline_data[[patient.col]] %in% paired_patients, + , + drop = FALSE + ] + + condition_data <- condition_data[ + condition_data[[patient.col]] %in% paired_patients, + , + drop = FALSE + ] + + used_samples <- unique(c( + baseline_data[[sample.col]], + condition_data[[sample.col]] + )) + + missing_samples <- setdiff(used_samples, colnames(expr)) + + if (length(missing_samples) > 0) { + stop( + "The following samples are missing from expr: ", + paste(missing_samples, collapse = ", ") + ) + } + + non_numeric_samples <- used_samples[ + !vapply(expr[, used_samples, drop = FALSE], is.numeric, logical(1)) + ] + + if (length(non_numeric_samples) > 0) { + stop( + "The following samples in expr are not numeric: ", + paste(non_numeric_samples, collapse = ", ") + ) + } + + mean_baseline <- vapply( + paired_patients, + function(patient) { + samples <- baseline_data[ + baseline_data[[patient.col]] == patient, + sample.col + ] + + rowMeans(expr[, samples, drop = FALSE], na.rm = na.rm) + }, + numeric(nrow(expr)) + ) + + mean_condition <- vapply( + paired_patients, + function(patient) { + samples <- condition_data[ + condition_data[[patient.col]] == patient, + sample.col + ] + + rowMeans(expr[, samples, drop = FALSE], na.rm = na.rm) + }, + numeric(nrow(expr)) + ) + + rownames(mean_baseline) <- rownames(expr) + rownames(mean_condition) <- rownames(expr) + + colnames(mean_baseline) <- paired_patients + colnames(mean_condition) <- paired_patients + + list( + baseline = mean_baseline, + condition = mean_condition, + patients = paired_patients + ) + } + + filter_one_comparison <- function(df, condition_label, comparison_name) { + df <- as.data.frame(df) + + if (!gene.col %in% colnames(df)) { + stop( + "Column '", + gene.col, + "' was not found in comparison: ", + comparison_name + ) + } + + if (!lfc.col %in% colnames(df)) { + stop( + "Column '", + lfc.col, + "' was not found in comparison: ", + comparison_name + ) + } + + genes <- as.character(df[[gene.col]]) + + lfc <- suppressWarnings(as.numeric(df[[lfc.col]])) + + invalid_lfc <- is.na(lfc) & !is.na(df[[lfc.col]]) + + if (any(invalid_lfc)) { + stop( + "Column '", + lfc.col, + "' must be numeric in comparison: ", + comparison_name + ) + } + + direction <- rep("none", length(lfc)) + direction[!is.na(lfc) & lfc > lfc.cutoff] <- "up" + direction[!is.na(lfc) & lfc < -lfc.cutoff] <- "down" + + missing_gene <- !genes %in% rownames(expr) + + diagnostics <- data.frame( + gene = genes, + comparison = comparison_name, + log2FoldChange = lfc, + direction = direction, + passed_trend = FALSE, + reason = NA_character_, + failed_patients = NA_character_, + stringsAsFactors = FALSE + ) + + diagnostics$reason[is.na(lfc)] <- "missing_lfc" + diagnostics$reason[direction == "none" & !is.na(lfc)] <- "no_direction" + diagnostics$reason[missing_gene] <- "missing_in_expr" + + evaluable <- which(!missing_gene & direction %in% c("up", "down")) + + if (length(evaluable) == 0) { + filtered_df <- df[FALSE, , drop = FALSE] + + return(list( + filtered = filtered_df, + diagnostics = diagnostics + )) + } + + patient_means <- get_patient_means(condition_label, comparison_name) + + genes_eval <- genes[evaluable] + + mean_baseline <- patient_means$baseline[genes_eval, , drop = FALSE] + mean_condition <- patient_means$condition[genes_eval, , drop = FALSE] + patients <- patient_means$patients + + missing_expression <- is.na(mean_baseline) | is.na(mean_condition) + has_missing_expression <- rowSums(missing_expression) > 0 + + if (scale == "linear") { + inconsistent_up <- mean_baseline >= mean_condition * ratio + inconsistent_down <- mean_condition >= mean_baseline * ratio + } else { + log2_ratio <- log2(ratio) + + inconsistent_up <- mean_baseline >= mean_condition + log2_ratio + inconsistent_down <- mean_condition >= mean_baseline + log2_ratio + } + + direction_eval <- direction[evaluable] + + has_inconsistent_trend <- ifelse( + direction_eval == "up", + rowSums(inconsistent_up, na.rm = TRUE) > 0, + rowSums(inconsistent_down, na.rm = TRUE) > 0 + ) + + passed <- !has_missing_expression & !has_inconsistent_trend + + reason <- ifelse( + passed, + "passed", + ifelse( + has_missing_expression, + "missing_expression", + "inconsistent_trend" + ) + ) + + failed_patients <- vapply( + seq_along(evaluable), + function(i) { + if (has_missing_expression[i]) { + patient_ids <- patients[missing_expression[i, ]] + } else if (direction_eval[i] == "up") { + patient_ids <- patients[inconsistent_up[i, ]] + } else { + patient_ids <- patients[inconsistent_down[i, ]] + } + + if (length(patient_ids) == 0) { + NA_character_ + } else { + paste(patient_ids, collapse = ";") + } + }, + character(1) + ) + + diagnostics$passed_trend[evaluable] <- passed + diagnostics$reason[evaluable] <- reason + diagnostics$failed_patients[evaluable] <- failed_patients + + filtered_df <- df[diagnostics$passed_trend, , drop = FALSE] + + list( + filtered = filtered_df, + diagnostics = diagnostics + ) + } + + filtered_results <- list() + diagnostics_list <- list() + + for (comparison_name in names(results)) { + filtered <- filter_one_comparison( + df = results[[comparison_name]], + condition_label = conditions[[comparison_name]], + comparison_name = comparison_name + ) + + filtered_results[[comparison_name]] <- filtered$filtered + diagnostics_list[[comparison_name]] <- filtered$diagnostics + } + + all_diagnostics <- do.call(rbind, diagnostics_list) + rownames(all_diagnostics) <- NULL + + trend_genes <- unique(unlist( + lapply( + filtered_results, + function(x) as.character(x[[gene.col]]) + ), + use.names = FALSE + )) + + trend_genes <- as.character(trend_genes) + + summary_table <- do.call( + rbind, + lapply( + split(all_diagnostics, all_diagnostics$comparison), + function(x) { + data.frame( + comparison = x$comparison[1], + total_genes = nrow(x), + kept_genes = sum(x$passed_trend, na.rm = TRUE), + removed_genes = sum(!x$passed_trend, na.rm = TRUE), + up_genes = sum(x$direction == "up", na.rm = TRUE), + down_genes = sum(x$direction == "down", na.rm = TRUE), + no_direction = sum(x$reason == "no_direction", na.rm = TRUE), + missing_lfc = sum(x$reason == "missing_lfc", na.rm = TRUE), + missing_in_expr = sum(x$reason == "missing_in_expr", na.rm = TRUE), + inconsistent_trend = sum(x$reason == "inconsistent_trend", na.rm = TRUE), + missing_expression = sum(x$reason == "missing_expression", na.rm = TRUE), + stringsAsFactors = FALSE + ) + } + ) + ) + + rownames(summary_table) <- NULL + + output <- filtered_results + output$TrendGenes <- trend_genes + output$Diagnostics <- all_diagnostics + output$Summary <- summary_table + + if (return_removed) { + output$RemovedGenes <- unique(as.character( + all_diagnostics$gene[!all_diagnostics$passed_trend] + )) + } + + return(output) +} diff --git a/man/concordanceDE.Rd b/man/concordanceDE.Rd new file mode 100644 index 0000000..8c1ddae --- /dev/null +++ b/man/concordanceDE.Rd @@ -0,0 +1,87 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/concordanceDE.R +\name{concordanceDE} +\alias{concordanceDE} +\title{Classify differential results by cross-layer concordance.} +\usage{ +concordanceDE( + de_x, + de_y, + gene_col = "gene", + logfc_col = "logFC", + padj_col = "padj", + padj_threshold = 0.05, + logfc_threshold = 1 +) +} +\arguments{ +\item{de_x}{Data frame for layer X. Must contain columns defined by +\code{gene_col}, \code{logfc_col}, and \code{padj_col}.} + +\item{de_y}{Data frame for layer Y. Must contain columns defined by +\code{gene_col}, \code{logfc_col}, and \code{padj_col}.} + +\item{gene_col}{Column name containing gene identifiers. Default: \code{"gene"}.} + +\item{logfc_col}{Column name containing log-fold changes. Default: \code{"logFC"}.} + +\item{padj_col}{Column name containing adjusted p-values/FDR values. +Default: \code{"padj"}.} + +\item{padj_threshold}{Numeric threshold for adjusted p-values. Use a single +value for both layers or a length-two vector for layer X and layer Y, +respectively. Default: \code{0.05}.} + +\item{logfc_threshold}{Numeric absolute log-fold-change threshold. Use a +single value for both layers or a length-two vector for layer X and layer Y, +respectively. Default: \code{1}.} +} +\value{ +A list with: +\itemize{ +\item \code{table}: merged data frame with one \code{category} column. +\item \code{concordance_score}: fraction of genes significant in both layers +with matching log-fold-change direction. +\item \code{summary}: table with counts per concordance category. +\item \code{n_shared_genes}: number of genes shared by both layers. +\item \code{n_both_significant}: number of genes significant in both layers. +} +} +\description{ +Compares two differential-analysis tables from paired omics layers, such as +RNA-seq and total-protein RPPA, by merging shared genes and classifying each +gene according to statistical significance and log-fold-change direction. +} +\details{ +Genes are assigned to one of five categories: \code{concordant}, \code{discordant}, +\code{only_x}, \code{only_y}, or \code{not_significant}. A concordance score is also +computed as the fraction of genes significant in both layers that have the +same effect direction. +} +\examples{ +de_rna <- data.frame( + gene = c("ESR1", "PGR", "EGFR", "MKI67", "GATA3"), + logFC = c(3.2, 2.1, -1.6, -1.4, 1.8), + padj = c(1e-6, 0.002, 0.01, 0.03, 0.04) +) + +de_protein <- data.frame( + gene = c("ESR1", "PGR", "EGFR", "MKI67", "GATA3"), + logFC = c(0.7, 0.4, -0.5, 0.3, 0.05), + padj = c(1e-4, 0.01, 0.02, 0.04, 0.40) +) + +res <- concordanceDE( + de_x = de_rna, + de_y = de_protein, + logfc_threshold = c(1, 0.2) +) +res$summary +res$concordance_score + +} +\seealso{ +\code{\link[=nice_ConcordanceScatter]{nice_ConcordanceScatter()}} to visualize the output of \code{concordanceDE()}; +\code{\link[=crossLayerCorr]{crossLayerCorr()}} to estimate sample-level concordance between two omics +layers. +} diff --git a/man/crossLayerCorr.Rd b/man/crossLayerCorr.Rd new file mode 100644 index 0000000..0e1a2f6 --- /dev/null +++ b/man/crossLayerCorr.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/crossLayerCorr.R +\name{crossLayerCorr} +\alias{crossLayerCorr} +\title{Correlate paired sample profiles between two omics layers.} +\usage{ +crossLayerCorr( + mat_x, + mat_y, + method = c("spearman", "pearson"), + top_n = 500, + plot = TRUE +) +} +\arguments{ +\item{mat_x}{Numeric matrix for layer X with features as rows and samples as +columns. Row names and column names are required.} + +\item{mat_y}{Numeric matrix for layer Y with features as rows and samples as +columns. Row names and column names are required.} + +\item{method}{Correlation method. One of \code{"spearman"} or \code{"pearson"}. +Default: \code{"spearman"}.} + +\item{top_n}{Number of most variable shared features to use. If \code{NULL}, all +shared features are used. Default: \code{500}.} + +\item{plot}{Logical; if \code{TRUE}, returns a \code{ggplot2} bar plot in the \code{plot} +element. Default: \code{TRUE}.} +} +\value{ +A list with: +\itemize{ +\item \code{correlations}: data frame with \code{sample} and \code{correlation} columns. +\item \code{median_r}: median sample-level correlation. +\item \code{n_shared_samples}: number of shared samples used. +\item \code{n_shared_features}: number of shared features before top-variable +filtering. +\item \code{n_features_used}: number of features used for correlation. +\item \code{plot}: \code{ggplot2} object or \code{NULL}. +} +} +\description{ +Computes one correlation per shared sample between two normalized matrices. +Matrices are first matched by shared sample names and shared feature names, +making the function suitable for RNA-vs-protein comparisons after mapping +protein features to gene symbols. +} +\details{ +By default, the function uses the top variable shared features to reduce noise +and returns both a sorted correlation table and an optional bar plot. +} +\examples{ +set.seed(1) +mat_rna <- matrix(rnorm(60), nrow = 10) +mat_protein <- mat_rna + matrix(rnorm(60, sd = 0.4), nrow = 10) +rownames(mat_rna) <- rownames(mat_protein) <- paste0("GENE", seq_len(10)) +colnames(mat_rna) <- colnames(mat_protein) <- paste0("S", seq_len(6)) + +res <- crossLayerCorr(mat_rna, mat_protein, top_n = 8, plot = FALSE) +head(res$correlations) +res$median_r + +} +\seealso{ +\code{\link[=concordanceDE]{concordanceDE()}} to classify genes by cross-layer differential concordance; +\code{\link[=nice_ConcordanceScatter]{nice_ConcordanceScatter()}} to visualize RNA-protein differential +concordance. +} diff --git a/man/detect_filter.Rd b/man/detect_filter.Rd index a83d36b..72b000b 100644 --- a/man/detect_filter.Rd +++ b/man/detect_filter.Rd @@ -85,4 +85,5 @@ head(detected$Comparison1) \seealso{ \code{\link[=nice_VSB]{nice_VSB()}} to plot expression of detected genes; \link{norm_counts} for an example normalized counts matrix. +\code{\link[=trend_filter]{trend_filter()}} to find genes with significant expression trends across conditions. } diff --git a/man/get_cox.Rd b/man/get_cox.Rd new file mode 100644 index 0000000..6b538b7 --- /dev/null +++ b/man/get_cox.Rd @@ -0,0 +1,162 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_cox.R +\name{get_cox} +\alias{get_cox} +\title{Fit Cox Proportional Hazards Models and Return a Tidy Table} +\usage{ +get_cox( + data, + time_col = "PFI.time", + event_col = "PFI", + vars = NULL, + model = c("univariable", "multivariable", "adjusted"), + adjust_vars = NULL, + keep_adjust_terms = FALSE, + ref_levels = NULL, + min_n = 10, + min_n_per_level = 5, + min_events = 5, + min_events_per_level = 5, + exclude_survival_like = TRUE, + verbose = TRUE +) +} +\arguments{ +\item{data}{A \code{data.frame} containing the survival columns +(\code{time_col}, \code{event_col}) and the variables to test.} + +\item{time_col}{Character. Name of the column with follow-up time. +The column should be numeric or coercible to numeric. Default: +\code{"PFI.time"}.} + +\item{event_col}{Character. Name of the event indicator column. +It should be coded as 0 = censored and 1 = event, or be coercible +to numeric 0/1. Default: \code{"PFI"}.} + +\item{vars}{Character vector. Variables of interest to test. If +\code{NULL}, variables are selected automatically after excluding +survival columns, ID-like columns, columns starting with \code{"_"}, +and optionally survival/outcome-like columns.} + +\item{model}{Character. Type of Cox model to fit. One of +\code{"univariable"}, \code{"multivariable"}, or \code{"adjusted"}. +\code{"univariable"} fits one model per variable. \code{"multivariable"} +fits one model containing all variables in \code{vars}. \code{"adjusted"} +fits one model per variable of interest, adjusted by \code{adjust_vars}.} + +\item{adjust_vars}{Character vector. Covariates used when +\code{model = "adjusted"}. Ignored for \code{"univariable"} and +\code{"multivariable"} models. Default: \code{NULL}.} + +\item{keep_adjust_terms}{Logical. If \code{TRUE} and +\code{model = "adjusted"}, adjustment covariate terms are kept in the +returned table. If \code{FALSE}, only terms from the variable of interest +are returned. Default: \code{FALSE}.} + +\item{ref_levels}{A named list to explicitly set reference levels for +categorical variables. Names must be column names and values must be the +desired reference categories. Example: +\code{list(ER_Status_nature2012 = "Negative", + AJCC_Stage_nature2012 = "Stage I")}.} + +\item{min_n}{Integer. Minimum total number of complete observations required +for each Cox model. Default: \code{10}.} + +\item{min_n_per_level}{Integer. For categorical predictors, minimum number +of complete observations required in each category. Default: \code{5}. +Set to \code{0} to disable.} + +\item{min_events}{Integer. Minimum total number of events required for each +Cox model. Default: \code{5}. Set to \code{0} to disable.} + +\item{min_events_per_level}{Integer. For categorical predictors, minimum +number of events required in each category. This is the main safeguard +against sparse categories and infinite Cox coefficients. Default: +\code{5}. Set to \code{0} to disable.} + +\item{exclude_survival_like}{Logical. If \code{TRUE} and \code{vars = NULL}, +automatic variable selection excludes columns whose names suggest survival +endpoints or outcome leakage, such as \code{OS}, \code{DSS}, \code{DFI}, +\code{PFI}, \code{days_to}, \code{death}, \code{vital_status}, +\code{followup}, or \code{survival}. Default: \code{TRUE}.} + +\item{verbose}{Logical. If \code{TRUE}, prints informative messages when +models or variables are skipped. Default: \code{TRUE}.} +} +\value{ +A \code{data.frame} with one row per model term and columns including +\code{model}, \code{model_id}, \code{variable}, \code{term}, +\code{term_clean}, \code{reference}, \code{HR}, \code{CI_low}, +\code{CI_high}, \code{p.value}, \code{n_used}, \code{n_events}, and +\code{adjusted_for}. +} +\description{ +Fits Cox proportional hazards models against a selected survival endpoint and +returns a tidy table with hazard ratios, confidence intervals, p-values, model +metadata, and labels ready to be plotted with \code{nice_forest()}. +} +\details{ +The function performs several checks before fitting each model: +\enumerate{ +\item Removes empty strings and incomplete rows. +\item Converts character and logical predictors to factors. +\item Drops unused factor levels. +\item Applies explicit reference levels via \code{ref_levels}. +\item Skips sparse categorical variables using \code{min_n_per_level} and +\code{min_events_per_level}. +\item Skips models with possible perfect separation or infinite +coefficients. +\item Returns exponentiated Cox coefficients as hazard ratios. +} + +In this function, \code{"multivariable"} means a Cox model with multiple +predictors for one survival endpoint. This is different from +\code{"multivariate"}, which usually refers to multiple outcomes. +} +\examples{ +\dontrun{ +# Univariable Cox models +cox_uni <- get_cox( + data = df, + time_col = "PFI.time", + event_col = "PFI", + vars = c("ER_Status_nature2012", "PAM50Call_RNAseq"), + model = "univariable" +) + +# Multivariable Cox model +cox_multi <- get_cox( + data = df, + time_col = "PFI.time", + event_col = "PFI", + vars = c("age_at_initial_pathologic_diagnosis", + "AJCC_Stage_nature2012", + "PAM50Call_RNAseq"), + model = "multivariable" +) + +# Adjusted Cox models +cox_adj <- get_cox( + data = df, + time_col = "PFI.time", + event_col = "PFI", + vars = c("ER_Status_nature2012", "HER2_Final_Status_nature2012"), + adjust_vars = c("age_at_initial_pathologic_diagnosis", + "AJCC_Stage_nature2012"), + model = "adjusted" +) + +nice_forest(cox_adj) +} +} +\seealso{ +\code{\link{nice_forest}} for plotting the tidy Cox model results returned +by \code{get_cox()}. + +\code{\link{nice_KM}} for Kaplan-Meier survival curve visualization. + +\code{\link[survival]{coxph}} and \code{\link[survival]{Surv}} for the +underlying Cox proportional hazards model and survival object. + +\code{\link[broom]{tidy}} for tidying model outputs. +} diff --git a/man/get_glm.Rd b/man/get_glm.Rd new file mode 100644 index 0000000..2348382 --- /dev/null +++ b/man/get_glm.Rd @@ -0,0 +1,179 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_glm.R +\name{get_glm} +\alias{get_glm} +\title{Function to Fit a GLM and Return Tidy Results with Effect Sizes and FDR Correction} +\usage{ +get_glm( + data, + outcome, + predictors, + family = "binomial", + adjust_method = "BH", + conf_level = 0.95, + exponentiate = NULL, + remove_intercept = TRUE, + verbose = FALSE +) +} +\arguments{ +\item{data}{A \code{data.frame} or \code{tibble} containing all outcome and +predictor variables.} + +\item{outcome}{Character string of length 1. Name of the outcome column. +For logistic regression this is typically coded as \code{0}/\code{1}, +although other binomial formats accepted by \code{\link[stats]{glm}} can +also be used.} + +\item{predictors}{Character vector. Names of the predictor columns to include. +Categorical variables should be \code{factor} or \code{character}; they are +handled by \code{\link[stats]{glm}} contrast coding automatically.} + +\item{family}{Character string, family function, or \code{\link[stats]{family}} +object passed to \code{\link[stats]{glm}}. It specifies both the assumed +distribution of the outcome and the link function used in the GLM. +Common options include: +\itemize{ +\item \code{"binomial"} or \code{binomial(link = "logit")} for binary, +proportion, or success/failure outcomes; this corresponds to logistic +regression when the link is \code{"logit"}. +\item \code{"gaussian"} or \code{gaussian(link = "identity")} for +approximately normally distributed continuous outcomes. +\item \code{"poisson"} or \code{poisson(link = "log")} for count or rate +outcomes. +\item \code{"quasibinomial"} and \code{"quasipoisson"} for binomial or +Poisson-type outcomes with overdispersion. +\item \code{Gamma(link = "log")} for positive, right-skewed continuous +outcomes. +\item \code{inverse.gaussian()} for positive continuous outcomes with +variance increasing strongly with the mean. +} +Custom family objects can also be supplied. Default: \code{"binomial"}.} + +\item{adjust_method}{Character string. Method for p-value adjustment passed +to \code{\link[stats]{p.adjust}}. Options: \code{"BH"}, +\code{"bonferroni"}, \code{"holm"}, \code{"BY"}, \code{"fdr"}, +\code{"none"}. Default: \code{"BH"}.} + +\item{conf_level}{Numeric value in \code{(0, 1)}. Confidence level for the +interval. Default: \code{0.95}.} + +\item{exponentiate}{Logical or \code{NULL}. If \code{TRUE}, coefficients and +confidence intervals are exponentiated. If \code{NULL}, exponentiation is +applied automatically when the model link is \code{"logit"} or \code{"log"}. +This gives odds ratios for logistic models with logit link, rate ratios for +Poisson-type models with log link, and multiplicative effects for other +log-link models. Default: \code{NULL}.} + +\item{remove_intercept}{Logical. Whether to drop the \code{(Intercept)} row +from the returned table. Default: \code{TRUE}.} + +\item{verbose}{Logical. If \code{TRUE}, prints the model summary to console. +Default: \code{FALSE}.} +} +\value{ +A \code{tibble} of class +\code{c("get_glm_result", "tbl_df", "tbl", "data.frame")} with the following +columns: +\describe{ +\item{\code{term}}{Predictor name; for factors, includes the level +according to the contrast coding used by \code{glm}.} +\item{\code{estimate}}{Exponentiated coefficient if +\code{exponentiate = TRUE}; otherwise the raw coefficient on the +model linear predictor scale.} +\item{\code{ci_lower}}{Lower bound of the confidence interval on the same +scale as \code{estimate}.} +\item{\code{ci_upper}}{Upper bound of the confidence interval on the same +scale as \code{estimate}.} +\item{\code{std_error}}{Standard error of the coefficient on the linear +predictor scale.} +\item{\code{statistic}}{Wald test statistic. For fixed-dispersion families +such as binomial and Poisson this is typically a z-statistic; for +families with estimated dispersion it may be a t-statistic.} +\item{\code{p_value}}{Two-sided p-value from the Wald test.} +\item{\code{q_value}}{P-value adjusted for multiple comparisons using +the method specified in \code{adjust_method}.} +\item{\code{significance}}{Star annotation based on raw p-value: +\code{"***"} p<0.001, \code{"**"} p<0.01, \code{"*"} p<0.05, +\code{"."} p<0.10, \code{" "} otherwise.} +} + +The following attributes are attached to the returned object: +\describe{ +\item{\code{model}}{The fitted \code{glm} object.} +\item{\code{formula}}{Character string of the model formula.} +\item{\code{family}}{Family name used.} +\item{\code{link}}{Link function used.} +\item{\code{n_obs}}{Number of observations used in model fitting.} +\item{\code{AIC}}{Akaike Information Criterion. May be \code{NA} for +quasi-likelihood families.} +\item{\code{exponentiate}}{Logical indicating whether coefficients were +exponentiated.} +\item{\code{adjust_method}}{P-value adjustment method used.} +} +} +\description{ +Fits a Generalized Linear Model (GLM) for binary, continuous, count, +proportion, rate, or positive skewed outcomes, depending on the specified +\code{family}. The function returns a tidy \code{tibble} with coefficients, +optional exponentiated effect sizes, 95\% confidence intervals, Wald test +statistics, raw p-values, and multiple-testing-adjusted q-values. + +The fitted \code{glm} object is attached as an attribute so that it can be +passed directly to downstream functions such as \code{\link{nice_ROC}} +without re-fitting. +} +\details{ +Q-values are computed across all reported terms after removing the intercept +if \code{remove_intercept = TRUE}. Therefore, the multiple-testing correction +pool matches the number of hypotheses shown in the returned table. + +Automatic exponentiation is based on the link function. Models with +\code{logit} link are reported as odds ratios; models with \code{log} link +are reported as multiplicative effects such as rate ratios or mean ratios. +Models with identity, probit, cloglog, inverse, or other links are not +exponentiated automatically. +} +\examples{ +\dontrun{ +# Binary outcome: logistic regression +res_logit <- get_glm( + data = train_data, + outcome = "stage_advanced", + predictors = c("age", "ER", "PR", "HER2", "histology", "menopause"), + family = "binomial" +) + +# Continuous outcome: linear model through glm +res_gaussian <- get_glm( + data = train_data, + outcome = "tumor_size", + predictors = c("age", "ER", "PR", "HER2"), + family = "gaussian" +) + +# Count outcome: Poisson regression +res_pois <- get_glm( + data = train_data, + outcome = "n_mutations", + predictors = c("age", "stage", "histology"), + family = "poisson" +) + +# Positive skewed continuous outcome +res_gamma <- get_glm( + data = train_data, + outcome = "cost", + predictors = c("age", "stage", "treatment"), + family = Gamma(link = "log") +) + +# Retrieve fitted model +fit <- attr(res_logit, "model") +} + +} +\seealso{ +\code{\link{nice_ROC}} for ROC curve visualisation of logistic +models produced by \code{get_glm}. +} diff --git a/man/nice_ConcordanceScatter.Rd b/man/nice_ConcordanceScatter.Rd new file mode 100644 index 0000000..c23efe5 --- /dev/null +++ b/man/nice_ConcordanceScatter.Rd @@ -0,0 +1,95 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/nice_ConcordanceScatter.R +\name{nice_ConcordanceScatter} +\alias{nice_ConcordanceScatter} +\title{Plot cross-layer log-fold-change concordance.} +\usage{ +nice_ConcordanceScatter( + concordance_result, + x_label = "log2FC (RNA)", + y_label = "log2FC (Protein)", + genes_label = NULL, + method = c("pearson", "spearman"), + ci_level = 0.95, + point_size = 1.5, + alpha = 0.65, + gene_col = NULL, + logfc_col = "logFC", + category_colors = NULL +) +} +\arguments{ +\item{concordance_result}{Output from \code{\link[=concordanceDE]{concordanceDE()}}. A data frame with a +\code{category} column can also be supplied through \code{concordance_result$table}.} + +\item{x_label}{X-axis label. Default: \code{"log2FC (RNA)"}.} + +\item{y_label}{Y-axis label. Default: \code{"log2FC (Protein)"}.} + +\item{genes_label}{Optional character vector of genes to label with +\code{ggrepel}.} + +\item{method}{Correlation method used by \code{ggpubr::stat_cor()}. One of +\code{"pearson"} or \code{"spearman"}. Default: \code{"pearson"}.} + +\item{ci_level}{Confidence level for the linear-model confidence interval. +Default: \code{0.95}.} + +\item{point_size}{Point size. Default: \code{1.5}.} + +\item{alpha}{Point transparency. Default: \code{0.65}.} + +\item{gene_col}{Optional gene column name. If \code{NULL}, the function uses the +first column containing \code{"gene"}, ignoring case.} + +\item{logfc_col}{Base log-fold-change column name used in \code{\link[=concordanceDE]{concordanceDE()}}. +Default: \code{"logFC"}, which expects columns \code{logFC_x} and \code{logFC_y}.} + +\item{category_colors}{Optional named character vector with colors for +\code{concordant}, \code{discordant}, \code{only_x}, \code{only_y}, and \code{not_significant}.} +} +\value{ +A \code{ggplot2} object. +} +\description{ +Builds a scatter plot comparing log-fold changes from two omics layers after +classification with \code{\link[=concordanceDE]{concordanceDE()}}. Points are colored by concordance +category, reference lines divide the plot into directional quadrants, and an +optional regression line is drawn for concordant genes. +} +\examples{ +de_rna <- data.frame( + gene = c("ESR1", "PGR", "EGFR", "MKI67", "GATA3"), + logFC = c(3.2, 2.1, -1.6, -1.4, 1.8), + padj = c(1e-6, 0.002, 0.01, 0.03, 0.04) +) + +de_protein <- data.frame( + gene = c("ESR1", "PGR", "EGFR", "MKI67", "GATA3"), + logFC = c(0.7, 0.4, -0.5, 0.3, 0.05), + padj = c(1e-4, 0.01, 0.02, 0.04, 0.40) +) + +res <- concordanceDE( + de_x = de_rna, + de_y = de_protein, + logfc_threshold = c(1, 0.2) +) + +if ( + requireNamespace("ggpubr", quietly = TRUE) && + requireNamespace("ggrepel", quietly = TRUE) +) { + nice_ConcordanceScatter( + res, + genes_label = c("ESR1", "EGFR"), + method = "spearman" + ) +} + +} +\seealso{ +\code{\link[=concordanceDE]{concordanceDE()}} to classify genes by cross-layer differential concordance; +\code{\link[=crossLayerCorr]{crossLayerCorr()}} to estimate sample-level concordance between two omics +layers. +} diff --git a/man/nice_ROC.Rd b/man/nice_ROC.Rd new file mode 100644 index 0000000..a8527f2 --- /dev/null +++ b/man/nice_ROC.Rd @@ -0,0 +1,154 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/nice_ROC.R +\name{nice_ROC} +\alias{nice_ROC} +\title{Plot and Compare ROC Curves for Binary Classification Models} +\usage{ +nice_ROC( + models, + data = NULL, + outcome, + colors = c("#E63946", "#457B9D", "#2A9D8F", "#E9C46A"), + show_ci = TRUE, + show_delong = TRUE, + smooth = FALSE, + direction = "auto", + plot_title = "ROC Curve Comparison", + plot_subtitle = NULL, + theme_fn = ggplot2::theme_classic, + return_data = FALSE +) +} +\arguments{ +\item{models}{A \emph{named} list. Each element is one of: +\itemize{ +\item A fitted \code{glm} object (predictions generated internally via +\code{predict(model, newdata = data, type = "response")}). +\item A numeric vector of predicted probabilities of length +\code{nrow(data)} (values in [0, 1]). +} +Names are used as legend labels (e.g., +\code{list("Clinical" = fit_A, "Clinical + Omics" = fit_B)}).} + +\item{data}{A \code{data.frame} or \code{tibble}. Test / evaluation dataset. Required +when any element of \code{models} is a \code{glm} object.} + +\item{outcome}{Character (length 1). Name of the binary outcome column in \code{data} +(must be \code{0}/\code{1} integer or numeric; \code{1} = event / positive +class).} + +\item{colors}{Character vector of colour codes for the ROC curves. Recycled if shorter +than the number of models. +Default: a four-colour colorblind-friendly palette +(\code{c("#E63946", "#457B9D", "#2A9D8F", "#E9C46A")}).} + +\item{show_ci}{Logical. Whether to display the DeLong 95\% CI for each AUC in the legend. +Default: \code{TRUE}.} + +\item{show_delong}{Logical. When exactly two models are provided, annotate the plot with the +DeLong test p-value and delta AUC. Default: \code{TRUE}.} + +\item{smooth}{Logical. Apply kernel smoothing to the ROC curves +(\code{pROC::smooth}). Useful for small samples; may hide real variability +on large samples. Default: \code{FALSE}.} + +\item{direction}{Character. Direction for \code{pROC::roc}. Usually \code{"auto"} +(default). Set to \code{"<"} or \code{">"} if you need to enforce a +direction.} + +\item{plot_title}{Character string. Main title of the figure. +Default: \code{"ROC Curve Comparison"}.} + +\item{plot_subtitle}{Character string or \code{NULL}. Subtitle. Default: \code{NULL}.} + +\item{theme_fn}{A \code{ggplot2} theme function (without parentheses). Applied to the +plot. Default: \code{ggplot2::theme_classic}.} + +\item{return_data}{Logical. If \code{TRUE}, returns a named list containing the ggplot object +\emph{and} the underlying data/statistics. If \code{FALSE} (default), only +the \code{ggplot} object is returned.} +} +\value{ +When \code{return_data = FALSE} (default): a \code{ggplot2} object. + +When \code{return_data = TRUE}: a named list with elements: +\describe{ +\item{\code{plot}}{The \code{ggplot2} ROC figure.} +\item{\code{auc_table}}{A \code{tibble} with one row per model containing: +\code{model}, \code{auc}, \code{ci_lower}, \code{ci_upper}, +\code{n_cases}, \code{n_controls}.} +\item{\code{delong_test}}{Result of \code{pROC::roc.test} (DeLong) if +exactly two models were supplied; otherwise \code{NULL}.} +\item{\code{roc_objects}}{Named list of \code{pROC::roc} objects for +further downstream analysis.} +} +} +\description{ +Generates publication-ready ROC curves for one or more binary classification +models evaluated on a held-out dataset. For each model the function computes +AUC with a 95\% CI using the DeLong method. When exactly two models are +supplied, a DeLong paired test is performed to compare their AUCs and the +result is annotated on the plot. + +Models can be supplied either as fitted \code{glm} objects or as named +numeric vectors of predicted probabilities, making the function flexible +for any classifier that outputs scores. +} +\details{ +\subsection{AUC confidence intervals}{ + +CIs are computed with the DeLong method via \code{pROC::ci.auc} which +accounts for the correlation between paired measurements (same subjects). +} + +\subsection{DeLong test}{ + +When two models are compared on the \emph{same} test set the observations +are paired, so the DeLong paired test is used +(\code{pROC::roc.test(method = "delong", paired = TRUE)}). +} + +\subsection{Diagonal reference line}{ + +The grey dashed diagonal represents random performance (AUC = 0.50). +} +} +\examples{ +\dontrun{ +# -- Passing glm objects (predictions generated internally) ----------------- +roc_result <- nice_ROC( + models = list("Clinical only" = fit_A, + "Clinical + Omics" = fit_B), + data = test_data, + outcome = "stage_advanced", + return_data = TRUE +) + +roc_result$plot +roc_result$auc_table +roc_result$delong_test + +# -- Passing probability vectors directly ----------------------------------- +prob_A <- predict(fit_A, newdata = test_data, type = "response") +prob_B <- predict(fit_B, newdata = test_data, type = "response") + +nice_ROC( + models = list("Clinical" = prob_A, "Clinical + Omics" = prob_B), + data = test_data, + outcome = "stage_advanced" +) + +# -- Single model (no comparison) ------------------------------------------- +nice_ROC( + models = list("Clinical only" = fit_A), + data = test_data, + outcome = "stage_advanced", + show_delong = FALSE +) +} + +} +\seealso{ +\code{\link{get_glm}} for fitting the models fed into +\code{nice_ROC}. +} diff --git a/man/nice_forest.Rd b/man/nice_forest.Rd new file mode 100644 index 0000000..3f410b5 --- /dev/null +++ b/man/nice_forest.Rd @@ -0,0 +1,136 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/nice_forest.R +\name{nice_forest} +\alias{nice_forest} +\title{Draw a Forest Plot from Cox Model Results} +\usage{ +nice_forest( + data, + estimate_col = "HR", + ci_low_col = "CI_low", + ci_high_col = "CI_high", + p_col = "p.value", + label_col = NULL, + variable_col = "variable", + term_col = "term_clean", + reference_col = "reference", + p_display = 1, + title = NULL, + xlab = "Hazard Ratio (log scale)", + log_scale = TRUE, + vline = 1, + color_sig = "#c0392b", + color_ns = "#7f8c8d", + ref_line_color = "#2c3e50", + sort_by = c("estimate", "p.value", "input"), + point_size = 3.5, + base_size = 12, + return_table = FALSE +) +} +\arguments{ +\item{data}{A \code{data.frame} containing model results. By default, it is +expected to contain \code{HR}, \code{CI_low}, \code{CI_high}, and +\code{p.value} columns.} + +\item{estimate_col}{Character. Name of the column containing the point +estimate. Default: \code{"HR"}.} + +\item{ci_low_col}{Character. Name of the lower confidence interval column. +Default: \code{"CI_low"}.} + +\item{ci_high_col}{Character. Name of the upper confidence interval column. +Default: \code{"CI_high"}.} + +\item{p_col}{Character. Name of the p-value column. Default: +\code{"p.value"}.} + +\item{label_col}{Character or \code{NULL}. Name of a precomputed label +column. If \code{NULL}, labels are built automatically from +\code{variable_col}, \code{term_col}, and \code{reference_col}.} + +\item{variable_col}{Character. Name of the variable column. Default: +\code{"variable"}.} + +\item{term_col}{Character. Name of the cleaned term column. Default: +\code{"term_clean"}.} + +\item{reference_col}{Character. Name of the reference-level column. +Default: \code{"reference"}.} + +\item{p_display}{Numeric. Only rows with p-value <= \code{p_display} are +shown. Default: \code{1} (show all rows).} + +\item{title}{Character. Plot title. If \code{NULL}, a default title is used.} + +\item{xlab}{Character. X-axis label. Default: +\code{"Hazard Ratio (log scale)"}.} + +\item{log_scale}{Logical. If \code{TRUE}, the x-axis is shown on a log10 +scale. Default: \code{TRUE}.} + +\item{vline}{Numeric. Reference line position. Default: \code{1}.} + +\item{color_sig}{Color for significant points (p < 0.05). Default: +\code{"#c0392b"}.} + +\item{color_ns}{Color for non-significant points. Default: +\code{"#7f8c8d"}.} + +\item{ref_line_color}{Color for the vertical reference line. Default: +\code{"#2c3e50"}.} + +\item{sort_by}{Character. How to order the plot rows. One of +\code{"estimate"}, \code{"p.value"}, or \code{"input"}. +Default: \code{"estimate"}.} + +\item{point_size}{Numeric. Point size. Default: \code{3.5}.} + +\item{base_size}{Numeric. Base font size for the ggplot theme. +Default: \code{12}.} + +\item{return_table}{Logical. If \code{TRUE}, returns a list with +\code{$plot} and \code{$table}. If \code{FALSE}, returns only the plot. +Default: \code{FALSE}.} +} +\value{ +A \code{ggplot} object if \code{return_table = FALSE}, or a named +list with \code{$plot} and \code{$table} if \code{return_table = TRUE}. +} +\description{ +Creates a publication-ready forest plot from a tidy table of model results, +such as the output produced by \code{get_cox()}. +} +\details{ +This function is intentionally model-agnostic. It only requires a table with +point estimates, confidence intervals, and p-values. For Cox models generated +with \code{get_cox()}, the default columns work without modification. +} +\examples{ +\dontrun{ +cox_tab <- get_cox( + data = df, + time_col = "PFI.time", + event_col = "PFI", + vars = c("ER_Status_nature2012", "PAM50Call_RNAseq"), + model = "univariable" +) + +nice_forest(cox_tab) + +nice_forest( + cox_tab, + p_display = 0.05, + title = "PFI — Significant Cox Terms" +) +} + +} +\seealso{ +\code{\link{get_cox}} for fitting Cox proportional hazards models and +returning tidy results that can be passed directly to \code{nice_forest()}. + +\code{\link{nice_KM}} for Kaplan-Meier survival curve visualization. + +\code{\link[ggplot2]{ggplot}} for the underlying plotting system. +} diff --git a/man/trend_filter.Rd b/man/trend_filter.Rd new file mode 100644 index 0000000..4326813 --- /dev/null +++ b/man/trend_filter.Rd @@ -0,0 +1,138 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/trend_filter.R +\name{trend_filter} +\alias{trend_filter} +\title{Filter genes by patient-level trend consistency.} +\usage{ +trend_filter( + expr, + sampledata, + results, + baseline, + conditions, + sample.col = "sample_id", + patient.col = "patient_id", + group.col = "sample_type", + gene.col = "ensembl", + lfc.col = "log2FoldChange", + ratio = 1.1, + lfc.cutoff = 0, + scale = c("linear", "log2"), + require_complete_pairs = FALSE, + na.rm = FALSE, + return_removed = TRUE +) +} +\arguments{ +\item{expr}{Numeric matrix or data frame of expression values with genes as +rows and sample IDs as columns. Row names must contain gene IDs.} + +\item{sampledata}{Data frame with sample metadata.} + +\item{results}{Data frame or named list of data frames containing +differential expression results. Each data frame must contain a gene ID +column and a log2 fold-change column.} + +\item{baseline}{Character vector identifying the baseline group or groups in +\code{group.col}.} + +\item{conditions}{Character vector or named list identifying the condition +group for each comparison in \code{results}. If \code{results} is a named list, +names in \code{conditions} should match names in \code{results}.} + +\item{sample.col}{Column in \code{sampledata} containing sample IDs. Default is +\code{"sample_id"}.} + +\item{patient.col}{Column in \code{sampledata} containing patient IDs. Default is +\code{"patient_id"}.} + +\item{group.col}{Column in \code{sampledata} containing group labels. Default is +\code{"sample_type"}.} + +\item{gene.col}{Column in \code{results} containing gene IDs. Default is +\code{"ensembl"}.} + +\item{lfc.col}{Column in \code{results} containing log2 fold changes. Default is +\code{"log2FoldChange"}.} + +\item{ratio}{Numeric value greater than 1. Defines the tolerated reversal +threshold. Default is \code{1.1}.} + +\item{lfc.cutoff}{Non-negative numeric value used to define UP and DOWN +regulation from \code{lfc.col}. Default is \code{0}.} + +\item{scale}{Expression scale. Use \code{"linear"} for normalized counts, CPM, TPM +or similar linear-scale values. Use \code{"log2"} for log2-transformed +expression values. Default is \code{"linear"}.} + +\item{require_complete_pairs}{Logical. If \code{TRUE}, the function stops when +unpaired patients are found. If \code{FALSE}, only patients with both baseline +and condition samples are used. Default is \code{FALSE}.} + +\item{na.rm}{Logical. Should missing expression values be removed when +calculating patient-level means? Default is \code{FALSE}.} + +\item{return_removed}{Logical. Should the output include a vector of removed +genes? Default is \code{TRUE}.} +} +\value{ +A named list containing: +\itemize{ +\item One filtered data frame per comparison. +\item \code{TrendGenes}: unique genes passing the trend consistency filter. +\item \code{Diagnostics}: gene-level filtering diagnostics. +\item \code{Summary}: comparison-level filtering summary. +\item \code{RemovedGenes}: unique genes removed by the filter, if +\code{return_removed = TRUE}. +} +} +\description{ +This function filters differentially expressed genes according to the +consistency of their expression trend across paired patients. For each gene +\eqn{g} and patient \eqn{p}, the function calculates the mean expression in +baseline samples and condition samples: +\eqn{\bar{X}_{N}(g,p)} and \eqn{\bar{X}_{T}(g,p)}. +} +\details{ +For genes classified as UP-regulated at the group level, a gene is removed if +at least one paired patient shows: +\eqn{\bar{X}_{N}(g,p) >= \bar{X}_{T}(g,p) * ratio}. + +For genes classified as DOWN-regulated at the group level, a gene is removed +if at least one paired patient shows: +\eqn{\bar{X}_{T}(g,p) >= \bar{X}_{N}(g,p) * ratio}. + +The direction of regulation is defined from the group-level log2 fold-change. +The patient-level consistency check is performed using the expression matrix +supplied in \code{expr}. + +The multiplicative rule \code{ratio = 1.1} is appropriate for linear-scale +expression values. If \code{expr} is log2-transformed, set \code{scale = "log2"}; the +function will use \code{log2(ratio)} as an additive threshold. +} +\examples{ +\dontrun{ +trend_res <- trend_filter( + expr = norm_counts, + sampledata = sampledata, + results = list(Tumor_vs_Normal = deseq_res), + baseline = "normal", + conditions = c(Tumor_vs_Normal = "tumor"), + sample.col = "sample_id", + patient.col = "patient_id", + group.col = "sample_type" +) + +length(trend_res$TrendGenes) +head(trend_res$Tumor_vs_Normal) +head(trend_res$Diagnostics) +trend_res$Summary +} + +} +\references{ +Requena D. et al. Nat Commun 15, 10887 (2024). +} +\seealso{ +\code{\link[=detect_filter]{detect_filter()}} +} diff --git a/tests/testthat/helper-filter-data.R b/tests/testthat/helper-filter-data.R new file mode 100644 index 0000000..0a8727f --- /dev/null +++ b/tests/testthat/helper-filter-data.R @@ -0,0 +1,130 @@ +# Helper fixtures for detect_filter() and trend_filter() tests. +# These fixtures are intentionally small and deterministic. + +make_detect_fixture <- function() { + norm_counts <- data.frame( + N1 = c(10, 100, 100, 1, 20, 80, 10), + N2 = c(20, 120, 110, 2, 25, 90, 12), + B1 = c(100, 10, 100, 20, 5, 80, 15), + B2 = c(120, 20, 110, 25, 10, 90, 16), + C1 = c(15, 15, 100, 10, 10, 80, 100), + C2 = c(20, 20, 110, 10, 10, 90, 120), + D1 = c(15, 15, 100, 10, 10, 80, 5), + D2 = c(20, 20, 110, 10, 10, 90, 10), + check.names = FALSE + ) + + rownames(norm_counts) <- c( + "g_up_detect", + "g_down_detect", + "g_low_baseMean", + "g_up_low_expr", + "g_down_low_baseline", + "g_no_direction", + "g_c_detect" + ) + + df_b <- data.frame( + ensembl = rownames(norm_counts)[1:6], + baseMean = c(100, 120, 20, 100, 100, 100), + log2FoldChange = c(2, -2, 2, 2, -2, 0), + padj = c(0.01, 0.01, 0.01, 0.02, 0.03, 0.04), + stringsAsFactors = FALSE + ) + + df_c <- data.frame( + ensembl = c("g_c_detect", "g_up_low_expr", "g_no_direction"), + baseMean = c(100, 100, 100), + log2FoldChange = c(2, 2, 0), + padj = c(0.01, 0.02, 0.05), + stringsAsFactors = FALSE + ) + + df_d <- data.frame( + ensembl = c("g_down_detect", "g_c_detect"), + baseMean = c(100, 100), + log2FoldChange = c(-2, 2), + padj = c(0.01, 0.02), + stringsAsFactors = FALSE + ) + + list( + norm_counts = norm_counts, + df_b = df_b, + df_c = df_c, + df_d = df_d, + samples_baseline = c("N1", "N2"), + samples_condition1 = c("B1", "B2"), + samples_condition2 = c("C1", "C2"), + samples_condition3 = c("D1", "D2") + ) +} + +make_trend_fixture <- function(log_scale = FALSE) { + expr_linear <- data.frame( + N_P1 = c(10, 30, 30, 10, 50), + T_P1 = c(20, 20, 10, 12, 60), + N_P2 = c(12, 10, 40, 10, 50), + T_P2 = c(24, 20, 15, 20, 55), + check.names = FALSE + ) + + rownames(expr_linear) <- c( + "g_up_pass", + "g_up_fail", + "g_down_pass", + "g_down_fail", + "g_no_direction" + ) + + expr <- if (log_scale) { + as.data.frame(log2(expr_linear)) + } else { + expr_linear + } + + sampledata <- data.frame( + sample_id = c("N_P1", "T_P1", "N_P2", "T_P2"), + patient_id = c("P1", "P1", "P2", "P2"), + sample_type = c("normal", "tumor", "normal", "tumor"), + stringsAsFactors = FALSE + ) + + res <- data.frame( + ensembl = c( + "g_up_pass", + "g_up_fail", + "g_down_pass", + "g_down_fail", + "g_no_direction", + "g_missing_expr" + ), + log2FoldChange = c(1.5, 1.2, -1.4, -1.1, 0, 2), + padj = c(0.01, 0.01, 0.02, 0.03, 0.20, 0.01), + stringsAsFactors = FALSE + ) + + list( + expr = expr, + sampledata = sampledata, + res = res + ) +} + +make_trend_unpaired_fixture <- function() { + fixture <- make_trend_fixture() + + fixture$expr$N_P3 <- c(10, 10, 10, 10, 10) + + fixture$sampledata <- rbind( + fixture$sampledata, + data.frame( + sample_id = "N_P3", + patient_id = "P3", + sample_type = "normal", + stringsAsFactors = FALSE + ) + ) + + fixture +} diff --git a/tests/testthat/helper-glm-test-data.R b/tests/testthat/helper-glm-test-data.R new file mode 100644 index 0000000..6116402 --- /dev/null +++ b/tests/testthat/helper-glm-test-data.R @@ -0,0 +1,36 @@ +# Helper data for get_glm() and nice_ROC() tests. +# testthat automatically sources files named helper*.R before tests. + +make_binary_glm_data <- function(n = 160, seed = 123) { + set.seed(seed) + x1 <- stats::rnorm(n) + x2 <- factor(sample(c("A", "B"), n, replace = TRUE), levels = c("A", "B")) + eta <- -0.25 + 0.75 * x1 + 0.45 * (x2 == "B") + p <- stats::plogis(eta) + y <- stats::rbinom(n, size = 1, prob = p) + + data.frame( + y = as.integer(y), + x1 = x1, + x2 = x2 + ) +} + +make_gaussian_glm_data <- function(n = 120, seed = 456) { + set.seed(seed) + x1 <- stats::rnorm(n) + x2 <- stats::rnorm(n) + y <- 1.5 + 0.8 * x1 - 0.35 * x2 + stats::rnorm(n, sd = 0.7) + + data.frame(y = y, x1 = x1, x2 = x2) +} + +make_count_glm_data <- function(n = 140, seed = 789) { + set.seed(seed) + x1 <- stats::rnorm(n) + exposure <- stats::runif(n, min = 0.8, max = 1.5) + lambda <- exp(0.2 + 0.35 * x1 + log(exposure)) + y <- stats::rpois(n, lambda = lambda) + + data.frame(y = y, x1 = x1, exposure = exposure) +} diff --git a/tests/testthat/helper-survival-test-data.R b/tests/testthat/helper-survival-test-data.R new file mode 100644 index 0000000..bf824ad --- /dev/null +++ b/tests/testthat/helper-survival-test-data.R @@ -0,0 +1,59 @@ +# Helper objects used by the survival plotting/modeling tests. +# Place this file in tests/testthat/. + +make_cox_test_data <- function(n = 120, seed = 123) { + set.seed(seed) + + gene_status <- factor( + rep(c("WT", "Mut"), each = n / 2), + levels = c("WT", "Mut") + ) + age <- round(seq(42, 78, length.out = n) + rnorm(n, sd = 3), 1) + x <- as.integer(gene_status == "Mut") + + true_time <- stats::rexp(n, rate = 0.004 * exp(0.45 * x + 0.015 * (age - 60))) + censor_time <- stats::rexp(n, rate = 0.0025) + + data.frame( + sample = paste0("S", seq_len(n)), + PFI.time = round(pmin(true_time, censor_time) * 100, 2), + PFI = as.integer(true_time <= censor_time), + gene_status = gene_status, + age = age, + protein_score = stats::rnorm(n), + sparse_var = factor(c(rep("Rare", 3), rep("Common", n - 3))), + days_to_death = round(true_time * 100, 2), + `_hidden_omic` = stats::rnorm(n), + check.names = FALSE + ) +} + +make_forest_test_data <- function() { + data.frame( + model = "adjusted", + model_id = c("PAM50 adjusted", "DNAmethyl adjusted", "RPPA adjusted"), + variable = c("PAM50Call_RNAseq", "_PANCAN_DNAMethyl_BRCA", "RPPA_Clusters_nature2012"), + term_clean = c("Her2", "cluster 3", "Reactive"), + reference = c("LumA", "cluster 4", "Basal"), + HR = c(2.25, 0.43, 0.22), + CI_low = c(1.25, 0.19, 0.06), + CI_high = c(4.07, 0.98, 0.78), + p.value = c(0.00715, 0.0435, 0.0186), + n_used = c(100, 100, 100), + n_events = c(30, 30, 30), + adjusted_for = "age", + stringsAsFactors = FALSE + ) +} + +make_km_test_data <- function(n = 60) { + data.frame( + PFI.time = c(seq(25, 750, length.out = n / 2), seq(40, 900, length.out = n / 2)), + PFI = rep(c(1, 0, 1, 1, 0, 0), length.out = n), + GENE_muts = factor(rep(c("No", "Yes"), each = n / 2), levels = c("No", "Yes")), + PAM50_group = factor( + rep(c("LumA", "HER2", "Basal"), length.out = n), + levels = c("LumA", "HER2", "Basal") + ) + ) +} diff --git a/tests/testthat/test-concordanceDE.R b/tests/testthat/test-concordanceDE.R new file mode 100644 index 0000000..c227ce4 --- /dev/null +++ b/tests/testthat/test-concordanceDE.R @@ -0,0 +1,23 @@ +test_that("concordanceDE classifies categories correctly", { + de_x <- data.frame( + gene = c("A", "B", "C", "D", "E"), + logFC = c(2, 2, 2, 0.2, 0.1), + padj = c(0.01, 0.01, 0.01, 0.80, 0.80) + ) + + de_y <- data.frame( + gene = c("A", "B", "C", "D", "E"), + logFC = c(0.5, -0.5, 0.1, -0.5, 0.1), + padj = c(0.01, 0.01, 0.80, 0.01, 0.80) + ) + + res <- concordanceDE(de_x, de_y, logfc_threshold = c(1, 0.2)) + cats <- setNames(as.character(res$table$category), res$table$gene) + + expect_equal(cats[["A"]], "concordant") + expect_equal(cats[["B"]], "discordant") + expect_equal(cats[["C"]], "only_x") + expect_equal(cats[["D"]], "only_y") + expect_equal(cats[["E"]], "not_significant") + expect_equal(res$concordance_score, 0.5) +}) diff --git a/tests/testthat/test-crossLayerCorr.R b/tests/testthat/test-crossLayerCorr.R new file mode 100644 index 0000000..f33391e --- /dev/null +++ b/tests/testthat/test-crossLayerCorr.R @@ -0,0 +1,15 @@ +test_that("crossLayerCorr matches shared features and samples", { + set.seed(1) + mat_x <- matrix(rnorm(30), nrow = 5) + mat_y <- mat_x + matrix(rnorm(30, sd = 0.1), nrow = 5) + + rownames(mat_x) <- rownames(mat_y) <- paste0("G", 1:5) + colnames(mat_x) <- colnames(mat_y) <- paste0("S", 1:6) + + res <- crossLayerCorr(mat_x, mat_y, top_n = 4, plot = FALSE) + + expect_equal(res$n_shared_samples, 6) + expect_equal(res$n_shared_features, 5) + expect_equal(res$n_features_used, 4) + expect_true(all(c("sample", "correlation") %in% names(res$correlations))) +}) diff --git a/tests/testthat/test-detect_filter.R b/tests/testthat/test-detect_filter.R new file mode 100644 index 0000000..71f7bd8 --- /dev/null +++ b/tests/testthat/test-detect_filter.R @@ -0,0 +1,90 @@ +test_that("detect_filter keeps detectable UP and DOWN genes in one comparison", { + fx <- make_detect_fixture() + + out <- detect_filter( + norm.counts = fx$norm_counts, + df.BvsA = fx$df_b, + samples.baseline = fx$samples_baseline, + samples.condition1 = fx$samples_condition1, + cutoffs = c(50, 50, 0) + ) + + expect_type(out, "list") + expect_named(out, c("Comparison1", "DetectGenes")) + + expect_setequal( + out$DetectGenes, + c("g_up_detect", "g_down_detect") + ) + + expect_setequal( + out$Comparison1$ensembl, + c("g_up_detect", "g_down_detect") + ) + + expect_false("g_low_baseMean" %in% out$DetectGenes) + expect_false("g_up_low_expr" %in% out$DetectGenes) + expect_false("g_down_low_baseline" %in% out$DetectGenes) + expect_false("g_no_direction" %in% out$DetectGenes) +}) + +test_that("detect_filter supports optional second and third comparisons", { + fx <- make_detect_fixture() + + out <- detect_filter( + norm.counts = fx$norm_counts, + df.BvsA = fx$df_b, + df.CvsA = fx$df_c, + df.DvsA = fx$df_d, + samples.baseline = fx$samples_baseline, + samples.condition1 = fx$samples_condition1, + samples.condition2 = fx$samples_condition2, + samples.condition3 = fx$samples_condition3, + cutoffs = c(50, 50, 0) + ) + + expect_named( + out, + c("Comparison1", "DetectGenes", "Comparison2", "Comparison3") + ) + + expect_true("g_c_detect" %in% out$DetectGenes) + expect_true("g_c_detect" %in% out$Comparison2$ensembl) + + expect_setequal( + out$Comparison1$ensembl, + c("g_up_detect", "g_down_detect") + ) +}) + +test_that("detect_filter removes duplicated genes from DetectGenes", { + fx <- make_detect_fixture() + + df_b_dup <- rbind(fx$df_b, fx$df_b[fx$df_b$ensembl == "g_up_detect", ]) + + out <- detect_filter( + norm.counts = fx$norm_counts, + df.BvsA = df_b_dup, + samples.baseline = fx$samples_baseline, + samples.condition1 = fx$samples_condition1, + cutoffs = c(50, 50, 0) + ) + + expect_equal(length(out$DetectGenes), length(unique(out$DetectGenes))) +}) + +test_that("detect_filter validates cutoff length", { + fx <- make_detect_fixture() + + expect_error( + detect_filter( + norm.counts = fx$norm_counts, + df.BvsA = fx$df_b, + samples.baseline = fx$samples_baseline, + samples.condition1 = fx$samples_condition1, + cutoffs = c(50, 50) + ), + "Cutoffs vector must contain three values", + fixed = TRUE + ) +}) diff --git a/tests/testthat/test-get_cox.R b/tests/testthat/test-get_cox.R new file mode 100644 index 0000000..45347f1 --- /dev/null +++ b/tests/testthat/test-get_cox.R @@ -0,0 +1,201 @@ +# Tests for get_cox(). +# Place this file in tests/testthat/. + +test_that("get_cox validates required inputs", { + skip_if_not_installed("survival") + skip_if_not_installed("broom") + skip_if_not_installed("dplyr") + + dat <- make_cox_test_data() + + expect_error( + get_cox(list(), vars = "gene_status", verbose = FALSE), + "'data' should be a data.frame", + fixed = TRUE + ) + + expect_error( + get_cox(dat, time_col = "missing_time", vars = "gene_status", verbose = FALSE), + "Column 'missing_time' not found" + ) + + expect_error( + get_cox(dat, event_col = "missing_event", vars = "gene_status", verbose = FALSE), + "Column 'missing_event' not found" + ) + + expect_error( + get_cox(dat, vars = "does_not_exist", verbose = FALSE), + "variables were not found" + ) + + expect_error( + get_cox(dat, vars = "gene_status", model = "adjusted", verbose = FALSE), + "'adjust_vars' is required", + fixed = TRUE + ) +}) + +test_that("get_cox fits univariable categorical Cox models", { + skip_if_not_installed("survival") + skip_if_not_installed("broom") + skip_if_not_installed("dplyr") + + dat <- make_cox_test_data() + + out <- get_cox( + data = dat, + time_col = "PFI.time", + event_col = "PFI", + vars = "gene_status", + model = "univariable", + ref_levels = list(gene_status = "WT"), + min_events_per_level = 1, + verbose = FALSE + ) + + expect_s3_class(out, "data.frame") + expect_true(all(c( + "model", "model_id", "variable", "term", "term_clean", "reference", + "HR", "CI_low", "CI_high", "p.value", "n_used", "n_events" + ) %in% names(out))) + expect_equal(unique(out$model), "univariable") + expect_equal(unique(out$variable), "gene_status") + expect_equal(unique(out$reference), "WT") + expect_true(all(is.finite(out$HR))) + expect_true(all(out$CI_low > 0 & out$CI_high > 0)) + expect_true(all(out$p.value >= 0 & out$p.value <= 1)) +}) + +test_that("get_cox fits multivariable models with categorical and continuous terms", { + skip_if_not_installed("survival") + skip_if_not_installed("broom") + skip_if_not_installed("dplyr") + + dat <- make_cox_test_data() + + out <- get_cox( + data = dat, + time_col = "PFI.time", + event_col = "PFI", + vars = c("gene_status", "age"), + model = "multivariable", + ref_levels = list(gene_status = "WT"), + min_events_per_level = 1, + verbose = FALSE + ) + + expect_equal(unique(out$model), "multivariable") + expect_equal(unique(out$model_id), "multivariable") + expect_setequal(unique(out$variable), c("gene_status", "age")) + expect_true(any(out$reference == "continuous")) + expect_true(all(is.finite(out$HR))) +}) + +test_that("get_cox adjusted models drop adjustment terms unless requested", { + skip_if_not_installed("survival") + skip_if_not_installed("broom") + skip_if_not_installed("dplyr") + + dat <- make_cox_test_data() + + out_drop <- get_cox( + data = dat, + time_col = "PFI.time", + event_col = "PFI", + vars = "gene_status", + model = "adjusted", + adjust_vars = "age", + keep_adjust_terms = FALSE, + ref_levels = list(gene_status = "WT"), + min_events_per_level = 1, + verbose = FALSE + ) + + expect_equal(unique(out_drop$model), "adjusted") + expect_equal(unique(out_drop$variable), "gene_status") + expect_equal(unique(out_drop$adjusted_for), "age") + + out_keep <- get_cox( + data = dat, + time_col = "PFI.time", + event_col = "PFI", + vars = "gene_status", + model = "adjusted", + adjust_vars = "age", + keep_adjust_terms = TRUE, + ref_levels = list(gene_status = "WT"), + min_events_per_level = 1, + verbose = FALSE + ) + + expect_setequal(unique(out_keep$variable), c("gene_status", "age")) +}) + +test_that("get_cox skips sparse categorical variables but keeps valid models", { + skip_if_not_installed("survival") + skip_if_not_installed("broom") + skip_if_not_installed("dplyr") + + dat <- make_cox_test_data() + + out <- get_cox( + data = dat, + time_col = "PFI.time", + event_col = "PFI", + vars = c("gene_status", "sparse_var"), + model = "univariable", + ref_levels = list(gene_status = "WT"), + min_n_per_level = 10, + min_events_per_level = 1, + verbose = FALSE + ) + + expect_equal(unique(out$variable), "gene_status") + expect_false("sparse_var" %in% out$variable) +}) + +test_that("get_cox automatic variable selection excludes ID-like, survival-like, and underscored columns", { + skip_if_not_installed("survival") + skip_if_not_installed("broom") + skip_if_not_installed("dplyr") + + dat <- make_cox_test_data()[, c( + "sample", "PFI.time", "PFI", "gene_status", "days_to_death", "_hidden_omic" + )] + + out <- get_cox( + data = dat, + time_col = "PFI.time", + event_col = "PFI", + vars = NULL, + model = "univariable", + ref_levels = list(gene_status = "WT"), + min_events_per_level = 1, + verbose = FALSE + ) + + expect_equal(unique(out$variable), "gene_status") +}) + +test_that("get_cox errors when no Cox model can be fitted", { + skip_if_not_installed("survival") + skip_if_not_installed("broom") + skip_if_not_installed("dplyr") + + dat <- make_cox_test_data() + dat$PFI <- 2L + + expect_error( + get_cox( + data = dat, + time_col = "PFI.time", + event_col = "PFI", + vars = "gene_status", + model = "univariable", + min_events_per_level = 1, + verbose = FALSE + ), + "No Cox model could be fitted" + ) +}) diff --git a/tests/testthat/test-get_glm.R b/tests/testthat/test-get_glm.R new file mode 100644 index 0000000..61b5a27 --- /dev/null +++ b/tests/testthat/test-get_glm.R @@ -0,0 +1,194 @@ +testthat::skip_if_not_installed("broom") +testthat::skip_if_not_installed("dplyr") +testthat::skip_if_not_installed("tibble") + +test_that("get_glm returns tidy GLM results and model metadata", { + dat <- make_binary_glm_data() + + res <- suppressMessages(get_glm( + data = dat, + outcome = "y", + predictors = c("x1", "x2"), + family = "binomial", + adjust_method = "BH" + )) + + expect_s3_class(res, "get_glm_result") + expect_s3_class(res, "tbl_df") + expect_equal( + names(res), + c( + "term", "estimate", "ci_lower", "ci_upper", "std_error", + "statistic", "p_value", "q_value", "significance" + ) + ) + + expect_false("(Intercept)" %in% res$term) + expect_true(all(c("x1", "x2B") %in% res$term)) + expect_true(all(is.finite(res$estimate))) + expect_true(all(res$estimate > 0)) + + expect_s3_class(attr(res, "model"), "glm") + expect_equal(attr(res, "formula"), "y ~ x1 + x2") + expect_equal(attr(res, "family"), "binomial") + expect_equal(attr(res, "link"), "logit") + expect_equal(attr(res, "n_obs"), nrow(dat)) + expect_true(is.finite(attr(res, "AIC"))) + expect_true(attr(res, "exponentiate")) + expect_equal(attr(res, "adjust_method"), "BH") + + expect_equal( + res$q_value, + stats::p.adjust(res$p_value, method = "BH"), + tolerance = 1e-12, + ignore_attr = TRUE + ) +}) + +test_that("get_glm can keep the intercept and adjust all reported p-values", { + dat <- make_binary_glm_data() + + res <- suppressMessages(get_glm( + data = dat, + outcome = "y", + predictors = c("x1", "x2"), + family = stats::binomial(), + adjust_method = "holm", + remove_intercept = FALSE + )) + + expect_true("(Intercept)" %in% res$term) + expect_equal(attr(res, "adjust_method"), "holm") + expect_equal( + res$q_value, + stats::p.adjust(res$p_value, method = "holm"), + tolerance = 1e-12, + ignore_attr = TRUE + ) +}) + +test_that("get_glm auto exponentiation follows the link and can be overridden", { + bin_dat <- make_binary_glm_data() + gauss_dat <- make_gaussian_glm_data() + pois_dat <- make_count_glm_data() + + res_gauss <- suppressMessages(get_glm( + data = gauss_dat, + outcome = "y", + predictors = c("x1", "x2"), + family = stats::gaussian() + )) + expect_false(attr(res_gauss, "exponentiate")) + expect_equal(attr(res_gauss, "link"), "identity") + + res_pois <- suppressMessages(get_glm( + data = pois_dat, + outcome = "y", + predictors = "x1", + family = stats::poisson + )) + expect_true(attr(res_pois, "exponentiate")) + expect_equal(attr(res_pois, "link"), "log") + expect_true(all(res_pois$estimate > 0)) + + res_no_exp <- suppressMessages(get_glm( + data = bin_dat, + outcome = "y", + predictors = c("x1", "x2"), + family = "binomial", + exponentiate = FALSE + )) + expect_false(attr(res_no_exp, "exponentiate")) +}) + +test_that("get_glm safely quotes non-syntactic and reserved variable names", { + dat <- make_binary_glm_data() + weird_dat <- data.frame( + "case status" = dat$y, + "age years" = dat$x1, + "if" = as.numeric(dat$x2 == "B"), + check.names = FALSE + ) + + res <- suppressMessages(get_glm( + data = weird_dat, + outcome = "case status", + predictors = c("age years", "if"), + family = "binomial" + )) + + expect_equal(attr(res, "formula"), "`case status` ~ `age years` + `if`") + expect_equal(nrow(res), 2L) + expect_s3_class(attr(res, "model"), "glm") +}) + +test_that("get_glm validates inputs before fitting", { + dat <- make_binary_glm_data() + + expect_error( + get_glm(as.list(dat), "y", c("x1", "x2")), + "`data` must be" + ) + expect_error( + get_glm(dat, c("y", "other"), c("x1", "x2")), + "`outcome` must be" + ) + expect_error( + get_glm(dat, "missing_y", c("x1", "x2")), + "not found" + ) + expect_error( + get_glm(dat, "y", character()), + "`predictors` must be" + ) + expect_error( + get_glm(dat, "y", c("x1", "missing_x")), + "not in `data`" + ) + expect_error( + get_glm(dat, "y", c("x1", "x2"), conf_level = 1), + "`conf_level` must be" + ) + expect_error( + get_glm(dat, "y", c("x1", "x2"), exponentiate = NA), + "`exponentiate` must be" + ) + expect_error( + get_glm(dat, "y", c("x1", "x2"), remove_intercept = NA), + "`remove_intercept` must be" + ) + expect_error( + get_glm(dat, "y", c("x1", "x2"), verbose = NA), + "`verbose` must be" + ) + expect_error( + get_glm(dat, "y", c("x1", "x2"), adjust_method = "bad_method"), + "should be one of" + ) + expect_error( + get_glm(dat, "y", c("x1", "x2"), family = "not_a_family"), + "Could not find" + ) + expect_error( + get_glm( + dat, "y", c("x1", "x2"), + family = function(link) stats::binomial(link = link) + ), + "could not be evaluated" + ) +}) + +test_that("print.get_glm_result prints a compact summary and returns input invisibly", { + dat <- make_binary_glm_data() + res <- suppressMessages(get_glm(dat, "y", c("x1", "x2"), family = "binomial")) + + printed <- capture.output(returned <- print(res)) + + expect_identical(returned, res) + expect_true(any(grepl("get_glm result", printed, fixed = TRUE))) + expect_true(any(grepl("q-value correction: BH", printed, fixed = TRUE))) + + bad <- res + bad$p_value <- NULL + expect_error(print(bad), "missing columns") +}) diff --git a/tests/testthat/test-nice_ConcordanceScatter.R b/tests/testthat/test-nice_ConcordanceScatter.R new file mode 100644 index 0000000..456db24 --- /dev/null +++ b/tests/testthat/test-nice_ConcordanceScatter.R @@ -0,0 +1,59 @@ +# tests/testthat/test-nice_ConcordanceScatter.R +test_that("nice_ConcordanceScatter returns a ggplot object", { + de_x <- data.frame( + gene = c("ESR1", "PGR", "EGFR", "ERBB2", "TP53"), + logFC = c(2.5, 1.8, -1.6, 0.4, -0.2), + padj = c(0.001, 0.01, 0.02, 0.50, 0.80) + ) + + de_y <- data.frame( + gene = c("ESR1", "PGR", "EGFR", "ERBB2", "TP53"), + logFC = c(0.7, 0.4, -0.5, 0.3, -0.1), + padj = c(0.002, 0.03, 0.04, 0.20, 0.90) + ) + + conc <- concordanceDE( + de_x = de_x, + de_y = de_y, + gene_col = "gene", + logfc_col = "logFC", + padj_col = "padj", + padj_threshold = 0.05, + logfc_threshold = c(1, 0.20) + ) + + p <- nice_ConcordanceScatter( + concordance_result = conc, + genes_label = c("ESR1", "EGFR"), + method = "spearman" + ) + + expect_s3_class(p, "ggplot") +}) +test_that("nice_ConcordanceScatter works without gene labels", { + de_x <- data.frame( + gene = c("ESR1", "PGR", "EGFR", "ERBB2", "TP53"), + logFC = c(2.5, 1.8, -1.6, 0.4, -0.2), + padj = c(0.001, 0.01, 0.02, 0.50, 0.80) + ) + + de_y <- data.frame( + gene = c("ESR1", "PGR", "EGFR", "ERBB2", "TP53"), + logFC = c(0.7, 0.4, -0.5, 0.3, -0.1), + padj = c(0.002, 0.03, 0.04, 0.20, 0.90) + ) + + conc <- concordanceDE( + de_x = de_x, + de_y = de_y, + gene_col = "gene", + logfc_col = "logFC", + padj_col = "padj", + padj_threshold = 0.05, + logfc_threshold = c(1, 0.20) + ) + + p <- nice_ConcordanceScatter(conc) + + expect_s3_class(p, "ggplot") +}) \ No newline at end of file diff --git a/tests/testthat/test-nice_KM.R b/tests/testthat/test-nice_KM.R new file mode 100644 index 0000000..defd2a2 --- /dev/null +++ b/tests/testthat/test-nice_KM.R @@ -0,0 +1,104 @@ +test_that("nice_KM returns a ggplot object for two strata", { + skip_if_not_installed("ggplot2") + skip_if_not_installed("survival") + skip_if_not_installed("survminer") + + dat <- make_km_test_data() + + p <- nice_KM( + data = dat, + gene = "GENE_muts", + time_var = "PFI.time", + event_var = "PFI", + conf_int = FALSE, + pval_size = 3 + ) + + expect_s3_class(p, "ggplot") +}) + +test_that("nice_KM returnData returns the survfit object and plot", { + skip_if_not_installed("ggplot2") + skip_if_not_installed("survival") + skip_if_not_installed("survminer") + + dat <- make_km_test_data() + + out <- nice_KM( + data = dat, + gene = "GENE_muts", + time_var = "PFI.time", + event_var = "PFI", + conf_int = FALSE, + returnData = TRUE, + pval_size = 3 + ) + + expect_true(identical(names(out), c("km_fit", "plot"))) + expect_s3_class(out$km_fit, "survfit") + expect_s3_class(out$plot, "ggplot") +}) + +test_that("nice_KM supports three strata when a matching palette is supplied", { + skip_if_not_installed("ggplot2") + skip_if_not_installed("survival") + skip_if_not_installed("survminer") + + dat <- make_km_test_data() + + p <- nice_KM( + data = dat, + gene = "PAM50_group", + time_var = "PFI.time", + event_var = "PFI", + title_prefix = "Subtype ", + colors = c("#1F8FFF", "#ED4D4D", "#2ECC71"), + conf_int = FALSE, + pval_size = 3 + ) + + expect_s3_class(p, "ggplot") +}) + +test_that("nice_KM handles variables with only one observed category", { + skip_if_not_installed("ggplot2") + + dat <- make_km_test_data() + dat$GENE_muts <- factor("No", levels = c("No", "Yes")) + + p <- NULL + expect_warning( + p <- nice_KM( + data = dat, + gene = "GENE_muts", + time_var = "PFI.time", + event_var = "PFI" + ), + "Only one category" + ) + + expect_s3_class(p, "ggplot") +}) + +test_that("nice_KM singleton-category branch also works with returnData", { + skip_if_not_installed("ggplot2") + + dat <- make_km_test_data() + dat$GENE_muts <- "No" + + out <- NULL + expect_warning( + out <- nice_KM( + data = dat, + gene = "GENE_muts", + time_var = "PFI.time", + event_var = "PFI", + returnData = TRUE + ), + "Only one category" + ) + + expect_true(identical(names(out), c("km_fit", "plot"))) + expect_null(out$km_fit) + expect_s3_class(out$plot, "ggplot") +}) diff --git a/tests/testthat/test-nice_ROC.R b/tests/testthat/test-nice_ROC.R new file mode 100644 index 0000000..826ef35 --- /dev/null +++ b/tests/testthat/test-nice_ROC.R @@ -0,0 +1,128 @@ +testthat::skip_if_not_installed("dplyr") +testthat::skip_if_not_installed("ggplot2") +testthat::skip_if_not_installed("pROC") +testthat::skip_if_not_installed("tibble") + +test_that("nice_ROC returns plot, AUC table, DeLong test, and ROC objects", { + dat <- make_binary_glm_data(n = 180, seed = 321) + fit_x1 <- stats::glm(y ~ x1, data = dat, family = stats::binomial()) + fit_x1_x2 <- stats::glm(y ~ x1 + x2, data = dat, family = stats::binomial()) + + out <- nice_ROC( + models = list("x1 only" = fit_x1, "x1 + x2" = fit_x1_x2), + data = dat, + outcome = "y", + return_data = TRUE + ) + + expect_type(out, "list") + expect_named(out, c("plot", "auc_table", "delong_test", "roc_objects")) + expect_s3_class(out$plot, "ggplot") + + expect_s3_class(out$auc_table, "tbl_df") + expect_equal(nrow(out$auc_table), 2L) + expect_named( + out$auc_table, + c("model", "auc", "ci_lower", "ci_upper", "n_cases", "n_controls") + ) + expect_equal(out$auc_table$model, c("x1 only", "x1 + x2")) + expect_true(all(out$auc_table$auc >= 0 & out$auc_table$auc <= 1)) + expect_true(all(out$auc_table$ci_lower >= 0 & out$auc_table$ci_lower <= 1)) + expect_true(all(out$auc_table$ci_upper >= 0 & out$auc_table$ci_upper <= 1)) + expect_equal(unique(out$auc_table$n_cases), sum(dat$y == 1)) + expect_equal(unique(out$auc_table$n_controls), sum(dat$y == 0)) + + expect_s3_class(out$delong_test, "htest") + expect_named(out$roc_objects, c("x1 only", "x1 + x2")) + expect_true(all(vapply(out$roc_objects, inherits, logical(1), "roc"))) +}) + +test_that("nice_ROC returns a ggplot object when return_data is FALSE", { + dat <- make_binary_glm_data(n = 150, seed = 654) + fit <- stats::glm(y ~ x1 + x2, data = dat, family = stats::binomial()) + + p <- nice_ROC( + models = list("full model" = fit), + data = dat, + outcome = "y", + plot_title = "Test ROC", + plot_subtitle = "Held-out sample" + ) + + expect_s3_class(p, "ggplot") + expect_equal(p$labels$title, "Test ROC") + expect_equal(p$labels$subtitle, "Held-out sample") +}) + +test_that("nice_ROC accepts numeric probability vectors", { + dat <- make_binary_glm_data(n = 150, seed = 987) + fit <- stats::glm(y ~ x1 + x2, data = dat, family = stats::binomial()) + probs <- stats::predict(fit, newdata = dat, type = "response") + + out <- nice_ROC( + models = list("probabilities" = as.numeric(probs)), + data = dat, + outcome = "y", + show_ci = FALSE, + return_data = TRUE + ) + + expect_equal(out$auc_table$model, "probabilities") + expect_null(out$delong_test) + expect_named(out$roc_objects, "probabilities") + expect_s3_class(out$plot, "ggplot") +}) + +test_that("nice_ROC can compare two models without adding DeLong output", { + dat <- make_binary_glm_data(n = 160, seed = 246) + fit_x1 <- stats::glm(y ~ x1, data = dat, family = stats::binomial()) + fit_x1_x2 <- stats::glm(y ~ x1 + x2, data = dat, family = stats::binomial()) + + out <- nice_ROC( + models = list("x1 only" = fit_x1, "x1 + x2" = fit_x1_x2), + data = dat, + outcome = "y", + show_delong = FALSE, + return_data = TRUE + ) + + expect_null(out$delong_test) + expect_equal(nrow(out$auc_table), 2L) +}) + +test_that("nice_ROC validates inputs", { + dat <- make_binary_glm_data(n = 120, seed = 135) + fit <- stats::glm(y ~ x1 + x2, data = dat, family = stats::binomial()) + + expect_error( + nice_ROC(models = list(fit), data = dat, outcome = "y"), + "named" + ) + expect_error( + nice_ROC(models = list("model" = fit), data = dat, outcome = c("y", "z")), + "single character string" + ) + expect_error( + nice_ROC(models = list("model" = fit), data = dat, outcome = "missing_y"), + "not found" + ) + + non_binary <- dat + non_binary$y[1] <- 2L + expect_error( + nice_ROC(models = list("model" = fit), data = non_binary, outcome = "y"), + "binary 0/1" + ) + expect_error( + nice_ROC(models = list("prob" = rep(0.5, nrow(dat))), data = NULL, outcome = "y"), + "data` must be provided" + ) + expect_error( + nice_ROC(models = list("prob" = rep(0.5, 3)), data = dat, outcome = "y"), + "has length" + ) + expect_error( + nice_ROC(models = list("bad" = "not a model"), data = dat, outcome = "y"), + "glm object or numeric probability vector" + ) +}) diff --git a/tests/testthat/test-nice_forest.R b/tests/testthat/test-nice_forest.R new file mode 100644 index 0000000..6e7e845 --- /dev/null +++ b/tests/testthat/test-nice_forest.R @@ -0,0 +1,108 @@ +# Tests for nice_forest(). +# Place this file in tests/testthat/. + +test_that("nice_forest returns a ggplot object", { + skip_if_not_installed("ggplot2") + + tab <- make_forest_test_data() + + p <- nice_forest(tab, title = "PFI adjusted Cox forest plot") + + expect_s3_class(p, "ggplot") +}) + +test_that("nice_forest can return the filtered plot table", { + skip_if_not_installed("ggplot2") + + tab <- make_forest_test_data() + + out <- nice_forest( + tab, + p_display = 0.02, + sort_by = "input", + return_table = TRUE + ) + + expect_named(out, c("plot", "table")) + expect_s3_class(out$plot, "ggplot") + expect_s3_class(out$table, "data.frame") + expect_equal(nrow(out$table), 2) + expect_true(all(out$table$.p_plot <= 0.02)) + expect_true(all(out$table$.significance_plot == "p < 0.05")) +}) + +test_that("nice_forest sorts rows by estimate or p-value", { + skip_if_not_installed("ggplot2") + + tab <- make_forest_test_data() + + by_estimate <- nice_forest(tab, sort_by = "estimate", return_table = TRUE)$table + expect_equal(by_estimate$.estimate_plot, sort(tab$HR)) + + by_p <- nice_forest(tab, sort_by = "p.value", return_table = TRUE)$table + expect_equal(by_p$.p_plot, sort(tab$p.value)) +}) + +test_that("nice_forest supports user-provided label columns", { + skip_if_not_installed("ggplot2") + + tab <- make_forest_test_data() + tab$pretty_label <- c("PAM50 HER2", "DNA methylation cluster 3", "RPPA Reactive") + + out <- nice_forest( + tab, + label_col = "pretty_label", + sort_by = "input", + return_table = TRUE + ) + + expect_equal(as.character(out$table$.label_plot), tab$pretty_label) +}) + +test_that("nice_forest validates input columns and scalar arguments", { + skip_if_not_installed("ggplot2") + + tab <- make_forest_test_data() + + expect_error( + nice_forest(tab[, setdiff(names(tab), "CI_high")]), + "required columns were not found" + ) + + expect_error( + nice_forest(tab, p_display = 2), + "'p_display' should be a single numeric value between 0 and 1", + fixed = TRUE + ) + + expect_error( + nice_forest(tab, vline = c(1, 2)), + "'vline' should be a single numeric value", + fixed = TRUE + ) +}) + +test_that("nice_forest errors when filtering removes all rows", { + skip_if_not_installed("ggplot2") + + tab <- make_forest_test_data() + + expect_error( + nice_forest(tab, p_display = 0.001), + "No rows remained after filtering" + ) +}) + +test_that("nice_forest requires positive estimates on a log scale", { + skip_if_not_installed("ggplot2") + + tab <- make_forest_test_data() + tab$HR[1] <- 0 + + expect_error( + nice_forest(tab, log_scale = TRUE), + "must be > 0" + ) + + expect_s3_class(nice_forest(tab, log_scale = FALSE), "ggplot") +}) diff --git a/tests/testthat/test-trend_filter.R b/tests/testthat/test-trend_filter.R new file mode 100644 index 0000000..ecce336 --- /dev/null +++ b/tests/testthat/test-trend_filter.R @@ -0,0 +1,201 @@ +test_that("trend_filter keeps genes with patient-level trend consistency", { + fx <- make_trend_fixture() + + out <- trend_filter( + expr = fx$expr, + sampledata = fx$sampledata, + results = list(Tumor_vs_Normal = fx$res), + baseline = "normal", + conditions = c(Tumor_vs_Normal = "tumor"), + sample.col = "sample_id", + patient.col = "patient_id", + group.col = "sample_type", + ratio = 1.1, + scale = "linear" + ) + + expect_type(out, "list") + expect_named( + out, + c("Tumor_vs_Normal", "TrendGenes", "Diagnostics", "Summary", "RemovedGenes") + ) + + expect_setequal(out$TrendGenes, c("g_up_pass", "g_down_pass")) + expect_setequal(out$Tumor_vs_Normal$ensembl, c("g_up_pass", "g_down_pass")) + + expect_s3_class(out$Diagnostics, "data.frame") + expect_s3_class(out$Summary, "data.frame") +}) + +test_that("trend_filter reports diagnostic reasons for removed genes", { + fx <- make_trend_fixture() + + out <- trend_filter( + expr = fx$expr, + sampledata = fx$sampledata, + results = list(Tumor_vs_Normal = fx$res), + baseline = "normal", + conditions = c(Tumor_vs_Normal = "tumor"), + ratio = 1.1, + scale = "linear" + ) + + diagnostics <- out$Diagnostics + + expect_equal( + diagnostics$reason[diagnostics$gene == "g_up_fail"], + "inconsistent_trend" + ) + + expect_equal( + diagnostics$reason[diagnostics$gene == "g_down_fail"], + "inconsistent_trend" + ) + + expect_equal( + diagnostics$reason[diagnostics$gene == "g_no_direction"], + "no_direction" + ) + + expect_equal( + diagnostics$reason[diagnostics$gene == "g_missing_expr"], + "missing_in_expr" + ) + + expect_true("g_up_fail" %in% out$RemovedGenes) + expect_true("g_down_fail" %in% out$RemovedGenes) +}) + +test_that("trend_filter supports one data frame in results", { + fx <- make_trend_fixture() + + out <- trend_filter( + expr = fx$expr, + sampledata = fx$sampledata, + results = fx$res, + baseline = "normal", + conditions = "tumor", + ratio = 1.1, + scale = "linear" + ) + + expect_named(out, c("Comparison1", "TrendGenes", "Diagnostics", "Summary", "RemovedGenes")) + expect_setequal(out$TrendGenes, c("g_up_pass", "g_down_pass")) +}) + +test_that("trend_filter handles log2-scale expression values", { + fx <- make_trend_fixture(log_scale = TRUE) + + out <- trend_filter( + expr = fx$expr, + sampledata = fx$sampledata, + results = list(Tumor_vs_Normal = fx$res), + baseline = "normal", + conditions = c(Tumor_vs_Normal = "tumor"), + ratio = 1.1, + scale = "log2" + ) + + expect_setequal(out$TrendGenes, c("g_up_pass", "g_down_pass")) +}) + +test_that("trend_filter warns or errors on unpaired patients depending on option", { + fx <- make_trend_unpaired_fixture() + + expect_warning( + trend_filter( + expr = fx$expr, + sampledata = fx$sampledata, + results = list(Tumor_vs_Normal = fx$res), + baseline = "normal", + conditions = c(Tumor_vs_Normal = "tumor"), + require_complete_pairs = FALSE + ), + "Using only paired patients", + fixed = TRUE + ) + + expect_error( + trend_filter( + expr = fx$expr, + sampledata = fx$sampledata, + results = list(Tumor_vs_Normal = fx$res), + baseline = "normal", + conditions = c(Tumor_vs_Normal = "tumor"), + require_complete_pairs = TRUE + ), + "Unpaired patients found", + fixed = TRUE + ) +}) + +test_that("trend_filter validates required inputs", { + fx <- make_trend_fixture() + + sampledata_missing_col <- fx$sampledata + sampledata_missing_col$patient_id <- NULL + + expect_error( + trend_filter( + expr = fx$expr, + sampledata = sampledata_missing_col, + results = list(Tumor_vs_Normal = fx$res), + baseline = "normal", + conditions = c(Tumor_vs_Normal = "tumor") + ), + "columns are missing from sampledata", + fixed = TRUE + ) + + expect_error( + trend_filter( + expr = fx$expr, + sampledata = fx$sampledata, + results = list(Tumor_vs_Normal = fx$res), + baseline = "normal", + conditions = c(Tumor_vs_Normal = "tumor"), + ratio = 1 + ), + "ratio must be a single numeric value greater than 1", + fixed = TRUE + ) + + expect_error( + trend_filter( + expr = fx$expr, + sampledata = fx$sampledata, + results = list(Tumor_vs_Normal = fx$res), + baseline = "normal", + conditions = c(Tumor_vs_Normal = "tumor"), + scale = "raw" + ) + ) +}) + +test_that("trend_filter supports multiple comparisons", { + fx <- make_trend_fixture() + + res2 <- fx$res + res2$log2FoldChange <- -res2$log2FoldChange + + out <- trend_filter( + expr = fx$expr, + sampledata = fx$sampledata, + results = list( + Tumor_vs_Normal = fx$res, + Tumor_reversed = res2 + ), + baseline = "normal", + conditions = c( + Tumor_vs_Normal = "tumor", + Tumor_reversed = "tumor" + ), + ratio = 1.1, + scale = "linear" + ) + + expect_true("Tumor_vs_Normal" %in% names(out)) + expect_true("Tumor_reversed" %in% names(out)) + expect_s3_class(out$Summary, "data.frame") + expect_equal(nrow(out$Summary), 2) +})