diff --git a/.Rbuildignore b/.Rbuildignore index c57d8d6..e2f3e9c 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -8,3 +8,7 @@ ^_pkgdown\.yml$ ^docs$ ^pkgdown$ +^\.positai$ +^\.claude$ +^doc$ +^Meta$ diff --git a/.gitignore b/.gitignore index 90cd220..7ed16d8 100644 --- a/.gitignore +++ b/.gitignore @@ -53,3 +53,6 @@ rsconnect/ .quarto inst/doc docs +.positai +/doc/ +/Meta/ diff --git a/DESCRIPTION b/DESCRIPTION index 177895e..d5c8788 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -16,7 +16,6 @@ Authors@R: c( Description: This package contains functions that help in manipulating tables and generating plots for multi-omics analysis including genomics, transcriptomics, proteomics, methylomics and immunoinformatics. License: CC BY-NC-SA 4.0 Encoding: UTF-8 -RoxygenNote: 7.3.3 Imports: dplyr, ggplot2, @@ -25,13 +24,18 @@ Imports: graphics, magrittr, rlang, - stats, umap, tibble, scales, grid, utils, - patchwork + patchwork, + Gviz, + rtracklayer, + GenomeInfoDb, + GenomicRanges, + stats, + S4Vectors Suggests: biomaRt, circlize, @@ -55,12 +59,16 @@ Suggests: tidygraph, tidyr, tidyselect, - tm + tm, + testthat (>= 3.0.0), + withr, + Rsamtools Roxygen: list(markdown = TRUE) Depends: - R (>= 3.5) + R (>= 4.4.0) LazyData: true LazyDataCompression: xz VignetteBuilder: knitr URL: https://bigmindlab.github.io/OmicsKit - +Config/testthat/edition: 3 +Config/roxygen2/version: 8.0.0 diff --git a/NAMESPACE b/NAMESPACE index 44d21fc..277bfa9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,11 +12,14 @@ export(get_superterm) export(getgenesPA) export(heatmap_PA) export(heatmap_path_PA) +export(list_ensembl_species) +export(list_ensembl_versions) export(list_gmts) export(merge_PA) export(multiplot_PA) export(network_clust) export(network_clust_gg) +export(nice_GenomeTrack) export(nice_KM) export(nice_PCA) export(nice_UMAP) @@ -30,7 +33,10 @@ export(split_cases) export(splot_PA) export(tpm) import(ggplot2) +importFrom(S4Vectors,"mcols<-") +importFrom(S4Vectors,mcols) importFrom(magrittr,"%>%") importFrom(patchwork,plot_layout) importFrom(rlang,.data) +importFrom(stats,na.omit) importFrom(utils,modifyList) diff --git a/R/get_annotations.R b/R/get_annotations.R index ff25e15..e4a91ef 100644 --- a/R/get_annotations.R +++ b/R/get_annotations.R @@ -1,139 +1,605 @@ +# ============================================================================= +# OmicsKit — Ensembl annotation utilities +# Functions: list_ensembl_versions, list_ensembl_species, get_annotations +# Internal helpers: .resolve_ensembl_host, .get_symbol_attribute, +# .resolve_symbol_column, .handle_biomart_connection_error, +# .handle_biomart_query_error +# ============================================================================= + + ############################ # Function get_annotations # ############################ -#' Get annotations from Ensembl. +#' Get gene or transcript annotations from Ensembl BioMart. #' #' This function annotates a column of transcripts or gene IDs (ENSEMBL) with information of the Biomart. -#' If transcript IDs are provided, they are also annotated with information of the genes to which they belong. #' #' The Gene information added include: -#' - Gene ENSEMBL ID, HGNC Symbol, Description, Biotype and Chromosome. +#' - Gene ENSEMBL ID, gene Symbol, Description, Biotype and Chromosome. #' - Gene start, end and length #' -#' @param ensembl_ids The column of transcripts to be used as input. -#' @param mode To specify the IDs provided, between "transcripts" or "genes". Default = genes. -#' @param version This function can use the versions 102, 103, and 112 of Ensembl. Default = "Current". -#' @param filename The name of the output file, which is table. Default = gene_annotations. -#' @param format The output is saved in .csv or .xlsx formats. Default = csv. +#' Annotates a vector of Ensembl gene or transcript IDs using BioMart. If +#' transcript IDs are provided, they are also annotated with information +#' of the genes to which they belong. Works for any species available in +#' Ensembl — use [list_ensembl_species()] to find the correct `species` value +#' for your organism, and [list_ensembl_versions()] to verify version availability. +#' +#' @details +#' **Workflow:** +#' \enumerate{ +#' \item `list_ensembl_species()` → find dataset name. +#' \item `list_ensembl_versions()` → confirm version exists. +#' \item `get_annotations(ensembl_ids, species = "drerio_gene_ensembl")`. +#' } +#' +#' **Symbol resolution by species:** +#' \itemize{ +#' \item Human (`hsapiens`): uses `hgnc_symbol`, falling back to +#' `external_gene_name` when HGNC symbol is absent. +#' \item Mouse (`mmusculus`): uses `mgi_symbol`. +#' \item Rat (`rnorvegicus`): uses `rgd_symbol`. +#' \item Zebrafish (`drerio`): uses `zfin_id_symbol`. +#' \item Drosophila (`dmelanogaster`): uses `flybasename_gene`. +#' \item All other species: uses `external_gene_name` (universal fallback). +#' } +#' +#' @param ensembl_ids A character vector of Ensembl gene or transcript IDs as inputs. +#' @param species The BioMart dataset identifier. Default = +#' `"hsapiens_gene_ensembl"` (human). Use [list_ensembl_species()] to find +#' the identifier for other organisms. +#' @param mode Either `"genes"` or `"transcripts"`. Default = `"genes"`. +#' @param version Ensembl release version as a string (e.g. `"112"`, `"114"`), +#' or `"current"`. Use [list_ensembl_versions()] to see available versions, +#' and [list_ensembl_species()] to confirm your species exists in that +#' version. Default = `"current"`. +#' @param filename Name of the output file without extension. If `NULL`, the +#' result is returned but **not** saved to disk. Default = +#' `"gene_annotations"`. +#' @param format Output format: `"csv"` or `"xlsx"`. Ignored when +#' `filename = NULL`. Default = `"csv"`. +#' #' @importFrom magrittr %>% #' @importFrom rlang .data #' -#' @return A data frame with one row per input ID and the following columns: -#' `geneID`, `symbol`, `biotype`, `chromosome`, `gene_start`, `gene_end`, -#' `gene_length`, `description`. For `mode = "transcripts"`, an additional -#' `transcriptID` column is included. The data frame is also saved to disk -#' as a `.csv` or `.xlsx` file (see `filename` and `format`). +#' @return A data frame with one row per input ID and columns: `geneID`, +#' `symbol`, `biotype`, `chromosome`, `gene_start`, `gene_end`, +#' `gene_length`, `description`. For `mode = "transcripts"`, a `transcriptID` +#' column is prepended. If `filename` is not `NULL`, the data frame is also +#' written to disk. #' #' @examples #' \dontrun{ -#' # Annotate genes from Normalized counts (requires internet connection) -#' data(norm_counts) +#' # Step 1 — find your species dataset +#' list_ensembl_species(version = "112", filter = "zebrafish") +#' # -> "drerio_gene_ensembl" #' -#' # Requires a reference table with a "geneID" column. -#' # Use get_annotations() to generate it: +#' # Step 2 — annotate (human, default) #' annotations <- get_annotations( -#' ensembl_ids = rownames(norm_counts), -#' mode = "genes" +#' ensembl_ids = c("ENSG00000141510", "ENSG00000012048"), +#' species = "hsapiens_gene_ensembl", +#' version = "112", +#' filename = NULL #' ) #' -#' head(annotations) +#' # Mouse +#' annotations_mouse <- get_annotations( +#' ensembl_ids = c("ENSMUSG00000059552", "ENSMUSG00000024610"), +#' species = "mmusculus_gene_ensembl", +#' version = "112", +#' filename = NULL +#' ) #' -#' # Use with add_annotations() -#' norm_counts_annot <- add_annotations( -#' object = norm_counts, -#' reference = annotations, -#' variables = c("symbol", "biotype") +#' # Zebrafish +#' annotations_zf <- get_annotations( +#' ensembl_ids = c("ENSDARG00000002333"), +#' species = "drerio_gene_ensembl", +#' version = "112", +#' filename = NULL #' ) #' } #' #' @note Requires an active internet connection to query the Ensembl BioMart. -#' `gene_length` is computed as `gene_end - gene_start + 1` (genomic length). -#' For TPM calculation with [tpm()], this is an approximation, -#' use transcript-level lengths for higher accuracy. +#' `gene_length` is computed as `gene_end - gene_start + 1` (genomic span). +#' For TPM calculation with [tpm()], transcript-level lengths are more +#' accurate. #' -#' @seealso [add_annotations()] to join annotations to a counts matrix; -#' [tpm()] which requires gene lengths from this function. +#' @seealso [list_ensembl_versions()], [list_ensembl_species()] to find +#' the identifier for other organisms, [add_annotations()], [tpm()] #' #' @export -get_annotations <- function(ensembl_ids, mode = "genes", filename = "gene_annotations", version = "Current", format = "csv") { - - if (!requireNamespace("biomaRt", quietly = TRUE)) stop("Package \"biomaRt\" must be installed to use this function.", call. = FALSE) - - if (version == "102") { - ensembl = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", - dataset = "hsapiens_gene_ensembl", - host = "https://nov2020.archive.ensembl.org") - } else if (version == "103") { - ensembl = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", - dataset = "hsapiens_gene_ensembl", - host = "https://feb2021.archive.ensembl.org") - } else if (version == "112") { - ensembl = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", - dataset = "hsapiens_gene_ensembl", - host = "https://may2024.archive.ensembl.org") - } else if (version == "Current") { - ensembl = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", - dataset = "hsapiens_gene_ensembl", - host = "https://useast.ensembl.org") +get_annotations <- function(ensembl_ids, + species = "hsapiens_gene_ensembl", + mode = "genes", + version = "current", + filename = "gene_annotations", + format = "csv") { + + # --- Dependency check ------------------------------------------------------- + if (!requireNamespace("biomaRt", quietly = TRUE)) { + stop( + "Package \"biomaRt\" must be installed to use this function.\n", + "Install it with: BiocManager::install(\"biomaRt\")", + call. = FALSE + ) } - annotations <- c("ensembl_gene_id", "hgnc_symbol", "gene_biotype", - "chromosome_name", "start_position", "end_position", - "description") + # --- Resolve host and connect ----------------------------------------------- + host <- .resolve_ensembl_host(version) + + message("Connecting to Ensembl BioMart [version: ", version, + " | species: ", species, "] ...") + + ensembl <- tryCatch( + biomaRt::useMart("ENSEMBL_MART_ENSEMBL", + dataset = species, + host = host), + error = function(e) .handle_biomart_connection_error(e, host, species, version) + ) + + # --- Detect appropriate symbol attribute for this species ------------------- + symbol_info <- .get_symbol_attribute(ensembl, species) + is_human <- grepl("^hsapiens", species) - new_names <- c("geneID", "symbol", "biotype", "chromosome", - "gene_start", "gene_end", "description") + # --- Define BioMart attributes ---------------------------------------------- + if (is_human) { + # Human: fetch both hgnc_symbol + external_gene_name, merge later + annotations_attrs <- c( + "ensembl_gene_id", "hgnc_symbol", "external_gene_name", + "gene_biotype", "chromosome_name", + "start_position", "end_position", "description" + ) + } else { + annotations_attrs <- c( + "ensembl_gene_id", symbol_info$attr, + "gene_biotype", "chromosome_name", + "start_position", "end_position", "description" + ) + } - # There are more annotations available in the biomaRt, check listAttributes(mart = ensembl) - # The terms "go_id" and "name_1006" can be added in a future release. + final_col_names <- c( + "geneID", "symbol", "biotype", "chromosome", + "gene_start", "gene_end", "description" + ) + # --- Query BioMart ---------------------------------------------------------- if (mode == "transcripts") { df <- data.frame(transcriptID = ensembl_ids) - genemap <- biomaRt::getBM(attributes = c("ensembl_transcript_id_version", annotations), - filters = "ensembl_transcript_id_version", - values = df$transcriptID, - mart = ensembl) + genemap <- tryCatch( + biomaRt::getBM( + attributes = c("ensembl_transcript_id_version", annotations_attrs), + filters = "ensembl_transcript_id_version", + values = df$transcriptID, + mart = ensembl + ), + error = function(e) .handle_biomart_query_error(e, host) + ) + + genemap <- .resolve_symbol_column(genemap, is_human, symbol_info$attr) + + keep_cols <- c("ensembl_transcript_id_version", "ensembl_gene_id", + "symbol", "gene_biotype", "chromosome_name", + "start_position", "end_position", "description") idx <- match(df$transcriptID, genemap$ensembl_transcript_id_version) - df <- merge(df, genemap[idx, c("ensembl_transcript_id_version", annotations)], - by.x = "transcriptID", by.y = "ensembl_transcript_id_version") - names(df) <- c("transcriptID", new_names) + df <- merge( + df, + genemap[idx, keep_cols], + by.x = "transcriptID", + by.y = "ensembl_transcript_id_version" + ) + names(df) <- c("transcriptID", final_col_names) } else { df <- data.frame(geneID = ensembl_ids) - genemap <- biomaRt::getBM(attributes = annotations, - filters = "ensembl_gene_id", - values = df$geneID, - mart = ensembl) + genemap <- tryCatch( + biomaRt::getBM( + attributes = annotations_attrs, + filters = "ensembl_gene_id", + values = df$geneID, + mart = ensembl + ), + error = function(e) .handle_biomart_query_error(e, host) + ) - idx <- match(df$geneID, genemap$ensembl_gene_id) - df <- merge(df, genemap[idx, annotations], by.x = "geneID", by.y = "ensembl_gene_id") - names(df) <- new_names + genemap <- .resolve_symbol_column(genemap, is_human, symbol_info$attr) + + keep_cols <- c("ensembl_gene_id", "symbol", "gene_biotype", + "chromosome_name", "start_position", + "end_position", "description") + idx <- match(df$geneID, genemap$ensembl_gene_id) + df <- merge( + df, + genemap[idx, keep_cols], + by.x = "geneID", + by.y = "ensembl_gene_id" + ) + names(df) <- final_col_names } + # --- Compute gene length ---------------------------------------------------- df$gene_length <- df$gene_end - df$gene_start + 1 - df <- df %>% dplyr::relocate(.data$gene_length, .before = "description") + df <- df %>% dplyr::relocate("gene_length", .before = "description") + + # --- Save to disk (optional) ------------------------------------------------ + if (!is.null(filename)) { + + if (format == "xlsx") { + if (!requireNamespace("openxlsx", quietly = TRUE)) { + stop( + "Package \"openxlsx\" must be installed to save in xlsx format.\n", + "Install it with: install.packages(\"openxlsx\")", + call. = FALSE + ) + } + openxlsx::write.xlsx(df, + file = paste0(filename, ".xlsx"), + colNames = TRUE, + rowNames = FALSE, + append = FALSE) + } else { + utils::write.csv(df, row.names = FALSE, file = paste0(filename, ".csv")) + } + + message("Annotations saved to: ", paste0(filename, ".", format)) + } + + return(df) +} + +############################ +# Function list_ensembl_versions # +############################ + +#' List available Ensembl releases and their BioMart hosts. +#' +#' A convenience wrapper around [biomaRt::listEnsemblArchives()] that returns +#' all available Ensembl release versions with their dates and host URLs. +#' +#' @details +#' Ensembl uses a **universal release numbering system**: every release (e.g. +#' v112, v113) covers *all* species simultaneously. What varies between species +#' is whether the species is present in a given release and which genome +#' assembly is used. To check whether your species of interest is available in +#' a particular version, use [list_ensembl_species()]. +#' +#' The recommended workflow is: +#' \enumerate{ +#' \item Find the dataset name for your species: `list_ensembl_species()`. +#' \item Find available versions and confirm species presence: +#' `list_ensembl_versions()` + `list_ensembl_species(version = "X")`. +#' \item Annotate: `get_annotations(species = "...", version = "...")`. +#' } +#' +#' @return A data frame with columns `name`, `date`, `url`, `version`, and +#' `current_release` (marked with `*` for the active release). +#' +#' @examples +#' \dontrun{ +#' # See all available Ensembl versions +#' list_ensembl_versions() +#' +#' # Then check if your species is in a specific version +#' list_ensembl_species(version = "112") +#' } +#' +#' @seealso [list_ensembl_species()], [get_annotations()] +#' @export + +list_ensembl_versions <- function() { - if (format == "xlsx") { - if (!requireNamespace("openxlsx", quietly = TRUE)) { + if (!requireNamespace("biomaRt", quietly = TRUE)) { + stop( + "Package \"biomaRt\" must be installed to use this function.\n", + "Install it with: BiocManager::install(\"biomaRt\")", + call. = FALSE + ) + } + + archives <- tryCatch( + biomaRt::listEnsemblArchives(), + error = function(e) { stop( - "Package \"openxlsx\" must be installed to use this function.", + "Could not retrieve Ensembl archive list.\n", + "This is likely a connectivity issue with the Ensembl server.\n\n", + "Suggestions:\n", + " - Check your internet connection.\n", + " - Try again later.\n", + " - Visit https://www.ensembl.org/info/website/archives/index.html", call. = FALSE + ) + } + ) + + df <- archives[, c("name", "date", "url", "version", "current_release")] + # Exclude the legacy GRCh37 entry — it is a fixed human-only site, + # not a versioned multi-species release. + df <- df[df$version != "GRCh37", ] + rownames(df) <- NULL + + message( + "NOTE: Ensembl release numbers are UNIVERSAL across all species.\n", + "A species may not exist in every version. Use list_ensembl_species(version)\n", + "to confirm that your species dataset is available in a given release." + ) + + return(df) +} + + +############################ +# Function list_ensembl_species # +############################ + +#' List all species datasets available in a given Ensembl version. +#' +#' Queries BioMart at the specified Ensembl version and returns a data frame +#' of all available gene datasets. Use this to: +#' \itemize{ +#' \item Find the exact `species` string to pass to [get_annotations()]. +#' \item Confirm that a species is available in a specific Ensembl version. +#' } +#' +#' @details +#' Because Ensembl releases are universal (same version number for all +#' species), this function simply lists which species datasets exist in the +#' BioMart of the requested version. Species added in later releases will not +#' appear in older versions. +#' +#' @param version Ensembl release version as a string (e.g. `"112"`, `"114"`), +#' or `"current"` for the latest release. Default = `"current"`. +#' @param filter Optional. A search string to filter results by dataset name +#' or description (case-insensitive). Useful for quickly finding a species +#' without scrolling through hundreds of rows. Default = `NULL` (no filter). +#' +#' @return A data frame with columns: +#' \describe{ +#' \item{dataset}{The dataset identifier to pass to `species` in +#' [get_annotations()] (e.g. `"hsapiens_gene_ensembl"`).} +#' \item{description}{Human-readable species name and assembly +#' (e.g. `"Human genes (GRCh38.p14)"`).} +#' \item{version}{Assembly version string.} +#' } +#' +#' @examples +#' \dontrun{ +#' # All species in version 112 +#' list_ensembl_species(version = "112") +#' +#' # Find zebrafish dataset in version 112 +#' list_ensembl_species(version = "112", filter = "zebrafish") +#' +#' # Find mouse dataset +#' list_ensembl_species(filter = "mouse") +#' +#' # Find all fish species +#' list_ensembl_species(filter = "fish") +#' } +#' +#' @seealso [list_ensembl_versions()], [get_annotations()] +#' @export + +list_ensembl_species <- function(version = "current", filter = NULL) { + + if (!requireNamespace("biomaRt", quietly = TRUE)) { + stop( + "Package \"biomaRt\" must be installed to use this function.\n", + "Install it with: BiocManager::install(\"biomaRt\")", + call. = FALSE + ) + } + + host <- .resolve_ensembl_host(version) + + message("Fetching species list from Ensembl version ", version, "...") + + mart <- tryCatch( + biomaRt::useMart("ENSEMBL_MART_ENSEMBL", host = host), + error = function(e) .handle_biomart_connection_error(e, host) + ) + + datasets <- tryCatch( + biomaRt::listDatasets(mart), + error = function(e) .handle_biomart_query_error(e, host) + ) + + # Keep only gene datasets + df <- datasets[grepl("_gene_ensembl$", datasets$dataset), ] + rownames(df) <- NULL + + # Apply optional text filter across dataset name and description + if (!is.null(filter) && nchar(trimws(filter)) > 0) { + mask <- grepl(filter, df$dataset, ignore.case = TRUE) | + grepl(filter, df$description, ignore.case = TRUE) + df <- df[mask, ] + + if (nrow(df) == 0) { + message( + "No species found matching \"", filter, "\" in Ensembl version ", + version, ".\n", + "Try a broader search term or check list_ensembl_species(version = \"", + version, "\") without a filter." + ) + return(invisible(df)) + } + } + + message( + nrow(df), " species datasets found", + if (!is.null(filter)) paste0(" matching \"", filter, "\"") else "", + " (Ensembl v", version, ").\n", + "Pass the `dataset` column value to the `species` argument of get_annotations()." + ) + + return(df) +} + + +# ============================================================================= +# Internal helpers +# ============================================================================= + +# Resolve BioMart host URL from version string +.resolve_ensembl_host <- function(version) { + + if (tolower(as.character(version)) == "current") { + return("https://www.ensembl.org") + } + + archives <- tryCatch( + biomaRt::listEnsemblArchives(), + error = function(e) { + stop( + "Could not retrieve the list of Ensembl archives.\n", + "This may be due to a network or server issue.\n\n", + "Suggestions:\n", + " - Check your internet connection.\n", + " - Use version = \"current\" to try the main server.\n", + " - Visit https://www.ensembl.org/info/website/archives/index.html", + call. = FALSE + ) + } + ) + + matched <- archives[archives$version == as.character(version), ] + + if (nrow(matched) == 0) { + available <- paste( + sort(archives$version[archives$version != "GRCh37"]), + collapse = ", " + ) + stop( + "Ensembl version \"", version, "\" was not found.\n", + "Available versions: ", available, "\n", + "Use list_ensembl_versions() to see the full list.", + call. = FALSE + ) + } + + return(matched$url[1]) +} + + +# Detect the best symbol attribute for a species +# Returns a list: list(attr = "attribute_name", source = "label for messages") +.get_symbol_attribute <- function(mart, species) { + + if (grepl("^hsapiens", species)) { + return(list(attr = "hgnc_symbol", source = "HGNC")) + } + + # Curated map: species prefix → preferred symbol attribute + source label + curated_map <- list( + mmusculus = list(attr = "mgi_symbol", source = "MGI"), + rnorvegicus = list(attr = "rgd_symbol", source = "RGD"), + drerio = list(attr = "zfin_id_symbol", source = "ZFIN"), + dmelanogaster = list(attr = "flybasename_gene", source = "FlyBase"), + celegans = list(attr = "wormbase_gene", source = "WormBase") + ) + + # Check if this species matches any curated prefix + for (prefix in names(curated_map)) { + if (grepl(paste0("^", prefix), species)) { + entry <- curated_map[[prefix]] + + # Verify the attribute actually exists in this mart/version + available_attrs <- tryCatch( + biomaRt::listAttributes(mart)$name, + error = function(e) character(0) + ) + + if (entry$attr %in% available_attrs) { + return(entry) + } else { + message( + "Note: preferred symbol attribute '", entry$attr, "' (", + entry$source, ") is not available for this species/version.\n", + "Falling back to 'external_gene_name'." ) + return(list(attr = "external_gene_name", source = "External")) + } + } + } + + # Default fallback for non-curated species + return(list(attr = "external_gene_name", source = "External")) +} + + +# Resolve symbol column (human: HGNC fallback to external; others: rename attr) +.resolve_symbol_column <- function(genemap, is_human, symbol_attr) { + + if (is_human) { + # Ensure both columns exist + if (!"hgnc_symbol" %in% names(genemap)) { + genemap$hgnc_symbol <- NA_character_ + } + if (!"external_gene_name" %in% names(genemap)) { + genemap$external_gene_name <- NA_character_ } - openxlsx::write.xlsx(df, file = paste0(filename, ".xlsx"), - colNames = TRUE, rowNames = FALSE, append = FALSE) + # Prefer HGNC, fallback to external gene name + genemap$symbol <- ifelse( + is.na(genemap$hgnc_symbol) | genemap$hgnc_symbol == "", + genemap$external_gene_name, + genemap$hgnc_symbol + ) } else { - utils::write.csv(df, row.names = FALSE, file = paste0(filename, ".csv")) + # Non-human: rename preferred attribute to "symbol" if present + if (symbol_attr %in% names(genemap)) { + genemap$symbol <- genemap[[symbol_attr]] + } else if ("external_gene_name" %in% names(genemap)) { + genemap$symbol <- genemap[["external_gene_name"]] + } else { + genemap$symbol <- NA_character_ + } } - return(df) + return(genemap) +} + + +# Standardized connection error handler +.handle_biomart_connection_error <- function(e, host, species = NULL, version = NULL) { + + ctx <- "" + if (!is.null(version)) { + ctx <- paste0(ctx, "version: ", version) + } + if (!is.null(species)) { + ctx <- paste0(ctx, if (nchar(ctx) > 0) " | " else "", "species: ", species) + } + if (nchar(ctx) > 0) ctx <- paste0(" [", ctx, "]") + + stop( + "Could not connect to Ensembl BioMart", ctx, ".\n", + "Host: ", host, "\n\n", + "Suggestions:\n", + " - Check your internet connection.\n", + " - Try again later.\n", + " - Use version = \"current\" to try the main server.\n", + " - Visit https://www.ensembl.org/info/website/archives/index.html", + call. = FALSE + ) +} + + +# Standardized query error handler +.handle_biomart_query_error <- function(e, host) { + stop( + "BioMart query failed.\n", + "Host: ", host, "\n\n", + "Suggestions:\n", + " - Check your internet connection.\n", + " - Try again later.\n", + " - Confirm that the species dataset exists in this version.\n", + " - Use list_ensembl_species(version = \"X\") to verify.", + call. = FALSE + ) } diff --git a/R/nice_GenomeTrack.R b/R/nice_GenomeTrack.R new file mode 100644 index 0000000..abc040c --- /dev/null +++ b/R/nice_GenomeTrack.R @@ -0,0 +1,624 @@ +############################## +# Function nice_GenomeTrack # +############################## + +#' Plot genomic tracks with gene annotations and optional data tracks. +#' +#' Builds a multi-track genomic visualization using `Gviz`. The function can +#' annotate all genes in a genomic region or restrict annotations to user-supplied +#' gene IDs/symbols. Optional data tracks include BAM, BigWig, and BED files. +#' +#' @param region Genomic region to visualize. Either a `GRanges` object or a +#' named vector with `chr`, `start`, and `end` entries +#' (e.g. `c(chr = "chr1", start = 1e6, end = 2e6)`). +#' @param genome_label Assembly label used by `Gviz` (e.g. "hg38", "mm10"). +#' This is not a BioMart parameter; it is only used for track metadata. +#' @param organism Ensembl BioMart dataset name (e.g. "hsapiens_gene_ensembl"). +#' Use [list_ensembl_species()] to find valid dataset identifiers. +#' @param ensembl_version Ensembl release version (e.g. "112") or "current". +#' Determines which Ensembl archive host is queried. +#' @param annotations Optional data frame of precomputed annotations (for +#' example from [get_annotations()]). Must include columns `geneID`, `symbol`, +#' `chromosome`, `gene_start`, `gene_end`. +#' @param gene_ids Optional character vector of gene identifiers (Ensembl IDs +#' or symbols). If provided, annotations are queried internally. +#' @param tracks Named list of file paths for data tracks. Supported formats: +#' `bam`, `bw`/`bigwig`, `bed`. +#' @param highlight_genes Character vector of gene symbols to highlight. +#' @param gene_color Color for highlighted genes. +#' @param other_color Color for non-highlighted genes. +#' @param show_transcripts Logical; if `TRUE`, adds a `GeneRegionTrack` with +#' transcripts from Ensembl. +#' @param track_sizes Optional numeric vector of relative heights for tracks. +#' If provided, its length must match `track_list`. If `NULL`, sizes are +#' computed automatically. +#' @param export_pdf Optional file path to save a PDF. If `NULL`, the plot is +#' rendered to the active device. +#' +#' @return A list of `Gviz` track objects (invisibly). +#' +#' @examples +#' \dontrun{ +#' nice_GenomeTrack( +#' region = c(chr = "chr17", start = 7e6, end = 8e6), +#' genome_label = "hg38", +#' organism = "hsapiens_gene_ensembl", +#' ensembl_version = "current", +#' tracks = list(ChIP = "chip_signal.bw") +#' ) +#' } +#' +#' @importFrom stats na.omit +#' @importFrom S4Vectors mcols mcols<- +#' +#' @note +#' BAM files require an index file with the `.bai` suffix in the same directory +#' (e.g. `sample.bam.bai`). If it is missing, an error is raised. +#' +#' @export + +nice_GenomeTrack <- function( + region, + genome_label = "hg38", + organism = "hsapiens_gene_ensembl", + ensembl_version = "current", + annotations = NULL, + gene_ids = NULL, + tracks = list(), + highlight_genes = NULL, + gene_color = "orangered", + other_color = "#7B68EE", + show_transcripts = FALSE, + track_sizes = NULL, + export_pdf = NULL +) { + # ---- Package checks --------------------------------------------------------- + if (!requireNamespace("Gviz", quietly = TRUE)) { + stop( + "Package \"Gviz\" must be installed to use this function.", + call. = FALSE + ) + } + if (!requireNamespace("biomaRt", quietly = TRUE)) { + stop( + "Package \"biomaRt\" must be installed to use this function.", + call. = FALSE + ) + } + if (!requireNamespace("GenomicRanges", quietly = TRUE)) { + stop( + "Package \"GenomicRanges\" must be installed to use this function.", + call. = FALSE + ) + } + if (!requireNamespace("GenomeInfoDb", quietly = TRUE)) { + stop( + "Package \"GenomeInfoDb\" must be installed to use this function.", + call. = FALSE + ) + } + + # ---- Validate mutually exclusive inputs ----------------------------------- + if (!is.null(annotations) && !is.null(gene_ids)) { + stop("Provide only one of `annotations` or `gene_ids`, not both.", call. = FALSE) + } + + # ---- Parse genomic region -------------------------------------------------- + if (inherits(region, "GRanges")) { + chr <- as.character(GenomicRanges::seqnames(region))[1] + start <- min(GenomicRanges::start(region)) + end <- max(GenomicRanges::end(region)) + } else { + if (is.null(names(region)) || !all(c("chr", "start", "end") %in% names(region))) { + stop("`region` must be a GRanges or a named vector with chr/start/end.", call. = FALSE) + } + chr <- as.character(region["chr"]) + start <- as.numeric(region["start"]) + end <- as.numeric(region["end"]) + } + + if (is.na(start) || is.na(end) || is.na(chr)) { + stop("`region` must include valid chr/start/end values.", call. = FALSE) + } + if (start > end) { + stop("`region` start must be less than or equal to end.", call. = FALSE) + } + + # ---- Resolve Ensembl host -------------------------------------------------- + if (!exists(".resolve_ensembl_host", + mode = "function")) { + stop( + "Internal helper `.resolve_ensembl_host()` not found. ", + "Please update OmicsKit to include the Ensembl helpers from PR #44.", + call. = FALSE + ) + } + + host <- .resolve_ensembl_host(ensembl_version) + + # ---- Connect to BioMart ---------------------------------------------------- + mart <- tryCatch( + biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = organism, host = host), + error = function(e) { + if (exists(".handle_biomart_connection_error", mode = "function")) { + .handle_biomart_connection_error(e, host, organism, ensembl_version) + } + stop(e$message, call. = FALSE) + } + ) + + # ---- Basic metadata -------------------------------------------------------- + chr_num <- sub("^chr", "", chr) + is_human <- grepl("^hsapiens", organism) + + # Resolve the best symbol attribute for non-human species if available + symbol_info <- if (exists(".get_symbol_attribute", mode = "function")) { + .get_symbol_attribute(mart, organism) + } else { + list(attr = "external_gene_name", source = "External") + } + + # ---- Define BioMart attributes -------------------------------------------- + if (is_human) { + attributes <- c( + "ensembl_gene_id", "hgnc_symbol", "external_gene_name", + "chromosome_name", "start_position", "end_position", "strand" + ) + } else { + attributes <- c( + "ensembl_gene_id", symbol_info$attr, + "chromosome_name", "start_position", "end_position", "strand" + ) + } + + # ---- Helper: BioMart query with error handling ----------------------------- + query_biomart <- function(filters, values) { + tryCatch( + biomaRt::getBM( + attributes = attributes, + filters = filters, + values = values, + mart = mart + ), + error = function(e) { + if (exists(".handle_biomart_query_error", mode = "function")) { + .handle_biomart_query_error(e, host) + } + stop(e$message, call. = FALSE) + } + ) + } + + # ---- Helper: resolve symbol column ---------------------------------------- + resolve_symbol <- function(df) { + if (exists(".resolve_symbol_column", mode = "function")) { + return(.resolve_symbol_column(df, is_human, symbol_info$attr)) + } + + if (is_human) { + if (!"hgnc_symbol" %in% names(df)) df$hgnc_symbol <- NA_character_ + if (!"external_gene_name" %in% names(df)) df$external_gene_name <- NA_character_ + df$symbol <- ifelse( + is.na(df$hgnc_symbol) | df$hgnc_symbol == "", + df$external_gene_name, + df$hgnc_symbol + ) + } else if (symbol_info$attr %in% names(df)) { + df$symbol <- df[[symbol_info$attr]] + } else if ("external_gene_name" %in% names(df)) { + df$symbol <- df[["external_gene_name"]] + } else { + df$symbol <- NA_character_ + } + + df + } + + # ---- Build annotation data ----------------------------------------------- + if (!is.null(annotations)) { + required_cols <- c("geneID", "symbol", "chromosome", "gene_start", "gene_end") + if (!all(required_cols %in% names(annotations))) { + stop( + "`annotations` must include columns: ", + paste(required_cols, collapse = ", "), + call. = FALSE + ) + } + + # Use provided annotations directly + anno_data <- data.frame( + ensembl_gene_id = annotations$geneID, + symbol = annotations$symbol, + chromosome_name = annotations$chromosome, + start_position = annotations$gene_start, + end_position = annotations$gene_end, + strand = if ("strand" %in% names(annotations)) annotations$strand else "*", + stringsAsFactors = FALSE + ) + + } else if (!is.null(gene_ids)) { + # Query only the requested genes + gene_ids <- as.character(gene_ids) + ensembl_ids <- gene_ids[grepl("^ENS", gene_ids)] + symbol_ids <- setdiff(gene_ids, ensembl_ids) + + res_ids <- if (length(ensembl_ids) > 0) { + query_biomart("ensembl_gene_id", ensembl_ids) + } else { + data.frame() + } + + res_symbols <- data.frame() + if (length(symbol_ids) > 0) { + if (is_human) { + res_hgnc <- query_biomart("hgnc_symbol", symbol_ids) + res_ext <- query_biomart("external_gene_name", symbol_ids) + res_symbols <- rbind(res_hgnc, res_ext) + # Remove duplicate Ensembl IDs introduced by dual queries + res_symbols <- res_symbols[!duplicated(res_symbols$ensembl_gene_id), ] + } else { + res_symbols <- query_biomart(symbol_info$attr, symbol_ids) + } + } + + anno_data <- rbind(res_ids, res_symbols) + + if (nrow(anno_data) == 0) { + stop("No annotations found for the provided gene IDs.", call. = FALSE) + } + + anno_data <- resolve_symbol(anno_data) + + } else { + # Query all genes in the region + anno_data <- query_biomart( + filters = c("chromosome_name", "start", "end"), + values = list(chromosome_name = chr_num, start = start, end = end) + ) + + if (nrow(anno_data) == 0) { + stop("No genes found in the requested region.", call. = FALSE) + } + + anno_data <- resolve_symbol(anno_data) + } + + # Ensure symbol column exists + if (!"symbol" %in% names(anno_data)) { + anno_data <- resolve_symbol(anno_data) + } + + # Ensure required coordinate columns exist + if (!"chromosome_name" %in% names(anno_data)) { + if ("chromosome" %in% names(anno_data)) { + anno_data$chromosome_name <- anno_data$chromosome + } else { + stop("Annotation data must include chromosome information.", call. = FALSE) + } + } + + if (!"start_position" %in% names(anno_data)) { + if ("gene_start" %in% names(anno_data)) { + anno_data$start_position <- anno_data$gene_start + } else { + stop("Annotation data must include gene start positions.", call. = FALSE) + } + } + + if (!"end_position" %in% names(anno_data)) { + if ("gene_end" %in% names(anno_data)) { + anno_data$end_position <- anno_data$gene_end + } else { + stop("Annotation data must include gene end positions.", call. = FALSE) + } + } + + # ---- If user provided genes, ensure they are within the region ------------- + if (!is.null(gene_ids) || !is.null(annotations)) { + outside <- anno_data$chromosome_name != chr_num | + anno_data$start_position < start | + anno_data$end_position > end + + if (any(outside, na.rm = TRUE)) { + outside_genes <- unique(na.omit(anno_data$symbol[outside])) + outside_genes <- outside_genes[nzchar(outside_genes)] + + same_chr <- anno_data$chromosome_name == chr_num + min_start <- min(anno_data$start_position[same_chr], na.rm = TRUE) + max_end <- max(anno_data$end_position[same_chr], na.rm = TRUE) + + left_expand <- if (is.finite(min_start) && min_start < start) start - min_start else 0 + right_expand <- if (is.finite(max_end) && max_end > end) max_end - end else 0 + + suggestion <- paste0( + "Current region: ", chr, ":", start, "-", end, "." + ) + + if (left_expand > 0 || right_expand > 0) { + suggestion <- paste0( + suggestion, + " Consider extending to ", chr, ":", + min(start, min_start, na.rm = TRUE), "-", + max(end, max_end, na.rm = TRUE), + " (", left_expand, " bp upstream, ", right_expand, " bp downstream)." + ) + } + + if (length(outside_genes) == 0) { + outside_genes <- unique(na.omit(anno_data$ensembl_gene_id[outside])) + } + + stop( + "Some requested genes fall outside the region.\n", + if (length(outside_genes) > 0) { + paste0("Outside genes: ", paste(outside_genes, collapse = ", "), "\n") + } else { + "" + }, + suggestion, + call. = FALSE + ) + } + } + + # ---- Keep only genes fully inside the region ------------------------------- + anno_data <- anno_data[anno_data$chromosome_name == chr_num & + anno_data$start_position >= start & + anno_data$end_position <= end, , drop = FALSE] + + # Fix strand encoding from BioMart (numeric -> "+/-") + anno_data$strand <- ifelse( + anno_data$strand == 1, "+", + ifelse(anno_data$strand == -1, "-", "*") + ) + + # ---- Build GRanges for genes ---------------------------------------------- + gr <- GenomicRanges::makeGRangesFromDataFrame( + anno_data, + seqnames.field = "chromosome_name", + start.field = "start_position", + end.field = "end_position", + keep.extra.columns = TRUE + ) + GenomeInfoDb::seqlevelsStyle(gr) <- "UCSC" + + # Replace missing symbols with Ensembl IDs + anno_data$symbol <- ifelse( + is.na(anno_data$symbol) | anno_data$symbol == "", + anno_data$ensembl_gene_id, + anno_data$symbol + ) + + # ---- Build fill colors for gene track ------------------------------------- + if (is.null(highlight_genes)) { + fill_colors <- rep(other_color, length.out = nrow(anno_data)) + } else { + fill_colors <- ifelse( + anno_data$symbol %in% highlight_genes, + gene_color, + other_color + ) + } + + # ---- Axis track ------------------------------------------------------------ + axis_track <- Gviz::GenomeAxisTrack() + + # Store symbol as "gene" for labeling + gr$gene <- anno_data$symbol + + # ---- Gene annotation track ------------------------------------------------- + anno_track <- Gviz::AnnotationTrack( + gr, + group = gr$gene, + groupAnnotation = "group", + just.group = "below", + fill = fill_colors, + col = NA, + lwd.border = 0, + shape = "box", + name = "Genes", + fontcolor.group = "black", + cex.group = 0.8, + fontface.group = 1, + just.title = "right" + ) + + track_list <- list(axis_track, anno_track) + + # ---- Optional transcript track -------------------------------------------- + if (isTRUE(show_transcripts)) { + bm_transcripts <- biomaRt::getBM( + attributes = c( + "ensembl_gene_id", "ensembl_transcript_id", + "chromosome_name", "transcript_start", + "transcript_end", "strand", + "external_gene_name", "gene_biotype", + "transcript_biotype" + ), + filters = c("chromosome_name", "start", "end"), + values = list( + chromosome_name = chr_num, + start = start, + end = end + ), + mart = mart + ) + + # If gene_ids are provided, keep only those transcripts + if (!is.null(gene_ids)) { + bm_transcripts <- bm_transcripts[bm_transcripts$external_gene_name %in% gene_ids, ] + } + + if (nrow(bm_transcripts) == 0) { + warning("No transcripts found in the requested region.", call. = FALSE) + } else { + # Fix strand encoding for transcripts + bm_transcripts$strand <- ifelse( + bm_transcripts$strand == 1, "+", + ifelse(bm_transcripts$strand == -1, "-", "*") + ) + + # Build GRanges for transcripts + gr_tx <- GenomicRanges::makeGRangesFromDataFrame( + bm_transcripts, + seqnames.field = "chromosome_name", + start.field = "transcript_start", + end.field = "transcript_end", + strand.field = "strand", + keep.extra.columns = TRUE + ) + GenomeInfoDb::seqlevelsStyle(gr_tx) <- "UCSC" + + # Force standard metadata column names for labels + mcols(gr_tx)$gene <- mcols(gr_tx)$ensembl_gene_id + mcols(gr_tx)$transcript <- mcols(gr_tx)$ensembl_transcript_id + + transcript_track <- Gviz::GeneRegionTrack( + gr_tx, + genome = genome_label, + chromosome = chr, + name = "Transcripts", + transcriptAnnotation = "transcript", + col = NULL, + fill = other_color + ) + + track_list <- c(track_list, list(transcript_track)) + } + } + + # ---- Optional data tracks -------------------------------------------------- + if (length(tracks) > 0) { + for (nm in names(tracks)) { + f <- tracks[[nm]] + ext <- tolower(tools::file_ext(f)) + + if (!file.exists(f)) { + stop("Track file not found: ", f, call. = FALSE) + } + + tr <- switch( + ext, + bam = { + if (!requireNamespace("Rsamtools", quietly = TRUE)) { + stop("Package \"Rsamtools\" must be installed for BAM tracks.", call. = FALSE) + } + if (!file.exists(paste0(f, ".bai"))) { + stop("BAM index not found: ", paste0(f, ".bai"), call. = FALSE) + } + Gviz::AlignmentsTrack( + f, + isPaired = TRUE, + name = nm, + genome = genome_label, + chromosome = chr + ) + }, + bw =, + bigwig = { + if (!requireNamespace("rtracklayer", quietly = TRUE)) { + stop("Package \"rtracklayer\" must be installed for BigWig tracks.", call. = FALSE) + } + Gviz::DataTrack( + f, + genome = genome_label, + chromosome = chr, + name = nm, + type = "h", + fill.histogram = other_color, + just.title = "right" + ) + }, + bed = { + if (!requireNamespace("rtracklayer", quietly = TRUE)) { + stop("Package \"rtracklayer\" must be installed for BED tracks.", call. = FALSE) + } + bed_gr <- rtracklayer::import(f) + + # Use BED "name" column as labels when available + if ("name" %in% names(mcols(bed_gr))) { + mcols(bed_gr)$label <- mcols(bed_gr)$name + } else { + mcols(bed_gr)$label <- NA_character_ + } + + Gviz::AnnotationTrack( + bed_gr, + genome = genome_label, + chromosome = chr, + name = nm, + shape = "box", + group = bed_gr$label, + groupAnnotation = "group", + just.group = "below", + showFeatureId = FALSE, + fontcolor.group = "black", + cex.group = 0.8, + just.title = "right" + ) + }, + stop( + "Unsupported file format: .", ext, + "\nSupported: .bam, .bw, .bigwig, .bed", + call. = FALSE + ) + ) + + track_list <- c(track_list, list(tr)) + } + } + + # ---- Optional PDF export --------------------------------------------------- + if (!is.null(export_pdf)) { + grDevices::pdf(export_pdf, width = 13, height = 1.8 * length(track_list)) + } + + # ---- Track sizes ----------------------------------------------------------- + if (!is.null(track_sizes)) { + if (length(track_sizes) != length(track_list)) { + stop("`track_sizes` must have the same length as `track_list`.", call. = FALSE) + } + sizes <- track_sizes + } else { + n_extra <- length(track_list) - 2 - ifelse(isTRUE(show_transcripts), 1, 0) + sizes <- c( + 1, # GenomeAxis + 4, # Genes + if (isTRUE(show_transcripts)) 10 else NULL, # Transcripts + rep(3, n_extra) # Additional tracks + ) + } + + # ---- Plot tracks ----------------------------------------------------------- + Gviz::plotTracks( + track_list, + chromosome = chr, + from = start, + to = end, + genome = genome_label, + rotation.title = 0, + title.width = 3, + cex.title = 1.2, + fontface.title = 1, + col.title = "black", + background.title = "white", + col.border.title = "white", + lwd.title = 0, + just.title = "right", + col.axis = "black", + cex.axis = 0.6, + col.frame = "white", + frame = TRUE, + main = paste0("Genomic Coordinates (", genome_label, ")"), + cex.main = 1.2, + fontface.main = 1, + sizes = sizes + ) + + # ---- Close PDF device if needed ------------------------------------------- + if (!is.null(export_pdf)) { + grDevices::dev.off() + } + + invisible(track_list) +} diff --git a/man/OmicsKit-package.Rd b/man/OmicsKit-package.Rd index 8e9a101..71d4bcc 100644 --- a/man/OmicsKit-package.Rd +++ b/man/OmicsKit-package.Rd @@ -7,12 +7,20 @@ \title{OmicsKit: A bioinformatics toolset for multiomics analysis} \description{ This package contains functions that help in manipulating tables and generating plots for multi-omics analysis including genomics, transcriptomics, proteomics, methylomics and immunoinformatics. +} +\seealso{ +Useful links: +\itemize{ + \item \url{https://bigmindlab.github.io/OmicsKit} +} + } \author{ \strong{Maintainer}: Daniel Guevara \email{dgdiaz011202@gmail.com} (\href{https://orcid.org/0009-0001-2786-8729}{ORCID}) Authors: \itemize{ + \item Daniel Guevara \email{dgdiaz011202@gmail.com} (\href{https://orcid.org/0009-0001-2786-8729}{ORCID}) \item Daniel Garbozo \email{dan.garbozo.1201.urp@gmail.com} (\href{https://orcid.org/0009-0003-2495-6568}{ORCID}) \item Angela Alarcon \email{alarcondcg@gmail.com} (\href{https://orcid.org/0000-0003-0293-5603}{ORCID}) \item Jennifer Almeyda \email{jalmeydatz@gmail.com} (\href{https://orcid.org/0009-0003-9589-1167}{ORCID}) diff --git a/man/get_annotations.Rd b/man/get_annotations.Rd index f741973..a5144f0 100644 --- a/man/get_annotations.Rd +++ b/man/get_annotations.Rd @@ -2,75 +2,118 @@ % Please edit documentation in R/get_annotations.R \name{get_annotations} \alias{get_annotations} -\title{Get annotations from Ensembl.} +\title{Get gene or transcript annotations from Ensembl BioMart.} \usage{ get_annotations( ensembl_ids, + species = "hsapiens_gene_ensembl", mode = "genes", + version = "current", filename = "gene_annotations", - version = "Current", format = "csv" ) } \arguments{ -\item{ensembl_ids}{The column of transcripts to be used as input.} +\item{ensembl_ids}{A character vector of Ensembl gene or transcript IDs as inputs.} -\item{mode}{To specify the IDs provided, between "transcripts" or "genes". Default = genes.} +\item{species}{The BioMart dataset identifier. Default = +\code{"hsapiens_gene_ensembl"} (human). Use \code{\link[=list_ensembl_species]{list_ensembl_species()}} to find +the identifier for other organisms.} -\item{filename}{The name of the output file, which is table. Default = gene_annotations.} +\item{mode}{Either \code{"genes"} or \code{"transcripts"}. Default = \code{"genes"}.} -\item{version}{This function can use the versions 102, 103, and 112 of Ensembl. Default = "Current".} +\item{version}{Ensembl release version as a string (e.g. \code{"112"}, \code{"114"}), +or \code{"current"}. Use \code{\link[=list_ensembl_versions]{list_ensembl_versions()}} to see available versions, +and \code{\link[=list_ensembl_species]{list_ensembl_species()}} to confirm your species exists in that +version. Default = \code{"current"}.} -\item{format}{The output is saved in .csv or .xlsx formats. Default = csv.} +\item{filename}{Name of the output file without extension. If \code{NULL}, the +result is returned but \strong{not} saved to disk. Default = +\code{"gene_annotations"}.} + +\item{format}{Output format: \code{"csv"} or \code{"xlsx"}. Ignored when +\code{filename = NULL}. Default = \code{"csv"}.} } \value{ -A data frame with one row per input ID and the following columns: -\code{geneID}, \code{symbol}, \code{biotype}, \code{chromosome}, \code{gene_start}, \code{gene_end}, -\code{gene_length}, \code{description}. For \code{mode = "transcripts"}, an additional -\code{transcriptID} column is included. The data frame is also saved to disk -as a \code{.csv} or \code{.xlsx} file (see \code{filename} and \code{format}). +A data frame with one row per input ID and columns: \code{geneID}, +\code{symbol}, \code{biotype}, \code{chromosome}, \code{gene_start}, \code{gene_end}, +\code{gene_length}, \code{description}. For \code{mode = "transcripts"}, a \code{transcriptID} +column is prepended. If \code{filename} is not \code{NULL}, the data frame is also +written to disk. } \description{ This function annotates a column of transcripts or gene IDs (ENSEMBL) with information of the Biomart. -If transcript IDs are provided, they are also annotated with information of the genes to which they belong. } \details{ The Gene information added include: \itemize{ -\item Gene ENSEMBL ID, HGNC Symbol, Description, Biotype and Chromosome. +\item Gene ENSEMBL ID, gene Symbol, Description, Biotype and Chromosome. \item Gene start, end and length } + +Annotates a vector of Ensembl gene or transcript IDs using BioMart. If +transcript IDs are provided, they are also annotated with information +of the genes to which they belong. Works for any species available in +Ensembl — use \code{\link[=list_ensembl_species]{list_ensembl_species()}} to find the correct \code{species} value +for your organism, and \code{\link[=list_ensembl_versions]{list_ensembl_versions()}} to verify version availability. + +\strong{Workflow:} +\enumerate{ +\item \code{list_ensembl_species()} → find dataset name. +\item \code{list_ensembl_versions()} → confirm version exists. +\item \code{get_annotations(ensembl_ids, species = "drerio_gene_ensembl")}. +} + +\strong{Symbol resolution by species:} +\itemize{ +\item Human (\code{hsapiens}): uses \code{hgnc_symbol}, falling back to +\code{external_gene_name} when HGNC symbol is absent. +\item Mouse (\code{mmusculus}): uses \code{mgi_symbol}. +\item Rat (\code{rnorvegicus}): uses \code{rgd_symbol}. +\item Zebrafish (\code{drerio}): uses \code{zfin_id_symbol}. +\item Drosophila (\code{dmelanogaster}): uses \code{flybasename_gene}. +\item All other species: uses \code{external_gene_name} (universal fallback). +} } \note{ Requires an active internet connection to query the Ensembl BioMart. -\code{gene_length} is computed as \code{gene_end - gene_start + 1} (genomic length). -For TPM calculation with \code{\link[=tpm]{tpm()}}, this is an approximation, -use transcript-level lengths for higher accuracy. +\code{gene_length} is computed as \code{gene_end - gene_start + 1} (genomic span). +For TPM calculation with \code{\link[=tpm]{tpm()}}, transcript-level lengths are more +accurate. } \examples{ \dontrun{ -# Annotate genes from Normalized counts (requires internet connection) -data(norm_counts) +# Step 1 — find your species dataset +list_ensembl_species(version = "112", filter = "zebrafish") +# -> "drerio_gene_ensembl" -# Requires a reference table with a "geneID" column. -# Use get_annotations() to generate it: +# Step 2 — annotate (human, default) annotations <- get_annotations( - ensembl_ids = rownames(norm_counts), - mode = "genes" + ensembl_ids = c("ENSG00000141510", "ENSG00000012048"), + species = "hsapiens_gene_ensembl", + version = "112", + filename = NULL ) -head(annotations) +# Mouse +annotations_mouse <- get_annotations( + ensembl_ids = c("ENSMUSG00000059552", "ENSMUSG00000024610"), + species = "mmusculus_gene_ensembl", + version = "112", + filename = NULL +) -# Use with add_annotations() -norm_counts_annot <- add_annotations( - object = norm_counts, - reference = annotations, - variables = c("symbol", "biotype") +# Zebrafish +annotations_zf <- get_annotations( + ensembl_ids = c("ENSDARG00000002333"), + species = "drerio_gene_ensembl", + version = "112", + filename = NULL ) } } \seealso{ -\code{\link[=add_annotations]{add_annotations()}} to join annotations to a counts matrix; -\code{\link[=tpm]{tpm()}} which requires gene lengths from this function. +\code{\link[=list_ensembl_versions]{list_ensembl_versions()}}, \code{\link[=list_ensembl_species]{list_ensembl_species()}} to find +the identifier for other organisms, \code{\link[=add_annotations]{add_annotations()}}, \code{\link[=tpm]{tpm()}} } diff --git a/man/get_superterm.Rd b/man/get_superterm.Rd index 3df8568..fc5c51e 100644 --- a/man/get_superterm.Rd +++ b/man/get_superterm.Rd @@ -17,7 +17,7 @@ network).} \item{community_membership}{A named numeric or integer vector mapping each gene set to its community ID. Typically the output of -\code{\link[igraph:communities]{igraph::membership()}} applied to a community detection result (e.g., +\code{\link[igraph:membership]{igraph::membership()}} applied to a community detection result (e.g., \code{\link[igraph:cluster_louvain]{igraph::cluster_louvain()}}). Must have the same length as \code{geneset_names}. See \code{\link[=get_network_communities]{get_network_communities()}} for a simpler workflow.} diff --git a/man/list_ensembl_species.Rd b/man/list_ensembl_species.Rd new file mode 100644 index 0000000..b425778 --- /dev/null +++ b/man/list_ensembl_species.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_annotations.R +\name{list_ensembl_species} +\alias{list_ensembl_species} +\title{List all species datasets available in a given Ensembl version.} +\usage{ +list_ensembl_species(version = "current", filter = NULL) +} +\arguments{ +\item{version}{Ensembl release version as a string (e.g. \code{"112"}, \code{"114"}), +or \code{"current"} for the latest release. Default = \code{"current"}.} + +\item{filter}{Optional. A search string to filter results by dataset name +or description (case-insensitive). Useful for quickly finding a species +without scrolling through hundreds of rows. Default = \code{NULL} (no filter).} +} +\value{ +A data frame with columns: +\describe{ +\item{dataset}{The dataset identifier to pass to \code{species} in +\code{\link[=get_annotations]{get_annotations()}} (e.g. \code{"hsapiens_gene_ensembl"}).} +\item{description}{Human-readable species name and assembly +(e.g. \code{"Human genes (GRCh38.p14)"}).} +\item{version}{Assembly version string.} +} +} +\description{ +Queries BioMart at the specified Ensembl version and returns a data frame +of all available gene datasets. Use this to: +\itemize{ +\item Find the exact \code{species} string to pass to \code{\link[=get_annotations]{get_annotations()}}. +\item Confirm that a species is available in a specific Ensembl version. +} +} +\details{ +Because Ensembl releases are universal (same version number for all +species), this function simply lists which species datasets exist in the +BioMart of the requested version. Species added in later releases will not +appear in older versions. +} +\examples{ +\dontrun{ +# All species in version 112 +list_ensembl_species(version = "112") + +# Find zebrafish dataset in version 112 +list_ensembl_species(version = "112", filter = "zebrafish") + +# Find mouse dataset +list_ensembl_species(filter = "mouse") + +# Find all fish species +list_ensembl_species(filter = "fish") +} + +} +\seealso{ +\code{\link[=list_ensembl_versions]{list_ensembl_versions()}}, \code{\link[=get_annotations]{get_annotations()}} +} diff --git a/man/list_ensembl_versions.Rd b/man/list_ensembl_versions.Rd new file mode 100644 index 0000000..d6e1480 --- /dev/null +++ b/man/list_ensembl_versions.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_annotations.R +\name{list_ensembl_versions} +\alias{list_ensembl_versions} +\title{List available Ensembl releases and their BioMart hosts.} +\usage{ +list_ensembl_versions() +} +\value{ +A data frame with columns \code{name}, \code{date}, \code{url}, \code{version}, and +\code{current_release} (marked with \code{*} for the active release). +} +\description{ +A convenience wrapper around \code{\link[biomaRt:listEnsemblArchives]{biomaRt::listEnsemblArchives()}} that returns +all available Ensembl release versions with their dates and host URLs. +} +\details{ +Ensembl uses a \strong{universal release numbering system}: every release (e.g. +v112, v113) covers \emph{all} species simultaneously. What varies between species +is whether the species is present in a given release and which genome +assembly is used. To check whether your species of interest is available in +a particular version, use \code{\link[=list_ensembl_species]{list_ensembl_species()}}. + +The recommended workflow is: +\enumerate{ +\item Find the dataset name for your species: \code{list_ensembl_species()}. +\item Find available versions and confirm species presence: +\code{list_ensembl_versions()} + \code{list_ensembl_species(version = "X")}. +\item Annotate: \code{get_annotations(species = "...", version = "...")}. +} +} +\examples{ +\dontrun{ +# See all available Ensembl versions +list_ensembl_versions() + +# Then check if your species is in a specific version +list_ensembl_species(version = "112") +} + +} +\seealso{ +\code{\link[=list_ensembl_species]{list_ensembl_species()}}, \code{\link[=get_annotations]{get_annotations()}} +} diff --git a/man/nice_GenomeTrack.Rd b/man/nice_GenomeTrack.Rd new file mode 100644 index 0000000..4605d08 --- /dev/null +++ b/man/nice_GenomeTrack.Rd @@ -0,0 +1,86 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/nice_GenomeTrack.R +\name{nice_GenomeTrack} +\alias{nice_GenomeTrack} +\title{Plot genomic tracks with gene annotations and optional data tracks.} +\usage{ +nice_GenomeTrack( + region, + genome_label = "hg38", + organism = "hsapiens_gene_ensembl", + ensembl_version = "current", + annotations = NULL, + gene_ids = NULL, + tracks = list(), + highlight_genes = NULL, + gene_color = "orangered", + other_color = "#7B68EE", + show_transcripts = FALSE, + track_sizes = NULL, + export_pdf = NULL +) +} +\arguments{ +\item{region}{Genomic region to visualize. Either a \code{GRanges} object or a +named vector with \code{chr}, \code{start}, and \code{end} entries +(e.g. \code{c(chr = "chr1", start = 1e6, end = 2e6)}).} + +\item{genome_label}{Assembly label used by \code{Gviz} (e.g. "hg38", "mm10"). +This is not a BioMart parameter; it is only used for track metadata.} + +\item{organism}{Ensembl BioMart dataset name (e.g. "hsapiens_gene_ensembl"). +Use \code{\link[=list_ensembl_species]{list_ensembl_species()}} to find valid dataset identifiers.} + +\item{ensembl_version}{Ensembl release version (e.g. "112") or "current". +Determines which Ensembl archive host is queried.} + +\item{annotations}{Optional data frame of precomputed annotations (for +example from \code{\link[=get_annotations]{get_annotations()}}). Must include columns \code{geneID}, \code{symbol}, +\code{chromosome}, \code{gene_start}, \code{gene_end}.} + +\item{gene_ids}{Optional character vector of gene identifiers (Ensembl IDs +or symbols). If provided, annotations are queried internally.} + +\item{tracks}{Named list of file paths for data tracks. Supported formats: +\code{bam}, \code{bw}/\code{bigwig}, \code{bed}.} + +\item{highlight_genes}{Character vector of gene symbols to highlight.} + +\item{gene_color}{Color for highlighted genes.} + +\item{other_color}{Color for non-highlighted genes.} + +\item{show_transcripts}{Logical; if \code{TRUE}, adds a \code{GeneRegionTrack} with +transcripts from Ensembl.} + +\item{track_sizes}{Optional numeric vector of relative heights for tracks. +If provided, its length must match \code{track_list}. If \code{NULL}, sizes are +computed automatically.} + +\item{export_pdf}{Optional file path to save a PDF. If \code{NULL}, the plot is +rendered to the active device.} +} +\value{ +A list of \code{Gviz} track objects (invisibly). +} +\description{ +Builds a multi-track genomic visualization using \code{Gviz}. The function can +annotate all genes in a genomic region or restrict annotations to user-supplied +gene IDs/symbols. Optional data tracks include BAM, BigWig, and BED files. +} +\note{ +BAM files require an index file with the \code{.bai} suffix in the same directory +(e.g. \code{sample.bam.bai}). If it is missing, an error is raised. +} +\examples{ +\dontrun{ +nice_GenomeTrack( + region = c(chr = "chr17", start = 7e6, end = 8e6), + genome_label = "hg38", + organism = "hsapiens_gene_ensembl", + ensembl_version = "current", + tracks = list(ChIP = "chip_signal.bw") +) +} + +} diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 0000000..e4be3fc --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,12 @@ +# This file is part of the standard setup for testthat. +# It is recommended that you do not modify it. +# +# Where should you do additional test configuration? +# Learn more about the roles of various files in: +# * https://r-pkgs.org/testing-design.html#sec-tests-files-overview +# * https://testthat.r-lib.org/articles/special-files.html + +library(testthat) +library(OmicsKit) + +test_check("OmicsKit") diff --git a/tests/testthat/test-get_annotations.R b/tests/testthat/test-get_annotations.R new file mode 100644 index 0000000..382513c --- /dev/null +++ b/tests/testthat/test-get_annotations.R @@ -0,0 +1,94 @@ +testthat::test_that("get_annotations returns expected columns for genes", { + testthat::skip_if_not_installed("biomaRt") + testthat::skip_on_cran() + + mock_useMart <- function(...) structure(list(), class = "mock_mart") + mock_getBM <- function(attributes, filters, values, mart) { + if (filters == "ensembl_gene_id") { + data.frame( + ensembl_gene_id = "ENSG00000141510", + hgnc_symbol = "TP53", + gene_biotype = "protein_coding", + chromosome_name = "17", + start_position = 7661779, + end_position = 7687550, + description = "tumor protein p53", + stringsAsFactors = FALSE + ) + } else { + stop("Unexpected filters in mock_getBM()") + } + } + + testthat::local_mocked_bindings( + useMart = mock_useMart, + getBM = mock_getBM, + .package = "biomaRt" + ) + + ids <- c("ENSG00000141510") # TP53 + res <- get_annotations( + ensembl_ids = ids, + species = "hsapiens_gene_ensembl", + version = "112", + filename = NULL + ) + + testthat::expect_s3_class(res, "data.frame") + testthat::expect_true(all(c("geneID", "symbol", "biotype", "chromosome", + "gene_start", "gene_end", "gene_length", "description") %in% names(res))) + testthat::expect_equal(res$geneID[1], "ENSG00000141510") +}) + +testthat::test_that("get_annotations works for transcripts", { + testthat::skip_if_not_installed("biomaRt") + testthat::skip_on_cran() + + mock_useMart <- function(...) structure(list(), class = "mock_mart") + mock_getBM <- function(attributes, filters, values, mart) { + if (filters == "ensembl_transcript_id_version") { + data.frame( + ensembl_transcript_id_version = "ENST00000269305.9", + ensembl_gene_id = "ENSG00000141510", + hgnc_symbol = "TP53", + gene_biotype = "protein_coding", + chromosome_name = "17", + start_position = 7661779, + end_position = 7687550, + description = "tumor protein p53", + stringsAsFactors = FALSE + ) + } else { + stop("Unexpected filters in mock_getBM()") + } + } + + testthat::local_mocked_bindings( + useMart = mock_useMart, + getBM = mock_getBM, + .package = "biomaRt" + ) + + ids <- "ENST00000269305.9" + + res <- get_annotations( + ensembl_ids = ids, + species = "hsapiens_gene_ensembl", + mode = "transcripts", + version = "112", + filename = NULL + ) + + testthat::expect_s3_class(res, "data.frame") + testthat::expect_true("transcriptID" %in% names(res)) + testthat::expect_equal(res$transcriptID[1], ids) +}) + +testthat::test_that("get_annotations errors when biomaRt is missing", { + testthat::skip_if_not_installed("withr") + + withr::with_namespace("biomaRt", { + # If biomaRt is installed, skip this test + testthat::skip("biomaRt is installed; cannot simulate missing package here.") + }) +}) diff --git a/tests/testthat/test-helpers.R b/tests/testthat/test-helpers.R new file mode 100644 index 0000000..255dec7 --- /dev/null +++ b/tests/testthat/test-helpers.R @@ -0,0 +1,31 @@ +testthat::test_that(".resolve_ensembl_host handles current", { + expect_equal(.resolve_ensembl_host("current"), "https://www.ensembl.org") +}) + +testthat::test_that(".get_symbol_attribute returns HGNC for human", { + res <- .get_symbol_attribute(NULL, "hsapiens_gene_ensembl") + expect_equal(res$attr, "hgnc_symbol") + expect_equal(res$source, "HGNC") +}) + +testthat::test_that(".resolve_symbol_column sets symbol for human", { + df <- data.frame( + hgnc_symbol = c("", "TP53"), + external_gene_name = c("TP53", "TP53"), + stringsAsFactors = FALSE + ) + + out <- .resolve_symbol_column(df, is_human = TRUE, symbol_attr = "hgnc_symbol") + expect_equal(out$symbol[1], "TP53") # fallback to external_gene_name + expect_equal(out$symbol[2], "TP53") # hgnc_symbol preferred +}) + +testthat::test_that(".resolve_symbol_column sets symbol for non-human", { + df <- data.frame( + mgi_symbol = c("Actb"), + stringsAsFactors = FALSE + ) + + out <- .resolve_symbol_column(df, is_human = FALSE, symbol_attr = "mgi_symbol") + expect_equal(out$symbol[1], "Actb") +}) diff --git a/tests/testthat/test-nice_GenomeTrack.R b/tests/testthat/test-nice_GenomeTrack.R new file mode 100644 index 0000000..e67f50b --- /dev/null +++ b/tests/testthat/test-nice_GenomeTrack.R @@ -0,0 +1,78 @@ +test_that("nice_GenomeTrack validates inputs and returns tracks", { + skip_if_not_installed("Gviz") + skip_if_not_installed("biomaRt") + skip_if_not_installed("GenomicRanges") + skip_if_not_installed("GenomeInfoDb") + + host <- .resolve_ensembl_host("113") + testthat::skip_if_offline(gsub("^https?://", "", host)) + + + # ---- Minimal annotations (no BioMart queries) ----------------------------- + annotations <- data.frame( + geneID = c("ENSG000001", "ENSG000002"), + symbol = c("GENE1", "GENE2"), + chromosome = c("1", "1"), + gene_start = c(150, 400), + gene_end = c(200, 450), + strand = c("+", "-"), + stringsAsFactors = FALSE + ) + + region <- c(chr = "chr1", start = 100, end = 1000) + + # ---- Error when both annotations and gene_ids are provided ---------------- + expect_error( + nice_GenomeTrack( + region = region, + annotations = annotations, + gene_ids = c("GENE1") + ), + "Provide only one" + ) + + # ---- Error on invalid region --------------------------------------------- + expect_error( + nice_GenomeTrack( + region = c(chr = "chr1", start = 200, end = 100), + annotations = annotations + ), + "start must be less than or equal to end" + ) + + # ---- Error when annotations missing required columns ---------------------- + bad_annotations <- data.frame( + geneID = "ENSG000001", + symbol = "GENE1" + ) + expect_error( + nice_GenomeTrack( + region = region, + annotations = bad_annotations + ), + "must include columns" + ) + + # ---- Successful run with annotations -------------------------------------- + out_pdf <- tempfile(fileext = ".pdf") + tracks <- nice_GenomeTrack( + region = region, + annotations = annotations, + show_transcripts = FALSE, + export_pdf = out_pdf + ) + + expect_type(tracks, "list") + expect_length(tracks, 2) # GenomeAxis + Genes + + # ---- Error when track_sizes length does not match track_list -------------- + expect_error( + nice_GenomeTrack( + region = region, + annotations = annotations, + show_transcripts = FALSE, + track_sizes = c(1) # wrong length + ), + "track_sizes" + ) +}) diff --git a/vignettes/DEA_workflow.Rmd b/vignettes/DEA_workflow.Rmd index b8388b7..a02f104 100644 --- a/vignettes/DEA_workflow.Rmd +++ b/vignettes/DEA_workflow.Rmd @@ -36,6 +36,7 @@ The workflow covers: ```{r packages} library(OmicsKit) library(ggplot2) +library(biomaRt) # for get_annotations ``` --- @@ -195,15 +196,21 @@ sum(deseq2_results$padj < 0.05, na.rm = TRUE) `get_annotations()` queries Ensembl via biomaRt to retrieve gene symbols, biotype, chromosomal location, and length. Requires an internet connection. +Use `list_ensembl_species()` to find the correct dataset name for your organism, +and `list_ensembl_versions()` to see available releases. + ```{r get_annotations, eval = FALSE} +# Example: list datasets (optional) +# list_ensembl_species(version = "current", filter = "human") + annotations <- get_annotations( ensembl_ids = deseq2_results$gene_id, + species = "hsapiens_gene_ensembl", mode = "genes", - version = "Current", + version = "current", filename = "luad_gene_annotations", format = "csv" ) - head(annotations) ``` @@ -217,7 +224,8 @@ Ensembl IDs as the key: norm_counts_annot <- add_annotations( object = norm_counts, reference = annotations, - variables = c("symbol", "biotype") + variables = c("symbol", "biotype"), + data_frame = TRUE ) head(norm_counts_annot[, c("geneID", "symbol", "biotype")]) diff --git a/vignettes/PA_workflow.Rmd b/vignettes/PA_workflow.Rmd index 97a42c9..e0f9918 100644 --- a/vignettes/PA_workflow.Rmd +++ b/vignettes/PA_workflow.Rmd @@ -67,12 +67,30 @@ In this vignette, we use a built-in example `geneset_list`, which contains immune response, and metabolism), and the built-in `gsea_results` dataset, which mimics the output of `merge_PA()` for three comparisons across HALLMARK, KEGG, and GO collections. -```{r load_data} +```{r load_data, eval = FALSE} data(geneset_list) data(gsea_results) data(deseq2_results) data(vst_counts) data(sampledata) + +# Retrieve HGNC symbols +annotations <- get_annotations( + ensembl_ids = deseq2_results$gene_id, + species = "hsapiens_gene_ensembl", + mode = "genes", + version = "112", + filename = "luad_gene_annotations", + format = "csv" +) + +# Set format to add_annotations +rownames(deseq2_results) <- deseq2_results$gene_id +deseq2_results_annot <- add_annotations(object = deseq2_results, + reference = annotations, + variables = c("symbol", "biotype"), + data_frame = TRUE +) # Ranked gene list: DESeq2 Wald stat from most positive to most negative ranked_genes <- deseq2_results$gene_id[ order(deseq2_results$stat, decreasing = TRUE) @@ -173,12 +191,27 @@ files are organized into subdirectories by format (PDF / JPG) and gene selection mode (`all_genes`, `le_genes`, `top_genes`). ```{r heatmap_PA, eval = FALSE} +# Change rownames from vst to hgnc symbols and clean format +idx_vst <- match(rownames(vst_counts), annotations$geneID) +new_rownames <- rownames(vst_counts) + +valid_vst <- !is.na(idx_vst) & + !is.na(annotations$symbol[idx_vst]) & + annotations$symbol[idx_vst] != "" + +new_rownames[valid_vst] <- annotations$symbol[idx_vst[valid_vst]] + +vst_sym <- vst_counts +rownames(vst_sym) <- new_rownames +vst_sym <- vst_sym[!duplicated(rownames(vst_sym)), ] + +# Plot heatmap_PA( - expression_data = vst_counts, + expression_data = vst_sym, metadata = sampledata, pa_data_annot = pa_annot, ranked_genes = ranked_genes, - plot_genes = c("all_genes", "le_genes", "top_genes"), + plot_genes = c("all_genes"), sample_col = "patient_id", group_col = "sample_type", out_dir = "heatmaps_PA",