diff --git a/.Rbuildignore b/.Rbuildignore index c57d8d6..e2f3e9c 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -8,3 +8,7 @@ ^_pkgdown\.yml$ ^docs$ ^pkgdown$ +^\.positai$ +^\.claude$ +^doc$ +^Meta$ diff --git a/.gitignore b/.gitignore index 90cd220..7ed16d8 100644 --- a/.gitignore +++ b/.gitignore @@ -53,3 +53,6 @@ rsconnect/ .quarto inst/doc docs +.positai +/doc/ +/Meta/ diff --git a/DESCRIPTION b/DESCRIPTION index 177895e..60a4790 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -55,12 +55,14 @@ Suggests: tidygraph, tidyr, tidyselect, - tm + tm, + testthat (>= 3.0.0), + withr Roxygen: list(markdown = TRUE) Depends: - R (>= 3.5) + R (>= 4.4.0) LazyData: true LazyDataCompression: xz VignetteBuilder: knitr URL: https://bigmindlab.github.io/OmicsKit - +Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index 44d21fc..23bc00e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,8 @@ export(get_superterm) export(getgenesPA) export(heatmap_PA) export(heatmap_path_PA) +export(list_ensembl_species) +export(list_ensembl_versions) export(list_gmts) export(merge_PA) export(multiplot_PA) diff --git a/R/get_annotations.R b/R/get_annotations.R index ff25e15..e4a91ef 100644 --- a/R/get_annotations.R +++ b/R/get_annotations.R @@ -1,139 +1,605 @@ +# ============================================================================= +# OmicsKit — Ensembl annotation utilities +# Functions: list_ensembl_versions, list_ensembl_species, get_annotations +# Internal helpers: .resolve_ensembl_host, .get_symbol_attribute, +# .resolve_symbol_column, .handle_biomart_connection_error, +# .handle_biomart_query_error +# ============================================================================= + + ############################ # Function get_annotations # ############################ -#' Get annotations from Ensembl. +#' Get gene or transcript annotations from Ensembl BioMart. #' #' This function annotates a column of transcripts or gene IDs (ENSEMBL) with information of the Biomart. -#' If transcript IDs are provided, they are also annotated with information of the genes to which they belong. #' #' The Gene information added include: -#' - Gene ENSEMBL ID, HGNC Symbol, Description, Biotype and Chromosome. +#' - Gene ENSEMBL ID, gene Symbol, Description, Biotype and Chromosome. #' - Gene start, end and length #' -#' @param ensembl_ids The column of transcripts to be used as input. -#' @param mode To specify the IDs provided, between "transcripts" or "genes". Default = genes. -#' @param version This function can use the versions 102, 103, and 112 of Ensembl. Default = "Current". -#' @param filename The name of the output file, which is table. Default = gene_annotations. -#' @param format The output is saved in .csv or .xlsx formats. Default = csv. +#' Annotates a vector of Ensembl gene or transcript IDs using BioMart. If +#' transcript IDs are provided, they are also annotated with information +#' of the genes to which they belong. Works for any species available in +#' Ensembl — use [list_ensembl_species()] to find the correct `species` value +#' for your organism, and [list_ensembl_versions()] to verify version availability. +#' +#' @details +#' **Workflow:** +#' \enumerate{ +#' \item `list_ensembl_species()` → find dataset name. +#' \item `list_ensembl_versions()` → confirm version exists. +#' \item `get_annotations(ensembl_ids, species = "drerio_gene_ensembl")`. +#' } +#' +#' **Symbol resolution by species:** +#' \itemize{ +#' \item Human (`hsapiens`): uses `hgnc_symbol`, falling back to +#' `external_gene_name` when HGNC symbol is absent. +#' \item Mouse (`mmusculus`): uses `mgi_symbol`. +#' \item Rat (`rnorvegicus`): uses `rgd_symbol`. +#' \item Zebrafish (`drerio`): uses `zfin_id_symbol`. +#' \item Drosophila (`dmelanogaster`): uses `flybasename_gene`. +#' \item All other species: uses `external_gene_name` (universal fallback). +#' } +#' +#' @param ensembl_ids A character vector of Ensembl gene or transcript IDs as inputs. +#' @param species The BioMart dataset identifier. Default = +#' `"hsapiens_gene_ensembl"` (human). Use [list_ensembl_species()] to find +#' the identifier for other organisms. +#' @param mode Either `"genes"` or `"transcripts"`. Default = `"genes"`. +#' @param version Ensembl release version as a string (e.g. `"112"`, `"114"`), +#' or `"current"`. Use [list_ensembl_versions()] to see available versions, +#' and [list_ensembl_species()] to confirm your species exists in that +#' version. Default = `"current"`. +#' @param filename Name of the output file without extension. If `NULL`, the +#' result is returned but **not** saved to disk. Default = +#' `"gene_annotations"`. +#' @param format Output format: `"csv"` or `"xlsx"`. Ignored when +#' `filename = NULL`. Default = `"csv"`. +#' #' @importFrom magrittr %>% #' @importFrom rlang .data #' -#' @return A data frame with one row per input ID and the following columns: -#' `geneID`, `symbol`, `biotype`, `chromosome`, `gene_start`, `gene_end`, -#' `gene_length`, `description`. For `mode = "transcripts"`, an additional -#' `transcriptID` column is included. The data frame is also saved to disk -#' as a `.csv` or `.xlsx` file (see `filename` and `format`). +#' @return A data frame with one row per input ID and columns: `geneID`, +#' `symbol`, `biotype`, `chromosome`, `gene_start`, `gene_end`, +#' `gene_length`, `description`. For `mode = "transcripts"`, a `transcriptID` +#' column is prepended. If `filename` is not `NULL`, the data frame is also +#' written to disk. #' #' @examples #' \dontrun{ -#' # Annotate genes from Normalized counts (requires internet connection) -#' data(norm_counts) +#' # Step 1 — find your species dataset +#' list_ensembl_species(version = "112", filter = "zebrafish") +#' # -> "drerio_gene_ensembl" #' -#' # Requires a reference table with a "geneID" column. -#' # Use get_annotations() to generate it: +#' # Step 2 — annotate (human, default) #' annotations <- get_annotations( -#' ensembl_ids = rownames(norm_counts), -#' mode = "genes" +#' ensembl_ids = c("ENSG00000141510", "ENSG00000012048"), +#' species = "hsapiens_gene_ensembl", +#' version = "112", +#' filename = NULL #' ) #' -#' head(annotations) +#' # Mouse +#' annotations_mouse <- get_annotations( +#' ensembl_ids = c("ENSMUSG00000059552", "ENSMUSG00000024610"), +#' species = "mmusculus_gene_ensembl", +#' version = "112", +#' filename = NULL +#' ) #' -#' # Use with add_annotations() -#' norm_counts_annot <- add_annotations( -#' object = norm_counts, -#' reference = annotations, -#' variables = c("symbol", "biotype") +#' # Zebrafish +#' annotations_zf <- get_annotations( +#' ensembl_ids = c("ENSDARG00000002333"), +#' species = "drerio_gene_ensembl", +#' version = "112", +#' filename = NULL #' ) #' } #' #' @note Requires an active internet connection to query the Ensembl BioMart. -#' `gene_length` is computed as `gene_end - gene_start + 1` (genomic length). -#' For TPM calculation with [tpm()], this is an approximation, -#' use transcript-level lengths for higher accuracy. +#' `gene_length` is computed as `gene_end - gene_start + 1` (genomic span). +#' For TPM calculation with [tpm()], transcript-level lengths are more +#' accurate. #' -#' @seealso [add_annotations()] to join annotations to a counts matrix; -#' [tpm()] which requires gene lengths from this function. +#' @seealso [list_ensembl_versions()], [list_ensembl_species()] to find +#' the identifier for other organisms, [add_annotations()], [tpm()] #' #' @export -get_annotations <- function(ensembl_ids, mode = "genes", filename = "gene_annotations", version = "Current", format = "csv") { - - if (!requireNamespace("biomaRt", quietly = TRUE)) stop("Package \"biomaRt\" must be installed to use this function.", call. = FALSE) - - if (version == "102") { - ensembl = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", - dataset = "hsapiens_gene_ensembl", - host = "https://nov2020.archive.ensembl.org") - } else if (version == "103") { - ensembl = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", - dataset = "hsapiens_gene_ensembl", - host = "https://feb2021.archive.ensembl.org") - } else if (version == "112") { - ensembl = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", - dataset = "hsapiens_gene_ensembl", - host = "https://may2024.archive.ensembl.org") - } else if (version == "Current") { - ensembl = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", - dataset = "hsapiens_gene_ensembl", - host = "https://useast.ensembl.org") +get_annotations <- function(ensembl_ids, + species = "hsapiens_gene_ensembl", + mode = "genes", + version = "current", + filename = "gene_annotations", + format = "csv") { + + # --- Dependency check ------------------------------------------------------- + if (!requireNamespace("biomaRt", quietly = TRUE)) { + stop( + "Package \"biomaRt\" must be installed to use this function.\n", + "Install it with: BiocManager::install(\"biomaRt\")", + call. = FALSE + ) } - annotations <- c("ensembl_gene_id", "hgnc_symbol", "gene_biotype", - "chromosome_name", "start_position", "end_position", - "description") + # --- Resolve host and connect ----------------------------------------------- + host <- .resolve_ensembl_host(version) + + message("Connecting to Ensembl BioMart [version: ", version, + " | species: ", species, "] ...") + + ensembl <- tryCatch( + biomaRt::useMart("ENSEMBL_MART_ENSEMBL", + dataset = species, + host = host), + error = function(e) .handle_biomart_connection_error(e, host, species, version) + ) + + # --- Detect appropriate symbol attribute for this species ------------------- + symbol_info <- .get_symbol_attribute(ensembl, species) + is_human <- grepl("^hsapiens", species) - new_names <- c("geneID", "symbol", "biotype", "chromosome", - "gene_start", "gene_end", "description") + # --- Define BioMart attributes ---------------------------------------------- + if (is_human) { + # Human: fetch both hgnc_symbol + external_gene_name, merge later + annotations_attrs <- c( + "ensembl_gene_id", "hgnc_symbol", "external_gene_name", + "gene_biotype", "chromosome_name", + "start_position", "end_position", "description" + ) + } else { + annotations_attrs <- c( + "ensembl_gene_id", symbol_info$attr, + "gene_biotype", "chromosome_name", + "start_position", "end_position", "description" + ) + } - # There are more annotations available in the biomaRt, check listAttributes(mart = ensembl) - # The terms "go_id" and "name_1006" can be added in a future release. + final_col_names <- c( + "geneID", "symbol", "biotype", "chromosome", + "gene_start", "gene_end", "description" + ) + # --- Query BioMart ---------------------------------------------------------- if (mode == "transcripts") { df <- data.frame(transcriptID = ensembl_ids) - genemap <- biomaRt::getBM(attributes = c("ensembl_transcript_id_version", annotations), - filters = "ensembl_transcript_id_version", - values = df$transcriptID, - mart = ensembl) + genemap <- tryCatch( + biomaRt::getBM( + attributes = c("ensembl_transcript_id_version", annotations_attrs), + filters = "ensembl_transcript_id_version", + values = df$transcriptID, + mart = ensembl + ), + error = function(e) .handle_biomart_query_error(e, host) + ) + + genemap <- .resolve_symbol_column(genemap, is_human, symbol_info$attr) + + keep_cols <- c("ensembl_transcript_id_version", "ensembl_gene_id", + "symbol", "gene_biotype", "chromosome_name", + "start_position", "end_position", "description") idx <- match(df$transcriptID, genemap$ensembl_transcript_id_version) - df <- merge(df, genemap[idx, c("ensembl_transcript_id_version", annotations)], - by.x = "transcriptID", by.y = "ensembl_transcript_id_version") - names(df) <- c("transcriptID", new_names) + df <- merge( + df, + genemap[idx, keep_cols], + by.x = "transcriptID", + by.y = "ensembl_transcript_id_version" + ) + names(df) <- c("transcriptID", final_col_names) } else { df <- data.frame(geneID = ensembl_ids) - genemap <- biomaRt::getBM(attributes = annotations, - filters = "ensembl_gene_id", - values = df$geneID, - mart = ensembl) + genemap <- tryCatch( + biomaRt::getBM( + attributes = annotations_attrs, + filters = "ensembl_gene_id", + values = df$geneID, + mart = ensembl + ), + error = function(e) .handle_biomart_query_error(e, host) + ) - idx <- match(df$geneID, genemap$ensembl_gene_id) - df <- merge(df, genemap[idx, annotations], by.x = "geneID", by.y = "ensembl_gene_id") - names(df) <- new_names + genemap <- .resolve_symbol_column(genemap, is_human, symbol_info$attr) + + keep_cols <- c("ensembl_gene_id", "symbol", "gene_biotype", + "chromosome_name", "start_position", + "end_position", "description") + idx <- match(df$geneID, genemap$ensembl_gene_id) + df <- merge( + df, + genemap[idx, keep_cols], + by.x = "geneID", + by.y = "ensembl_gene_id" + ) + names(df) <- final_col_names } + # --- Compute gene length ---------------------------------------------------- df$gene_length <- df$gene_end - df$gene_start + 1 - df <- df %>% dplyr::relocate(.data$gene_length, .before = "description") + df <- df %>% dplyr::relocate("gene_length", .before = "description") + + # --- Save to disk (optional) ------------------------------------------------ + if (!is.null(filename)) { + + if (format == "xlsx") { + if (!requireNamespace("openxlsx", quietly = TRUE)) { + stop( + "Package \"openxlsx\" must be installed to save in xlsx format.\n", + "Install it with: install.packages(\"openxlsx\")", + call. = FALSE + ) + } + openxlsx::write.xlsx(df, + file = paste0(filename, ".xlsx"), + colNames = TRUE, + rowNames = FALSE, + append = FALSE) + } else { + utils::write.csv(df, row.names = FALSE, file = paste0(filename, ".csv")) + } + + message("Annotations saved to: ", paste0(filename, ".", format)) + } + + return(df) +} + +############################ +# Function list_ensembl_versions # +############################ + +#' List available Ensembl releases and their BioMart hosts. +#' +#' A convenience wrapper around [biomaRt::listEnsemblArchives()] that returns +#' all available Ensembl release versions with their dates and host URLs. +#' +#' @details +#' Ensembl uses a **universal release numbering system**: every release (e.g. +#' v112, v113) covers *all* species simultaneously. What varies between species +#' is whether the species is present in a given release and which genome +#' assembly is used. To check whether your species of interest is available in +#' a particular version, use [list_ensembl_species()]. +#' +#' The recommended workflow is: +#' \enumerate{ +#' \item Find the dataset name for your species: `list_ensembl_species()`. +#' \item Find available versions and confirm species presence: +#' `list_ensembl_versions()` + `list_ensembl_species(version = "X")`. +#' \item Annotate: `get_annotations(species = "...", version = "...")`. +#' } +#' +#' @return A data frame with columns `name`, `date`, `url`, `version`, and +#' `current_release` (marked with `*` for the active release). +#' +#' @examples +#' \dontrun{ +#' # See all available Ensembl versions +#' list_ensembl_versions() +#' +#' # Then check if your species is in a specific version +#' list_ensembl_species(version = "112") +#' } +#' +#' @seealso [list_ensembl_species()], [get_annotations()] +#' @export + +list_ensembl_versions <- function() { - if (format == "xlsx") { - if (!requireNamespace("openxlsx", quietly = TRUE)) { + if (!requireNamespace("biomaRt", quietly = TRUE)) { + stop( + "Package \"biomaRt\" must be installed to use this function.\n", + "Install it with: BiocManager::install(\"biomaRt\")", + call. = FALSE + ) + } + + archives <- tryCatch( + biomaRt::listEnsemblArchives(), + error = function(e) { stop( - "Package \"openxlsx\" must be installed to use this function.", + "Could not retrieve Ensembl archive list.\n", + "This is likely a connectivity issue with the Ensembl server.\n\n", + "Suggestions:\n", + " - Check your internet connection.\n", + " - Try again later.\n", + " - Visit https://www.ensembl.org/info/website/archives/index.html", call. = FALSE + ) + } + ) + + df <- archives[, c("name", "date", "url", "version", "current_release")] + # Exclude the legacy GRCh37 entry — it is a fixed human-only site, + # not a versioned multi-species release. + df <- df[df$version != "GRCh37", ] + rownames(df) <- NULL + + message( + "NOTE: Ensembl release numbers are UNIVERSAL across all species.\n", + "A species may not exist in every version. Use list_ensembl_species(version)\n", + "to confirm that your species dataset is available in a given release." + ) + + return(df) +} + + +############################ +# Function list_ensembl_species # +############################ + +#' List all species datasets available in a given Ensembl version. +#' +#' Queries BioMart at the specified Ensembl version and returns a data frame +#' of all available gene datasets. Use this to: +#' \itemize{ +#' \item Find the exact `species` string to pass to [get_annotations()]. +#' \item Confirm that a species is available in a specific Ensembl version. +#' } +#' +#' @details +#' Because Ensembl releases are universal (same version number for all +#' species), this function simply lists which species datasets exist in the +#' BioMart of the requested version. Species added in later releases will not +#' appear in older versions. +#' +#' @param version Ensembl release version as a string (e.g. `"112"`, `"114"`), +#' or `"current"` for the latest release. Default = `"current"`. +#' @param filter Optional. A search string to filter results by dataset name +#' or description (case-insensitive). Useful for quickly finding a species +#' without scrolling through hundreds of rows. Default = `NULL` (no filter). +#' +#' @return A data frame with columns: +#' \describe{ +#' \item{dataset}{The dataset identifier to pass to `species` in +#' [get_annotations()] (e.g. `"hsapiens_gene_ensembl"`).} +#' \item{description}{Human-readable species name and assembly +#' (e.g. `"Human genes (GRCh38.p14)"`).} +#' \item{version}{Assembly version string.} +#' } +#' +#' @examples +#' \dontrun{ +#' # All species in version 112 +#' list_ensembl_species(version = "112") +#' +#' # Find zebrafish dataset in version 112 +#' list_ensembl_species(version = "112", filter = "zebrafish") +#' +#' # Find mouse dataset +#' list_ensembl_species(filter = "mouse") +#' +#' # Find all fish species +#' list_ensembl_species(filter = "fish") +#' } +#' +#' @seealso [list_ensembl_versions()], [get_annotations()] +#' @export + +list_ensembl_species <- function(version = "current", filter = NULL) { + + if (!requireNamespace("biomaRt", quietly = TRUE)) { + stop( + "Package \"biomaRt\" must be installed to use this function.\n", + "Install it with: BiocManager::install(\"biomaRt\")", + call. = FALSE + ) + } + + host <- .resolve_ensembl_host(version) + + message("Fetching species list from Ensembl version ", version, "...") + + mart <- tryCatch( + biomaRt::useMart("ENSEMBL_MART_ENSEMBL", host = host), + error = function(e) .handle_biomart_connection_error(e, host) + ) + + datasets <- tryCatch( + biomaRt::listDatasets(mart), + error = function(e) .handle_biomart_query_error(e, host) + ) + + # Keep only gene datasets + df <- datasets[grepl("_gene_ensembl$", datasets$dataset), ] + rownames(df) <- NULL + + # Apply optional text filter across dataset name and description + if (!is.null(filter) && nchar(trimws(filter)) > 0) { + mask <- grepl(filter, df$dataset, ignore.case = TRUE) | + grepl(filter, df$description, ignore.case = TRUE) + df <- df[mask, ] + + if (nrow(df) == 0) { + message( + "No species found matching \"", filter, "\" in Ensembl version ", + version, ".\n", + "Try a broader search term or check list_ensembl_species(version = \"", + version, "\") without a filter." + ) + return(invisible(df)) + } + } + + message( + nrow(df), " species datasets found", + if (!is.null(filter)) paste0(" matching \"", filter, "\"") else "", + " (Ensembl v", version, ").\n", + "Pass the `dataset` column value to the `species` argument of get_annotations()." + ) + + return(df) +} + + +# ============================================================================= +# Internal helpers +# ============================================================================= + +# Resolve BioMart host URL from version string +.resolve_ensembl_host <- function(version) { + + if (tolower(as.character(version)) == "current") { + return("https://www.ensembl.org") + } + + archives <- tryCatch( + biomaRt::listEnsemblArchives(), + error = function(e) { + stop( + "Could not retrieve the list of Ensembl archives.\n", + "This may be due to a network or server issue.\n\n", + "Suggestions:\n", + " - Check your internet connection.\n", + " - Use version = \"current\" to try the main server.\n", + " - Visit https://www.ensembl.org/info/website/archives/index.html", + call. = FALSE + ) + } + ) + + matched <- archives[archives$version == as.character(version), ] + + if (nrow(matched) == 0) { + available <- paste( + sort(archives$version[archives$version != "GRCh37"]), + collapse = ", " + ) + stop( + "Ensembl version \"", version, "\" was not found.\n", + "Available versions: ", available, "\n", + "Use list_ensembl_versions() to see the full list.", + call. = FALSE + ) + } + + return(matched$url[1]) +} + + +# Detect the best symbol attribute for a species +# Returns a list: list(attr = "attribute_name", source = "label for messages") +.get_symbol_attribute <- function(mart, species) { + + if (grepl("^hsapiens", species)) { + return(list(attr = "hgnc_symbol", source = "HGNC")) + } + + # Curated map: species prefix → preferred symbol attribute + source label + curated_map <- list( + mmusculus = list(attr = "mgi_symbol", source = "MGI"), + rnorvegicus = list(attr = "rgd_symbol", source = "RGD"), + drerio = list(attr = "zfin_id_symbol", source = "ZFIN"), + dmelanogaster = list(attr = "flybasename_gene", source = "FlyBase"), + celegans = list(attr = "wormbase_gene", source = "WormBase") + ) + + # Check if this species matches any curated prefix + for (prefix in names(curated_map)) { + if (grepl(paste0("^", prefix), species)) { + entry <- curated_map[[prefix]] + + # Verify the attribute actually exists in this mart/version + available_attrs <- tryCatch( + biomaRt::listAttributes(mart)$name, + error = function(e) character(0) + ) + + if (entry$attr %in% available_attrs) { + return(entry) + } else { + message( + "Note: preferred symbol attribute '", entry$attr, "' (", + entry$source, ") is not available for this species/version.\n", + "Falling back to 'external_gene_name'." ) + return(list(attr = "external_gene_name", source = "External")) + } + } + } + + # Default fallback for non-curated species + return(list(attr = "external_gene_name", source = "External")) +} + + +# Resolve symbol column (human: HGNC fallback to external; others: rename attr) +.resolve_symbol_column <- function(genemap, is_human, symbol_attr) { + + if (is_human) { + # Ensure both columns exist + if (!"hgnc_symbol" %in% names(genemap)) { + genemap$hgnc_symbol <- NA_character_ + } + if (!"external_gene_name" %in% names(genemap)) { + genemap$external_gene_name <- NA_character_ } - openxlsx::write.xlsx(df, file = paste0(filename, ".xlsx"), - colNames = TRUE, rowNames = FALSE, append = FALSE) + # Prefer HGNC, fallback to external gene name + genemap$symbol <- ifelse( + is.na(genemap$hgnc_symbol) | genemap$hgnc_symbol == "", + genemap$external_gene_name, + genemap$hgnc_symbol + ) } else { - utils::write.csv(df, row.names = FALSE, file = paste0(filename, ".csv")) + # Non-human: rename preferred attribute to "symbol" if present + if (symbol_attr %in% names(genemap)) { + genemap$symbol <- genemap[[symbol_attr]] + } else if ("external_gene_name" %in% names(genemap)) { + genemap$symbol <- genemap[["external_gene_name"]] + } else { + genemap$symbol <- NA_character_ + } } - return(df) + return(genemap) +} + + +# Standardized connection error handler +.handle_biomart_connection_error <- function(e, host, species = NULL, version = NULL) { + + ctx <- "" + if (!is.null(version)) { + ctx <- paste0(ctx, "version: ", version) + } + if (!is.null(species)) { + ctx <- paste0(ctx, if (nchar(ctx) > 0) " | " else "", "species: ", species) + } + if (nchar(ctx) > 0) ctx <- paste0(" [", ctx, "]") + + stop( + "Could not connect to Ensembl BioMart", ctx, ".\n", + "Host: ", host, "\n\n", + "Suggestions:\n", + " - Check your internet connection.\n", + " - Try again later.\n", + " - Use version = \"current\" to try the main server.\n", + " - Visit https://www.ensembl.org/info/website/archives/index.html", + call. = FALSE + ) +} + + +# Standardized query error handler +.handle_biomart_query_error <- function(e, host) { + stop( + "BioMart query failed.\n", + "Host: ", host, "\n\n", + "Suggestions:\n", + " - Check your internet connection.\n", + " - Try again later.\n", + " - Confirm that the species dataset exists in this version.\n", + " - Use list_ensembl_species(version = \"X\") to verify.", + call. = FALSE + ) } diff --git a/man/OmicsKit-package.Rd b/man/OmicsKit-package.Rd index 8e9a101..b06f94c 100644 --- a/man/OmicsKit-package.Rd +++ b/man/OmicsKit-package.Rd @@ -7,6 +7,13 @@ \title{OmicsKit: A bioinformatics toolset for multiomics analysis} \description{ This package contains functions that help in manipulating tables and generating plots for multi-omics analysis including genomics, transcriptomics, proteomics, methylomics and immunoinformatics. +} +\seealso{ +Useful links: +\itemize{ + \item \url{https://bigmindlab.github.io/OmicsKit} +} + } \author{ \strong{Maintainer}: Daniel Guevara \email{dgdiaz011202@gmail.com} (\href{https://orcid.org/0009-0001-2786-8729}{ORCID}) diff --git a/man/get_annotations.Rd b/man/get_annotations.Rd index f741973..a5144f0 100644 --- a/man/get_annotations.Rd +++ b/man/get_annotations.Rd @@ -2,75 +2,118 @@ % Please edit documentation in R/get_annotations.R \name{get_annotations} \alias{get_annotations} -\title{Get annotations from Ensembl.} +\title{Get gene or transcript annotations from Ensembl BioMart.} \usage{ get_annotations( ensembl_ids, + species = "hsapiens_gene_ensembl", mode = "genes", + version = "current", filename = "gene_annotations", - version = "Current", format = "csv" ) } \arguments{ -\item{ensembl_ids}{The column of transcripts to be used as input.} +\item{ensembl_ids}{A character vector of Ensembl gene or transcript IDs as inputs.} -\item{mode}{To specify the IDs provided, between "transcripts" or "genes". Default = genes.} +\item{species}{The BioMart dataset identifier. Default = +\code{"hsapiens_gene_ensembl"} (human). Use \code{\link[=list_ensembl_species]{list_ensembl_species()}} to find +the identifier for other organisms.} -\item{filename}{The name of the output file, which is table. Default = gene_annotations.} +\item{mode}{Either \code{"genes"} or \code{"transcripts"}. Default = \code{"genes"}.} -\item{version}{This function can use the versions 102, 103, and 112 of Ensembl. Default = "Current".} +\item{version}{Ensembl release version as a string (e.g. \code{"112"}, \code{"114"}), +or \code{"current"}. Use \code{\link[=list_ensembl_versions]{list_ensembl_versions()}} to see available versions, +and \code{\link[=list_ensembl_species]{list_ensembl_species()}} to confirm your species exists in that +version. Default = \code{"current"}.} -\item{format}{The output is saved in .csv or .xlsx formats. Default = csv.} +\item{filename}{Name of the output file without extension. If \code{NULL}, the +result is returned but \strong{not} saved to disk. Default = +\code{"gene_annotations"}.} + +\item{format}{Output format: \code{"csv"} or \code{"xlsx"}. Ignored when +\code{filename = NULL}. Default = \code{"csv"}.} } \value{ -A data frame with one row per input ID and the following columns: -\code{geneID}, \code{symbol}, \code{biotype}, \code{chromosome}, \code{gene_start}, \code{gene_end}, -\code{gene_length}, \code{description}. For \code{mode = "transcripts"}, an additional -\code{transcriptID} column is included. The data frame is also saved to disk -as a \code{.csv} or \code{.xlsx} file (see \code{filename} and \code{format}). +A data frame with one row per input ID and columns: \code{geneID}, +\code{symbol}, \code{biotype}, \code{chromosome}, \code{gene_start}, \code{gene_end}, +\code{gene_length}, \code{description}. For \code{mode = "transcripts"}, a \code{transcriptID} +column is prepended. If \code{filename} is not \code{NULL}, the data frame is also +written to disk. } \description{ This function annotates a column of transcripts or gene IDs (ENSEMBL) with information of the Biomart. -If transcript IDs are provided, they are also annotated with information of the genes to which they belong. } \details{ The Gene information added include: \itemize{ -\item Gene ENSEMBL ID, HGNC Symbol, Description, Biotype and Chromosome. +\item Gene ENSEMBL ID, gene Symbol, Description, Biotype and Chromosome. \item Gene start, end and length } + +Annotates a vector of Ensembl gene or transcript IDs using BioMart. If +transcript IDs are provided, they are also annotated with information +of the genes to which they belong. Works for any species available in +Ensembl — use \code{\link[=list_ensembl_species]{list_ensembl_species()}} to find the correct \code{species} value +for your organism, and \code{\link[=list_ensembl_versions]{list_ensembl_versions()}} to verify version availability. + +\strong{Workflow:} +\enumerate{ +\item \code{list_ensembl_species()} → find dataset name. +\item \code{list_ensembl_versions()} → confirm version exists. +\item \code{get_annotations(ensembl_ids, species = "drerio_gene_ensembl")}. +} + +\strong{Symbol resolution by species:} +\itemize{ +\item Human (\code{hsapiens}): uses \code{hgnc_symbol}, falling back to +\code{external_gene_name} when HGNC symbol is absent. +\item Mouse (\code{mmusculus}): uses \code{mgi_symbol}. +\item Rat (\code{rnorvegicus}): uses \code{rgd_symbol}. +\item Zebrafish (\code{drerio}): uses \code{zfin_id_symbol}. +\item Drosophila (\code{dmelanogaster}): uses \code{flybasename_gene}. +\item All other species: uses \code{external_gene_name} (universal fallback). +} } \note{ Requires an active internet connection to query the Ensembl BioMart. -\code{gene_length} is computed as \code{gene_end - gene_start + 1} (genomic length). -For TPM calculation with \code{\link[=tpm]{tpm()}}, this is an approximation, -use transcript-level lengths for higher accuracy. +\code{gene_length} is computed as \code{gene_end - gene_start + 1} (genomic span). +For TPM calculation with \code{\link[=tpm]{tpm()}}, transcript-level lengths are more +accurate. } \examples{ \dontrun{ -# Annotate genes from Normalized counts (requires internet connection) -data(norm_counts) +# Step 1 — find your species dataset +list_ensembl_species(version = "112", filter = "zebrafish") +# -> "drerio_gene_ensembl" -# Requires a reference table with a "geneID" column. -# Use get_annotations() to generate it: +# Step 2 — annotate (human, default) annotations <- get_annotations( - ensembl_ids = rownames(norm_counts), - mode = "genes" + ensembl_ids = c("ENSG00000141510", "ENSG00000012048"), + species = "hsapiens_gene_ensembl", + version = "112", + filename = NULL ) -head(annotations) +# Mouse +annotations_mouse <- get_annotations( + ensembl_ids = c("ENSMUSG00000059552", "ENSMUSG00000024610"), + species = "mmusculus_gene_ensembl", + version = "112", + filename = NULL +) -# Use with add_annotations() -norm_counts_annot <- add_annotations( - object = norm_counts, - reference = annotations, - variables = c("symbol", "biotype") +# Zebrafish +annotations_zf <- get_annotations( + ensembl_ids = c("ENSDARG00000002333"), + species = "drerio_gene_ensembl", + version = "112", + filename = NULL ) } } \seealso{ -\code{\link[=add_annotations]{add_annotations()}} to join annotations to a counts matrix; -\code{\link[=tpm]{tpm()}} which requires gene lengths from this function. +\code{\link[=list_ensembl_versions]{list_ensembl_versions()}}, \code{\link[=list_ensembl_species]{list_ensembl_species()}} to find +the identifier for other organisms, \code{\link[=add_annotations]{add_annotations()}}, \code{\link[=tpm]{tpm()}} } diff --git a/man/list_ensembl_species.Rd b/man/list_ensembl_species.Rd new file mode 100644 index 0000000..b425778 --- /dev/null +++ b/man/list_ensembl_species.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_annotations.R +\name{list_ensembl_species} +\alias{list_ensembl_species} +\title{List all species datasets available in a given Ensembl version.} +\usage{ +list_ensembl_species(version = "current", filter = NULL) +} +\arguments{ +\item{version}{Ensembl release version as a string (e.g. \code{"112"}, \code{"114"}), +or \code{"current"} for the latest release. Default = \code{"current"}.} + +\item{filter}{Optional. A search string to filter results by dataset name +or description (case-insensitive). Useful for quickly finding a species +without scrolling through hundreds of rows. Default = \code{NULL} (no filter).} +} +\value{ +A data frame with columns: +\describe{ +\item{dataset}{The dataset identifier to pass to \code{species} in +\code{\link[=get_annotations]{get_annotations()}} (e.g. \code{"hsapiens_gene_ensembl"}).} +\item{description}{Human-readable species name and assembly +(e.g. \code{"Human genes (GRCh38.p14)"}).} +\item{version}{Assembly version string.} +} +} +\description{ +Queries BioMart at the specified Ensembl version and returns a data frame +of all available gene datasets. Use this to: +\itemize{ +\item Find the exact \code{species} string to pass to \code{\link[=get_annotations]{get_annotations()}}. +\item Confirm that a species is available in a specific Ensembl version. +} +} +\details{ +Because Ensembl releases are universal (same version number for all +species), this function simply lists which species datasets exist in the +BioMart of the requested version. Species added in later releases will not +appear in older versions. +} +\examples{ +\dontrun{ +# All species in version 112 +list_ensembl_species(version = "112") + +# Find zebrafish dataset in version 112 +list_ensembl_species(version = "112", filter = "zebrafish") + +# Find mouse dataset +list_ensembl_species(filter = "mouse") + +# Find all fish species +list_ensembl_species(filter = "fish") +} + +} +\seealso{ +\code{\link[=list_ensembl_versions]{list_ensembl_versions()}}, \code{\link[=get_annotations]{get_annotations()}} +} diff --git a/man/list_ensembl_versions.Rd b/man/list_ensembl_versions.Rd new file mode 100644 index 0000000..d6e1480 --- /dev/null +++ b/man/list_ensembl_versions.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_annotations.R +\name{list_ensembl_versions} +\alias{list_ensembl_versions} +\title{List available Ensembl releases and their BioMart hosts.} +\usage{ +list_ensembl_versions() +} +\value{ +A data frame with columns \code{name}, \code{date}, \code{url}, \code{version}, and +\code{current_release} (marked with \code{*} for the active release). +} +\description{ +A convenience wrapper around \code{\link[biomaRt:listEnsemblArchives]{biomaRt::listEnsemblArchives()}} that returns +all available Ensembl release versions with their dates and host URLs. +} +\details{ +Ensembl uses a \strong{universal release numbering system}: every release (e.g. +v112, v113) covers \emph{all} species simultaneously. What varies between species +is whether the species is present in a given release and which genome +assembly is used. To check whether your species of interest is available in +a particular version, use \code{\link[=list_ensembl_species]{list_ensembl_species()}}. + +The recommended workflow is: +\enumerate{ +\item Find the dataset name for your species: \code{list_ensembl_species()}. +\item Find available versions and confirm species presence: +\code{list_ensembl_versions()} + \code{list_ensembl_species(version = "X")}. +\item Annotate: \code{get_annotations(species = "...", version = "...")}. +} +} +\examples{ +\dontrun{ +# See all available Ensembl versions +list_ensembl_versions() + +# Then check if your species is in a specific version +list_ensembl_species(version = "112") +} + +} +\seealso{ +\code{\link[=list_ensembl_species]{list_ensembl_species()}}, \code{\link[=get_annotations]{get_annotations()}} +} diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 0000000..e4be3fc --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,12 @@ +# This file is part of the standard setup for testthat. +# It is recommended that you do not modify it. +# +# Where should you do additional test configuration? +# Learn more about the roles of various files in: +# * https://r-pkgs.org/testing-design.html#sec-tests-files-overview +# * https://testthat.r-lib.org/articles/special-files.html + +library(testthat) +library(OmicsKit) + +test_check("OmicsKit") diff --git a/tests/testthat/test-get_annotations.R b/tests/testthat/test-get_annotations.R new file mode 100644 index 0000000..8f256b9 --- /dev/null +++ b/tests/testthat/test-get_annotations.R @@ -0,0 +1,60 @@ +testthat::test_that("get_annotations returns expected columns for genes", { + testthat::skip_if_not_installed("biomaRt") + testthat::skip_if_offline("www.ensembl.org") + testthat::skip_on_cran() + + ids <- c("ENSG00000141510") # TP53 + res <- get_annotations( + ensembl_ids = ids, + species = "hsapiens_gene_ensembl", + version = "112", + filename = NULL + ) + + expect_s3_class(res, "data.frame") + expect_true(all(c("geneID", "symbol", "biotype", "chromosome", + "gene_start", "gene_end", "gene_length", "description") %in% names(res))) + expect_equal(res$geneID[1], "ENSG00000141510") +}) + +testthat::test_that("get_annotations works for transcripts", { + testthat::skip_if_not_installed("biomaRt") + testthat::skip_if_offline("www.ensembl.org") + testthat::skip_on_cran() + + # Obtener un transcript válido del release 112 + ensembl <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", + dataset = "hsapiens_gene_ensembl", + host = .resolve_ensembl_host("112")) + + ids <- biomaRt::getBM( + attributes = "ensembl_transcript_id_version", + filters = "ensembl_gene_id", + values = "ENSG00000141510", + mart = ensembl + )$ensembl_transcript_id_version + + testthat::skip_if(length(ids) == 0) + + res <- get_annotations( + ensembl_ids = ids[1], + species = "hsapiens_gene_ensembl", + mode = "transcripts", + version = "112", + filename = NULL + ) + + expect_s3_class(res, "data.frame") + expect_true("transcriptID" %in% names(res)) + expect_equal(res$transcriptID[1], ids[1]) +}) + + +testthat::test_that("get_annotations errors when biomaRt is missing", { + testthat::skip_if_not_installed("withr") + + withr::with_namespace("biomaRt", { + # If biomaRt is installed, skip this test + testthat::skip("biomaRt is installed; cannot simulate missing package here.") + }) +}) diff --git a/tests/testthat/test-helpers.R b/tests/testthat/test-helpers.R new file mode 100644 index 0000000..255dec7 --- /dev/null +++ b/tests/testthat/test-helpers.R @@ -0,0 +1,31 @@ +testthat::test_that(".resolve_ensembl_host handles current", { + expect_equal(.resolve_ensembl_host("current"), "https://www.ensembl.org") +}) + +testthat::test_that(".get_symbol_attribute returns HGNC for human", { + res <- .get_symbol_attribute(NULL, "hsapiens_gene_ensembl") + expect_equal(res$attr, "hgnc_symbol") + expect_equal(res$source, "HGNC") +}) + +testthat::test_that(".resolve_symbol_column sets symbol for human", { + df <- data.frame( + hgnc_symbol = c("", "TP53"), + external_gene_name = c("TP53", "TP53"), + stringsAsFactors = FALSE + ) + + out <- .resolve_symbol_column(df, is_human = TRUE, symbol_attr = "hgnc_symbol") + expect_equal(out$symbol[1], "TP53") # fallback to external_gene_name + expect_equal(out$symbol[2], "TP53") # hgnc_symbol preferred +}) + +testthat::test_that(".resolve_symbol_column sets symbol for non-human", { + df <- data.frame( + mgi_symbol = c("Actb"), + stringsAsFactors = FALSE + ) + + out <- .resolve_symbol_column(df, is_human = FALSE, symbol_attr = "mgi_symbol") + expect_equal(out$symbol[1], "Actb") +}) diff --git a/vignettes/DEA_workflow.Rmd b/vignettes/DEA_workflow.Rmd index b8388b7..a02f104 100644 --- a/vignettes/DEA_workflow.Rmd +++ b/vignettes/DEA_workflow.Rmd @@ -36,6 +36,7 @@ The workflow covers: ```{r packages} library(OmicsKit) library(ggplot2) +library(biomaRt) # for get_annotations ``` --- @@ -195,15 +196,21 @@ sum(deseq2_results$padj < 0.05, na.rm = TRUE) `get_annotations()` queries Ensembl via biomaRt to retrieve gene symbols, biotype, chromosomal location, and length. Requires an internet connection. +Use `list_ensembl_species()` to find the correct dataset name for your organism, +and `list_ensembl_versions()` to see available releases. + ```{r get_annotations, eval = FALSE} +# Example: list datasets (optional) +# list_ensembl_species(version = "current", filter = "human") + annotations <- get_annotations( ensembl_ids = deseq2_results$gene_id, + species = "hsapiens_gene_ensembl", mode = "genes", - version = "Current", + version = "current", filename = "luad_gene_annotations", format = "csv" ) - head(annotations) ``` @@ -217,7 +224,8 @@ Ensembl IDs as the key: norm_counts_annot <- add_annotations( object = norm_counts, reference = annotations, - variables = c("symbol", "biotype") + variables = c("symbol", "biotype"), + data_frame = TRUE ) head(norm_counts_annot[, c("geneID", "symbol", "biotype")]) diff --git a/vignettes/PA_workflow.Rmd b/vignettes/PA_workflow.Rmd index 97a42c9..e0f9918 100644 --- a/vignettes/PA_workflow.Rmd +++ b/vignettes/PA_workflow.Rmd @@ -67,12 +67,30 @@ In this vignette, we use a built-in example `geneset_list`, which contains immune response, and metabolism), and the built-in `gsea_results` dataset, which mimics the output of `merge_PA()` for three comparisons across HALLMARK, KEGG, and GO collections. -```{r load_data} +```{r load_data, eval = FALSE} data(geneset_list) data(gsea_results) data(deseq2_results) data(vst_counts) data(sampledata) + +# Retrieve HGNC symbols +annotations <- get_annotations( + ensembl_ids = deseq2_results$gene_id, + species = "hsapiens_gene_ensembl", + mode = "genes", + version = "112", + filename = "luad_gene_annotations", + format = "csv" +) + +# Set format to add_annotations +rownames(deseq2_results) <- deseq2_results$gene_id +deseq2_results_annot <- add_annotations(object = deseq2_results, + reference = annotations, + variables = c("symbol", "biotype"), + data_frame = TRUE +) # Ranked gene list: DESeq2 Wald stat from most positive to most negative ranked_genes <- deseq2_results$gene_id[ order(deseq2_results$stat, decreasing = TRUE) @@ -173,12 +191,27 @@ files are organized into subdirectories by format (PDF / JPG) and gene selection mode (`all_genes`, `le_genes`, `top_genes`). ```{r heatmap_PA, eval = FALSE} +# Change rownames from vst to hgnc symbols and clean format +idx_vst <- match(rownames(vst_counts), annotations$geneID) +new_rownames <- rownames(vst_counts) + +valid_vst <- !is.na(idx_vst) & + !is.na(annotations$symbol[idx_vst]) & + annotations$symbol[idx_vst] != "" + +new_rownames[valid_vst] <- annotations$symbol[idx_vst[valid_vst]] + +vst_sym <- vst_counts +rownames(vst_sym) <- new_rownames +vst_sym <- vst_sym[!duplicated(rownames(vst_sym)), ] + +# Plot heatmap_PA( - expression_data = vst_counts, + expression_data = vst_sym, metadata = sampledata, pa_data_annot = pa_annot, ranked_genes = ranked_genes, - plot_genes = c("all_genes", "le_genes", "top_genes"), + plot_genes = c("all_genes"), sample_col = "patient_id", group_col = "sample_type", out_dir = "heatmaps_PA",