Skip to content

Commit

Permalink
Add identifiers table to traits.build output (#195)
Browse files Browse the repository at this point in the history
Add a dataframe to a traits.build output that documents "identifiers"
that link specimens/individuals/etc. across datasets and to
museum/herbarium collections.

Six types of identifiers have been added to the traits.build schema,
reflecting key identifiers types specified by DarwinCore. This is not a
comprehensive list of all DarwinCore identifiers - but instead of adding
others, this seems like a good starting point, and traits.build has been
coded to NOT throw an error is a different `identifier_type` is used.

Note, `catalog_number` is for non-universal identifiers that represent
an individual researchers plant tag system while `collectionID` and
`materialSampleID` are meant to be universally unique identifiers.

Notes:
- coded so that metadata.yml files are not required to have an
`identifiers` section, so people already using traits.build do not need
to retrofit their metadata files
- there is a corresponding branch in {austraits} that makes changes to
functions to accommodate the changed output structure. The DESCRIPTION
specifies that {austraits} must be installed from this commit - and will
need to be changed back to master when new releases of {austraits} and
{traits.build} are simultaneously released.
- pull request includes: tests, new function to add identifiers to
metadata files

_I don't understand why changes that were previously pushed to master
are appearing here. For instance, the renaming of functions. Confirming
this pull request is from `add_specimen_ID` branch to `develop`_
  • Loading branch information
ehwenk authored Jan 29, 2025
1 parent e825a40 commit 27a6517
Show file tree
Hide file tree
Showing 35 changed files with 4,266 additions and 120 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ Suggests:
covr
Remotes:
richfitz/remake,
traitecoevo/austraits
traitecoevo/austraits@3585fab
Encoding: UTF-8
VignetteBuilder: knitr
RoxygenNote: 7.3.2
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ export(flatten_database)
export(get_schema)
export(get_unit_conversions)
export(metadata_add_contexts)
export(metadata_add_identifiers)
export(metadata_add_locations)
export(metadata_add_source_bibtex)
export(metadata_add_source_doi)
Expand Down
86 changes: 80 additions & 6 deletions R/process.R
Original file line number Diff line number Diff line change
Expand Up @@ -94,12 +94,23 @@ dataset_process <- function(filename_data_raw,
contexts <-
metadata$contexts %>%
process_format_contexts(dataset_id, traits)

# Load identifiers
if ("identifiers" %in% names(metadata) & !all(is.na(metadata[["identifiers"]]))) {
identifiers <-
metadata[["identifiers"]] %>% austraits::convert_list_to_df2()
} else {
identifiers <- list(
"var_in",
"identifier_type"
)
}

# Load and clean trait data
traits <-
traits %>%
process_parse_data(dataset_id, metadata, contexts, schema)

process_parse_data(dataset_id, metadata, contexts, schema, identifiers)
# Context ids needed to continue processing
context_ids <- traits$context_ids

Expand All @@ -112,9 +123,9 @@ dataset_process <- function(filename_data_raw,
traits$traits %>%
process_add_all_columns(
c(names(schema[["austraits"]][["elements"]][["traits"]][["elements"]]),
"parsing_id", "location_name", "taxonomic_resolution", "methods", "unit_in")
"parsing_id", "location_name", "taxonomic_resolution", "methods", "unit_in", identifiers$var_in)
)

# Replace old `location_id` with a new `location_id`
if (nrow(locations) > 0) {
traits <-
Expand Down Expand Up @@ -178,6 +189,33 @@ dataset_process <- function(filename_data_raw,
dplyr::arrange(.data$observation_id, .data$trait_name, .data$value_type) %>%
# Ensure everything converted to character type
util_df_convert_character()

# Separate identifiers into a standalone table that is in long format.
# Needs to happen now after `observation_id` is set.

identifiers_tmp <- traits %>%
dplyr::select(dplyr::all_of(c("dataset_id", "observation_id", identifiers$var_in)))

if (ncol(identifiers_tmp) >= 3) {
identifiers_tmp <- identifiers_tmp %>%
tidyr::pivot_longer(cols = 3:ncol(identifiers_tmp)) %>%
dplyr::rename(identifier_value = value, var_in = name) %>%
dplyr::left_join(identifiers, by = join_by(var_in)) %>%
dplyr::select(dataset_id, observation_id, identifier_type, identifier_value) %>%
dplyr::filter(!is.na(.data$identifier_value), !is.na(.data$observation_id)) %>%
dplyr::arrange(observation_id, identifier_type) %>%
dplyr::distinct()
} else {
identifiers_tmp <- tibble::tibble(
dataset_id = character(0),
observation_id = character(0),
identifier_type = character(0),
identifier_value = character(0)
)
}

traits <- traits %>%
dplyr::select(-dplyr::all_of(identifiers$var_in))

# Record contributors
contributors <-
Expand Down Expand Up @@ -287,6 +325,7 @@ dataset_process <- function(filename_data_raw,
dplyr::select(dplyr::all_of(c(taxon_name = "aligned_name"))) %>%
dplyr::distinct(),
contributors = contributors,
identifiers = identifiers_tmp,
sources = sources,
definitions = definitions,
schema = schema,
Expand Down Expand Up @@ -1320,7 +1359,7 @@ process_add_all_columns <- function(data, vars, add_error_column = TRUE) {
#' substitutions and unique observation id added
#' @importFrom dplyr select mutate filter arrange distinct case_when full_join everything any_of bind_cols
#' @importFrom rlang .data
process_parse_data <- function(data, dataset_id, metadata, contexts, schema) {
process_parse_data <- function(data, dataset_id, metadata, contexts, schema, identifiers) {

# Get config data for dataset
data_is_long_format <- metadata[["dataset"]][["data_is_long_format"]]
Expand All @@ -1336,7 +1375,7 @@ process_parse_data <- function(data, dataset_id, metadata, contexts, schema) {

df <- data %>%
# Next step selects and renames columns based on named vector
dplyr::select(dplyr::any_of(c(var_in[i], v, contexts$var_in))) %>% # Why select v? When would those ids ever be in the data?
dplyr::select(dplyr::any_of(c(var_in[i], v, contexts$var_in, identifiers$var_in))) %>% # Why select v? When would those ids ever be in the data?
dplyr::mutate(dataset_id = dataset_id)

# Step 1b. Import any values that aren't columns of data
Expand Down Expand Up @@ -1618,6 +1657,41 @@ process_format_contributors <- function(my_list, dataset_id, schema) {
contributors
}

#' Format identifiers from list into tibble
#'
#' Format identifiers, read in from the `metadata.yml` file. Converts from list to tibble.
#'
#' @param my_list List of input information
#' @param dataset_id Identifier for a particular study in the AusTraits database
#' @param schema Schema for traits.build
#'
#' @return Tibble with details of identifiers
#' @importFrom rlang .data
#'
#' @examples
#' \dontrun{
#' process_format_identifiers(read_metadata("data/Falster_2003/metadata.yml")$identifiers)
#' }
process_format_identifiers <- function(my_list, dataset_id, traits) {

if (length(unlist(my_list$identifiers)) > 1) {
identifiers <-
my_list$identifiers %>%
austraits::convert_list_to_df2() %>%
dplyr::mutate(dataset_id = dataset_id)
} else {
identifiers <- tibble::tibble(dataset_id = character())
}

identifiers <-
identifiers %>%
process_add_all_columns(
names(schema[["austraits"]][["elements"]][["identifiers"]][["elements"]]),
add_error_column = FALSE)

identifiers
}

process_format_methods <- function(metadata, dataset_id, sources, contributors) {

# Identify sources as being `primary`, `secondary` or `original`
Expand Down
72 changes: 72 additions & 0 deletions R/setup.R
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,78 @@ metadata_add_contexts <- function(dataset_id, overwrite = FALSE, user_responses

}

#' For specified `dataset_id` import database cross-referencing identifer data from a dataframe
#'
#' This functions asks users which columns in the dataframe they would like to keep
#' and records this appropriately in the metadata. The input data is assumed to be
#' in wide format.
#' The output may require additional manual editing if the user selects a non-standard identifier type.
#'
#' @inheritParams metadata_path_dataset_id
#' @param overwrite Overwrite existing information
#' @param user_responses Named list containing simulated user input for manual selection
#' of variables, mainly for testing purposes
#'
#' @importFrom rlang .data
#' @export
metadata_add_identifiers <- function(dataset_id, overwrite = FALSE) {

# Read metadata
metadata <- read_metadata_dataset(dataset_id)

# Load and clean trait data
data <-
readr::read_csv(file.path("data", dataset_id, "data.csv"), col_types = cols()) %>%
process_custom_code(metadata[["dataset"]][["custom_R_code"]])()

# Get list of potential columns
v <- names(data)

# Check for existing info and if it exists, retain information if overwrite = FALSE
if (!overwrite && !is.na(metadata$identifiers[1])) {

identifiers <- metadata$identifiers
n_existing <- length(metadata$identifiers)

message(
sprintf(
red("Existing identifier information detected, from the following columns in the dataset: ") %+%
green("'%s'\n\t") %+% red("Metadata is being appended; please review duplicates manually"),
identifiers %>% purrr::map_chr(~.x[["var_in"]]) %>% paste(collapse = "', '"))
)

} else {

identifiers <- list()
n_existing <- 0

}

var_in <- metadata_user_select_names(
paste("Indicate all columns that contain identifiers that cross-reference between observations in ", dataset_id, " and an herbarium voucher or another database."), v)

types <-
c("catalogNumber", "collectionID", "institutionCode", "institutionID", "materialSampleID", "occurrenceID")

for (i in seq_along(var_in)) {

ii <- n_existing + i
identifier_type <- metadata_user_select_names(
paste("What identifier type does ", var_in[i], "fit in?"), types)

identifiers[[ii]] <-
list(
var_in = var_in[i],
identifier_type = identifier_type
)

}

metadata$identifiers <- identifiers
write_metadata_dataset(metadata, dataset_id)
return(invisible(metadata))

}

#' Adds citation details to a metadata file for given study
#'
Expand Down
45 changes: 40 additions & 5 deletions R/testdata.R
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,18 @@ dataset_test_worker <-
f <- files[2]
test_expect_allowed_text(readLines(f, encoding = "UTF-8"), info = paste0(red(f), "\tmetadata"), label = "metadata")
testthat::expect_silent(metadata <- yaml::read_yaml(f))
test_expect_list_names_exact(
metadata, schema$metadata$elements %>% names(),
info = red(f), label = "metadata sections"
)

if (!is.null(metadata[["identifiers"]])) {
test_expect_list_names_exact(
metadata, schema$metadata$elements %>% names(),
info = red(f), label = "metadata sections"
)
} else {
test_expect_list_names_allowed(
metadata, schema$metadata$elements %>% names(),
info = red(f), label = "metadata sections"
)
}

## Custom R code
txt <- metadata[["dataset"]][["custom_R_code"]]
Expand Down Expand Up @@ -214,6 +222,23 @@ dataset_test_worker <-
info = paste0(red(f), "\tdataset"), label = "metadata"
)

## Identifiers
if (!"identifiers" %in% names(metadata)) {
testthat::expect_silent(
identifiers <-
metadata$identifiers %>%
process_format_identifiers(dataset_id, data)
)
}

if (!"identifiers" %in% names(metadata)) {
test_expect_list_elements_exact_names(
metadata$identifiers,
schema$metadata$elements$identifiers$values %>% names(),
info = paste0(red(f), "\tidentifiers")
)
}

## Locations

testthat::expect_silent(
Expand Down Expand Up @@ -679,10 +704,20 @@ dataset_test_worker <-

}

# Load identifiers
if ("identifiers" %in% names(metadata) & !all(is.na(metadata[["identifiers"]]))) {
identifiers <-
metadata[["identifiers"]] %>% austraits::convert_list_to_df2()
} else {
identifiers <- list(
"var_in",
"identifier_type"
)
}
## Check that special characters do not make it into the data
test_expect_no_error(
parsed_data <- data %>%
process_parse_data(dataset_id, metadata, contexts, schema),
process_parse_data(dataset_id, metadata, contexts, schema, identifiers),
info = sprintf("%s\t`process_parse_data`", red(dataset_id)))

test_expect_allowed_text(
Expand Down
4 changes: 4 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,10 @@ read_metadata <- function(path) {
paste(collapse = "\n")
}

if (!"identifier" %in% names(data)) {
data["identifier"] <- NA
}

data
}

Expand Down
Loading

0 comments on commit 27a6517

Please sign in to comment.