Add identifiers table to traits.build output (#195)

Add a dataframe to a traits.build output that documents "identifiers" that link specimens/individuals/etc. across datasets and to museum/herbarium collections. Six types of identifiers have been added to the traits.build schema, reflecting key identifiers types specified by DarwinCore. This is not a comprehensive list of all DarwinCore identifiers - but instead of adding others, this seems like a good starting point, and traits.build has been coded to NOT throw an error is a different `identifier_type` is used. Note, `catalog_number` is for non-universal identifiers that represent an individual researchers plant tag system while `collectionID` and `materialSampleID` are meant to be universally unique identifiers. Notes: - coded so that metadata.yml files are not required to have an `identifiers` section, so people already using traits.build do not need to retrofit their metadata files - there is a corresponding branch in {austraits} that makes changes to functions to accommodate the changed output structure. The DESCRIPTION specifies that {austraits} must be installed from this commit - and will need to be changed back to master when new releases of {austraits} and {traits.build} are simultaneously released. - pull request includes: tests, new function to add identifiers to metadata files _I don't understand why changes that were previously pushed to master are appearing here. For instance, the renaming of functions. Confirming this pull request is from `add_specimen_ID` branch to `develop`_
traitecoevo · Jan 29, 2025 · 27a6517 · 27a6517
1 parent e825a40
commit 27a6517
Show file tree

Hide file tree

Showing 35 changed files with 4,266 additions and 120 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -60,7 +60,7 @@ Suggests:
     covr
 Remotes:
     richfitz/remake,
-    traitecoevo/austraits
+    traitecoevo/austraits@3585fab
 Encoding: UTF-8
 VignetteBuilder: knitr
 RoxygenNote: 7.3.2

diff --git a/NAMESPACE b/NAMESPACE
@@ -20,6 +20,7 @@ export(flatten_database)
 export(get_schema)
 export(get_unit_conversions)
 export(metadata_add_contexts)
+export(metadata_add_identifiers)
 export(metadata_add_locations)
 export(metadata_add_source_bibtex)
 export(metadata_add_source_doi)

diff --git a/R/process.R b/R/process.R
@@ -94,12 +94,23 @@ dataset_process <- function(filename_data_raw,
   contexts <-
     metadata$contexts %>%
     process_format_contexts(dataset_id, traits)
+
+  # Load identifiers
+  if ("identifiers" %in% names(metadata) & !all(is.na(metadata[["identifiers"]]))) {
+    identifiers <-
+      metadata[["identifiers"]] %>% austraits::convert_list_to_df2()
+  } else {
+    identifiers <- list(
+      "var_in",
+      "identifier_type"
+    )
+  }
 
   # Load and clean trait data
   traits <-
     traits %>%
-    process_parse_data(dataset_id, metadata, contexts, schema)
-
+    process_parse_data(dataset_id, metadata, contexts, schema, identifiers)
+  
   # Context ids needed to continue processing
   context_ids <- traits$context_ids
 
@@ -112,9 +123,9 @@ dataset_process <- function(filename_data_raw,
     traits$traits %>%
     process_add_all_columns(
       c(names(schema[["austraits"]][["elements"]][["traits"]][["elements"]]),
-        "parsing_id", "location_name", "taxonomic_resolution", "methods", "unit_in")
+        "parsing_id", "location_name", "taxonomic_resolution", "methods", "unit_in", identifiers$var_in)
     )
-
+  
   # Replace old `location_id` with a new `location_id`
   if (nrow(locations) > 0) {
     traits <-
@@ -178,6 +189,33 @@ dataset_process <- function(filename_data_raw,
     dplyr::arrange(.data$observation_id, .data$trait_name, .data$value_type) %>%
     # Ensure everything converted to character type
     util_df_convert_character()
+
+  # Separate identifiers into a standalone table that is in long format.
+  # Needs to happen now after `observation_id` is set.
+
+  identifiers_tmp <- traits %>% 
+    dplyr::select(dplyr::all_of(c("dataset_id", "observation_id", identifiers$var_in)))
+
+  if (ncol(identifiers_tmp) >= 3) {
+    identifiers_tmp <- identifiers_tmp %>%
+      tidyr::pivot_longer(cols = 3:ncol(identifiers_tmp)) %>%
+      dplyr::rename(identifier_value = value, var_in = name) %>%
+      dplyr::left_join(identifiers, by = join_by(var_in)) %>%
+      dplyr::select(dataset_id, observation_id, identifier_type, identifier_value) %>%
+      dplyr::filter(!is.na(.data$identifier_value), !is.na(.data$observation_id)) %>%
+      dplyr::arrange(observation_id, identifier_type) %>%
+      dplyr::distinct()
+  } else {
+    identifiers_tmp <- tibble::tibble(
+      dataset_id = character(0),
+      observation_id = character(0),
+      identifier_type = character(0),
+      identifier_value = character(0)
+    )
+  }
+
+  traits <- traits %>%
+    dplyr::select(-dplyr::all_of(identifiers$var_in))
 
   # Record contributors
   contributors <-
@@ -287,6 +325,7 @@ dataset_process <- function(filename_data_raw,
         dplyr::select(dplyr::all_of(c(taxon_name = "aligned_name"))) %>%
         dplyr::distinct(),
       contributors = contributors,
+      identifiers = identifiers_tmp,
       sources = sources,
       definitions = definitions,
       schema = schema,
@@ -1320,7 +1359,7 @@ process_add_all_columns <- function(data, vars, add_error_column = TRUE) {
 #' substitutions and unique observation id added
 #' @importFrom dplyr select mutate filter arrange distinct case_when full_join everything any_of bind_cols
 #' @importFrom rlang .data
-process_parse_data <- function(data, dataset_id, metadata, contexts, schema) {
+process_parse_data <- function(data, dataset_id, metadata, contexts, schema, identifiers) {
 
   # Get config data for dataset
   data_is_long_format <- metadata[["dataset"]][["data_is_long_format"]]
@@ -1336,7 +1375,7 @@ process_parse_data <- function(data, dataset_id, metadata, contexts, schema) {
 
   df <- data %>%
     # Next step selects and renames columns based on named vector
-    dplyr::select(dplyr::any_of(c(var_in[i], v, contexts$var_in))) %>% # Why select v? When would those ids ever be in the data?
+    dplyr::select(dplyr::any_of(c(var_in[i], v, contexts$var_in, identifiers$var_in))) %>% # Why select v? When would those ids ever be in the data?
     dplyr::mutate(dataset_id = dataset_id)
 
   # Step 1b. Import any values that aren't columns of data
@@ -1618,6 +1657,41 @@ process_format_contributors <- function(my_list, dataset_id, schema) {
   contributors
 }
 
+#' Format identifiers from list into tibble
+#'
+#' Format identifiers, read in from the `metadata.yml` file. Converts from list to tibble.
+#'
+#' @param my_list List of input information
+#' @param dataset_id Identifier for a particular study in the AusTraits database
+#' @param schema Schema for traits.build
+#'
+#' @return Tibble with details of identifiers
+#' @importFrom rlang .data
+#'
+#' @examples
+#' \dontrun{
+#' process_format_identifiers(read_metadata("data/Falster_2003/metadata.yml")$identifiers)
+#' }
+process_format_identifiers <- function(my_list, dataset_id, traits) {
+
+    if (length(unlist(my_list$identifiers)) > 1) {
+      identifiers <-
+        my_list$identifiers %>%
+        austraits::convert_list_to_df2() %>%
+        dplyr::mutate(dataset_id = dataset_id)
+    } else {
+      identifiers <- tibble::tibble(dataset_id = character())
+    }
+
+    identifiers <-
+      identifiers %>%
+      process_add_all_columns(
+        names(schema[["austraits"]][["elements"]][["identifiers"]][["elements"]]),
+        add_error_column = FALSE)
+
+  identifiers
+}
+
 process_format_methods <- function(metadata, dataset_id, sources, contributors) {
 
   # Identify sources as being `primary`, `secondary` or `original`

diff --git a/R/setup.R b/R/setup.R
@@ -558,6 +558,78 @@ metadata_add_contexts <- function(dataset_id, overwrite = FALSE, user_responses
 
 }
 
+#' For specified `dataset_id` import database cross-referencing identifer data from a dataframe
+#'
+#' This functions asks users which columns in the dataframe they would like to keep
+#' and records this appropriately in the metadata. The input data is assumed to be
+#' in wide format.
+#' The output may require additional manual editing if the user selects a non-standard identifier type.
+#'
+#' @inheritParams metadata_path_dataset_id
+#' @param overwrite Overwrite existing information
+#' @param user_responses Named list containing simulated user input for manual selection
+#' of variables, mainly for testing purposes
+#'
+#' @importFrom rlang .data
+#' @export
+metadata_add_identifiers <- function(dataset_id, overwrite = FALSE) {
+
+  # Read metadata
+  metadata <- read_metadata_dataset(dataset_id)
+
+  # Load and clean trait data
+  data <-
+    readr::read_csv(file.path("data", dataset_id, "data.csv"), col_types = cols()) %>%
+    process_custom_code(metadata[["dataset"]][["custom_R_code"]])()
+
+  # Get list of potential columns
+  v <- names(data)
+
+  # Check for existing info and if it exists, retain information if overwrite = FALSE
+  if (!overwrite && !is.na(metadata$identifiers[1])) {
+
+    identifiers <- metadata$identifiers
+    n_existing <- length(metadata$identifiers)
+
+    message(
+      sprintf(
+        red("Existing identifier information detected, from the following columns in the dataset: ") %+%
+        green("'%s'\n\t") %+% red("Metadata is being appended; please review duplicates manually"),
+      identifiers %>% purrr::map_chr(~.x[["var_in"]]) %>% paste(collapse = "', '"))
+    )
+
+  } else {
+
+    identifiers <- list()
+    n_existing <- 0
+
+  }
+
+    var_in <- metadata_user_select_names(
+      paste("Indicate all columns that contain identifiers that cross-reference between observations in ", dataset_id, " and an herbarium voucher or another database."), v)
+
+    types <-
+      c("catalogNumber", "collectionID", "institutionCode", "institutionID", "materialSampleID", "occurrenceID")
+
+    for (i in seq_along(var_in)) {
+
+      ii <- n_existing + i
+      identifier_type <- metadata_user_select_names(
+        paste("What identifier type does ", var_in[i], "fit in?"), types)
+
+      identifiers[[ii]] <-
+        list(
+          var_in = var_in[i],
+          identifier_type = identifier_type
+        )
+
+    }
+
+  metadata$identifiers <- identifiers
+  write_metadata_dataset(metadata, dataset_id)
+  return(invisible(metadata))
+
+}
 
 #' Adds citation details to a metadata file for given study
 #'

diff --git a/R/testdata.R b/R/testdata.R
@@ -100,10 +100,18 @@ dataset_test_worker <-
         f <- files[2]
         test_expect_allowed_text(readLines(f, encoding = "UTF-8"), info = paste0(red(f), "\tmetadata"), label = "metadata")
         testthat::expect_silent(metadata <- yaml::read_yaml(f))
-        test_expect_list_names_exact(
-          metadata, schema$metadata$elements %>% names(),
-          info = red(f), label = "metadata sections"
-        )
+
+        if (!is.null(metadata[["identifiers"]])) {
+          test_expect_list_names_exact(
+            metadata, schema$metadata$elements %>% names(),
+            info = red(f), label = "metadata sections"
+          )
+        } else {
+          test_expect_list_names_allowed(
+            metadata, schema$metadata$elements %>% names(),
+            info = red(f), label = "metadata sections"
+          )
+        }
 
         ## Custom R code
         txt <- metadata[["dataset"]][["custom_R_code"]]
@@ -214,6 +222,23 @@ dataset_test_worker <-
           info = paste0(red(f), "\tdataset"), label = "metadata"
         )
 
+        ## Identifiers
+        if (!"identifiers" %in% names(metadata)) {
+          testthat::expect_silent(
+            identifiers <-
+              metadata$identifiers %>%
+              process_format_identifiers(dataset_id, data)
+          )
+        }
+
+        if (!"identifiers" %in% names(metadata)) {
+          test_expect_list_elements_exact_names(
+            metadata$identifiers,
+            schema$metadata$elements$identifiers$values %>% names(),
+            info = paste0(red(f), "\tidentifiers")
+          )
+        }
+
         ## Locations
 
         testthat::expect_silent(
@@ -679,10 +704,20 @@ dataset_test_worker <-
 
         }
 
+        # Load identifiers
+        if ("identifiers" %in% names(metadata) & !all(is.na(metadata[["identifiers"]]))) {
+          identifiers <-
+            metadata[["identifiers"]] %>% austraits::convert_list_to_df2()
+        } else {
+          identifiers <- list(
+            "var_in",
+            "identifier_type"
+          )
+        }
         ## Check that special characters do not make it into the data
         test_expect_no_error(
           parsed_data <- data %>%
-            process_parse_data(dataset_id, metadata, contexts, schema),
+            process_parse_data(dataset_id, metadata, contexts, schema, identifiers),
           info = sprintf("%s\t`process_parse_data`", red(dataset_id)))
 
         test_expect_allowed_text(

diff --git a/R/utils.R b/R/utils.R
@@ -169,6 +169,10 @@ read_metadata <- function(path) {
       paste(collapse = "\n")
   }
 
+  if (!"identifier" %in% names(data)) {
+    data["identifier"] <- NA
+  }
+
   data
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -169,6 +169,10 @@ read_metadata <- function(path) { @@
           paste(collapse = "\n")
       }
+      if (!"identifier" %in% names(data)) {
+        data["identifier"] <- NA
+      }
       data
     }
@@ Expand Down @@