extract add parameter for exact matches (#156)

* Add option to extract_ functions to only allow exact matches - versus the default which uses partial string matches. There are instances where partial matches return too much data: For instance - searching on trait = "leaf_area" returns data for "leaf_area" and "leaf_area_ratio" - searching for taxonomic_status = "accepted" also returns data for "genus accepted" and therefore more taxon names than are sought. The default is still for partial matches, but there is now a parameter "partial_matches_allowed" which can be set to FALSE. closes issue #150 * Bug fix in `extract_data` that turned up through AusFizz. It was possible for the grouping variables designated in the #Rejoining Contexts section of this function to be identical across multiple context properties (dataset_id, category, link_id, value, description). There were 4 datasets in AusFizz where this was the case. This led to duplicate listings of link_vals and then as a follow-on join_contexts didn't work because there were duplicate link_vals. closes issue #154
traitecoevo · Jan 21, 2025 · e78c8fa · e78c8fa
1 parent cc5dc3e
commit e78c8fa
Show file tree

Hide file tree

Showing 5 changed files with 55 additions and 18 deletions.
diff --git a/R/extract_data.R b/R/extract_data.R
@@ -19,7 +19,7 @@
 #' extract_data(database = traits.build_database, table = "traits", 
 #' col = "trait_name", col_value = "leaf_area")
 #' }
-extract_data <- function(database, table = NA, col, col_value) {
+extract_data <- function(database, table = NA, col, col_value, partial_matches_allowed = TRUE) {
 
   # Check missingness
   check_arg_missingness(database, col, col_value)
@@ -40,10 +40,16 @@ extract_data <- function(database, table = NA, col, col_value) {
 
     check_col_exists_in_table(database, table, col)
 
-    indicies_tmp <- purrr::map(col_value, ~{
-      stringr::str_which(database[[col]], 
-                         pattern = stringr::regex(.x, ignore_case = TRUE))
-    })
+    if (partial_matches_allowed == TRUE) {
+      indicies_tmp <- purrr::map(col_value, ~{
+        stringr::str_which(database[[col]], 
+                          pattern = stringr::regex(.x, ignore_case = TRUE))
+      })
+    } else {
+      indicies_tmp <- purrr::map(col_value, ~{
+        which(database[[col]] == .x)
+      })
+    }
 
     found_indicies <- purrr::reduce(indicies_tmp, union)
 
@@ -164,10 +170,16 @@ extract_data <- function(database, table = NA, col, col_value) {
         # chose columns to select, ensuring "value" isn't among the columns, since it has a different meaning for each table
         columns_to_select <- intersect(setdiff(names(database$traits), "value"), names(database[[table[[i]]]]))
 
-        indicies_tmp <- purrr::map(col_value, ~{
-          stringr::str_which(database[[table[[i]]]][[col]], 
-                             pattern = stringr::regex(.x, ignore_case = TRUE))
-        })
+        if (partial_matches_allowed == TRUE) {
+          indicies_tmp <- purrr::map(col_value, ~{
+            stringr::str_which(database[[table[[i]]]][[col]], 
+                               pattern = stringr::regex(.x, ignore_case = TRUE))
+          })
+        } else {
+          indicies_tmp <- purrr::map(col_value, ~{
+            which(database[[table[[i]]]][[col]] == .x)
+          })
+        }
 
         found_indicies <- purrr::reduce(indicies_tmp, union)
 
@@ -224,6 +236,7 @@ extract_data <- function(database, table = NA, col, col_value) {
                 ret[["treatment_context_id"]]) %>%
       dplyr::select(-dplyr::any_of(c("entity_context_id", "method_context_id", "plot_context_id", "temporal_context_id", "treatment_context_id"))) %>%
       dplyr::group_by(dataset_id, category, link_id, value, description) %>%
+        dplyr::distinct(link_vals, .keep_all = TRUE) %>%
         dplyr::mutate(link_vals = paste0(link_vals, collapse = ", ")) %>%
       dplyr::ungroup() %>%
       dplyr::distinct()

diff --git a/R/extract_dataset.R b/R/extract_dataset.R
@@ -24,7 +24,7 @@
 #' @export
 
 
-extract_dataset <- function(database, dataset_id) {
+extract_dataset <- function(database, dataset_id, partial_matches_allowed = TRUE) {
 
   # Check compatability
   status <- check_compatibility(database, single_table_allowed = TRUE)
@@ -34,7 +34,7 @@ extract_dataset <- function(database, dataset_id) {
     function_not_supported(database)
   }
 
-  extract_data(database, "traits", "dataset_id", col_value = dataset_id)
+  extract_data(database, "traits", "dataset_id", col_value = dataset_id, partial_matches_allowed = partial_matches_allowed)
 }
 
 
diff --git a/R/extract_taxa.R b/R/extract_taxa.R
@@ -25,7 +25,7 @@
 #' @author Fonti Kar - [email protected]
 #' @export
 
-extract_taxa <- function(database, family = NULL, genus = NULL, taxon_name = NULL){
+extract_taxa <- function(database, family = NULL, genus = NULL, taxon_name = NULL, partial_matches_allowed = TRUE){
   # Check compatability
   status <- check_compatibility(database, single_table_allowed = TRUE)
 
@@ -40,15 +40,15 @@ extract_taxa <- function(database, family = NULL, genus = NULL, taxon_name = NUL
   }
 
   if( ! is.null(family) ){
-    return(extract_data(database, "taxa", "family", col_value = family))
+    return(extract_data(database, "taxa", "family", col_value = family, partial_matches_allowed = partial_matches_allowed))
   }
 
   if( ! is.null(genus) ){
-    return(extract_data(database, "taxa", "genus", col_value = genus))
+    return(extract_data(database, "taxa", "genus", col_value = genus, partial_matches_allowed = partial_matches_allowed))
   }
 
   if( ! is.null(taxon_name))
-    return(extract_data(database, "traits", "taxon_name", col_value = taxon_name))
+    return(extract_data(database, "traits", "taxon_name", col_value = taxon_name, partial_matches_allowed = partial_matches_allowed))
 }
 
 

diff --git a/R/extract_trait.R b/R/extract_trait.R
@@ -26,7 +26,7 @@
 
 
 
-extract_trait <- function(database, trait_names, taxon_names=NULL) {
+extract_trait <- function(database, trait_names, taxon_names=NULL, partial_matches_allowed = TRUE) {
   # Check compatability
   status <- check_compatibility(database, single_table_allowed = TRUE)
 
@@ -35,10 +35,10 @@ extract_trait <- function(database, trait_names, taxon_names=NULL) {
     function_not_supported(database)
   } 
 
-  ret <- extract_data(database, "traits", "trait_name", col_value = trait_names)
+  ret <- extract_data(database, "traits", "trait_name", col_value = trait_names, partial_matches_allowed = partial_matches_allowed)
 
   if(!is.null(taxon_names))
-    ret <- extract_data(ret, "traits", "taxon_name", col_value = taxon_names)
+    ret <- extract_data(ret, "traits", "taxon_name", col_value = taxon_names, partial_matches_allowed = partial_matches_allowed)
 
   return(ret)
 }
diff --git a/tests/testthat/test-extract_.R b/tests/testthat/test-extract_.R
@@ -242,3 +242,27 @@ test_that("Extract function works when just traits table is read in", {
   expect_silent(extract_data(database = join_then_extract, col = "longitude (deg)", col_value = "145"))
 })
 
+test_that("Extract function works with partial match parameter", {
+  expect_silent(extract_data(database = austraits_5.0.0_lite, table = "traits", col = "dataset_id", col_value = "Falster"))
+  expect_silent(extract_data(database = austraits_5.0.0_lite, table = "traits", col = "dataset_id", col_value = "Falster", partial_matches_allowed = T))
+  expect_error(extract_data(database = austraits_5.0.0_lite, table = "traits", col = "dataset_id", col_value = "Falster", partial_matches_allowed = F))
+  expect_silent(extract_data(database = austraits_5.0.0_lite, table = "traits", col = "dataset_id", col_value = "Falster_2003", partial_matches_allowed = F))
+  expect_silent(extract_data(database = austraits_5.0.0_lite$traits, col = "dataset_id", col_value = "Falster"))
+  expect_silent(Falster_partial <- extract_data(database = austraits_5.0.0_lite$traits, col = "dataset_id", col_value = "Falster", partial_matches_allowed = T))
+  expect_error(extract_data(database = austraits_5.0.0_lite$traits, col = "dataset_id", col_value = "Falster", partial_matches_allowed = F))
+  expect_silent(Falster_exact <- extract_data(database = austraits_5.0.0_lite$traits, col = "dataset_id", col_value = "Falster_2003", partial_matches_allowed = F))
+  expect_equal(nrow(Falster_partial %>% dplyr::distinct(dataset_id)), 3)
+  expect_equal(nrow(Falster_exact %>% dplyr::distinct(dataset_id)), 1)
+  expect_silent(extract_dataset(database = austraits_5.0.0_lite, "Falster", partial_matches_allowed = T))
+  expect_error(extract_dataset(database = austraits_5.0.0_lite, "Falster", partial_matches_allowed = F))
+  expect_silent(extract_dataset(database = austraits_5.0.0_lite, "Falster_2003", partial_matches_allowed = F))
+  expect_silent(extract_trait(database = austraits_5.0.0_lite, "photosynth", partial_matches_allowed = T))
+  expect_error(extract_trait(database = austraits_5.0.0_lite, "photosynth", partial_matches_allowed = F))
+  expect_silent(extract_trait(database = austraits_5.0.0_lite, "photosynthetic_pathway", partial_matches_allowed = F))
+  expect_silent(Acacia_dealbata_partial <- extract_taxa(database = austraits_5.0.0_lite, taxon_name = "Acacia dealbata", partial_matches_allowed = T))
+  expect_silent(Acacia_dealbata_exact <- extract_taxa(database = austraits_5.0.0_lite, taxon_name = "Acacia dealbata", partial_matches_allowed = F))
+  expect_gt(nrow(Acacia_dealbata_partial$traits),nrow(Acacia_dealbata_exact$traits))
+  expect_equal(Acacia_dealbata_partial$traits %>% dplyr::distinct(taxon_name) %>% nrow(), 3)
+  expect_equal(Acacia_dealbata_exact$traits %>% dplyr::distinct(taxon_name) %>% nrow(), 1)
+})
+