Skip to content

Commit

Permalink
extract add parameter for exact matches (#156)
Browse files Browse the repository at this point in the history
* Add option to extract_ functions to only allow exact matches - versus the default which uses partial string matches.

There are instances where partial matches return too much data:

For instance

- searching on trait = "leaf_area" returns data for "leaf_area" and "leaf_area_ratio"
- searching for taxonomic_status = "accepted" also returns data for "genus accepted" and therefore more taxon names than are sought.

The default is still for partial matches, but there is now a parameter "partial_matches_allowed" which can be set to FALSE.

closes issue #150 

* Bug fix in `extract_data` that turned up through AusFizz. 

It was possible for the grouping variables designated in the #Rejoining Contexts section of this function to be identical across multiple context properties (dataset_id, category, link_id, value, description). There were 4 datasets in AusFizz where this was the case. This led to duplicate listings of link_vals and then as a follow-on join_contexts didn't work because there were duplicate link_vals.

closes issue #154
  • Loading branch information
ehwenk authored Jan 21, 2025
1 parent cc5dc3e commit e78c8fa
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 18 deletions.
31 changes: 22 additions & 9 deletions R/extract_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
#' extract_data(database = traits.build_database, table = "traits",
#' col = "trait_name", col_value = "leaf_area")
#' }
extract_data <- function(database, table = NA, col, col_value) {
extract_data <- function(database, table = NA, col, col_value, partial_matches_allowed = TRUE) {

# Check missingness
check_arg_missingness(database, col, col_value)
Expand All @@ -40,10 +40,16 @@ extract_data <- function(database, table = NA, col, col_value) {

check_col_exists_in_table(database, table, col)

indicies_tmp <- purrr::map(col_value, ~{
stringr::str_which(database[[col]],
pattern = stringr::regex(.x, ignore_case = TRUE))
})
if (partial_matches_allowed == TRUE) {
indicies_tmp <- purrr::map(col_value, ~{
stringr::str_which(database[[col]],
pattern = stringr::regex(.x, ignore_case = TRUE))
})
} else {
indicies_tmp <- purrr::map(col_value, ~{
which(database[[col]] == .x)
})
}

found_indicies <- purrr::reduce(indicies_tmp, union)

Expand Down Expand Up @@ -164,10 +170,16 @@ extract_data <- function(database, table = NA, col, col_value) {
# chose columns to select, ensuring "value" isn't among the columns, since it has a different meaning for each table
columns_to_select <- intersect(setdiff(names(database$traits), "value"), names(database[[table[[i]]]]))

indicies_tmp <- purrr::map(col_value, ~{
stringr::str_which(database[[table[[i]]]][[col]],
pattern = stringr::regex(.x, ignore_case = TRUE))
})
if (partial_matches_allowed == TRUE) {
indicies_tmp <- purrr::map(col_value, ~{
stringr::str_which(database[[table[[i]]]][[col]],
pattern = stringr::regex(.x, ignore_case = TRUE))
})
} else {
indicies_tmp <- purrr::map(col_value, ~{
which(database[[table[[i]]]][[col]] == .x)
})
}

found_indicies <- purrr::reduce(indicies_tmp, union)

Expand Down Expand Up @@ -224,6 +236,7 @@ extract_data <- function(database, table = NA, col, col_value) {
ret[["treatment_context_id"]]) %>%
dplyr::select(-dplyr::any_of(c("entity_context_id", "method_context_id", "plot_context_id", "temporal_context_id", "treatment_context_id"))) %>%
dplyr::group_by(dataset_id, category, link_id, value, description) %>%
dplyr::distinct(link_vals, .keep_all = TRUE) %>%
dplyr::mutate(link_vals = paste0(link_vals, collapse = ", ")) %>%
dplyr::ungroup() %>%
dplyr::distinct()
Expand Down
4 changes: 2 additions & 2 deletions R/extract_dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
#' @export


extract_dataset <- function(database, dataset_id) {
extract_dataset <- function(database, dataset_id, partial_matches_allowed = TRUE) {

# Check compatability
status <- check_compatibility(database, single_table_allowed = TRUE)
Expand All @@ -34,7 +34,7 @@ extract_dataset <- function(database, dataset_id) {
function_not_supported(database)
}

extract_data(database, "traits", "dataset_id", col_value = dataset_id)
extract_data(database, "traits", "dataset_id", col_value = dataset_id, partial_matches_allowed = partial_matches_allowed)
}


8 changes: 4 additions & 4 deletions R/extract_taxa.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
#' @author Fonti Kar - [email protected]
#' @export

extract_taxa <- function(database, family = NULL, genus = NULL, taxon_name = NULL){
extract_taxa <- function(database, family = NULL, genus = NULL, taxon_name = NULL, partial_matches_allowed = TRUE){
# Check compatability
status <- check_compatibility(database, single_table_allowed = TRUE)

Expand All @@ -40,15 +40,15 @@ extract_taxa <- function(database, family = NULL, genus = NULL, taxon_name = NUL
}

if( ! is.null(family) ){
return(extract_data(database, "taxa", "family", col_value = family))
return(extract_data(database, "taxa", "family", col_value = family, partial_matches_allowed = partial_matches_allowed))
}

if( ! is.null(genus) ){
return(extract_data(database, "taxa", "genus", col_value = genus))
return(extract_data(database, "taxa", "genus", col_value = genus, partial_matches_allowed = partial_matches_allowed))
}

if( ! is.null(taxon_name))
return(extract_data(database, "traits", "taxon_name", col_value = taxon_name))
return(extract_data(database, "traits", "taxon_name", col_value = taxon_name, partial_matches_allowed = partial_matches_allowed))
}


Expand Down
6 changes: 3 additions & 3 deletions R/extract_trait.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@



extract_trait <- function(database, trait_names, taxon_names=NULL) {
extract_trait <- function(database, trait_names, taxon_names=NULL, partial_matches_allowed = TRUE) {
# Check compatability
status <- check_compatibility(database, single_table_allowed = TRUE)

Expand All @@ -35,10 +35,10 @@ extract_trait <- function(database, trait_names, taxon_names=NULL) {
function_not_supported(database)
}

ret <- extract_data(database, "traits", "trait_name", col_value = trait_names)
ret <- extract_data(database, "traits", "trait_name", col_value = trait_names, partial_matches_allowed = partial_matches_allowed)

if(!is.null(taxon_names))
ret <- extract_data(ret, "traits", "taxon_name", col_value = taxon_names)
ret <- extract_data(ret, "traits", "taxon_name", col_value = taxon_names, partial_matches_allowed = partial_matches_allowed)

return(ret)
}
24 changes: 24 additions & 0 deletions tests/testthat/test-extract_.R
Original file line number Diff line number Diff line change
Expand Up @@ -242,3 +242,27 @@ test_that("Extract function works when just traits table is read in", {
expect_silent(extract_data(database = join_then_extract, col = "longitude (deg)", col_value = "145"))
})

test_that("Extract function works with partial match parameter", {
expect_silent(extract_data(database = austraits_5.0.0_lite, table = "traits", col = "dataset_id", col_value = "Falster"))
expect_silent(extract_data(database = austraits_5.0.0_lite, table = "traits", col = "dataset_id", col_value = "Falster", partial_matches_allowed = T))
expect_error(extract_data(database = austraits_5.0.0_lite, table = "traits", col = "dataset_id", col_value = "Falster", partial_matches_allowed = F))
expect_silent(extract_data(database = austraits_5.0.0_lite, table = "traits", col = "dataset_id", col_value = "Falster_2003", partial_matches_allowed = F))
expect_silent(extract_data(database = austraits_5.0.0_lite$traits, col = "dataset_id", col_value = "Falster"))
expect_silent(Falster_partial <- extract_data(database = austraits_5.0.0_lite$traits, col = "dataset_id", col_value = "Falster", partial_matches_allowed = T))
expect_error(extract_data(database = austraits_5.0.0_lite$traits, col = "dataset_id", col_value = "Falster", partial_matches_allowed = F))
expect_silent(Falster_exact <- extract_data(database = austraits_5.0.0_lite$traits, col = "dataset_id", col_value = "Falster_2003", partial_matches_allowed = F))
expect_equal(nrow(Falster_partial %>% dplyr::distinct(dataset_id)), 3)
expect_equal(nrow(Falster_exact %>% dplyr::distinct(dataset_id)), 1)
expect_silent(extract_dataset(database = austraits_5.0.0_lite, "Falster", partial_matches_allowed = T))
expect_error(extract_dataset(database = austraits_5.0.0_lite, "Falster", partial_matches_allowed = F))
expect_silent(extract_dataset(database = austraits_5.0.0_lite, "Falster_2003", partial_matches_allowed = F))
expect_silent(extract_trait(database = austraits_5.0.0_lite, "photosynth", partial_matches_allowed = T))
expect_error(extract_trait(database = austraits_5.0.0_lite, "photosynth", partial_matches_allowed = F))
expect_silent(extract_trait(database = austraits_5.0.0_lite, "photosynthetic_pathway", partial_matches_allowed = F))
expect_silent(Acacia_dealbata_partial <- extract_taxa(database = austraits_5.0.0_lite, taxon_name = "Acacia dealbata", partial_matches_allowed = T))
expect_silent(Acacia_dealbata_exact <- extract_taxa(database = austraits_5.0.0_lite, taxon_name = "Acacia dealbata", partial_matches_allowed = F))
expect_gt(nrow(Acacia_dealbata_partial$traits),nrow(Acacia_dealbata_exact$traits))
expect_equal(Acacia_dealbata_partial$traits %>% dplyr::distinct(taxon_name) %>% nrow(), 3)
expect_equal(Acacia_dealbata_exact$traits %>% dplyr::distinct(taxon_name) %>% nrow(), 1)
})

0 comments on commit e78c8fa

Please sign in to comment.