Skip to content

Commit

Permalink
final changes
Browse files Browse the repository at this point in the history
- remove naming of individual databases from `bind_databases`. Once we use this version of the functions in traits.build we will have to add in the creation of the `dataset_id` column in `taxonomic_updates` from elsewhere. That is the only use of the database name and it introduces an error when use in generic situations
- added lots of loops, more compact coding to `extract_data`
  • Loading branch information
ehwenk committed Nov 12, 2024
1 parent e697775 commit 881fddc
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 111 deletions.
12 changes: 1 addition & 11 deletions R/bind_databases.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,6 @@ bind_databases <- function(database_1, ...) {
# List of databases to combine
databases = list(database_1, ...)

# Capture names of databases input; required below to assign names to databases in list
database_names <- match.call() %>% as.character()
database_names <- database_names[-1]

combine <- function(name, databases) {
dplyr::bind_rows(lapply(databases, "[[", name)) %>% dplyr::distinct()
}
Expand All @@ -35,18 +31,12 @@ bind_databases <- function(database_1, ...) {
# Drop null datasets
databases[sapply(databases, is.null)] <- NULL

# Assign names to databases
names(databases) <- database_names
# This is the initial code used in traits.build, where the names of the individual databases
# were always dataset_id's
#names(databases) <- sapply(databases, "[[", database_names)

# Taxonomy

taxonomic_updates <-
combine("taxonomic_updates", databases) %>%
dplyr::group_by(.data$original_name, .data$aligned_name, .data$taxon_name, .data$taxonomic_resolution) %>%
dplyr::mutate(dataset_id = paste(.data$dataset_id, collapse = " ")) %>%
#dplyr::mutate(dataset_id = paste(.data$dataset_id, collapse = " ")) %>%
dplyr::ungroup() %>%
dplyr::distinct() %>%
dplyr::arrange(.data$original_name, .data$aligned_name, .data$taxon_name, .data$taxonomic_resolution)
Expand Down
152 changes: 53 additions & 99 deletions R/extract_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#' col = "trait_name", col_value = "leaf_area")
#' }
extract_data <- function(database, table, col, col_value) {

database$contexts <- database$contexts %>% tidyr::separate_longer_delim(link_vals, delim = ", ")

database$contexts_tmp <- split(database$contexts, database$contexts$link_id)
Expand All @@ -45,71 +45,48 @@ extract_data <- function(database, table, col, col_value) {
# Rename the generic `link_vals` to the specific context category they represent and
# move the tables from database_tmp to the main database list.

#rename_contexts_link_vals <- function(table, name) {
# table %>%
# dplyr::rename(!!rlang::quo_name(name) := link_vals)
#}


# this overwrites all of database

#database <- purrr::map(database$contexts_tmp,
# ~rename_contexts_link_vals(.x, .x$link_id |> unique()))


database$entity_context_id <- database$contexts_tmp$entity_context_id %>%
dplyr::rename(entity_context_id = link_vals)

database$method_context_id <- database$contexts_tmp$method_context_id %>%
dplyr::rename(method_context_id = link_vals)

database$temporal_context_id <- database$contexts_tmp$temporal_context_id %>%
dplyr::rename(temporal_context_id = link_vals)

database$plot_context_id <- database$contexts_tmp$plot_context_id %>%
dplyr::rename(plot_context_id = link_vals)

database$treatment_context_id <- database$contexts_tmp$treatment_context_id %>%
dplyr::rename(treatment_context_id = link_vals)
for (z in c("entity_context_id", "method_context_id", "temporal_context_id",
"plot_context_id", "treatment_context_id")) {
database[[z]] <- database$contexts_tmp[[z]] %>%
dplyr::rename(!!z := link_vals)
}

# Create an empty database list
ret <- list(
traits = dplyr::tibble(),
ret <- list(
locations = dplyr::tibble(),
contexts = dplyr::tibble(),
entity_context_id = dplyr::tibble(),
method_context_id = dplyr::tibble(),
temporal_context_id = dplyr::tibble(),
plot_context_id = dplyr::tibble(),
treatment_context_id = dplyr::tibble(),
methods = dplyr::tibble(),
excluded_data = dplyr::tibble(),
taxonomic_updates = dplyr::tibble(),
taxa = dplyr::tibble(),
contributors = dplyr::tibble()
taxonomic_updates = dplyr::tibble(),
contributors = dplyr::tibble(),
traits = dplyr::tibble(),
excluded_data = dplyr::tibble(),
contexts = dplyr::tibble()
)

ret_tmp <- list()
ret_tmp <- ret[1:10]

# Cookie cutters

# XX - could be a list, but then problems lower down

locations_cc <- c("dataset_id", "location_id")
entity_contexts_cc <- c("dataset_id", "entity_context_id")
temporal_contexts_cc <- c("dataset_id", "temporal_context_id")
method_contexts_cc <- c("dataset_id", "method_context_id")
plot_contexts_cc <- c("dataset_id", "plot_context_id")
treatment_contexts_cc <- c("dataset_id", "treatment_context_id")
methods_cc <- c("dataset_id", "trait_name", "method_id")
taxonomic_updates_cc <- c("dataset_id", "taxon_name", "original_name")
taxa_cc <- c("taxon_name")
contributors_cc <- c("dataset_id")

cookie_cutters <- list(
locations_cc = c("dataset_id", "location_id"),
entity_contexts_cc = c("dataset_id", "entity_context_id"),
method_contexts_cc = c("dataset_id", "method_context_id"),
temporal_contexts_cc = c("dataset_id", "temporal_context_id"),
plot_contexts_cc = c("dataset_id", "plot_context_id"),
treatment_contexts_cc = c("dataset_id", "treatment_context_id"),
methods_cc = c("dataset_id", "trait_name", "method_id"),
taxa_cc = c("taxon_name"),
taxonomic_updates_cc = c("dataset_id", "taxon_name", "original_name"),
contributors_cc = c("dataset_id")
)

# Create table of various look-up values to be used below

# Create vectors for table
# Create additional vectors for table
tables_to_cut <- c("locations", "entity_context_id", "method_context_id", "temporal_context_id",
"plot_context_id", "treatment_context_id",
"methods", "taxa", "taxonomic_updates", "contributors")
Expand All @@ -119,13 +96,9 @@ extract_data <- function(database, table, col, col_value) {
"database$plot_context_id", "database$treatment_context_id",
"database$methods", "database$taxa", "database$taxonomic_updates", "database$contributors")

cookie_cutters <- c("locations_cc", "entity_contexts_cc", "method_contexts_cc", "temporal_contexts_cc",
"plot_contexts_cc", "treatment_contexts_cc", "methods_cc", "taxa_cc", "taxonomic_updates_cc",
"contributors_cc")

# Create table
tables <- dplyr::tibble(
cookie_cutters = cookie_cutters, # XX - or = names(cookie_cutters) if cookie_cutters is a list
cookie_cutters = names(cookie_cutters),
tables_to_cut = tables_to_cut,
tables_complete_path = tables_complete_path
)
Expand All @@ -152,9 +125,6 @@ extract_data <- function(database, table, col, col_value) {
# chose columns to select, ensuring "value" isn't among the columns, since it has a different meaning for each table
columns_to_select <- intersect(setdiff(names(database$traits), "value"), names(database[[table[[i]]]]))

# Trim reference table i.e. locations with pattern matching
#found_indicies <- database[[table[[i]]]][[col]] |> stringr::str_which(pattern = stringr::regex(col_value, ignore_case = TRUE))

indicies_tmp <- purrr::map(col_value, ~{
stringr::str_which(database[[table[[i]]]][[col]],
pattern = stringr::regex(.x, ignore_case = TRUE))
Expand All @@ -163,82 +133,66 @@ extract_data <- function(database, table, col, col_value) {
found_indicies <- purrr::reduce(indicies_tmp, union)

# Trim traits, based on the columns identified as being common between the traits table and target table
cc_traits <- database[[table[[i]]]] |>
dplyr::slice(found_indicies) |>
dplyr::select(tidyselect::all_of(columns_to_select)) |>
cc_traits <- database[[table[[i]]]] %>%
dplyr::slice(found_indicies) %>%
dplyr::select(tidyselect::all_of(columns_to_select)) %>%
dplyr::distinct()

# Filtering join
## It will quite literally cookie cutting the traits table if the columns match what is in cc_traits
ret_tmp[["traits"]] <- database[["traits"]]|>
ret_tmp[["traits"]] <- database[["traits"]]%>%
dplyr::semi_join(cc_traits, by = columns_to_select)

# Use same filtering join to trim excluded data
ret_tmp[["excluded_data"]] <- database[["excluded_data"]] |>
ret_tmp[["excluded_data"]] <- database[["excluded_data"]] %>%
dplyr::semi_join(cc_traits, by = columns_to_select)


for (j in seq_along(tables_tmp$tables_to_cut)) {

cut_traits <- ret_tmp[["traits"]] |>
dplyr::select(get(tables_tmp$cookie_cutters[[j]])) |> # XX - this line no longer works with a list and tried other variants to query new vector location
cut_traits <- ret_tmp[["traits"]] %>%
dplyr::select(cookie_cutters[[j]]) %>%
dplyr::distinct()

cut_traits <- cut_traits |>
cut_traits <- cut_traits %>%
dplyr::filter(dplyr::if_all(tidyselect::everything(), ~ !is.na(.)))

cut_table <- eval(parse(text = tables_tmp$tables_complete_path[[j]])) |>
dplyr::semi_join(cut_traits, by = get(tables_tmp$cookie_cutters[[j]])) %>%
cut_table <- eval(parse(text = tables_tmp$tables_complete_path[[j]])) %>%
dplyr::semi_join(cut_traits, by = cookie_cutters[[j]]) %>%
dplyr::rename(link_vals = tidyselect::contains("context_id"))

assign(paste0("ret_tmp[[\"", tables_tmp$tables_to_cut[[j]], "\"]]"), cut_table )
ret_tmp[[j]] <- cut_table

}

# Bind together rows extracted from each table
ret[["traits"]] <- ret[["traits"]] %>% dplyr::bind_rows(ret_tmp[["traits"]]) %>% dplyr::distinct()
ret[["locations"]] <- ret[["locations"]] %>% dplyr::bind_rows(`ret_tmp[["locations"]]`) %>% dplyr::distinct()
ret[["entity_context_id"]] <- ret[["entity_context_id"]] %>% dplyr::bind_rows(`ret_tmp[["entity_context_id"]]`) %>% dplyr::distinct()
ret[["method_context_id"]] <- ret[["method_context_id"]] %>% dplyr::bind_rows(`ret_tmp[["method_context_id"]]`) %>% dplyr::distinct()
ret[["temporal_context_id"]] <- ret[["temporal_context_id"]] %>% dplyr::bind_rows(`ret_tmp[["temporal_context_id"]]`) %>% dplyr::distinct()
ret[["plot_context_id"]] <- ret[["plot_context_id"]] %>% dplyr::bind_rows(`ret_tmp[["plot_context_id"]]`) %>% dplyr::distinct()
ret[["treatment_context_id"]] <- ret[["treatment_context_id"]] %>% dplyr::bind_rows(`ret_tmp[["treatment_context_id"]]`) %>% dplyr::distinct()
ret[["methods"]] <- ret[["methods"]] %>% dplyr::bind_rows(`ret_tmp[["methods"]]`) %>% dplyr::distinct()
ret[["excluded_data"]] <- ret[["excluded_data"]] %>% dplyr::bind_rows(ret_tmp[["excluded_data"]]) %>% dplyr::distinct()
ret[["taxonomic_updates"]] <- ret[["taxonomic_updates"]] %>% dplyr::bind_rows(`ret_tmp[["taxonomic_updates"]]`) %>% dplyr::distinct()
ret[["taxa"]] <- ret[["taxa"]] %>% dplyr::bind_rows(`ret_tmp[["taxa"]]`) %>% dplyr::distinct()
ret[["contributors"]] <- ret[["contributors"]] %>% dplyr::bind_rows(`ret_tmp[["contributors"]]`) %>% dplyr::distinct()

# getting error with loop
# for (v in c("traits", tables$tables_to_cut)) {

# ret[[v]] <- ret_tmp[[v]] %>%
# dplyr::bind_rows(`ret_tmp[[v]]`) %>%
# dplyr::distinct()

# }

for (v in seq_along(c(tables$tables_to_cut, "traits", "excluded_data"))) {

ret[[v]] <- ret[[v]] %>%
dplyr::bind_rows(ret_tmp[[v]]) %>%
dplyr::distinct()

}

}

# Rejoin contexts
ret[["contexts"]] <- ret[["entity_context_id"]] |>
ret[["contexts"]] <- ret[["entity_context_id"]] %>%
dplyr::bind_rows(ret[["method_context_id"]],
ret[["plot_context_id"]],
ret[["temporal_context_id"]],
ret[["treatment_context_id"]]) |>
dplyr::select(-dplyr::any_of(c("entity_context_id", "method_context_id", "plot_context_id", "temporal_context_id", "treatment_context_id"))) |>
dplyr::group_by(dataset_id, category, link_id, value, description) |>
dplyr::mutate(link_vals = paste0(link_vals, collapse = ", ")) |>
dplyr::ungroup() |>
ret[["treatment_context_id"]]) %>%
dplyr::select(-dplyr::any_of(c("entity_context_id", "method_context_id", "plot_context_id", "temporal_context_id", "treatment_context_id"))) %>%
dplyr::group_by(dataset_id, category, link_id, value, description) %>%
dplyr::mutate(link_vals = paste0(link_vals, collapse = ", ")) %>%
dplyr::ungroup() %>%
dplyr::distinct()

ret <- ret[!names(ret) %in% c("entity_context_id", "method_context_id", "plot_context_id", "temporal_context_id", "treatment_context_id")]

# Trim sources - Are these just dataset_ids...
from_methods_to_sources_cc <- dplyr::union(ret$methods$source_primary_key, # Is this part really needed, aren't these just dataset_ids?
ret$methods$source_secondary_key |> strsplit("; ") |> unlist()) |>
unique() |> stats::na.omit() |> as.character()
ret$methods$source_secondary_key %>% strsplit("; ") %>% unlist()) %>%
unique() %>% stats::na.omit() %>% as.character()

ret[["sources"]] <- database[["sources"]][from_methods_to_sources_cc]

Expand Down
2 changes: 1 addition & 1 deletion man/bind_databases.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 881fddc

Please sign in to comment.