final changes

- remove naming of individual databases from `bind_databases`. Once we use this version of the functions in traits.build we will have to add in the creation of the `dataset_id` column in `taxonomic_updates` from elsewhere. That is the only use of the database name and it introduces an error when use in generic situations - added lots of loops, more compact coding to `extract_data`
traitecoevo · Nov 12, 2024 · 881fddc · 881fddc
1 parent e697775
commit 881fddc
Show file tree

Hide file tree

Showing 3 changed files with 55 additions and 111 deletions.
diff --git a/R/bind_databases.R b/R/bind_databases.R
@@ -14,10 +14,6 @@ bind_databases <- function(database_1, ...) {
   # List of databases to combine
   databases = list(database_1, ...)
 
-  # Capture names of databases input; required below to assign names to databases in list
-  database_names <- match.call() %>% as.character()
-  database_names <- database_names[-1]
-
   combine <- function(name, databases) {
     dplyr::bind_rows(lapply(databases, "[[", name)) %>% dplyr::distinct()
   }
@@ -35,18 +31,12 @@ bind_databases <- function(database_1, ...) {
   # Drop null datasets
   databases[sapply(databases, is.null)] <- NULL
 
-  # Assign names to databases
-  names(databases) <- database_names
-  # This is the initial code used in traits.build, where the names of the individual databases
-  # were always dataset_id's
-  #names(databases) <- sapply(databases, "[[", database_names)
-
   # Taxonomy
 
   taxonomic_updates <-
     combine("taxonomic_updates", databases) %>%
     dplyr::group_by(.data$original_name, .data$aligned_name, .data$taxon_name, .data$taxonomic_resolution) %>%
-    dplyr::mutate(dataset_id = paste(.data$dataset_id, collapse = " ")) %>%
+    #dplyr::mutate(dataset_id = paste(.data$dataset_id, collapse = " ")) %>%
     dplyr::ungroup() %>%
     dplyr::distinct() %>%
     dplyr::arrange(.data$original_name, .data$aligned_name, .data$taxon_name, .data$taxonomic_resolution)

diff --git a/R/extract_data.R b/R/extract_data.R
@@ -20,7 +20,7 @@
 #' col = "trait_name", col_value = "leaf_area")
 #' }
 extract_data <- function(database, table, col, col_value) {
-  
+
   database$contexts <- database$contexts %>% tidyr::separate_longer_delim(link_vals, delim = ", ")
 
   database$contexts_tmp <- split(database$contexts, database$contexts$link_id)
@@ -45,71 +45,48 @@ extract_data <- function(database, table, col, col_value) {
   # Rename the generic `link_vals` to the specific context category they represent and 
   # move the tables from database_tmp to the main database list.
 
-  #rename_contexts_link_vals <- function(table, name) {
-  #  table %>%
-  #    dplyr::rename(!!rlang::quo_name(name) := link_vals)
-  #}
-
-
-  # this overwrites all of database
-
-  #database <- purrr::map(database$contexts_tmp, 
-  #           ~rename_contexts_link_vals(.x, .x$link_id |> unique()))
-
-
-  database$entity_context_id <- database$contexts_tmp$entity_context_id %>%
-    dplyr::rename(entity_context_id = link_vals)
-
-  database$method_context_id <- database$contexts_tmp$method_context_id %>%
-    dplyr::rename(method_context_id = link_vals)  
-
-  database$temporal_context_id <- database$contexts_tmp$temporal_context_id %>%
-    dplyr::rename(temporal_context_id = link_vals)
-
-  database$plot_context_id <- database$contexts_tmp$plot_context_id %>%
-    dplyr::rename(plot_context_id = link_vals)
-
-  database$treatment_context_id <- database$contexts_tmp$treatment_context_id %>%
-    dplyr::rename(treatment_context_id = link_vals)
+  for (z in c("entity_context_id", "method_context_id", "temporal_context_id", 
+              "plot_context_id", "treatment_context_id")) {
+    database[[z]] <- database$contexts_tmp[[z]] %>%
+      dplyr::rename(!!z := link_vals)
+  }
 
   # Create an empty database list
-  ret <- list(
-    traits = dplyr::tibble(), 
+  ret <- list( 
     locations = dplyr::tibble(),
-    contexts = dplyr::tibble(),
     entity_context_id = dplyr::tibble(),
     method_context_id = dplyr::tibble(),
     temporal_context_id = dplyr::tibble(),
     plot_context_id = dplyr::tibble(),
     treatment_context_id = dplyr::tibble(),
     methods = dplyr::tibble(),
-    excluded_data = dplyr::tibble(),
-    taxonomic_updates = dplyr::tibble(),
     taxa = dplyr::tibble(),
-    contributors = dplyr::tibble()
+    taxonomic_updates = dplyr::tibble(),
+    contributors = dplyr::tibble(),
+    traits = dplyr::tibble(),
+    excluded_data = dplyr::tibble(),
+    contexts = dplyr::tibble()
   )
 
-  ret_tmp <- list()
+  ret_tmp <- ret[1:10]
 
   # Cookie cutters
-
-  # XX - could be a list, but then problems lower down
-
-  locations_cc <- c("dataset_id", "location_id")
-  entity_contexts_cc <- c("dataset_id", "entity_context_id")
-  temporal_contexts_cc <- c("dataset_id", "temporal_context_id")
-  method_contexts_cc <- c("dataset_id", "method_context_id")
-  plot_contexts_cc <- c("dataset_id", "plot_context_id")
-  treatment_contexts_cc <- c("dataset_id", "treatment_context_id")
-  methods_cc <- c("dataset_id", "trait_name", "method_id")
-  taxonomic_updates_cc <- c("dataset_id", "taxon_name", "original_name")
-  taxa_cc <- c("taxon_name")
-  contributors_cc <- c("dataset_id")
-
+  cookie_cutters <- list(
+    locations_cc = c("dataset_id", "location_id"),
+    entity_contexts_cc = c("dataset_id", "entity_context_id"),
+    method_contexts_cc = c("dataset_id", "method_context_id"),
+    temporal_contexts_cc = c("dataset_id", "temporal_context_id"),
+    plot_contexts_cc = c("dataset_id", "plot_context_id"),
+    treatment_contexts_cc = c("dataset_id", "treatment_context_id"),
+    methods_cc = c("dataset_id", "trait_name", "method_id"),
+    taxa_cc = c("taxon_name"),
+    taxonomic_updates_cc = c("dataset_id", "taxon_name", "original_name"),
+    contributors_cc = c("dataset_id")
+  )
 
   # Create table of various look-up values to be used below
 
-  # Create vectors for table 
+  # Create additional vectors for table 
   tables_to_cut <- c("locations", "entity_context_id", "method_context_id", "temporal_context_id", 
                             "plot_context_id", "treatment_context_id", 
                             "methods", "taxa", "taxonomic_updates", "contributors")
@@ -119,13 +96,9 @@ extract_data <- function(database, table, col, col_value) {
                             "database$plot_context_id", "database$treatment_context_id", 
                             "database$methods", "database$taxa", "database$taxonomic_updates", "database$contributors")
 
-  cookie_cutters <- c("locations_cc", "entity_contexts_cc", "method_contexts_cc", "temporal_contexts_cc", 
-                      "plot_contexts_cc", "treatment_contexts_cc", "methods_cc", "taxa_cc", "taxonomic_updates_cc", 
-                      "contributors_cc")
-
   # Create table
   tables <- dplyr::tibble(
-    cookie_cutters = cookie_cutters, # XX - or = names(cookie_cutters) if cookie_cutters is a list
+    cookie_cutters = names(cookie_cutters),
     tables_to_cut = tables_to_cut,
     tables_complete_path = tables_complete_path
   )
@@ -152,9 +125,6 @@ extract_data <- function(database, table, col, col_value) {
       # chose columns to select, ensuring "value" isn't among the columns, since it has a different meaning for each table
       columns_to_select <- intersect(setdiff(names(database$traits), "value"), names(database[[table[[i]]]]))
 
-      # Trim reference table i.e. locations with pattern matching
-      #found_indicies <- database[[table[[i]]]][[col]] |> stringr::str_which(pattern = stringr::regex(col_value, ignore_case = TRUE))
-
       indicies_tmp <- purrr::map(col_value, ~{
         stringr::str_which(database[[table[[i]]]][[col]], 
                            pattern = stringr::regex(.x, ignore_case = TRUE))
@@ -163,82 +133,66 @@ extract_data <- function(database, table, col, col_value) {
       found_indicies <- purrr::reduce(indicies_tmp, union)
 
       # Trim traits, based on the columns identified as being common between the traits table and target table
-      cc_traits <- database[[table[[i]]]] |>
-        dplyr::slice(found_indicies) |>
-        dplyr::select(tidyselect::all_of(columns_to_select)) |> 
+      cc_traits <- database[[table[[i]]]] %>%
+        dplyr::slice(found_indicies) %>%
+        dplyr::select(tidyselect::all_of(columns_to_select)) %>% 
         dplyr::distinct()
 
       # Filtering join
       ## It will quite literally cookie cutting the traits table if the columns match what is in cc_traits
-      ret_tmp[["traits"]] <- database[["traits"]]|> 
+      ret_tmp[["traits"]] <- database[["traits"]]%>% 
         dplyr::semi_join(cc_traits, by = columns_to_select)
 
       # Use same filtering join to trim excluded data
-      ret_tmp[["excluded_data"]] <- database[["excluded_data"]]  |> 
+      ret_tmp[["excluded_data"]] <- database[["excluded_data"]]  %>% 
         dplyr::semi_join(cc_traits, by = columns_to_select)
 
 
       for (j in seq_along(tables_tmp$tables_to_cut)) {
 
-        cut_traits <- ret_tmp[["traits"]] |> 
-          dplyr::select(get(tables_tmp$cookie_cutters[[j]])) |> # XX - this line no longer works with a list and tried other variants to query new vector location
+        cut_traits <- ret_tmp[["traits"]] %>% 
+          dplyr::select(cookie_cutters[[j]]) %>%
           dplyr::distinct() 
 
-        cut_traits <- cut_traits |>
+        cut_traits <- cut_traits %>%
           dplyr::filter(dplyr::if_all(tidyselect::everything(), ~ !is.na(.)))
 
-        cut_table <- eval(parse(text = tables_tmp$tables_complete_path[[j]])) |>
-          dplyr::semi_join(cut_traits, by = get(tables_tmp$cookie_cutters[[j]])) %>%
+        cut_table <- eval(parse(text = tables_tmp$tables_complete_path[[j]])) %>%
+          dplyr::semi_join(cut_traits, by = cookie_cutters[[j]]) %>%
           dplyr::rename(link_vals = tidyselect::contains("context_id"))
 
-        assign(paste0("ret_tmp[[\"", tables_tmp$tables_to_cut[[j]], "\"]]"), cut_table )
+        ret_tmp[[j]] <- cut_table
 
       }
 
-      # Bind together rows extracted from each table
-      ret[["traits"]] <- ret[["traits"]] %>% dplyr::bind_rows(ret_tmp[["traits"]]) %>% dplyr::distinct()
-      ret[["locations"]] <- ret[["locations"]] %>% dplyr::bind_rows(`ret_tmp[["locations"]]`) %>% dplyr::distinct()
-      ret[["entity_context_id"]] <- ret[["entity_context_id"]] %>% dplyr::bind_rows(`ret_tmp[["entity_context_id"]]`) %>% dplyr::distinct()
-      ret[["method_context_id"]] <- ret[["method_context_id"]] %>% dplyr::bind_rows(`ret_tmp[["method_context_id"]]`) %>% dplyr::distinct()
-      ret[["temporal_context_id"]] <- ret[["temporal_context_id"]] %>% dplyr::bind_rows(`ret_tmp[["temporal_context_id"]]`) %>% dplyr::distinct()
-      ret[["plot_context_id"]] <- ret[["plot_context_id"]] %>% dplyr::bind_rows(`ret_tmp[["plot_context_id"]]`) %>% dplyr::distinct()
-      ret[["treatment_context_id"]] <- ret[["treatment_context_id"]] %>% dplyr::bind_rows(`ret_tmp[["treatment_context_id"]]`) %>% dplyr::distinct()
-      ret[["methods"]] <- ret[["methods"]] %>% dplyr::bind_rows(`ret_tmp[["methods"]]`) %>% dplyr::distinct()
-      ret[["excluded_data"]] <- ret[["excluded_data"]] %>% dplyr::bind_rows(ret_tmp[["excluded_data"]]) %>% dplyr::distinct()
-      ret[["taxonomic_updates"]] <- ret[["taxonomic_updates"]] %>% dplyr::bind_rows(`ret_tmp[["taxonomic_updates"]]`) %>% dplyr::distinct()
-      ret[["taxa"]] <- ret[["taxa"]] %>% dplyr::bind_rows(`ret_tmp[["taxa"]]`) %>% dplyr::distinct()
-      ret[["contributors"]] <- ret[["contributors"]] %>% dplyr::bind_rows(`ret_tmp[["contributors"]]`) %>% dplyr::distinct()
-
-      # getting error with loop
-        #  for (v in c("traits", tables$tables_to_cut)) { 
-
-        # ret[[v]] <- ret_tmp[[v]] %>%
-        #   dplyr::bind_rows(`ret_tmp[[v]]`) %>%
-        #   dplyr::distinct()
-
-        #  }
-
+       for (v in seq_along(c(tables$tables_to_cut, "traits", "excluded_data"))) {
+
+          ret[[v]] <- ret[[v]] %>%
+            dplyr::bind_rows(ret_tmp[[v]]) %>%
+            dplyr::distinct()
+
+       }
 
     }
 
   # Rejoin contexts
-  ret[["contexts"]] <- ret[["entity_context_id"]] |>
+  ret[["contexts"]] <- ret[["entity_context_id"]] %>%
     dplyr::bind_rows(ret[["method_context_id"]],
               ret[["plot_context_id"]],
               ret[["temporal_context_id"]],
-              ret[["treatment_context_id"]]) |>
-    dplyr::select(-dplyr::any_of(c("entity_context_id", "method_context_id", "plot_context_id", "temporal_context_id", "treatment_context_id"))) |>
-    dplyr::group_by(dataset_id, category, link_id, value, description) |>
-      dplyr::mutate(link_vals = paste0(link_vals, collapse = ", ")) |>
-    dplyr::ungroup() |>
+              ret[["treatment_context_id"]]) %>%
+    dplyr::select(-dplyr::any_of(c("entity_context_id", "method_context_id", "plot_context_id", "temporal_context_id", "treatment_context_id"))) %>%
+    dplyr::group_by(dataset_id, category, link_id, value, description) %>%
+      dplyr::mutate(link_vals = paste0(link_vals, collapse = ", ")) %>%
+    dplyr::ungroup() %>%
     dplyr::distinct()
 
   ret <- ret[!names(ret) %in% c("entity_context_id", "method_context_id", "plot_context_id", "temporal_context_id", "treatment_context_id")]
 
     # Trim sources - Are these just dataset_ids...
   from_methods_to_sources_cc <- dplyr::union(ret$methods$source_primary_key,  # Is this part really needed, aren't these just dataset_ids? 
-                                             ret$methods$source_secondary_key |> strsplit("; ") |> unlist()) |> 
-    unique() |> stats::na.omit() |> as.character()
+                                             ret$methods$source_secondary_key %>% strsplit("; ") %>% unlist()) %>% 
+    unique() %>% stats::na.omit() %>% as.character()
 
   ret[["sources"]] <- database[["sources"]][from_methods_to_sources_cc]
 

diff --git a/man/bind_databases.Rd b/man/bind_databases.Rd