-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
234 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
#' Fetch Records by Year Ranges from Finna API (Including NA Dates) | ||
#' | ||
#' This function fetches records from the Finna API in chunks divided by year ranges, handling missing date values. | ||
#' | ||
#' @param base_query The base query string, defaults to "*". | ||
#' @param base_filters A character vector of filters for the search, e.g., `c('collection:"VIO"')`. | ||
#' @param year_ranges A list of numeric vectors specifying year ranges, e.g., `list(c(2000, 2005), c(2006, 2010))`. | ||
#' @param include_na Whether to include records with missing `main_date_str`. Default is `TRUE`. | ||
#' @param limit_per_query Maximum number of records to fetch per query. Default is 100000. | ||
#' @param total_limit Maximum number of records to fetch overall. Default is `Inf`. | ||
#' @param delay_after_query Delay in seconds between queries. Default is 5. | ||
#' @return A tibble containing all fetched records. | ||
#' @export | ||
fetch_viola_records <- function(base_query = "*", | ||
base_filters = c('collection:"VIO"'), | ||
year_ranges = list(c(0, as.numeric(format(Sys.Date(), "%Y")))), | ||
include_na = TRUE, | ||
limit_per_query = 100000, | ||
total_limit = Inf, | ||
delay_after_query = 5) { | ||
library(dplyr) | ||
library(purrr) | ||
|
||
# Input validation | ||
if (!all(sapply(year_ranges, function(x) length(x) == 2 && is.numeric(x) && x[1] <= x[2]))) { | ||
stop("Each year range must be a numeric vector of length 2, where the first element <= the second.") | ||
} | ||
|
||
if (!is.numeric(limit_per_query) || limit_per_query <= 0) { | ||
stop("limit_per_query must be a positive integer.") | ||
} | ||
|
||
if (!is.numeric(total_limit) || total_limit <= 0) { | ||
stop("total_limit must be a positive number.") | ||
} | ||
|
||
message("Fetching records started...") | ||
|
||
all_results <- list() | ||
total_fetched <- 0 | ||
|
||
# Iterate over year ranges | ||
for (range in year_ranges) { | ||
if (total_fetched >= total_limit) { | ||
message("Reached the total record limit.") | ||
break | ||
} | ||
|
||
# Construct the range-specific filter | ||
range_filter <- paste0('main_date_str:["', range[1], '" TO "', range[2], '"]') | ||
filters <- c(base_filters, range_filter) | ||
|
||
# Fetch results for the range | ||
results <- tryCatch({ | ||
search_finna( | ||
query = base_query, | ||
filters = filters, | ||
limit = limit_per_query | ||
) | ||
}, error = function(e) { | ||
warning(sprintf("Error fetching data for range %s-%s: %s", range[1], range[2], e$message)) | ||
return(NULL) | ||
}) | ||
|
||
# Append valid results | ||
if (!is.null(results) && nrow(results) > 0) { | ||
all_results <- c(all_results, list(results)) | ||
total_fetched <- total_fetched + nrow(results) | ||
} | ||
|
||
# Log progress | ||
message(sprintf("Range %d-%d: Fetched %d records (Total: %d)", range[1], range[2], ifelse(is.null(results), 0, nrow(results)), total_fetched)) | ||
|
||
# Stop if total limit is reached | ||
if (total_fetched >= total_limit) { | ||
message("Reached the total record limit.") | ||
break | ||
} | ||
|
||
# Add delay between queries | ||
Sys.sleep(delay_after_query) | ||
} | ||
|
||
# Optionally fetch NA values | ||
if (include_na && total_fetched < total_limit) { | ||
message("Fetching records with missing main_date_str...") | ||
na_results <- tryCatch({ | ||
search_finna( | ||
query = base_query, | ||
filters = c(base_filters, '-main_date_str:*'), | ||
limit = limit_per_query | ||
) | ||
}, error = function(e) { | ||
warning("Error fetching NA values: ", e$message) | ||
return(NULL) | ||
}) | ||
|
||
if (!is.null(na_results) && nrow(na_results) > 0) { | ||
all_results <- c(all_results, list(na_results)) | ||
total_fetched <- total_fetched + nrow(na_results) | ||
} | ||
|
||
message(sprintf("Fetched %d records with missing dates.", ifelse(is.null(na_results), 0, nrow(na_results)))) | ||
} | ||
|
||
# Combine all results into a single tibble | ||
combined_results <- bind_rows(all_results) %>% | ||
distinct() # Remove duplicates | ||
|
||
# Final message | ||
message(sprintf("Fetching completed. Total records fetched: %d", total_fetched)) | ||
return(combined_results) | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
--- | ||
title: "Viola in Finna" | ||
output: rmarkdown::html_vignette | ||
vignette: > | ||
%\VignetteIndexEntry{Viola in Finna} | ||
%\VignetteEngine{knitr::rmarkdown} | ||
%\VignetteEncoding{UTF-8} | ||
--- | ||
This R Markdown document provides a comprehensive guide to querying and analyzing Viola collection data using the Finna API and the `fetch_viola_records` function. | ||
|
||
## Viola Collection in Finna | ||
|
||
The `fetch_viola_records` function allows batch processing of data from the Viola collection in Finna, across multiple year ranges, and handles records without date information. | ||
|
||
### Example Usage | ||
|
||
#### Fetch Data Across Year Ranges | ||
|
||
The following example fetches data from the Viola collection for the year ranges: 0–1699, 1700–1799, and 1800–1899. | ||
|
||
```{r message = FALSE, warning = FALSE} | ||
library(finna) | ||
# Fetch records | ||
results <- fetch_viola_records( | ||
base_query = "*", | ||
base_filters = c('collection:"VIO"'), # Filters for the Viola collection | ||
year_ranges = list(c(1700,1705)), # Year ranges to query | ||
include_na = TRUE, # Include records with missing dates | ||
limit_per_query = 100000, # Maximum records per query | ||
total_limit = Inf, # Overall record limit | ||
delay_after_query = 3 # Delay between API calls | ||
) | ||
# View the number of records fetched | ||
print(nrow(results)) | ||
head(results) | ||
``` | ||
|
||
#### Fetch Records Without Dates | ||
|
||
You can also fetch records missing the `main_date_str` field: | ||
|
||
```{r message = FALSE, warning = FALSE} | ||
# Fetch undated records | ||
undated_records <- search_finna( | ||
query = "*", | ||
filters = c('collection:"VIO"', '-main_date_str:*'), | ||
limit = Inf | ||
) | ||
# View undated records | ||
print(undated_records) | ||
``` | ||
|
||
--- | ||
|
||
## Data Visualization | ||
|
||
The Viola collection data can be visualized using metadata refinement and plotting functions. | ||
|
||
#### Author Distribution | ||
|
||
```{r message = FALSE, warning = FALSE} | ||
library(finna) | ||
# Refine metadata and visualize author distribution | ||
refined_data <- refine_metadata(results) | ||
visualize_author_distribution(refined_data) | ||
``` | ||
|
||
#### Word Cloud for Titles | ||
|
||
```{r message = FALSE, warning = FALSE} | ||
# Create a word cloud of titles | ||
visualize_word_cloud(refined_data, "Title") | ||
``` | ||
|
||
|
||
|
||
|