Skip to content

Commit

Permalink
update on viola and fennica
Browse files Browse the repository at this point in the history
  • Loading branch information
ake123 committed Dec 16, 2024
1 parent c8083fb commit b53214b
Show file tree
Hide file tree
Showing 5 changed files with 234 additions and 2 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ export(check_api_access)
export(enrich_author_name)
export(fetch_all_records)
export(fetch_finna_collection)
export(fetch_viola_records)
export(finna_cite)
export(finna_interactive)
export(get_finna_records)
Expand Down
113 changes: 113 additions & 0 deletions R/viola_records.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#' Fetch Records by Year Ranges from Finna API (Including NA Dates)
#'
#' This function fetches records from the Finna API in chunks divided by year ranges, handling missing date values.
#'
#' @param base_query The base query string, defaults to "*".
#' @param base_filters A character vector of filters for the search, e.g., `c('collection:"VIO"')`.
#' @param year_ranges A list of numeric vectors specifying year ranges, e.g., `list(c(2000, 2005), c(2006, 2010))`.
#' @param include_na Whether to include records with missing `main_date_str`. Default is `TRUE`.
#' @param limit_per_query Maximum number of records to fetch per query. Default is 100000.
#' @param total_limit Maximum number of records to fetch overall. Default is `Inf`.
#' @param delay_after_query Delay in seconds between queries. Default is 5.
#' @return A tibble containing all fetched records.
#' @export
fetch_viola_records <- function(base_query = "*",
base_filters = c('collection:"VIO"'),
year_ranges = list(c(0, as.numeric(format(Sys.Date(), "%Y")))),
include_na = TRUE,
limit_per_query = 100000,
total_limit = Inf,
delay_after_query = 5) {
library(dplyr)
library(purrr)

# Input validation
if (!all(sapply(year_ranges, function(x) length(x) == 2 && is.numeric(x) && x[1] <= x[2]))) {
stop("Each year range must be a numeric vector of length 2, where the first element <= the second.")
}

if (!is.numeric(limit_per_query) || limit_per_query <= 0) {
stop("limit_per_query must be a positive integer.")
}

if (!is.numeric(total_limit) || total_limit <= 0) {
stop("total_limit must be a positive number.")
}

message("Fetching records started...")

all_results <- list()
total_fetched <- 0

# Iterate over year ranges
for (range in year_ranges) {
if (total_fetched >= total_limit) {
message("Reached the total record limit.")
break
}

# Construct the range-specific filter
range_filter <- paste0('main_date_str:["', range[1], '" TO "', range[2], '"]')
filters <- c(base_filters, range_filter)

# Fetch results for the range
results <- tryCatch({
search_finna(
query = base_query,
filters = filters,
limit = limit_per_query
)
}, error = function(e) {
warning(sprintf("Error fetching data for range %s-%s: %s", range[1], range[2], e$message))
return(NULL)
})

# Append valid results
if (!is.null(results) && nrow(results) > 0) {
all_results <- c(all_results, list(results))
total_fetched <- total_fetched + nrow(results)
}

# Log progress
message(sprintf("Range %d-%d: Fetched %d records (Total: %d)", range[1], range[2], ifelse(is.null(results), 0, nrow(results)), total_fetched))

# Stop if total limit is reached
if (total_fetched >= total_limit) {
message("Reached the total record limit.")
break
}

# Add delay between queries
Sys.sleep(delay_after_query)
}

# Optionally fetch NA values
if (include_na && total_fetched < total_limit) {
message("Fetching records with missing main_date_str...")
na_results <- tryCatch({
search_finna(
query = base_query,
filters = c(base_filters, '-main_date_str:*'),
limit = limit_per_query
)
}, error = function(e) {
warning("Error fetching NA values: ", e$message)
return(NULL)
})

if (!is.null(na_results) && nrow(na_results) > 0) {
all_results <- c(all_results, list(na_results))
total_fetched <- total_fetched + nrow(na_results)
}

message(sprintf("Fetched %d records with missing dates.", ifelse(is.null(na_results), 0, nrow(na_results))))
}

# Combine all results into a single tibble
combined_results <- bind_rows(all_results) %>%
distinct() # Remove duplicates

# Final message
message(sprintf("Fetching completed. Total records fetched: %d", total_fetched))
return(combined_results)
}
37 changes: 37 additions & 0 deletions man/fetch_viola_records.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions vignettes/articles/Fennica.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@ print(fennica)
In order to download the whole data we can add the parameter `limit = Inf`
as `search_finna("*",filters=c('collection:"FEN"'), limit = Inf)`

## Checking the total counts
## 19th century fennica data

search the whole data and it total search of counts in the the interval between
some years for example between the years 1809-1917 as follows:

```{r message = FALSE, warning = FALSE}
library(finna)
fennica <- search_finna("*",filters = c('collection:"FEN"', 'search_daterange_mv:"[1809 TO 1918]"'))
fennica <- search_finna("*",filters = c('collection:"FEN"', 'search_daterange_mv:"[1808 TO 1918]"'))
print(fennica)
```

Expand Down
81 changes: 81 additions & 0 deletions vignettes/articles/viola_collections.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
---
title: "Viola in Finna"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Viola in Finna}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---
This R Markdown document provides a comprehensive guide to querying and analyzing Viola collection data using the Finna API and the `fetch_viola_records` function.

## Viola Collection in Finna

The `fetch_viola_records` function allows batch processing of data from the Viola collection in Finna, across multiple year ranges, and handles records without date information.

### Example Usage

#### Fetch Data Across Year Ranges

The following example fetches data from the Viola collection for the year ranges: 0–1699, 1700–1799, and 1800–1899.

```{r message = FALSE, warning = FALSE}
library(finna)
# Fetch records
results <- fetch_viola_records(
base_query = "*",
base_filters = c('collection:"VIO"'), # Filters for the Viola collection
year_ranges = list(c(1700,1705)), # Year ranges to query
include_na = TRUE, # Include records with missing dates
limit_per_query = 100000, # Maximum records per query
total_limit = Inf, # Overall record limit
delay_after_query = 3 # Delay between API calls
)
# View the number of records fetched
print(nrow(results))
head(results)
```

#### Fetch Records Without Dates

You can also fetch records missing the `main_date_str` field:

```{r message = FALSE, warning = FALSE}
# Fetch undated records
undated_records <- search_finna(
query = "*",
filters = c('collection:"VIO"', '-main_date_str:*'),
limit = Inf
)
# View undated records
print(undated_records)
```

---

## Data Visualization

The Viola collection data can be visualized using metadata refinement and plotting functions.

#### Author Distribution

```{r message = FALSE, warning = FALSE}
library(finna)
# Refine metadata and visualize author distribution
refined_data <- refine_metadata(results)
visualize_author_distribution(refined_data)
```

#### Word Cloud for Titles

```{r message = FALSE, warning = FALSE}
# Create a word cloud of titles
visualize_word_cloud(refined_data, "Title")
```




0 comments on commit b53214b

Please sign in to comment.