diff --git a/ChangeLog b/ChangeLog deleted file mode 100644 index ae9f1e1..0000000 --- a/ChangeLog +++ /dev/null @@ -1,15 +0,0 @@ -Date Version Comment -2015-12-09 0.0-1 First version on CRAN -2016-03-04 0.0-2 Minor improvements -2016-12-13 0.1-1 New function to pre-download Census data and other minor improvements -2017-03-03 0.1-2 Updated surname handling, enhanced demographics option, and improved error handling and documentation -2017-04-10 0.1-3 Allows Census data download at level user prefers (block, tract, or county) -2017-05-03 0.1-4 Fixed error in merge_surnames.R and updated relevant documentation -2017-06-08 0.1-5 Updated http to https to access U.S. Census API -2017-08-16 0.1-6 Removed extraneous documentation, renamed 2010 Surname List object, and added place as geography -2017-08-31 0.1-7 Added testthat functionality -2019-02-12 0.1-8 Updated to be compatible with U.S. Census API updates -2019-02-20 0.1-9 Fixed census_helper.R so that state field can be lower case in user data -2020-10-08 0.1-10 Fixed minor warning message -2020-04-22 0.1-11 Minor fix requested by CRAN -2020-05-17 0.1-12 Some Census API improvements (thanks to Silvia Kim) diff --git a/DESCRIPTION b/DESCRIPTION index c84e6a1..376bbee 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,13 +1,11 @@ Package: wru Version: 0.1-12 Date: 2021-05-17 -Title: Who are You? Bayesian Prediction of Racial Category Using Surname and - Geolocation +Title: Who are You? Bayesian Prediction of Racial Category Using + Surname and Geolocation Authors@R: c( person("Kabir", "Khanna", email = "kabirkhanna@gmail.com", role = c("aut", "cre")), - person("Kosuke", "Imai", email = "imai@harvard.edu", role = c("aut")), - person("Evan", "Rosenman", email = "etrrosenman@gmail.com", role = c("aut")), - person("Santiago", "Olivella", email = "olivella@unc.edu", role = c("aut")) + person("Kosuke", "Imai", email = "imai@harvard.edu", role = c("aut")) ) Description: Predicts individual race/ethnicity using surname, geolocation, and other attributes, such as gender and age. The method utilizes the Bayes' @@ -17,24 +15,13 @@ Description: Predicts individual race/ethnicity using surname, geolocation, Registration Records" Political Analysis . URL: https://github.com/kosukeimai/wru BugReports: https://github.com/kosukeimai/wru/issues -Depends: - R (>= 3.5.0), - utils -Imports: - devtools (>= 1.10.0), - PL94171, - stringr, - Rcpp, - dplyr (>= 1.0.0) -LinkingTo: - Rcpp, - RcppEigen, - RcppProgress -Suggests: - testthat +Depends: R (>= 3.5.0), utils +Imports: devtools (>= 1.10.0) +Suggests: testthat LazyLoad: yes LazyData: yes LazyDataCompression: xz License: GPL (>= 3) -RoxygenNote: 7.1.2 +RoxygenNote: 7.2.0 Encoding: UTF-8 +NeedsCompilation: no diff --git a/NAMESPACE b/NAMESPACE index 542d5c9..f9efb29 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,21 +2,10 @@ export(census_geo_api) export(census_helper) -export(census_helper_new) -export(co_cluster) -export(format_legacy_data) export(get_census_api) export(get_census_api_2) export(get_census_data) -export(merge_names) export(merge_surnames) export(predict_race) -export(predict_race_new) export(vec_to_chunk) -import(PL94171) import(devtools) -import(stringr) -importFrom(Rcpp,evalCpp) -importFrom(dplyr,.data) -importFrom(dplyr,`%>%`) -useDynLib(wru, .registration=TRUE) diff --git a/R/RcppExports.R b/R/RcppExports.R deleted file mode 100644 index 96ed265..0000000 --- a/R/RcppExports.R +++ /dev/null @@ -1,31 +0,0 @@ -# Generated by using Rcpp::compileAttributes() -> do not edit by hand -# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 - -#' Collapsed Gibbs sampler for keyWRU. Internal function -#' -#' @param data A list with the following elements -#' \itemize{ -#' \item{name_type_n}{Number of name types} -#' \item{race_n}{Number of races} -#' \item{geo_n}{Number of geolocations} -#' \item{geo_race_table}{Matrix of conditional probabilities Pr(Race | Geolocation), with geolocations in the rows} -#' \item{voters_per_geo}{Number of voterfile records per geolocation} -#' \item{race_inits}{Table of initial race assignments per voterfile record} -#' \item{name_data}{ -#' \itemize{ -#' \item{n_unique_names}{Number of unique names} -#' \item{record_name_id}{Name id corresponding to each voterfile record} -#' \item{keynames}{Integer matrix of name id's used as keynames for each race (race in the columns)} -#' \item{census_table}{Matrix of Pr(Name | Race), with races in the columns} -#' \item{beta_prior}{Scalar prior for name-race symmetric Dirichlet distribution} -#' \item{gamma_prior}{Vector prior shapes for keyname/non-keyname Beta mixture} -#' } -#' } -#' } -#' @param ctrl A list of control arguments; see \code{co_cluster} function for details. -#' -#' @keywords internal -keyWRU_fit <- function(data, ctrl) { - .Call(`_wru_keyWRU_fit`, data, ctrl) -} - diff --git a/R/census_helper_v2.R b/R/census_helper_v2.R deleted file mode 100644 index c3adf78..0000000 --- a/R/census_helper_v2.R +++ /dev/null @@ -1,137 +0,0 @@ -#' Census helper function. -#' -#' \code{census_helper_v2} links user-input dataset with Census geographic data. -#' -#' This function allows users to link their geocoded dataset (e.g., voter file) -#' with U.S. Census 2010 data. The function extracts Census Summary File data -#' at the county, tract, or block level using the 'UScensus2010' package. Census data -#' calculated are Pr(Geolocation | Race) where geolocation is county, tract, or block. -#' -#' @param key A required character object. Must contain user's Census API -#' key, which can be requested \href{https://api.census.gov/data/key_signup.html}{here}. -#' @param voter.file An object of class \code{data.frame}. Must contain field(s) named -#' \code{\var{county}}, \code{\var{tract}}, \code{\var{block}}, and/or \code{\var{place}} -#' specifying geolocation. These should be character variables that match up with -#' U.S. Census categories. County should be three characters (e.g., "031" not "31"), -#' tract should be six characters, and block should be four characters. -#' Place should be five characters if it is included. -#' @param states A character vector specifying which states to extract -#' Census data for, e.g. \code{c("NJ", "NY")}. Default is \code{"all"}, which extracts -#' Census data for all states contained in user-input data. -#' @param geo A character object specifying what aggregation level to use. -#' Use \code{"county"}, \code{"tract"}, or \code{"block"}. Default is \code{"tract"}. -#' Warning: extracting block-level data takes very long. -#' @param age A \code{TRUE}/\code{FALSE} object indicating whether to condition on -#' age or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). -#' If \code{TRUE}, function will return Pr(Geolocation, Age | Race). -#' If \code{\var{sex}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race). -#' @param sex A \code{TRUE}/\code{FALSE} object indicating whether to condition on -#' sex or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). -#' If \code{TRUE}, function will return Pr(Geolocation, Sex | Race). -#' If \code{\var{age}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race). -#' @param census.data A optional census object of class \code{list} containing -#' pre-saved Census geographic data. Can be created using \code{get_census_data} function. -#' If \code{\var{census.data}} is provided, the \code{\var{age}} element must have the same value -#' as the \code{\var{age}} option specified in this function (i.e., \code{TRUE} in both or -#' \code{FALSE} in both). Similarly, the \code{\var{sex}} element in the object provided in -#' \code{\var{census.data}} must have the same value as the \code{\var{sex}} option here. -#' If \code{\var{census.data}} is missing, Census geographic data will be obtained via Census API. -#' @param retry The number of retries at the census website if network interruption occurs. -#' @return Output will be an object of class \code{data.frame}. It will -#' consist of the original user-input data with additional columns of -#' Census data. -#' -#' @examples -#' \dontshow{data(voters)} -#' \dontrun{census_helper(key = "...", voter.file = voters, states = "nj", geo = "block")} -#' \dontrun{census_helper(key = "...", voter.file = voters, states = "all", geo = "tract", -#' age = TRUE, sex = TRUE)} -#' -#' @export -census_helper_new <- function(key, voter.file, states = "all", geo = "tract", age = FALSE, sex = FALSE, census.data = NA, retry = 0) { - - if (is.na(census.data) || (typeof(census.data) != "list")) { - toDownload = TRUE - } else { - toDownload = FALSE - } - - if (toDownload) { - if (missing(key)) { - stop('Must enter U.S. Census API key, which can be requested at https://api.census.gov/data/key_signup.html.') - } - } - - states <- toupper(states) - if (states == "ALL") { - states <- toupper(as.character(unique(voter.file$state))) - } - - df.out <- NULL - - for (s in 1:length(states)) { - - print(paste("State ", s, " of ", length(states), ": ", states[s], sep = "")) - state <- toupper(states[s]) - - if (geo == "place") { - geo.merge <- c("place") - if ((toDownload) || (is.null(census.data[[state]])) || (census.data[[state]]$age != age) || (census.data[[state]]$sex != sex)) { - census <- census_geo_api(key, state, geo = "place", age, sex, retry) - } else { - census <- census.data[[toupper(state)]]$place - } - } - - if (geo == "county") { - geo.merge <- c("county") - if ((toDownload) || (is.null(census.data[[state]])) || (census.data[[state]]$age != age) || (census.data[[state]]$sex != sex)) { - census <- census_geo_api(key, state, geo = "county", age, sex, retry) - } else { - census <- census.data[[toupper(state)]]$county - } - } - - if (geo == "tract") { - geo.merge <- c("county", "tract") - if ((toDownload) || (is.null(census.data[[state]])) || (census.data[[state]]$age != age) || (census.data[[state]]$sex != sex)) { - census <- census_geo_api(key, state, geo = "tract", age, sex, retry) - } else { - census <- census.data[[toupper(state)]]$tract - } - } - - if (geo == "block") { - geo.merge <- c("county", "tract", "block") - if ((toDownload) || (is.null(census.data[[state]])) || (census.data[[state]]$age != age) || (census.data[[state]]$sex != sex)) { - census <- census_geo_api(key, state, geo = "block", age, sex, retry) - } else { - census <- census.data[[toupper(state)]]$block - } - } - - census$state <- state - - if (age == F & sex == F) { - - ## Calculate Pr(Geolocation | Race) - geoPopulations <- rowSums(census[,grepl("P00", names(census))]) - census$r_whi <- census$P005003 / geoPopulations #Pr(White | Geo) - census$r_bla <- census$P005004 / geoPopulations #Pr(Black | Geo) - census$r_his <- census$P005010 / geoPopulations #Pr(Latino | Geo) - census$r_asi <- (census$P005006 + census$P005007) / geoPopulations #Pr(Asian or NH/PI | Geo) - census$r_oth <- (census$P005005 + census$P005008 + census$P005009) / geoPopulations #Pr(AI/AN, Other, or Mixed | Geo) - - drop <- c(grep("state", names(census)), grep("P005", names(census))) - voters.census <- merge(voter.file[toupper(voter.file$state) == toupper(states[s]), ], census[, -drop], by = geo.merge, all.x = T) - - } - - keep.vars <- c(names(voter.file)[names(voter.file) != "agecat"], - paste("r", c("whi", "bla", "his", "asi", "oth"), sep = "_")) - df.out <- as.data.frame(rbind(df.out, voters.census[keep.vars])) - - } - - return(df.out) -} diff --git a/R/co_cluster.R b/R/co_cluster.R deleted file mode 100644 index d43d642..0000000 --- a/R/co_cluster.R +++ /dev/null @@ -1,232 +0,0 @@ -#' Name-Race Co-clustering Using Keynames -#' -#' Estimate a Bayesian mixed-membership, keyword-assisted race classification model for geo-locations -#' in a voterfile. -#' -#' @param voter.file An object of class data.frame. Must contain a row for each individual being predicted, -#' as well as fields named "surname", and/or"first", and/or "middle", containing each -#' individual's corresponding names. It must also contain a field named state, which -#' contains the two-character lower-case abbreviation for each individual's state of residence (e.g., "nj" for New Jersey). -#' If using Census geographic data in race/ethnicity -#' predictions, voter.file must also contain at least one of the following fields: county, tract, block, -#' and/or place. These fields should contain character strings matching U.S. Census categories. County is three -#' characters (e.g., "031" not "31"), tract is six characters, and block is four characters. Place is -#' five characters. See \code{\link{predict_race}} for other optional fields. -#' @param name_types Character vector. Must have elements in "surname", "first", and "middle". -#' @param name_race_tables Named list, with as many elements as there are names in \code{name_types}, and names matching -#' elements in \code{name_types}. Each list element should be a data.frame of unique names (first column) by race (remaining columns), -#' with conditional probabilities p(Name|Race). -#' @param census.geo Required character vector. One of "county", "tract", "block" or "place". See \code{\link{predict_race}}. -#' @param ... Arguments passed to \code{\link{predict_race}}. -#' @param control List of control arguments, including -#' \itemize{ -#' \item{race_init}{ Initial race for each observation in voter.file. Must be an integer, with -#' 0=white, 1=black, 2=hispanic, 3=asian, and 4=other.} -#' \item{fit_insample}{ Boolean. Should model check in-sample fit of race prediction for each -#' record? If \code{TRUE}, \code{race_obs} cannot be \code{NULL}. Defaults to \code{FALSE}.} -#' \item{race_obs}{ Observed race for each record in \code{voter.file}. Must be an integer, with -#' 0=white, 1=black, 2=hispanic, 3=asian, and 4=other.} -#' \item{iter}{ Number of MCMC iterations. Defaults to 1000.} -#' \item{burnin}{ Number of iterations discarded as burnin. Defaults to half of \code{iter}.} -#' \item{thin}{ Thinning interval for MCMC. Defaults to 1.} -#' \item{log_post_interval}{ Interval for storing the log_posterior. Defaults to 10.} -#' \item{beta_prior}{ Parameter for symmetric Dirichlet prior over names for each race. Defaults to 5.} -#' \item{gamma_prior}{ Parameter for Beta prior over keyname/non-keyname mixture components. Defaults to c(5, 5).} -#' \item{verbose}{ Print progress information. Defaults to \code{TRUE}.} -#' \item{seed}{ RNG seed. If \code{NULL}, a seed is generated and stored for reproducibility.} -#' } -#' -#' -#' @return A named list: -#' \itemize{ -#' \item{name_by_race}{ Named list of predicted distributions of name by race for each name type.} -#' \item{race_by_record}{ A copy of \code{voter.file}, with additional columns of predicted -#' race probabilities, names \code{pred.}. } -#' \item{loglik}{ Values of log likelihood, evaluated every \code{log_post_interval}.} -#' \item{fit_insample}{ When \code{fit_insample=TRUE}, a probability of correct in_sample prediction -#' for each record in \code{vote.file}.} -#' } -#' -#' -#' @export -#' -co_cluster <- function(voter.file, - name_types, - name_race_tables, - census.geo, - ..., - control = NULL) -{ - ##Data quality checks - n_race <- ncol(name_race_tables[[1]])-1 - stopifnot(all(sapply(name_race_tables, ncol) == n_race+1), - all(name_types %in% c("surname","first", "middle")), - all(names(name_race_tables) %in% name_types), - name_types %in% names(voter.file), - census.geo %in% c("county","tract","block","place") - ) - - - voter.file$state <- toupper(voter.file$state) - - ## Form control list - ctrl <- list(iter = 1000, - thin = 1, - race_init = NULL, - log_post_interval = 10, - beta_prior = 0.01, - gamma_prior = c(1, 1), - verbose = TRUE, - fit_insample = FALSE, - race_obs = NULL, - max_keynames = 1000, - seed = sample(1:1000, 1)) - ctrl$burnin <- floor(ctrl$iter/2) - ctrl[names(control)] <- control - - ## Set RNG seed - set.seed(ctrl$seed) - - if(ctrl$fit_insample){ - if(is.null(ctrl$race_obs) | any(is.na(ctrl$race_obs))){ - stop("If `fit_insample' is TRUE, `race_obs' must be a complete vector of observed races for each record in the voter.file.") - } - } - - ## Initial race - race_pred_args <- list(census.surname = TRUE, - surname.only = FALSE, - surname.year = 2010, - census.geo = census.geo, - age = FALSE, - sex = FALSE, - retry = 0) - args_usr <- list(...) - ## level of geo aggregation - geo_id_names <- c("state", switch(race_pred_args$census.geo, - "county" = c("county"), - "tract" = c("county","tract"), - "block" = c("county","tract","block"), - "place" = c("place"), - "zip" = c("zip"))) - - race_pred_args[names(args_usr)] <- args_usr - if(is.null(race_pred_args$census.data)){ - if(is.null(race_pred_args$census.key)){ - stop("Geographic data is required. When `census.data' is NULL, you must provide a census API Key using `census.key' so I can download the required data.") - } - all_states <- unique(voter.file$state) - race_pred_args$census.data <- get_census_data(race_pred_args$census.key, - all_states, - race_pred_args$age, - race_pred_args$sex, - race_pred_args$census.geo, - race_pred_args$retry) - } - race.suff <- c("whi", "bla", "his", "asi", "oth") - if(is.null(ctrl$race_init)){ - race_pred_args$voter.file <- voter.file - race_pred <- do.call(predict_race, race_pred_args) - tmp <- base::merge(voter.file, race_pred, sort=FALSE) - ctrl$race_init <- apply(tmp[,paste0("pred.",race.suff)], 1, which.max) - 1 - } - if(any(is.na(ctrl$race_init))){ - stop("Some initial race values are NA. If you didn't provide initial values, check the results of calling predict_race() on the voter.file you want me to work on.") - } - geo_id <- do.call(paste, voter.file[,geo_id_names]) - ctrl$race_init <- split(ctrl$race_init, geo_id) - - - - ## P(race | geo) - g_r_t <- do.call(rbind, lapply(race_pred_args$census.data, - function(x){ - all_names <- names(x[[race_pred_args$census.geo]]) - tmp <- x[[race_pred_args$census.geo]][,c(geo_id_names, grep("P00", all_names, value=TRUE))] - tmp[,grep("P00", all_names, value=TRUE)] <- proportions(as.matrix(tmp[,grep("P00", all_names, value=TRUE)]), 1) - tmp$r_whi <- tmp$P005003 #Pr(White|Geo) - tmp$r_bla <- tmp$P005004 #Pr(Black|Geo) - tmp$r_his <- tmp$P005010 #Pr(Latino|Geo) - tmp$r_asi <- (tmp$P005006 + tmp$P005007) #Pr(Asian or NH/PI|Geo) - tmp$r_oth <- (tmp$P005005 + tmp$P005008 + tmp$P005009)#Pr(AI/AN, Other, or Mixed|Geo) - return(tmp) - })) - g_r_t_geo <- do.call(paste, g_r_t[,geo_id_names]) - ##Subset to geo's in vf - g_r_t <- g_r_t[g_r_t_geo %in% geo_id, ] - if(nrow(g_r_t) != length(unique(geo_id))){ - stop("Some records in voter.file have unique geographic locations that I wasn't able to find in the census.data. Some records in voter.file may have mis-matched geographic units that do not exist in the census.") - } - g_r_t_geo_new <- do.call(paste, g_r_t[,geo_id_names]) - geo_ord <- match(names(ctrl$race_init),g_r_t_geo_new) - geo_race_table <- as.matrix(g_r_t[geo_ord,grep("r_", names(g_r_t))]) - - - ##Name-specific data - name_data <- vector("list", length(name_types)) - names(name_data) <- name_types - for(ntype in name_types){ - str_names <- toupper(name_race_tables[[ntype]][,1]) - proc_names_str <- .name_preproc(voter.file[,ntype], c(str_names)) - u_obs_names <- unique(proc_names_str) - keynames_str_all <- str_names - kw_in_ind <- keynames_str_all %in% proc_names_str - keynames_str <- keynames_str_all[kw_in_ind] - u_kw <- unique(keynames_str) - n_u_kw <- length(u_kw) - reord <- order(match(u_obs_names, u_kw)) - u_obs_names <- u_obs_names[reord] - n_names <- length(u_obs_names) - w_names <- match(proc_names_str, u_obs_names) - 1 - keynames <- match(keynames_str, table = u_kw) - 1 - w_names_list <- split(w_names, geo_id) - phi_tilde <- array(0.0, c(n_u_kw, n_race)) - for(x in 1:n_race){ - phi_tilde[, x] <- proportions(name_race_tables[[ntype]][which(kw_in_ind),x+1]) - } - - colnames(phi_tilde) <- paste("p", race.suff, sep="_") - name_data[[ntype]] <- list(n_unique_names = n_names, - record_name_id = w_names_list, - largest_keyword = as.integer(max(keynames)), - census_table = phi_tilde, - beta_prior = ctrl$beta_prior, - gamma_prior = ctrl$gamma_prior, - u_obs_names = u_obs_names) - } - - ## Create data for keyWRU - data_list <- list(name_type_n = length(name_types), - race_n = ncol(name_race_tables[[1]][,-1]), - geo_n = length(unique(geo_id)), - geo_race_table = geo_race_table, - voters_per_geo = sapply(split(voter.file, geo_id), nrow), - race_inits = ctrl$race_init, - race_obs = if(ctrl$fit_insample){split(ctrl$race_obs, geo_id)} else {list()}, - name_data = name_data) - - full_res <- keyWRU_fit(data_list, ctrl) - pred_list <- full_res$phi - res <- lapply(name_types, - function(ntype){ - colnames(pred_list[[ntype]]) <- paste0("pred.",race.suff) - pred_list[[ntype]] <- as.data.frame(pred_list[[ntype]]) - pred_list[[ntype]] <- cbind(name_data[[ntype]]$u_obs_names, - pred_list[[ntype]]) - names(pred_list[[ntype]])[1] <- ntype - return(pred_list[[ntype]]) - }) - v.file.s <- split(voter.file, geo_id) - race_samp <- cbind(do.call(rbind, v.file.s), - proportions(as.matrix(do.call(rbind, full_res$predict_race)), 1)) - names(race_samp) <- c(names(voter.file), paste0("pred.",race.suff)) - names(res) <- names(pred_list) - ret_obj <- list() - ret_obj$name_by_race <- res - ret_obj$race_by_record <- race_samp - ret_obj$loglik <- full_res$ll - if(ctrl$fit_insample){ - ret_obj$fit_insample <- do.call(c,full_res$r_insample)/(ctrl$iter - ctrl$burnin) - } - return(ret_obj) -} \ No newline at end of file diff --git a/R/data.R b/R/data.R deleted file mode 100644 index 0b6034d..0000000 --- a/R/data.R +++ /dev/null @@ -1,59 +0,0 @@ -#' Dictionary of First Names -#' -#' An example dataset containing voter file information. -#' -#' @format A data frame with 1,043,742 rows and 6 columns: -#' \describe{ -#' \item{first_name}{Voter identifier (numeric)} -#' \item{p_whi_first}{Probability of white} -#' \item{p_bla_first}{Probability of black} -#' \item{p_his_first}{Probability of Hispanic} -#' \item{p_asi_first}{Probability of asian}, -#' \item{p_oth_first}{Probability of other} -#' } -#' -#' @keywords datasets -#' @name firstNameDict -#' @examples -#' data(firstNameDict) -NULL - -#' Dictionary of Last Names -#' -#' An example dataset containing voter file information. -#' -#' @format A data frame with 1,502,541 rows and 6 columns: -#' \describe{ -#' \item{last_name}{Voter identifier (numeric)} -#' \item{p_whi_last}{Probability of white} -#' \item{p_bla_last}{Probability of black} -#' \item{p_his_last}{Probability of Hispanic} -#' \item{p_asi_last}{Probability of asian}, -#' \item{p_oth_last}{Probability of other} -#' } -#' -#' @keywords datasets -#' @name lastNameDict -#' @examples -#' data(lastNameDict) -NULL - -#' Dictionary of Middle Names -#' -#' An example dataset containing voter file information. -#' -#' @format A data frame with 1,182,133 rows and 6 columns: -#' \describe{ -#' \item{middle_name}{Voter identifier (numeric)} -#' \item{p_whi_middle}{Probability of white} -#' \item{p_bla_middle}{Probability of black} -#' \item{p_his_middle}{Probability of Hispanic} -#' \item{p_asi_middle}{Probability of asian}, -#' \item{p_oth_middle}{Probability of other} -#' } -#' -#' @keywords datasets -#' @name middleNameDict -#' @examples -#' data(middleNameDict) -NULL diff --git a/R/format_legacy_data.R b/R/format_legacy_data.R deleted file mode 100644 index abbba84..0000000 --- a/R/format_legacy_data.R +++ /dev/null @@ -1,78 +0,0 @@ -#' Legacy data formatting function. -#' -#' \code{format_legacy_data} formats legacy data from the U.S. census to allow -#' for Bayesian name geocoding. -#' -#' This function allows users to construct datasets for analysis using the census legacy data format. -#' These data are available for the 2020 census at -#' https://www2.census.gov/programs-surveys/decennial/2020/data/01-Redistricting_File--PL_94-171/. -#' This function returns data structured analogously to data from the Census API, which is not yet -#' available for the 2020 Census as of September 2021. -#' -#' @param legacyFilePath A character vector giving the location of a legacy census data folder, -#' sourced from https://www2.census.gov/programs-surveys/decennial/2020/data/01-Redistricting_File--PL_94-171/. -#' These file names should end in ".pl". -#' @param outFile Optional character vector determining whether the formatted RData object should be saved. The -#' filepath should end in ".RData". -#' -#' @import PL94171 -#' @importFrom dplyr `%>%` .data -#' -#' @examples -#' \dontrun{ -#' gaCensusData <- format_legacy_data(PL94171::pl_url('GA', 2020)) -#' predict_race_new(ga.voter.file, namesToUse = 'last, first, mid', census.geo = 'block', -#' census.data = gaCensusData) -#' } -#' -#' @export -format_legacy_data <- function(legacyFilePath, outFile = NULL) { - - # aggregation levels to convert (county, tract, block group, and block) - summaryLevels <- c('050', '140', '150', '750') - - # read in the data - pl <- PL94171::pl_read(legacyFilePath) - pl <- PL94171::pl_select_standard(pl) - - # iterate through the levels - censusData.2020 <- lapply(summaryLevels, FUN = function(level) { - levelData <- PL94171::pl_subset(pl, level) - - # construct the base data frame - df <- levelData %>% - dplyr::select(GEOID = .data$GEOID, - state = toupper(.data$state), - county = .data$county, - P005003 = .data$pop_white, - P005004 = .data$pop_black, - P005010 = .data$pop_hisp, - P005006 = .data$pop_asian, - P005007 = .data$pop_nhpi, - P005005 = .data$pop_aian, - P005008 = .data$pop_other, - P005009 = .data$pop_two - ) - - # add geographic levels - if(level != '050') { - df <- df %>% dplyr::mutate(tract = substr(.data$GEOID, nchar(.data$GEOID) - 5, nchar(.data$GEOID))) - if(level != '140') { - df <- df %>% dplyr::mutate(blockGroup = substr(.data$GEOID, nchar(.data$GEOID), nchar(.data$GEOID))) - if(level != '150') { - df <- df %>% dplyr::mutate(block = substr(.data$GEOID, nchar(.data$GEOID) - 2, nchar(.data$GEOID))) - } - } - } - - df - }) - - # format and optionally save the file - names(censusData.2020) <- c('county', 'tract', 'blockGroup', 'block') - if(!is.null(outFile)) - save(censusData.2020, file = outFile) - - # return the object - return(censusData.2020) -} diff --git a/R/get_census_data.R b/R/get_census_data.R index 2c6d49f..e647539 100644 --- a/R/get_census_data.R +++ b/R/get_census_data.R @@ -56,4 +56,4 @@ get_census_data <- function(key, states, age = FALSE, sex = FALSE, census.geo = } } return(CensusObj) -} \ No newline at end of file +} diff --git a/R/merge_names.R b/R/merge_names.R deleted file mode 100644 index 2aed0a5..0000000 --- a/R/merge_names.R +++ /dev/null @@ -1,240 +0,0 @@ -#' Surname probability merging function. -#' -#' \code{merge_names} merges names in a user-input dataset with corresponding -#' race/ethnicity probabilities derived from both the U.S. Census Surname List -#' and Spanish Surname List and voter files from states in the Southern U.S. -#' -#' This function allows users to match names in their dataset with database entries -#' estimating P(name | ethnicity) for each of the five major racial groups for each -#' name. The database probabilities are derived from both the U.S. Census Surname List -#' and Spanish Surname List and voter files from states in the Southern U.S. -#' -#' By default, the function matches names as follows: -#' 1) Search raw surnames in the database; -#' 2) Remove any punctuation and search again; -#' 3) Remove any spaces and search again; -#' 4) Remove suffixes (e.g., "Jr") and search again (last names only) -#' 5) Split double-barreled names into two parts and search first part of name; -#' 6) Split double-barreled names into two parts and search second part of name; -#' -#' Each step only applies to names not matched in a previous step. -#' Steps 2 through 6 are not applied if \code{clean.surname} is FALSE. -#' -#' Note: Any name appearing only on the Spanish Surname List is assigned a -#' probability of 1 for Hispanics/Latinos and 0 for all other racial groups. -#' -#' @param voter.file An object of class \code{data.frame}. Must contain a row for each individual being predicted, -#' as well as a field named \code{\var{last}} containing each individual's surname. -#' If first name is also being used for prediction, the file must also contain a field -#' named \code{\var{first}}. If middle name is also being used for prediction, the field -#' must also contain a field named \code{\var{middle}}. -#' @param namesToUse A character vector identifying which names to use for the prediction. -#' The default value is \code{"last"}, indicating that only the last name will be used. -#' Other options are \code{"last, first"}, indicating that both last and first names will be -#' used, and \code{"last, first, middle"}, indicating that last, first, and middle names will all -#' be used. -#' @param clean.names A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, -#' any surnames in \code{\var{voter.file}} that cannot initially be matched -#' to the database will be cleaned, according to U.S. Census specifications, -#' in order to increase the chance of finding a match. Default is \code{TRUE}. -#' @return Output will be an object of class \code{data.frame}. It will -#' consist of the original user-input data with additional columns that -#' specify the part of the name matched with Census data (\code{\var{surname.match}}), -#' and the probabilities Pr(Race | Surname) for each racial group -#' (\code{\var{p_whi}} for White, \code{\var{p_bla}} for Black, -#' \code{\var{p_his}} for Hispanic/Latino, -#' \code{\var{p_asi}} for Asian and Pacific Islander, and -#' \code{\var{p_oth}} for Other/Mixed). -#' -#' @import devtools -#' @import stringr -#' -#' @examples -#' \donttest{ -#' data(voters) -#' merge_names(voters, 'last, first') -#'} -#' @export -merge_names <- function(voter.file, namesToUse, clean.names = TRUE) { - - # check the names - if(namesToUse == 'last') { - if(!("last" %in% names(voter.file))) - stop("Voter data frame needs to have a column named 'last'.") - - } else if(namesToUse == 'last, first') { - if(!("last" %in% names(voter.file)) || !("first" %in% names(voter.file))) - stop("Voter data frame needs to have a column named 'last' and a column called 'first'.") - - } else if(namesToUse == 'last, first, middle') { - if(!("last" %in% names(voter.file)) || !("first" %in% names(voter.file)) - || !("middle" %in% names(voter.file))) - stop("Voter data frame needs to have a column named 'last', a column called 'first', and a column called 'middle'.") - } - - # read in the name files and cast NA to the null string - firstNameDict[is.na(firstNameDict$first_name),]$first_name <- '' - middleNameDict[is.na(middleNameDict$middle_name),]$middle_name <- '' - lastNameDict[is.na(lastNameDict$last_name),]$last_name <- '' - - nameDict <- list('first' = firstNameDict, - 'middle' = middleNameDict, - 'last' = lastNameDict) - - ## Convert names in voter file to upper case - p_eth <- c("p_whi", "p_bla", "p_his", "p_asi", "p_oth") - df <- voter.file - df$caseid <- 1:nrow(df) - - df$lastname.match <- df$lastname.upper <- toupper(as.character(df$last)) - if(grepl('first', namesToUse)) - df$firstname.match <- df$firstname.upper <- toupper(as.character(df$first)) - if(grepl('middle', namesToUse)) { - df$middlename.match <- df$middlename.upper <- toupper(as.character(df$middle)) - df$middlename.match[is.na(df$middlename.match)] <- '' - } - - ## Merge Surnames with Census List (No Cleaning Yet) - df <- merge(df, lastNameDict, by.x = "lastname.match", by.y = "last_name", all.x = TRUE) - if(grepl('first', namesToUse)) - df <- merge(df, firstNameDict, by.x = "firstname.match", by.y = "first_name", all.x = TRUE) - if(grepl('middle', namesToUse)) { - df <- merge(df, middleNameDict, by.x = "middlename.match", by.y = "middle_name", all.x = TRUE) - } - - if(namesToUse == 'last' && sum(!(df$lastname.upper %in% lastNameDict$last_name)) == 0) - return(df[order(df$caseid), c(names(voter.file), "lastname.match", p_eth)]) - if(namesToUse == 'last, first' && sum(!(df$lastname.match %in% lastNameDict$last_name)) == 0 && - sum(!(df$firstname.upper %in% firstNameDict$first_name)) == 0) - return(df[order(df$caseid), c(names(voter.file), "lastname.match", "firstname.match", p_eth)]) - if(namesToUse == 'last, first, middle' && sum(!(df$lastname.match %in% lastNameDict$last_name)) == 0 && - sum(!(df$firstname.upper %in% firstNameDict$first_name)) == 0 && sum(!(df$middlename.upper %in% middleNameDict$middle_name)) == 0) - return(df[order(df$caseid), c(names(voter.file), "lastname.match", "firstname.match", "middlename.match", p_eth)]) - - ## Clean names (if specified by user) - if(clean.names) { - - for(nameType in str_split(namesToUse, ', ')[[1]]) { - - df1 <- df[!is.na(df[,paste('p_whi_', nameType, sep = '')]), ] #Matched names - df2 <- df[is.na(df[,paste('p_whi_', nameType, sep = '')]), ] #Unmatched names - - ## Remove All Punctuation and Try Merge Again - if(nrow(df2) > 0) { - df2[,paste(nameType, "name.match", sep = "")] <- gsub("[^[:alnum:] ]", "", df2[,paste(nameType, "name.upper", sep = "")]) - - df2 <- merge(df2[,!grepl(paste('_', nameType, sep = ''), names(df2))], nameDict[[nameType]], all.x = TRUE, - by.x = paste(nameType, "name.match", sep = ""), by.y = paste(nameType, "name", sep = '_')) - df2 <- df2[,names(df1)] # reorder the columns - - if (sum(!is.na(df2[,paste('p_whi_', nameType, sep = ''),])) > 0) { - df1 <- rbind(df1, df2[!is.na(df2[,paste('p_whi_', nameType, sep = ''),]), ]) - df2 <- df2[is.na(df2[,paste('p_whi_', nameType, sep = '')]), ] - } - } - - ## Remove All Spaces and Try Merge Again - if(nrow(df2) > 0) { - df2[,paste(nameType, "name.match", sep = "")] <- gsub(" ", "", df2[,paste(nameType, "name.match", sep = "")]) - df2 <- merge(df2[,!grepl(paste('_', nameType, sep = ''), names(df2))], nameDict[[nameType]], all.x = TRUE, - by.x = paste(nameType, "name.match", sep = ""), by.y = paste(nameType, "name", sep = '_')) - df2 <- df2[,names(df1)] # reorder the columns - - if (sum(!is.na(df2[,paste('p_whi_', nameType, sep = ''),])) > 0) { - df1 <- rbind(df1, df2[!is.na(df2[,paste('p_whi_', nameType, sep = ''),]), ]) - df2 <- df2[is.na(df2[,paste('p_whi_', nameType, sep = '')]), ] - } - } - - # Edits specific to common issues with last names - if(nameType == 'last' & nrow(df2) > 0) { - - ## Remove Jr/Sr/III Suffixes for last names - suffix <- c("JUNIOR", "SENIOR", "THIRD", "III", "JR", " II", " J R", " S R", " IV") - for (i in 1:length(suffix)) { - df2$lastname.match <- ifelse(substr(df2$lastname.match, nchar(df2$lastname.match) - (nchar(suffix)[i] - 1), nchar(df2$lastname.match)) == suffix[i], - substr(df2$lastname.match, 1, nchar(df2$lastname.match) - nchar(suffix)[i]), - df2$lastname.match) - } - df2$lastname.match <- ifelse(nchar(df2$lastname.match) >= 7, - ifelse(substr(df2$lastname.match, nchar(df2$lastname.match) - 1, nchar(df2$lastname.match)) == "SR", - substr(df2$lastname.match, 1, nchar(df2$lastname.match) - 2), - df2$lastname.match), - df2$lastname.match) #Remove "SR" only if name has at least 7 characters - - df2 <- merge(df2[,!grepl(paste('_', nameType, sep = ''), names(df2))], lastNameDict, by.x = "lastname.match", by.y = "last_name", all.x = TRUE) - df2 <- df2[,names(df1)] # reorder the columns - - if (sum(!is.na(df2[,paste('p_whi_', nameType, sep = ''),])) > 0) { - df1 <- rbind(df1, df2[!is.na(df2[,paste('p_whi_', nameType, sep = ''),]), ]) - df2 <- df2[is.na(df2[,paste('p_whi_', nameType, sep = '')]), ] - } - } - - - ## Names with Hyphens or Spaces, e.g. Double-Barreled Names - if(nrow(df2) > 0) { - df2$name2 <- df2$name1 <- NA - df2$name1[grep("-", df2[,paste(nameType, "name.upper", sep = "")])] <- sapply(strsplit(grep("-", df2[,paste(nameType, "name.upper", sep = "")], value = T), "-"), "[", 1) - df2$name2[grep("-", df2[,paste(nameType, "name.upper", sep = "")])] <- sapply(strsplit(grep("-", df2[,paste(nameType, "name.upper", sep = "")], value = T), "-"), "[", 2) - df2$name1[grep(" ", df2[,paste(nameType, "name.upper", sep = "")])] <- sapply(strsplit(grep(" ", df2[,paste(nameType, "name.upper", sep = "")], value = T), " "), "[", 1) - df2$name2[grep(" ", df2[,paste(nameType, "name.upper", sep = "")])] <- sapply(strsplit(grep(" ", df2[,paste(nameType, "name.upper", sep = "")], value = T), " "), "[", 2) - - ## Use first half of name to merge in priors - df2[,paste(nameType, "name.match", sep = "")] <- as.character(df2$name1) - df2 <- merge(df2[,!grepl(paste('_', nameType, sep = ''), names(df2))], nameDict[[nameType]], all.x = TRUE, - by.x = paste(nameType, "name.match", sep = ""), by.y = paste(nameType, "name", sep = '_')) - df2 <- df2[,c(names(df1), "name1", "name2")] # reorder the columns - - if (sum(!is.na(df2[,paste('p_whi_', nameType, sep = ''),])) > 0) { - df1 <- rbind(df1, df2[!is.na(df2[,paste('p_whi_', nameType, sep = '')]), !(names(df2) %in% c("name1", "name2"))]) - df2 <- df2[is.na(df2[,paste('p_whi_', nameType, sep = '')]), ] - } - } - - ## Use second half of name to merge in priors for rest - if(nrow(df2) > 0) { - df2[,paste(nameType, "name.match", sep = "")] <- as.character(df2$name2) - df2 <- merge(df2[,!grepl(paste('_', nameType, sep = ''), names(df2))], nameDict[[nameType]], all.x = TRUE, - by.x = paste(nameType, "name.match", sep = ""), by.y = paste(nameType, "name", sep = '_')) - df2 <- df2[,c(names(df1), "name1", "name2")] # reorder the columns - - if (sum(!is.na(df2[,paste('p_whi_', nameType, sep = ''),])) > 0) { - df1 <- rbind(df1, df2[!is.na(df2[,paste('p_whi_', nameType, sep = '')]), !(names(df2) %in% c("name1", "name2"))]) - df2 <- df2[is.na(df2[,paste('p_whi_', nameType, sep = '')]), ] - } - } - - if(nrow(df2) > 0) - df <- rbind(df1, df2[, !(names(df2) %in% c("name1", "name2"))]) - else - df <- df1 - df <- df[order(df$caseid),] - } - } - - - ## For unmatched names, just fill with a 1 - warning(paste(paste(sum(is.na(df$p_whi_last)), " (", round(100*mean(is.na(df$p_whi_last)), 1), "%) indivduals' last names were not matched.", sep = ""))) - if(grepl('first', namesToUse)) { - warning(paste(paste(sum(is.na(df$p_whi_first)), " (", round(100*mean(is.na(df$p_whi_first)), 1), "%) indivduals' first names were not matched.", sep = ""))) - } - if(grepl('middle', namesToUse)) { - warning(paste(paste(sum(is.na(df$p_whi_middle)), " (", round(100*mean(is.na(df$p_whi_middle)), 1), "%) indivduals' middle names were not matched.", sep = ""))) - } - - for(i in grep("p_", names(df))) { - df[,i] <- dplyr::coalesce(df[,i], 1) - } - - # return the data - if(namesToUse == 'last') - return(df[order(df$caseid), c(names(voter.file), "lastname.match", paste(p_eth, "last", sep = "_"))]) - else if(namesToUse == 'last, first') - return(df[order(df$caseid), c(names(voter.file), "lastname.match", "firstname.match", - paste(p_eth, "last", sep = "_"), paste(p_eth, "first", sep = "_"))]) - else if(namesToUse == 'last, first, middle') - return(df[order(df$caseid), c(names(voter.file), "lastname.match", "firstname.match", "middlename.match", - paste(p_eth, "last", sep = "_"), paste(p_eth, "first", sep = "_"), paste(p_eth, "middle", sep = "_"))]) - -} diff --git a/R/merge_surnames.R b/R/merge_surnames.R index e71dc29..cd3c5d6 100644 --- a/R/merge_surnames.R +++ b/R/merge_surnames.R @@ -1,48 +1,48 @@ #' Surname probability merging function. #' -#' \code{merge_surnames} merges surnames in user-input dataset with corresponding +#' \code{merge_surnames} merges surnames in user-input dataset with corresponding #' race/ethnicity probabilities from U.S. Census Surname List and Spanish Surname List. #' -#' This function allows users to match surnames in their dataset with the U.S. -#' Census Surname List (from 2000 or 2010) and Spanish Surname List to obtain +#' This function allows users to match surnames in their dataset with the U.S. +#' Census Surname List (from 2000 or 2010) and Spanish Surname List to obtain #' Pr(Race | Surname) for each of the five major racial groups. -#' -#' By default, the function matches surnames to the Census list as follows: -#' 1) Search raw surnames in Census surname list; -#' 2) Remove any punctuation and search again; -#' 3) Remove any spaces and search again; -#' 4) Remove suffixes (e.g., Jr) and search again; -#' 5) Split double-barreled surnames into two parts and search first part of name; -#' 6) Split double-barreled surnames into two parts and search second part of name; -#' 7) For any remaining names, impute probabilities using distribution +#' +#' By default, the function matches surnames to the Census list as follows: +#' 1) Search raw surnames in Census surname list; +#' 2) Remove any punctuation and search again; +#' 3) Remove any spaces and search again; +#' 4) Remove suffixes (e.g., Jr) and search again; +#' 5) Split double-barreled surnames into two parts and search first part of name; +#' 6) Split double-barreled surnames into two parts and search second part of name; +#' 7) For any remaining names, impute probabilities using distribution #' for all names not appearing on Census list. -#' -#' Each step only applies to surnames not matched in a previous ste. +#' +#' Each step only applies to surnames not matched in a previous ste. #' Steps 2 through 7 are not applied if \code{clean.surname} is FALSE. -#' -#' Note: Any name appearing only on the Spanish Surname List is assigned a +#' +#' Note: Any name appearing only on the Spanish Surname List is assigned a #' probability of 1 for Hispanics/Latinos and 0 for all other racial groups. #' -#' @param voter.file An object of class \code{data.frame}. Must contain a field +#' @param voter.file An object of class \code{data.frame}. Must contain a field #' named 'surname' containing list of surnames to be merged with Census lists. -#' @param surname.year An object of class \code{numeric} indicating which year -#' Census Surname List is from. Accepted values are \code{2010} and \code{2000}. +#' @param surname.year An object of class \code{numeric} indicating which year +#' Census Surname List is from. Accepted values are \code{2010} and \code{2000}. #' Default is \code{2010}. -#' @param clean.surname A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, -#' any surnames in \code{\var{voter.file}} that cannot initially be matched -#' to surname lists will be cleaned, according to U.S. Census specifications, +#' @param clean.surname A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, +#' any surnames in \code{\var{voter.file}} that cannot initially be matched +#' to surname lists will be cleaned, according to U.S. Census specifications, #' in order to increase the chance of finding a match. Default is \code{TRUE}. -#' @param impute.missing A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, -#' race/ethnicity probabilities will be imputed for unmatched names using +#' @param impute.missing A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, +#' race/ethnicity probabilities will be imputed for unmatched names using #' race/ethnicity distribution for all other names (i.e., not on Census List). #' Default is \code{TRUE}. -#' @return Output will be an object of class \code{data.frame}. It will -#' consist of the original user-input data with additional columns that -#' specify the part of the name matched with Census data (\code{\var{surname.match}}), -#' and the probabilities Pr(Race | Surname) for each racial group -#' (\code{\var{p_whi}} for White, \code{\var{p_bla}} for Black, -#' \code{\var{p_his}} for Hispanic/Latino, -#' \code{\var{p_asi}} for Asian and Pacific Islander, and +#' @return Output will be an object of class \code{data.frame}. It will +#' consist of the original user-input data with additional columns that +#' specify the part of the name matched with Census data (\code{\var{surname.match}}), +#' and the probabilities Pr(Race | Surname) for each racial group +#' (\code{\var{p_whi}} for White, \code{\var{p_bla}} for Black, +#' \code{\var{p_his}} for Hispanic/Latino, +#' \code{\var{p_asi}} for Asian and Pacific Islander, and #' \code{\var{p_oth}} for Other/Mixed). #' #' @import devtools @@ -57,7 +57,7 @@ merge_surnames <- function(voter.file, surname.year = 2010, clean.surname = T, i if ("surname" %in% names(voter.file) == F) { stop('Data does not contain surname field.') } - + ## Census Surname List if (surname.year == 2000) { surnames2000$surname <- as.character(surnames2000$surname) @@ -66,10 +66,10 @@ merge_surnames <- function(voter.file, surname.year = 2010, clean.surname = T, i surnames2010$surname <- as.character(surnames2010$surname) surnames <- surnames2010 } - + p_eth <- c("p_whi", "p_bla", "p_his", "p_asi", "p_oth") - - ## Convert Surnames in Voter File to Upper Case + + ## Convert Surnames in Voter File to Upper Case df <- voter.file df$caseid <- 1:nrow(df) df$surname.match <- df$surname.upper <- toupper(as.character(df$surname)) @@ -82,13 +82,13 @@ merge_surnames <- function(voter.file, surname.year = 2010, clean.surname = T, i } df[df$surname.upper %in% surnames$surname == F, ]$surname.match <- "" - + df1 <- df[df$surname.upper %in% surnames$surname, ] #Matched surnames df2 <- df[df$surname.upper %in% surnames$surname == F, ] #Unmatched surnames - + ## Clean Surnames (if Specified by User) if (clean.surname) { - + ## Remove All Punctuation and Try Merge Again df2$surname.match <- gsub("[^[:alnum:] ]", "", df2$surname.upper) df2 <- merge(df2[names(df2) %in% p_eth == F], surnames[c("surname", p_eth)], by.x = "surname.match", by.y = "surname", all.x = TRUE) @@ -110,14 +110,14 @@ merge_surnames <- function(voter.file, surname.year = 2010, clean.surname = T, i ## Remove Jr/Sr/III Suffixes suffix <- c("JUNIOR", "SENIOR", "THIRD", "III", "JR", " II", " J R", " S R", " IV") for (i in 1:length(suffix)) { - df2$surname.match <- ifelse(substr(df2$surname.match, nchar(df2$surname.match) - (nchar(suffix)[i] - 1), nchar(df2$surname.match)) == suffix[i], - substr(df2$surname.match, 1, nchar(df2$surname.match) - nchar(suffix)[i]), + df2$surname.match <- ifelse(substr(df2$surname.match, nchar(df2$surname.match) - (nchar(suffix)[i] - 1), nchar(df2$surname.match)) == suffix[i], + substr(df2$surname.match, 1, nchar(df2$surname.match) - nchar(suffix)[i]), df2$surname.match) } - df2$surname.match <- ifelse(nchar(df2$surname.match) >= 7, - ifelse(substr(df2$surname.match, nchar(df2$surname.match) - 1, nchar(df2$surname.match)) == "SR", - substr(df2$surname.match, 1, nchar(df2$surname.match) - 2), - df2$surname.match), + df2$surname.match <- ifelse(nchar(df2$surname.match) >= 7, + ifelse(substr(df2$surname.match, nchar(df2$surname.match) - 1, nchar(df2$surname.match)) == "SR", + substr(df2$surname.match, 1, nchar(df2$surname.match) - 2), + df2$surname.match), df2$surname.match) #Remove "SR" only if name has at least 7 characters df2 <- merge(df2[names(df2) %in% p_eth == F], surnames[c("surname", p_eth)], by.x = "surname.match", by.y = "surname", all.x = TRUE) if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) { @@ -127,30 +127,28 @@ merge_surnames <- function(voter.file, surname.year = 2010, clean.surname = T, i } ## Names with Hyphens or Spaces, e.g. Double-Barreled Names - if (nrow(df2) > 0) { - df2$surname2 <- df2$surname1 <- NA - df2$surname1[grep("-", df2$surname.upper)] <- sapply(strsplit(grep("-", df2$surname.upper, value = T), "-"), "[", 1) - df2$surname2[grep("-", df2$surname.upper)] <- sapply(strsplit(grep("-", df2$surname.upper, value = T), "-"), "[", 2) - df2$surname1[grep(" ", df2$surname.upper)] <- sapply(strsplit(grep(" ", df2$surname.upper, value = T), " "), "[", 1) - df2$surname2[grep(" ", df2$surname.upper)] <- sapply(strsplit(grep(" ", df2$surname.upper, value = T), " "), "[", 2) - - ## Use first half of name to merge in priors - df2$surname.match <- as.character(df2$surname1) - df2 <- merge(df2[names(df2) %in% c(p_eth) == F], surnames[c("surname", p_eth)], by.x = "surname.match", by.y = "surname", all.x = TRUE)[names(df2)] - if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) { - df1 <- rbind(df1, df2[df2$surname.match %in% surnames$surname, names(df2) %in% names(df1)]) - df2 <- df2[df2$surname.match %in% surnames$surname == F, ] - if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) {df2$surname.match <- ""} - } - - ## Use second half of name to merge in priors for rest - df2$surname.match <- as.character(df2$surname2) - df2 <- merge(df2[names(df2) %in% c(p_eth, "surname1", "surname2") == F], surnames[c("surname", p_eth)], by.x = "surname.match", by.y = "surname", all.x = TRUE)[names(df2) %in% c("surname1", "surname2") == F] - if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) { - df1 <- rbind(df1, df2[df2$surname.match %in% surnames$surname, names(df2) %in% names(df1)]) - df2 <- df2[df2$surname.match %in% surnames$surname == F, ] - if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) {df2$surname.match <- ""} - } + df2$surname2 <- df2$surname1 <- NA + df2$surname1[grep("-", df2$surname.upper)] <- sapply(strsplit(grep("-", df2$surname.upper, value = T), "-"), "[", 1) + df2$surname2[grep("-", df2$surname.upper)] <- sapply(strsplit(grep("-", df2$surname.upper, value = T), "-"), "[", 2) + df2$surname1[grep(" ", df2$surname.upper)] <- sapply(strsplit(grep(" ", df2$surname.upper, value = T), " "), "[", 1) + df2$surname2[grep(" ", df2$surname.upper)] <- sapply(strsplit(grep(" ", df2$surname.upper, value = T), " "), "[", 2) + + ## Use first half of name to merge in priors + df2$surname.match <- as.character(df2$surname1) + df2 <- merge(df2[names(df2) %in% c(p_eth) == F], surnames[c("surname", p_eth)], by.x = "surname.match", by.y = "surname", all.x = TRUE)[names(df2)] + if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) { + df1 <- rbind(df1, df2[df2$surname.match %in% surnames$surname, names(df2) %in% names(df1)]) + df2 <- df2[df2$surname.match %in% surnames$surname == F, ] + if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) {df2$surname.match <- ""} + } + + ## Use second half of name to merge in priors for rest + df2$surname.match <- as.character(df2$surname2) + df2 <- merge(df2[names(df2) %in% c(p_eth, "surname1", "surname2") == F], surnames[c("surname", p_eth)], by.x = "surname.match", by.y = "surname", all.x = TRUE)[names(df2) %in% c("surname1", "surname2") == F] + if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) { + df1 <- rbind(df1, df2[df2$surname.match %in% surnames$surname, names(df2) %in% names(df1)]) + df2 <- df2[df2$surname.match %in% surnames$surname == F, ] + if (nrow(df2[df2$surname.match %in% surnames$surname, ]) > 0) {df2$surname.match <- ""} } } @@ -162,7 +160,7 @@ merge_surnames <- function(voter.file, surname.year = 2010, clean.surname = T, i warning(paste("Probabilities were imputed for", nrow(df2), ifelse(nrow(df2) == 1, "surname", "surnames"), "that could not be matched to Census list.")) } } else warning(paste(nrow(df2), ifelse(nrow(df2) == 1, "surname was", "surnames were"), "not matched.")) - + df <- rbind(df1, df2) return(df[order(df$caseid), c(names(voter.file), "surname.match", p_eth)]) } diff --git a/R/name_preproc.R b/R/name_preproc.R deleted file mode 100644 index 5ceef62..0000000 --- a/R/name_preproc.R +++ /dev/null @@ -1,60 +0,0 @@ -#' Pre-process vector of names to match census style. Internal function -#' -#' @param voter_names Character vector to be pre-processed. -#' @param target_names Character vector of census names to be matched. -#' -#' @return A character vector of pre-processed named -#' @keywords internal -#' - -.name_preproc <- function(voter_names, target_names){ - post_names <- as.character(voter_names) - post_names <- toupper(post_names) - ## 1) Raw match - match_tmp <- post_names %in% target_names - ## 2) remove punctuation - post_names[!match_tmp] <- gsub("[^[:alnum:] ]", "", - post_names[!match_tmp]) - match_tmp <- post_names %in% target_names - ## 3) Remove spaces - post_names[!match_tmp] <- gsub(" ", "", - post_names[!match_tmp]) - match_tmp <- post_names %in% target_names - ## 4) Remove suffixes - suffix <- c("JUNIOR", "SENIOR", "THIRD", "III", "JR", " II", " J R", " S R", " IV") - for (i in 1:length(suffix)) { - post_names[!match_tmp] <- ifelse(substr(post_names[!match_tmp], - nchar(post_names[!match_tmp]) - (nchar(suffix)[i] - 1), - nchar(post_names[!match_tmp])) == suffix[i], - substr(post_names[!match_tmp], 1, nchar(post_names[!match_tmp]) - nchar(suffix)[i]), - post_names[!match_tmp]) - } - post_names[!match_tmp] <- ifelse(nchar(post_names[!match_tmp]) >= 7, - ifelse(substr(post_names[!match_tmp], - nchar(post_names[!match_tmp]) - 1, - nchar(post_names[!match_tmp])) == "SR", - substr(post_names[!match_tmp], 1, nchar(post_names[!match_tmp]) - 2), - post_names[!match_tmp]), - post_names[!match_tmp]) #Remove "SR" only if name has at least 7 characters - match_tmp <- post_names %in% target_names - ## 5) Split double-barreled names, match on first part - post_names_tmp <- post_names - post_names_tmp[(!match_tmp) & grep("[-, ]", post_names[(!match_tmp)])] <- sapply(strsplit(grep("[-, ]", - post_names[(!match_tmp)], - value = TRUE), - "[-, ]"), - "[", 1) - match_tmp_2 <- post_names_tmp %in% target_names - post_names[(match_tmp_2 == TRUE) & (match_tmp == FALSE)] <- post_names_tmp[(match_tmp_2 == TRUE) & (match_tmp == FALSE)] - match_tmp <- post_names %in% target_names - ## 6) Split double-barreled names, match on second part - post_names_tmp <- post_names - post_names_tmp[(!match_tmp) & grep("[-, ]", post_names[(!match_tmp)])] <- sapply(strsplit(grep("[-, ]", - post_names[(!match_tmp)], - value = TRUE), - "[-, ]"), - "[", 2) - match_tmp_2 <- post_names_tmp %in% target_names - post_names[(match_tmp_2 == TRUE) & (match_tmp == FALSE)] <- post_names_tmp[(match_tmp_2 == TRUE) & (match_tmp == FALSE)] - return(unlist(post_names)) -} \ No newline at end of file diff --git a/R/predict_race.R b/R/predict_race.R index 30f127b..c44fef6 100644 --- a/R/predict_race.R +++ b/R/predict_race.R @@ -70,7 +70,6 @@ #' Whatever the name of the party registration field in \code{\var{voter.file}}, #' it should be coded as 1 for Democrat, 2 for Republican, and 0 for Other. #' @param retry The number of retries at the census website if network interruption occurs. -#' @param impute.missing See \code{\link[wru]{merge_surnames}}. #' @return Output will be an object of class \code{data.frame}. It will #' consist of the original user-input data with additional columns with #' predicted probabilities for each of the five major racial categories: @@ -98,7 +97,7 @@ ## Race Prediction Function predict_race <- function(voter.file, census.surname = TRUE, surname.only = FALSE, surname.year = 2010, - census.geo, census.key, census.data = NA, age = FALSE, sex = FALSE, party, impute.missing = TRUE, retry = 0) { + census.geo, census.key, census.data = NA, age = FALSE, sex = FALSE, party, retry = 0) { if (!missing(census.geo) && (census.geo == "precinct")) { # geo <- "precinct" @@ -146,17 +145,16 @@ predict_race <- function(voter.file, ## Merge in Pr(Race | Surname) if necessary if (census.surname) { if (surname.year == 2010) { - voter.file <- merge_surnames(voter.file, impute.missing = impute.missing) + voter.file <- merge_surnames(voter.file) } else { if (surname.year == 2000) { - voter.file <- merge_surnames(voter.file, surname.year = surname.year, - impute.missing = impute.missing) + voter.file <- merge_surnames(voter.file, surname.year = surname.year) } else { stop(paste(surname.year, "is not a valid surname.year. It should be either 2000 or 2010 (default).")) } } } else { - # Check if voter.file has the necessary data + # Check if voter.file has the nessary data for (k in 1:length(eth)) { if (paste("p", eth[k], sep = "_") %in% names(voter.file) == F) { stop(paste("voter.file object needs to have columns named ", paste(paste("p", eth, sep = "_"), collapse = " and "), ".", sep = "")) diff --git a/R/predict_race_v2.R b/R/predict_race_v2.R deleted file mode 100644 index e4db216..0000000 --- a/R/predict_race_v2.R +++ /dev/null @@ -1,194 +0,0 @@ -#' Race prediction function. -#' -#' \code{predict_race_new} makes probabilistic estimates of individual-level race/ethnicity. -#' -#' This function implements the Bayesian race prediction methods outlined in -#' Imai, Olivella, and Rosenman (2021). The function produces probabilistic estimates of -#' individual-level race/ethnicity, based on name and geolocation. -#' @param voter.file An object of class \code{data.frame}. -#' Must contain a row for each individual being predicted, -#' as well as a field named \code{\var{last}} containing each individual's surname. -#' If first name is also being used for prediction, the file must also contain a field -#' named \code{\var{first}}. If middle name is also being used for prediction, the field -#' must also contain a field named \code{\var{middle}}. -#' Moreover, \code{\var{voter.file}} must contain a field named -#' \code{\var{state}}, which contains the two-character abbreviation for each individual's -#' state of residence (e.g., \code{"nj"} for New Jersey). -#' If using Census geographic data in race/ethnicity predictions, -#' \code{\var{voter.file}} must also contain at least one of the following fields: -#' \code{\var{county}}, \code{\var{tract}}, \code{\var{block}}, and/or \code{\var{place}}. -#' These fields should contain character strings matching U.S. Census categories. -#' County is three characters (e.g., \code{"031"} not \code{"31"}), -#' tract is six characters, and block is four characters. Place is five characters. -#' See below for other optional fields. -#' @param namesToUse A character vector identifying which names to use for the prediction. -#' The default value is \code{"last"}, indicating that only the last name will be used. -#' Other options are \code{"last, first"}, indicating that both last and first names will be -#' used, and \code{"last, first, middle"}, indicating that last, first, and middle names will all -#' be used. -#' @param census.geo An optional character vector specifying what level of -#' geography to use to merge in U.S. Census 2010 geographic data. Currently -#' \code{"county"}, \code{"tract"}, \code{"block"}, and \code{"place"} are supported. -#' Note: sufficient information must be in user-defined \code{\var{voter.file}} object. -#' If \code{\var{census.geo} = "county"}, then \code{\var{voter.file}} -#' must have column named \code{county}. -#' If \code{\var{census.geo} = "tract"}, then \code{\var{voter.file}} -#' must have columns named \code{county} and \code{tract}. -#' And if \code{\var{census.geo} = "block"}, then \code{\var{voter.file}} -#' must have columns named \code{county}, \code{tract}, and \code{block}. -#' If \code{\var{census.geo} = "place"}, then \code{\var{voter.file}} -#' must have column named \code{place}. -#' Specifying \code{\var{census.geo}} will call \code{census_helper} function -#' to merge Census geographic data at specified level of geography. -#' @param census.key A character object specifying user's Census API -#' key. Required if \code{\var{census.geo}} is specified, because -#' a valid Census API key is required to download Census geographic data. -#' @param census.data A list indexed by two-letter state abbreviations, -#' which contains pre-saved Census geographic data. -#' Can be generated using \code{get_census_data} function. -#' @param retry The number of retries at the census website if network interruption occurs. -#' @return Output will be an object of class \code{data.frame}. It will -#' consist of the original user-input data with additional columns with -#' predicted probabilities for each of the five major racial categories: -#' \code{\var{pred.whi}} for White, -#' \code{\var{pred.bla}} for Black, -#' \code{\var{pred.his}} for Hispanic/Latino, -#' \code{\var{pred.asi}} for Asian/Pacific Islander, and -#' \code{\var{pred.oth}} for Other/Mixed. -#' -#' @export -#' -#' @examples -#' data(voters) -#' predict_race(voters, surname.only = TRUE) -#' predict_race(voter.file = voters, surname.only = TRUE) -#' \dontrun{predict_race(voter.file = voters, census.geo = "tract", census.key = "...")} -#' \dontrun{predict_race(voter.file = voters, census.geo = "tract", census.key = "...", age = T)} -#' \dontrun{predict_race(voter.file = voters, census.geo = "place", census.key = "...", sex = T)} -#' \dontrun{CensusObj <- get_census_data("...", state = c("NY", "DC", "NJ")); -#' predict_race(voter.file = voters, census.geo = "tract", census.data = CensusObj, party = "PID")} -#' \dontrun{CensusObj2 <- get_census_data(key = "...", state = c("NY", "DC", "NJ"), age = T, sex = T); -#' predict_race(voter.file = voters, census.geo = "tract", census.data = CensusObj2, age = T, sex = T)} -#' \dontrun{CensusObj3 <- get_census_data(key = "...", state = c("NY", "DC", "NJ"), census.geo = "place"); -#' predict_race(voter.file = voters, census.geo = "place", census.data = CensusObj3)} -predict_race_new <- function(voter.file, namesToUse = 'last', census.geo, census.key, - census.data = NA, retry = 0) { - - # check the geography - if (!missing(census.geo) && (census.geo == "precinct")) { - stop('Error: census_helper function does not currently support merging precinct-level data.') - } - - vars.orig <- names(voter.file) - - # check the names - if(namesToUse == 'last') { - print("Proceeding with last name-only predictions...") - if(!("last" %in% names(voter.file))) - stop("Voter data frame needs to have a column named 'last'.") - - } else if(namesToUse == 'last, first') { - print("Proceeding with first and last name-only predictions...") - if(!("last" %in% names(voter.file)) || !("first" %in% names(voter.file))) - stop("Voter data frame needs to have a column named 'last' and a column called 'first'.") - - } else if(namesToUse == 'last, first, middle') { - print("Proceeding with first, last, and middle name predictions...") - if(!("last" %in% names(voter.file)) || !("first" %in% names(voter.file)) - || !("middle" %in% names(voter.file))) - stop("Voter data frame needs to have a column named 'last', a column called 'first', and a column called 'middle'.") - } - - # check the geographies - if (missing(census.geo) || is.null(census.geo) || is.na(census.geo) || census.geo %in% c("county", "tract", "block", "place") == F) { - stop("census.geo must be either 'county', 'tract', 'block', or 'place'") - } else { - print(paste("Proceeding with Census geographic data at", census.geo, "level...")) - } - if (missing(census.data) || is.null(census.data) || is.na(census.data)) { - if (missing(census.key) || is.null(census.key) || is.na(census.key)) { - stop("Please provide a valid Census API key using census.key option.") - } else { - print("Downloading Census geographic data using provided API key...") - } - } else { - if (!("state" %in% names(voter.file))) { - stop("voter.file object needs to have a column named state.") - } - if (sum(toupper(unique(as.character(voter.file$state))) %in% toupper(names(census.data)) == FALSE) > 0) { - print("census.data object does not include all states in voter.file object.") - if (missing(census.key) || is.null(census.key) || is.na(census.key)) { - stop("Please provide either a valid Census API key or valid census.data object that covers all states in voter.file object.") - } else { - print("Downloading Census geographic data for states not included in census.data object...") - } - } else { - print("Using Census geographic data from provided census.data object...") - } - } - - eth <- c("whi", "bla", "his", "asi", "oth") - - ## Merge in Pr(Name | Race) - voter.file <- merge_names(voter.file, namesToUse) - - if (census.geo == "place") { - if (!("place" %in% names(voter.file))) { - stop("voter.file object needs to have a column named place.") - } - voter.file <- census_helper_new(key = census.key, - voter.file = voter.file, - states = "all", - geo = "place", - census.data = census.data, retry = retry) - } - - if (census.geo == "block") { - if (!("tract" %in% names(voter.file)) || !("county" %in% names(voter.file)) || !("block" %in% names(voter.file))) { - stop("voter.file object needs to have columns named block, tract, and county.") - } - voter.file <- census_helper_new(key = census.key, - voter.file = voter.file, - states = "all", - geo = "block", - census.data = census.data, retry = retry) - } - - if (census.geo == "precinct") { - geo <- "precinct" - stop('Error: census_helper function does not currently support precinct-level data.') - } - - if (census.geo == "tract") { - if (!("tract" %in% names(voter.file)) || !("county" %in% names(voter.file))) { - stop("voter.file object needs to have columns named tract and county.") - } - voter.file <- census_helper_new(key = census.key, - voter.file = voter.file, - states = "all", - geo = "tract", - census.data = census.data, retry = retry) - } - - if (census.geo == "county") { - if (!("county" %in% names(voter.file))) { - stop("voter.file object needs to have a column named county.") - } - voter.file <- census_helper_new(key = census.key, - voter.file = voter.file, - states = "all", - geo = "county", - census.data = census.data, retry = retry) - } - - # Pr(Race | Surname, Geolocation) - preds <- voter.file[,grep("_last", names(voter.file))]*voter.file[,grep("r_", names(voter.file))] - if(grepl('first', namesToUse)) - preds <- preds*voter.file[,grep("_first", names(voter.file))] - if(grepl('middle', namesToUse)) - preds <- preds*voter.file[,grep("_middle", names(voter.file))] - preds <- apply(preds, 2, FUN = function(x) {x/rowSums(preds)}) - colnames(preds) <- paste("pred", eth, sep = "_") - - return(data.frame(cbind(voter.file[c(vars.orig)], preds))) -} diff --git a/R/voters.R b/R/voters.R index b71ef63..c8189bd 100644 --- a/R/voters.R +++ b/R/voters.R @@ -17,12 +17,10 @@ #' \item{sex}{0=male, 1=female} #' \item{party}{Party registration (character)} #' \item{PID}{Party registration (numeric)} -#' \item{first}{First name} -#' \item{last}{Last name} -#' } -#' +#' #' } +#' @docType data #' @keywords datasets #' @name voters #' @examples #' data(voters) -NULL +"voters" diff --git a/R/wru.R b/R/wru.R deleted file mode 100644 index a1f2b0c..0000000 --- a/R/wru.R +++ /dev/null @@ -1,3 +0,0 @@ -#' @useDynLib wru, .registration=TRUE -#' @importFrom Rcpp evalCpp -NULL diff --git a/R/zzz.R b/R/zzz.R deleted file mode 100644 index 9ca9a17..0000000 --- a/R/zzz.R +++ /dev/null @@ -1,3 +0,0 @@ -.onUnload <- function (libpath) { - library.dynam.unload("wru", libpath) -} \ No newline at end of file diff --git a/README.md b/README.md index 8f139f9..122b134 100644 --- a/README.md +++ b/README.md @@ -118,4 +118,4 @@ predict_race(voter.file = voters.dc.nj, census.geo = "county", census.data = cen predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = censusObj2, party = "PID", age = TRUE, sex = FALSE) # Pr(Race | Surname, Tract, Party) ``` ### A related song -Watch [this](https://www.youtube.com/watch?v=r5kmCgVhADY)! +Watch [this](https://www.youtube.com/watch?v=LYb_nqU_43w)! diff --git a/data/firstNameDict.rda b/data/firstNameDict.rda deleted file mode 100644 index 20ea91b..0000000 Binary files a/data/firstNameDict.rda and /dev/null differ diff --git a/data/lastNameDict.rda b/data/lastNameDict.rda deleted file mode 100644 index 6f8ad11..0000000 Binary files a/data/lastNameDict.rda and /dev/null differ diff --git a/data/middleNameDict.rda b/data/middleNameDict.rda deleted file mode 100644 index 56d1aa8..0000000 Binary files a/data/middleNameDict.rda and /dev/null differ diff --git a/data/surnames2000.RData b/data/surnames2000.RData new file mode 100644 index 0000000..8029711 Binary files /dev/null and b/data/surnames2000.RData differ diff --git a/data/surnames2000.rda b/data/surnames2000.rda deleted file mode 100644 index 431e1b1..0000000 Binary files a/data/surnames2000.rda and /dev/null differ diff --git a/data/surnames2010.RData b/data/surnames2010.RData new file mode 100644 index 0000000..61e6886 Binary files /dev/null and b/data/surnames2010.RData differ diff --git a/data/surnames2010.rda b/data/surnames2010.rda deleted file mode 100644 index 6f5b591..0000000 Binary files a/data/surnames2010.rda and /dev/null differ diff --git a/data/voters.RData b/data/voters.RData new file mode 100644 index 0000000..e7cfd64 Binary files /dev/null and b/data/voters.RData differ diff --git a/data/voters.rda b/data/voters.rda deleted file mode 100644 index 560fb20..0000000 Binary files a/data/voters.rda and /dev/null differ diff --git a/man/census_helper_new.Rd b/man/census_helper_new.Rd deleted file mode 100644 index db79d9f..0000000 --- a/man/census_helper_new.Rd +++ /dev/null @@ -1,77 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/census_helper_v2.R -\name{census_helper_new} -\alias{census_helper_new} -\title{Census helper function.} -\usage{ -census_helper_new( - key, - voter.file, - states = "all", - geo = "tract", - age = FALSE, - sex = FALSE, - census.data = NA, - retry = 0 -) -} -\arguments{ -\item{key}{A required character object. Must contain user's Census API -key, which can be requested \href{https://api.census.gov/data/key_signup.html}{here}.} - -\item{voter.file}{An object of class \code{data.frame}. Must contain field(s) named -\code{\var{county}}, \code{\var{tract}}, \code{\var{block}}, and/or \code{\var{place}} -specifying geolocation. These should be character variables that match up with -U.S. Census categories. County should be three characters (e.g., "031" not "31"), -tract should be six characters, and block should be four characters. -Place should be five characters if it is included.} - -\item{states}{A character vector specifying which states to extract -Census data for, e.g. \code{c("NJ", "NY")}. Default is \code{"all"}, which extracts -Census data for all states contained in user-input data.} - -\item{geo}{A character object specifying what aggregation level to use. -Use \code{"county"}, \code{"tract"}, or \code{"block"}. Default is \code{"tract"}. -Warning: extracting block-level data takes very long.} - -\item{age}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on -age or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). -If \code{TRUE}, function will return Pr(Geolocation, Age | Race). -If \code{\var{sex}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race).} - -\item{sex}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on -sex or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). -If \code{TRUE}, function will return Pr(Geolocation, Sex | Race). -If \code{\var{age}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race).} - -\item{census.data}{A optional census object of class \code{list} containing -pre-saved Census geographic data. Can be created using \code{get_census_data} function. -If \code{\var{census.data}} is provided, the \code{\var{age}} element must have the same value -as the \code{\var{age}} option specified in this function (i.e., \code{TRUE} in both or -\code{FALSE} in both). Similarly, the \code{\var{sex}} element in the object provided in -\code{\var{census.data}} must have the same value as the \code{\var{sex}} option here. -If \code{\var{census.data}} is missing, Census geographic data will be obtained via Census API.} - -\item{retry}{The number of retries at the census website if network interruption occurs.} -} -\value{ -Output will be an object of class \code{data.frame}. It will - consist of the original user-input data with additional columns of - Census data. -} -\description{ -\code{census_helper_v2} links user-input dataset with Census geographic data. -} -\details{ -This function allows users to link their geocoded dataset (e.g., voter file) -with U.S. Census 2010 data. The function extracts Census Summary File data -at the county, tract, or block level using the 'UScensus2010' package. Census data -calculated are Pr(Geolocation | Race) where geolocation is county, tract, or block. -} -\examples{ -\dontshow{data(voters)} -\dontrun{census_helper(key = "...", voter.file = voters, states = "nj", geo = "block")} -\dontrun{census_helper(key = "...", voter.file = voters, states = "all", geo = "tract", -age = TRUE, sex = TRUE)} - -} diff --git a/man/co_cluster.Rd b/man/co_cluster.Rd deleted file mode 100644 index 8bcbccc..0000000 --- a/man/co_cluster.Rd +++ /dev/null @@ -1,69 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/co_cluster.R -\name{co_cluster} -\alias{co_cluster} -\title{Name-Race Co-clustering Using Keynames} -\usage{ -co_cluster( - voter.file, - name_types, - name_race_tables, - census.geo, - ..., - control = NULL -) -} -\arguments{ -\item{voter.file}{An object of class data.frame. Must contain a row for each individual being predicted, -as well as fields named "surname", and/or"first", and/or "middle", containing each -individual's corresponding names. It must also contain a field named state, which -contains the two-character lower-case abbreviation for each individual's state of residence (e.g., "nj" for New Jersey). -If using Census geographic data in race/ethnicity -predictions, voter.file must also contain at least one of the following fields: county, tract, block, -and/or place. These fields should contain character strings matching U.S. Census categories. County is three -characters (e.g., "031" not "31"), tract is six characters, and block is four characters. Place is -five characters. See \code{\link{predict_race}} for other optional fields.} - -\item{name_types}{Character vector. Must have elements in "surname", "first", and "middle".} - -\item{name_race_tables}{Named list, with as many elements as there are names in \code{name_types}, and names matching -elements in \code{name_types}. Each list element should be a data.frame of unique names (first column) by race (remaining columns), -with conditional probabilities p(Name|Race).} - -\item{census.geo}{Required character vector. One of "county", "tract", "block" or "place". See \code{\link{predict_race}}.} - -\item{...}{Arguments passed to \code{\link{predict_race}}.} - -\item{control}{List of control arguments, including -\itemize{ -\item{race_init}{ Initial race for each observation in voter.file. Must be an integer, with - 0=white, 1=black, 2=hispanic, 3=asian, and 4=other.} - \item{fit_insample}{ Boolean. Should model check in-sample fit of race prediction for each - record? If \code{TRUE}, \code{race_obs} cannot be \code{NULL}. Defaults to \code{FALSE}.} - \item{race_obs}{ Observed race for each record in \code{voter.file}. Must be an integer, with - 0=white, 1=black, 2=hispanic, 3=asian, and 4=other.} - \item{iter}{ Number of MCMC iterations. Defaults to 1000.} - \item{burnin}{ Number of iterations discarded as burnin. Defaults to half of \code{iter}.} - \item{thin}{ Thinning interval for MCMC. Defaults to 1.} - \item{log_post_interval}{ Interval for storing the log_posterior. Defaults to 10.} - \item{beta_prior}{ Parameter for symmetric Dirichlet prior over names for each race. Defaults to 5.} - \item{gamma_prior}{ Parameter for Beta prior over keyname/non-keyname mixture components. Defaults to c(5, 5).} - \item{verbose}{ Print progress information. Defaults to \code{TRUE}.} - \item{seed}{ RNG seed. If \code{NULL}, a seed is generated and stored for reproducibility.} -}} -} -\value{ -A named list: -\itemize{ -\item{name_by_race}{ Named list of predicted distributions of name by race for each name type.} -\item{race_by_record}{ A copy of \code{voter.file}, with additional columns of predicted - race probabilities, names \code{pred.}. } -\item{loglik}{ Values of log likelihood, evaluated every \code{log_post_interval}.} -\item{fit_insample}{ When \code{fit_insample=TRUE}, a probability of correct in_sample prediction - for each record in \code{vote.file}.} -} -} -\description{ -Estimate a Bayesian mixed-membership, keyword-assisted race classification model for geo-locations -in a voterfile. -} diff --git a/man/dot-name_preproc.Rd b/man/dot-name_preproc.Rd deleted file mode 100644 index 93d20b1..0000000 --- a/man/dot-name_preproc.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/name_preproc.R -\name{.name_preproc} -\alias{.name_preproc} -\title{Pre-process vector of names to match census style. Internal function} -\usage{ -.name_preproc(voter_names, target_names) -} -\arguments{ -\item{voter_names}{Character vector to be pre-processed.} - -\item{target_names}{Character vector of census names to be matched.} -} -\value{ -A character vector of pre-processed named -} -\description{ -Pre-process vector of names to match census style. Internal function -} -\keyword{internal} diff --git a/man/firstNameDict.Rd b/man/firstNameDict.Rd deleted file mode 100644 index 9b4927e..0000000 --- a/man/firstNameDict.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\name{firstNameDict} -\alias{firstNameDict} -\title{Dictionary of First Names} -\format{ -A data frame with 1,043,742 rows and 6 columns: -\describe{ - \item{first_name}{Voter identifier (numeric)} - \item{p_whi_first}{Probability of white} - \item{p_bla_first}{Probability of black} - \item{p_his_first}{Probability of Hispanic} - \item{p_asi_first}{Probability of asian}, - \item{p_oth_first}{Probability of other} - } -} -\description{ -An example dataset containing voter file information. -} -\examples{ -data(firstNameDict) -} -\keyword{datasets} diff --git a/man/format_legacy_data.Rd b/man/format_legacy_data.Rd deleted file mode 100644 index 8cd2266..0000000 --- a/man/format_legacy_data.Rd +++ /dev/null @@ -1,35 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/format_legacy_data.R -\name{format_legacy_data} -\alias{format_legacy_data} -\title{Legacy data formatting function.} -\usage{ -format_legacy_data(legacyFilePath, outFile = NULL) -} -\arguments{ -\item{legacyFilePath}{A character vector giving the location of a legacy census data folder, -sourced from https://www2.census.gov/programs-surveys/decennial/2020/data/01-Redistricting_File--PL_94-171/. -These file names should end in ".pl".} - -\item{outFile}{Optional character vector determining whether the formatted RData object should be saved. The -filepath should end in ".RData".} -} -\description{ -\code{format_legacy_data} formats legacy data from the U.S. census to allow -for Bayesian name geocoding. -} -\details{ -This function allows users to construct datasets for analysis using the census legacy data format. -These data are available for the 2020 census at -https://www2.census.gov/programs-surveys/decennial/2020/data/01-Redistricting_File--PL_94-171/. -This function returns data structured analogously to data from the Census API, which is not yet -available for the 2020 Census as of September 2021. -} -\examples{ -\dontrun{ -gaCensusData <- format_legacy_data(PL94171::pl_url('GA', 2020)) -predict_race_new(ga.voter.file, namesToUse = 'last, first, mid', census.geo = 'block', - census.data = gaCensusData) -} - -} diff --git a/man/keyWRU_fit.Rd b/man/keyWRU_fit.Rd deleted file mode 100644 index 2edb983..0000000 --- a/man/keyWRU_fit.Rd +++ /dev/null @@ -1,35 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/RcppExports.R -\name{keyWRU_fit} -\alias{keyWRU_fit} -\title{Collapsed Gibbs sampler for keyWRU. Internal function} -\usage{ -keyWRU_fit(data, ctrl) -} -\arguments{ -\item{data}{A list with the following elements -\itemize{ - \item{name_type_n}{Number of name types} - \item{race_n}{Number of races} - \item{geo_n}{Number of geolocations} - \item{geo_race_table}{Matrix of conditional probabilities Pr(Race | Geolocation), with geolocations in the rows} - \item{voters_per_geo}{Number of voterfile records per geolocation} - \item{race_inits}{Table of initial race assignments per voterfile record} - \item{name_data}{ - \itemize{ - \item{n_unique_names}{Number of unique names} - \item{record_name_id}{Name id corresponding to each voterfile record} - \item{keynames}{Integer matrix of name id's used as keynames for each race (race in the columns)} - \item{census_table}{Matrix of Pr(Name | Race), with races in the columns} - \item{beta_prior}{Scalar prior for name-race symmetric Dirichlet distribution} - \item{gamma_prior}{Vector prior shapes for keyname/non-keyname Beta mixture} - } - } -}} - -\item{ctrl}{A list of control arguments; see \code{co_cluster} function for details.} -} -\description{ -Collapsed Gibbs sampler for keyWRU. Internal function -} -\keyword{internal} diff --git a/man/lastNameDict.Rd b/man/lastNameDict.Rd deleted file mode 100644 index b267232..0000000 --- a/man/lastNameDict.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\name{lastNameDict} -\alias{lastNameDict} -\title{Dictionary of Last Names} -\format{ -A data frame with 1,502,541 rows and 6 columns: -\describe{ - \item{last_name}{Voter identifier (numeric)} - \item{p_whi_last}{Probability of white} - \item{p_bla_last}{Probability of black} - \item{p_his_last}{Probability of Hispanic} - \item{p_asi_last}{Probability of asian}, - \item{p_oth_last}{Probability of other} - } -} -\description{ -An example dataset containing voter file information. -} -\examples{ -data(lastNameDict) -} -\keyword{datasets} diff --git a/man/merge_names.Rd b/man/merge_names.Rd deleted file mode 100644 index 727c666..0000000 --- a/man/merge_names.Rd +++ /dev/null @@ -1,67 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/merge_names.R -\name{merge_names} -\alias{merge_names} -\title{Surname probability merging function.} -\usage{ -merge_names(voter.file, namesToUse, clean.names = TRUE) -} -\arguments{ -\item{voter.file}{An object of class \code{data.frame}. Must contain a row for each individual being predicted, -as well as a field named \code{\var{last}} containing each individual's surname. -If first name is also being used for prediction, the file must also contain a field -named \code{\var{first}}. If middle name is also being used for prediction, the field -must also contain a field named \code{\var{middle}}.} - -\item{namesToUse}{A character vector identifying which names to use for the prediction. -The default value is \code{"last"}, indicating that only the last name will be used. -Other options are \code{"last, first"}, indicating that both last and first names will be -used, and \code{"last, first, middle"}, indicating that last, first, and middle names will all -be used.} - -\item{clean.names}{A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, -any surnames in \code{\var{voter.file}} that cannot initially be matched -to the database will be cleaned, according to U.S. Census specifications, -in order to increase the chance of finding a match. Default is \code{TRUE}.} -} -\value{ -Output will be an object of class \code{data.frame}. It will - consist of the original user-input data with additional columns that - specify the part of the name matched with Census data (\code{\var{surname.match}}), - and the probabilities Pr(Race | Surname) for each racial group - (\code{\var{p_whi}} for White, \code{\var{p_bla}} for Black, - \code{\var{p_his}} for Hispanic/Latino, - \code{\var{p_asi}} for Asian and Pacific Islander, and - \code{\var{p_oth}} for Other/Mixed). -} -\description{ -\code{merge_names} merges names in a user-input dataset with corresponding - race/ethnicity probabilities derived from both the U.S. Census Surname List - and Spanish Surname List and voter files from states in the Southern U.S. -} -\details{ -This function allows users to match names in their dataset with database entries - estimating P(name | ethnicity) for each of the five major racial groups for each - name. The database probabilities are derived from both the U.S. Census Surname List - and Spanish Surname List and voter files from states in the Southern U.S. - - By default, the function matches names as follows: - 1) Search raw surnames in the database; - 2) Remove any punctuation and search again; - 3) Remove any spaces and search again; - 4) Remove suffixes (e.g., "Jr") and search again (last names only) - 5) Split double-barreled names into two parts and search first part of name; - 6) Split double-barreled names into two parts and search second part of name; - - Each step only applies to names not matched in a previous step. - Steps 2 through 6 are not applied if \code{clean.surname} is FALSE. - - Note: Any name appearing only on the Spanish Surname List is assigned a - probability of 1 for Hispanics/Latinos and 0 for all other racial groups. -} -\examples{ -\donttest{ -data(voters) -merge_names(voters, 'last, first') -} -} diff --git a/man/merge_surnames.Rd b/man/merge_surnames.Rd index e87b9fa..cb8f4b0 100644 --- a/man/merge_surnames.Rd +++ b/man/merge_surnames.Rd @@ -12,56 +12,56 @@ merge_surnames( ) } \arguments{ -\item{voter.file}{An object of class \code{data.frame}. Must contain a field +\item{voter.file}{An object of class \code{data.frame}. Must contain a field named 'surname' containing list of surnames to be merged with Census lists.} -\item{surname.year}{An object of class \code{numeric} indicating which year -Census Surname List is from. Accepted values are \code{2010} and \code{2000}. +\item{surname.year}{An object of class \code{numeric} indicating which year +Census Surname List is from. Accepted values are \code{2010} and \code{2000}. Default is \code{2010}.} -\item{clean.surname}{A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, -any surnames in \code{\var{voter.file}} that cannot initially be matched -to surname lists will be cleaned, according to U.S. Census specifications, +\item{clean.surname}{A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, +any surnames in \code{\var{voter.file}} that cannot initially be matched +to surname lists will be cleaned, according to U.S. Census specifications, in order to increase the chance of finding a match. Default is \code{TRUE}.} -\item{impute.missing}{A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, -race/ethnicity probabilities will be imputed for unmatched names using +\item{impute.missing}{A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, +race/ethnicity probabilities will be imputed for unmatched names using race/ethnicity distribution for all other names (i.e., not on Census List). Default is \code{TRUE}.} } \value{ -Output will be an object of class \code{data.frame}. It will - consist of the original user-input data with additional columns that - specify the part of the name matched with Census data (\code{\var{surname.match}}), - and the probabilities Pr(Race | Surname) for each racial group - (\code{\var{p_whi}} for White, \code{\var{p_bla}} for Black, - \code{\var{p_his}} for Hispanic/Latino, - \code{\var{p_asi}} for Asian and Pacific Islander, and +Output will be an object of class \code{data.frame}. It will + consist of the original user-input data with additional columns that + specify the part of the name matched with Census data (\code{\var{surname.match}}), + and the probabilities Pr(Race | Surname) for each racial group + (\code{\var{p_whi}} for White, \code{\var{p_bla}} for Black, + \code{\var{p_his}} for Hispanic/Latino, + \code{\var{p_asi}} for Asian and Pacific Islander, and \code{\var{p_oth}} for Other/Mixed). } \description{ -\code{merge_surnames} merges surnames in user-input dataset with corresponding +\code{merge_surnames} merges surnames in user-input dataset with corresponding race/ethnicity probabilities from U.S. Census Surname List and Spanish Surname List. } \details{ -This function allows users to match surnames in their dataset with the U.S. - Census Surname List (from 2000 or 2010) and Spanish Surname List to obtain +This function allows users to match surnames in their dataset with the U.S. + Census Surname List (from 2000 or 2010) and Spanish Surname List to obtain Pr(Race | Surname) for each of the five major racial groups. - - By default, the function matches surnames to the Census list as follows: - 1) Search raw surnames in Census surname list; - 2) Remove any punctuation and search again; - 3) Remove any spaces and search again; - 4) Remove suffixes (e.g., Jr) and search again; - 5) Split double-barreled surnames into two parts and search first part of name; - 6) Split double-barreled surnames into two parts and search second part of name; - 7) For any remaining names, impute probabilities using distribution + + By default, the function matches surnames to the Census list as follows: + 1) Search raw surnames in Census surname list; + 2) Remove any punctuation and search again; + 3) Remove any spaces and search again; + 4) Remove suffixes (e.g., Jr) and search again; + 5) Split double-barreled surnames into two parts and search first part of name; + 6) Split double-barreled surnames into two parts and search second part of name; + 7) For any remaining names, impute probabilities using distribution for all names not appearing on Census list. - - Each step only applies to surnames not matched in a previous ste. + + Each step only applies to surnames not matched in a previous ste. Steps 2 through 7 are not applied if \code{clean.surname} is FALSE. - - Note: Any name appearing only on the Spanish Surname List is assigned a + + Note: Any name appearing only on the Spanish Surname List is assigned a probability of 1 for Hispanics/Latinos and 0 for all other racial groups. } \examples{ diff --git a/man/middleNameDict.Rd b/man/middleNameDict.Rd deleted file mode 100644 index d7efef8..0000000 --- a/man/middleNameDict.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\name{middleNameDict} -\alias{middleNameDict} -\title{Dictionary of Middle Names} -\format{ -A data frame with 1,182,133 rows and 6 columns: -\describe{ - \item{middle_name}{Voter identifier (numeric)} - \item{p_whi_middle}{Probability of white} - \item{p_bla_middle}{Probability of black} - \item{p_his_middle}{Probability of Hispanic} - \item{p_asi_middle}{Probability of asian}, - \item{p_oth_middle}{Probability of other} - } -} -\description{ -An example dataset containing voter file information. -} -\examples{ -data(middleNameDict) -} -\keyword{datasets} diff --git a/man/predict_race.Rd b/man/predict_race.Rd index d5080ca..67b1b6d 100644 --- a/man/predict_race.Rd +++ b/man/predict_race.Rd @@ -15,7 +15,6 @@ predict_race( age = FALSE, sex = FALSE, party, - impute.missing = TRUE, retry = 0 ) } @@ -94,8 +93,6 @@ on individual's party registration (in addition to geolocation). Whatever the name of the party registration field in \code{\var{voter.file}}, it should be coded as 1 for Democrat, 2 for Republican, and 0 for Other.} -\item{impute.missing}{See \code{\link[wru]{merge_surnames}}.} - \item{retry}{The number of retries at the census website if network interruption occurs.} } \value{ diff --git a/man/predict_race_new.Rd b/man/predict_race_new.Rd deleted file mode 100644 index d607d26..0000000 --- a/man/predict_race_new.Rd +++ /dev/null @@ -1,96 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/predict_race_v2.R -\name{predict_race_new} -\alias{predict_race_new} -\title{Race prediction function.} -\usage{ -predict_race_new( - voter.file, - namesToUse = "last", - census.geo, - census.key, - census.data = NA, - retry = 0 -) -} -\arguments{ -\item{voter.file}{An object of class \code{data.frame}. -Must contain a row for each individual being predicted, -as well as a field named \code{\var{last}} containing each individual's surname. -If first name is also being used for prediction, the file must also contain a field -named \code{\var{first}}. If middle name is also being used for prediction, the field -must also contain a field named \code{\var{middle}}. -Moreover, \code{\var{voter.file}} must contain a field named -\code{\var{state}}, which contains the two-character abbreviation for each individual's -state of residence (e.g., \code{"nj"} for New Jersey). -If using Census geographic data in race/ethnicity predictions, -\code{\var{voter.file}} must also contain at least one of the following fields: -\code{\var{county}}, \code{\var{tract}}, \code{\var{block}}, and/or \code{\var{place}}. -These fields should contain character strings matching U.S. Census categories. -County is three characters (e.g., \code{"031"} not \code{"31"}), -tract is six characters, and block is four characters. Place is five characters. -See below for other optional fields.} - -\item{namesToUse}{A character vector identifying which names to use for the prediction. -The default value is \code{"last"}, indicating that only the last name will be used. -Other options are \code{"last, first"}, indicating that both last and first names will be -used, and \code{"last, first, middle"}, indicating that last, first, and middle names will all -be used.} - -\item{census.geo}{An optional character vector specifying what level of -geography to use to merge in U.S. Census 2010 geographic data. Currently -\code{"county"}, \code{"tract"}, \code{"block"}, and \code{"place"} are supported. -Note: sufficient information must be in user-defined \code{\var{voter.file}} object. -If \code{\var{census.geo} = "county"}, then \code{\var{voter.file}} -must have column named \code{county}. -If \code{\var{census.geo} = "tract"}, then \code{\var{voter.file}} -must have columns named \code{county} and \code{tract}. -And if \code{\var{census.geo} = "block"}, then \code{\var{voter.file}} -must have columns named \code{county}, \code{tract}, and \code{block}. -If \code{\var{census.geo} = "place"}, then \code{\var{voter.file}} -must have column named \code{place}. -Specifying \code{\var{census.geo}} will call \code{census_helper} function -to merge Census geographic data at specified level of geography.} - -\item{census.key}{A character object specifying user's Census API -key. Required if \code{\var{census.geo}} is specified, because -a valid Census API key is required to download Census geographic data.} - -\item{census.data}{A list indexed by two-letter state abbreviations, -which contains pre-saved Census geographic data. -Can be generated using \code{get_census_data} function.} - -\item{retry}{The number of retries at the census website if network interruption occurs.} -} -\value{ -Output will be an object of class \code{data.frame}. It will - consist of the original user-input data with additional columns with - predicted probabilities for each of the five major racial categories: - \code{\var{pred.whi}} for White, - \code{\var{pred.bla}} for Black, - \code{\var{pred.his}} for Hispanic/Latino, - \code{\var{pred.asi}} for Asian/Pacific Islander, and - \code{\var{pred.oth}} for Other/Mixed. -} -\description{ -\code{predict_race_new} makes probabilistic estimates of individual-level race/ethnicity. -} -\details{ -This function implements the Bayesian race prediction methods outlined in -Imai, Olivella, and Rosenman (2021). The function produces probabilistic estimates of -individual-level race/ethnicity, based on name and geolocation. -} -\examples{ -data(voters) -predict_race(voters, surname.only = TRUE) -predict_race(voter.file = voters, surname.only = TRUE) -\dontrun{predict_race(voter.file = voters, census.geo = "tract", census.key = "...")} -\dontrun{predict_race(voter.file = voters, census.geo = "tract", census.key = "...", age = T)} -\dontrun{predict_race(voter.file = voters, census.geo = "place", census.key = "...", sex = T)} -\dontrun{CensusObj <- get_census_data("...", state = c("NY", "DC", "NJ")); -predict_race(voter.file = voters, census.geo = "tract", census.data = CensusObj, party = "PID")} -\dontrun{CensusObj2 <- get_census_data(key = "...", state = c("NY", "DC", "NJ"), age = T, sex = T); -predict_race(voter.file = voters, census.geo = "tract", census.data = CensusObj2, age = T, sex = T)} -\dontrun{CensusObj3 <- get_census_data(key = "...", state = c("NY", "DC", "NJ"), census.geo = "place"); -predict_race(voter.file = voters, census.geo = "place", census.data = CensusObj3)} -} diff --git a/man/voters.Rd b/man/voters.Rd index aeb352c..7a58eff 100644 --- a/man/voters.Rd +++ b/man/voters.Rd @@ -1,5 +1,6 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/voters.R +\docType{data} \name{voters} \alias{voters} \title{Example voter file.} @@ -19,9 +20,10 @@ A data frame with 10 rows and 12 variables: \item{sex}{0=male, 1=female} \item{party}{Party registration (character)} \item{PID}{Party registration (numeric)} - \item{first}{First name} - \item{last}{Last name} - } + #' } +} +\usage{ +voters } \description{ An example dataset containing voter file information. diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp deleted file mode 100644 index 00bded1..0000000 --- a/src/RcppExports.cpp +++ /dev/null @@ -1,35 +0,0 @@ -// Generated by using Rcpp::compileAttributes() -> do not edit by hand -// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 - -#include -#include - -using namespace Rcpp; - -#ifdef RCPP_USE_GLOBAL_ROSTREAM -Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); -Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); -#endif - -// keyWRU_fit -Rcpp::List keyWRU_fit(Rcpp::List data, Rcpp::List ctrl); -RcppExport SEXP _wru_keyWRU_fit(SEXP dataSEXP, SEXP ctrlSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< Rcpp::List >::type data(dataSEXP); - Rcpp::traits::input_parameter< Rcpp::List >::type ctrl(ctrlSEXP); - rcpp_result_gen = Rcpp::wrap(keyWRU_fit(data, ctrl)); - return rcpp_result_gen; -END_RCPP -} - -static const R_CallMethodDef CallEntries[] = { - {"_wru_keyWRU_fit", (DL_FUNC) &_wru_keyWRU_fit, 2}, - {NULL, NULL, 0} -}; - -RcppExport void R_init_wru(DllInfo *dll) { - R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); - R_useDynamicSymbols(dll, FALSE); -} diff --git a/src/XName.cpp b/src/XName.cpp deleted file mode 100644 index 9c8c36e..0000000 --- a/src/XName.cpp +++ /dev/null @@ -1,123 +0,0 @@ -#include "XName.h" - -using namespace Eigen; -using namespace Rcpp; -using namespace std; - - -XName::XName(const List& data, - const List& ctrl, - const int G_, - const int R_, - const VectorXd& n_r, - const std::vector& Races, - const String type - -) : - W(as< std::vector >(data["record_name_id"])), - phi_tilde(as(data["census_table"])), - gamma_prior(as(ctrl["gamma_prior"])), - beta_w(as(ctrl["beta_prior"])), - N_(as(data["n_unique_names"])), - R_(R_), - n_r(n_r), - type(type) -{ - //Initialize containers - max_kw = as(data["largest_keyword"]); - n_rc.resize(R_, 2); - n_rc.setZero(); - n_wr.resize(N_, R_); - n_wr.setZero(); - int geo_size = 0, r_ = 0, w_ = 0; - - - for(int ii = 0; ii < G_ ; ++ii){ //iterate over geos - geo_size = W[ii].size(); - C.push_back(VectorXi(geo_size)); - C[ii].setZero(geo_size); - } - - - e_phi.resize(N_, R_); - e_phi.setZero(); - - //Initialize placeholders - c1_prob = 1.0; numerator = 0.0; - denominator = 1.0; c0_prob = 1.0; - sum_c = 1.0; pi_0 = 1.0; pi_1 = 1.0; - c = 0; new_c = 0; r_ = 0; w_ = 0; - -} - -void XName::sample_c(int r, - int voter, - int geo_id) -{ - //Get unique names... - w_ = W[geo_id][voter]; - //(Check if name is in keyword list, and skip update if not) -//if(!found_in(keywords[r], w_)){ - if(w_ >= max_kw){ - return; - } - - // ... and current mixture component - c = C[geo_id][voter]; - - // remove data - if(c == 0){ - n_wr(w_, r)--; - } - n_rc(r, c)--; - - // newprob_c0 - c1_prob = phi_tilde(w_, r) * (n_rc(r, 1) + gamma_prior[0]); - - // newprob_c0 - numerator = (n_wr(w_, r) + beta_w) - * (n_rc(r, 0) + gamma_prior[1]); - denominator = n_rc(r, 0) + ((double)N_ * beta_w); - c0_prob = numerator / denominator; - - - // Normalize - sum_c = c0_prob + c1_prob; - - c1_prob = c1_prob / sum_c; - new_c = (R::runif(0,1) <= c1_prob); //new_c = Bern(c1_prob); - - // add back data counts - if (new_c == 0) { - n_wr(w_, r)++; - } - n_rc(r, new_c)++; - - //Update mixture component - C[geo_id][voter] = new_c; -} - -void XName::phihat_store(){ - // keyword component - for(int r = 0; r < R_; ++r){ - denominator = n_rc(r, 1) + n_rc(r, 0) + gamma_prior.sum(); - denominator_phi = n_r(r) + ((double)N_ * beta_w); - pi_0 = (n_rc(r, 1) + gamma_prior[0]) / denominator; - pi_1 = (n_rc(r, 0) + gamma_prior[1]) / denominator; - for(int w = 0; w < N_; ++w){ - if(w < max_kw){ - e_phi(w, r) += (pi_0 * phi_tilde(w, r) - + pi_1 * ((n_wr(w, r) + beta_w) / denominator_phi)); - } else { - e_phi(w, r) += ((n_wr(w, r) + beta_w) / denominator_phi); - } - } - } -} - -MatrixXd XName::getPhiHat(){ - return e_phi; -} - - - diff --git a/src/XName.h b/src/XName.h deleted file mode 100644 index a3d4d0b..0000000 --- a/src/XName.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef __XName__ -#define __XName__ - -#include -#include -#include "aux_funs.h" - -using namespace Eigen; -using namespace Rcpp; - -class XName -{ -public: - //Constructor - XName(const List& data, - const List& ctrl, - const int G_, - const int R_, - const VectorXd& n_r, - const std::vector& Races, - const String type - ); - - //getters - MatrixXd getPhiHat(); - -private: - //Data - const std::vector W; // List of vectors of unique name id (row 0) and counts (row 1), for each geo - const std::vector keywords; //List of vectors of keyword name id (rows) by race (columns). - const MatrixXd phi_tilde; // census table, name by race - const VectorXd gamma_prior; - const double beta_w; - const int N_, - R_ - ; - const VectorXd& n_r; - const String type; - - int max_kw; - - std::vector C; // Vector of vectors of mixture component for each record, in each geo - MatrixXd n_rc, // Suff. stat, race by mixture component - e_phi, //Expected distribution over races for each name - n_wr; //Suff stat, name by race for names in non-keyword components - - - double c1_prob, - numerator, - denominator, - denominator_phi, - c0_prob, - sum_c, - pi_0, pi_1; - - int c, - new_c, - r_, - w_; - //Methods - void sample_c(int r, - int voter, - int geo_id); - void phihat_store(); - - - - friend class keyWRU; -}; -#endif diff --git a/src/aux_funs.cpp b/src/aux_funs.cpp deleted file mode 100644 index 33790fc..0000000 --- a/src/aux_funs.cpp +++ /dev/null @@ -1,43 +0,0 @@ -#include "aux_funs.h" - -inline int rand_wrapper(const int n) - { - return floor(R::unif_rand() * n); - } - - -int rcat_without_normalize(Eigen::VectorXd &prob, - const double total, - const int size) - { - // Draw from a categorical distribution - // This function does not require a normalized probability vector. - double u = R::unif_rand() * total; - double temp = 0.0; - int index = 0; - for (int ii = 0; ii < size; ii++) { - temp += prob(ii); - if (u < temp) { - index = ii; - break; - } - } - return index; - } - - -double mylgamma(const double x){ - // gammaln_val = 0.0; - // gammaln_val = lgamma(x); - - // Good approximation when x > 1 - // x > 1: max abs err: 2.272e-03 - // x > 0.5: 0.012 - // x > 0.6: 0.008 - // Abramowitz and Stegun p.257 - - if(x < 0.6) - return (lgamma(x)); - else - return ((x-0.5)*log(x) - x + 0.91893853320467 + 1/(12*x)); - }; diff --git a/src/aux_funs.h b/src/aux_funs.h deleted file mode 100644 index 2786951..0000000 --- a/src/aux_funs.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef __aux_funs__ -#define __aux_funs__ - -#include -inline int rand_wrapper(const int n); -int rcat_without_normalize(Eigen::VectorXd& prob, - const double total, - const int size); - -double mylgamma(const double x); - - -#endif diff --git a/src/fit.cpp b/src/fit.cpp deleted file mode 100644 index 268890f..0000000 --- a/src/fit.cpp +++ /dev/null @@ -1,42 +0,0 @@ -#include -#include "XName.h" -#include "keyWRU.h" - -// [[Rcpp::plugins(cpp11)]] -// [[Rcpp::depends(RcppEigen)]] -// [[Rcpp::depends(RcppProgress)]] - -//' Collapsed Gibbs sampler for keyWRU. Internal function -//' -//' @param data A list with the following elements -//' \itemize{ -//' \item{name_type_n}{Number of name types} -//' \item{race_n}{Number of races} -//' \item{geo_n}{Number of geolocations} -//' \item{geo_race_table}{Matrix of conditional probabilities Pr(Race | Geolocation), with geolocations in the rows} -//' \item{voters_per_geo}{Number of voterfile records per geolocation} -//' \item{race_inits}{Table of initial race assignments per voterfile record} -//' \item{name_data}{ -//' \itemize{ -//' \item{n_unique_names}{Number of unique names} -//' \item{record_name_id}{Name id corresponding to each voterfile record} -//' \item{keynames}{Integer matrix of name id's used as keynames for each race (race in the columns)} -//' \item{census_table}{Matrix of Pr(Name | Race), with races in the columns} -//' \item{beta_prior}{Scalar prior for name-race symmetric Dirichlet distribution} -//' \item{gamma_prior}{Vector prior shapes for keyname/non-keyname Beta mixture} -//' } -//' } -//' } -//' @param ctrl A list of control arguments; see \code{co_cluster} function for details. -//' -//' @keywords internal -// [[Rcpp::export]] -Rcpp::List keyWRU_fit(Rcpp::List data, - Rcpp::List ctrl) -{ - keyWRU model(data, ctrl); - model.sample(); - Rcpp::List res = model.return_obj(); - return res; -} - diff --git a/src/keyWRU.cpp b/src/keyWRU.cpp deleted file mode 100644 index f12efa2..0000000 --- a/src/keyWRU.cpp +++ /dev/null @@ -1,267 +0,0 @@ -#include "keyWRU.h" - -using namespace Eigen; -using namespace Rcpp; - -keyWRU::keyWRU(const List data, - const List ctrl) : - M_(as(data["name_type_n"])), - R_(as(data["race_n"])), - G_(as(data["geo_n"])), - max_iter(as(ctrl["iter"])), - thin(as(ctrl["thin"])), - burnin(as(ctrl["burnin"])), - llk_per(as(ctrl["log_post_interval"])), - verbose(as(ctrl["verbose"])), - theta(as(data["geo_race_table"])), - geo_each_size(as(data["voters_per_geo"])), - Races(as< std::vector >(data["race_inits"])), - check_in_sample(as(ctrl["fit_insample"])) -{ - - //Initialize suff. stat for race count and race sample storage - n_r.resize(R_); - n_r.setZero(); - RaceSamples.resize(G_); - for (int ii = 0; ii < G_; ++ii) { //iterate over geos - (RaceSamples[ii]).resize(geo_each_size[ii], R_); - (RaceSamples[ii]).setZero(); - for (int jj = 0; jj < geo_each_size[ii]; ++jj) { //iterate over names - n_r(Races[ii][jj])++; - } - } - //Construct list of name objects - const List& all_name_data = as(data["name_data"]); - CharacterVector all_types = all_name_data.names(); - //XName name; - for(int m = 0; m < M_; ++m){ - //name = XName(all_name_data[m], ctrl, G_, R_, n_r, Races, all_types[m]); - names.push_back(XName(all_name_data[m], ctrl, G_, R_, n_r, Races, all_types[m])); - } - - int c_val = 0, r_, w_; - for (int ii = 0; ii < G_; ++ii) { - geo_id_ = ii; - geo_size = geo_each_size[geo_id_]; - // Iterate over each record in the geographic loc. - for (int jj = 0; jj < geo_size; ++jj) { - Rcpp::checkUserInterrupt(); - voter_ = jj; - //Init name-specific suff stats - for(int m = 0; m < M_; ++m){ - r_ = Races[ii][jj]; w_ = names[m].W[ii][jj]; - //initialize C (0 for non-keyword names, bern(0.7) otherwise) - if(w_ < names[m].max_kw){ - c_val = R::rbinom(1, 0.7); - names[m].C[ii][jj] = c_val; - } - //Init n_rc suff. stat matrix - names[m].n_rc(r_, c_val)++; - //Init n_wr suff. stat matrix - if(!c_val){ - names[m].n_wr(w_, r_)++; - } - } - } - } - - - - // Initialize all placeholders - geo_id_ = 0; geo_size = 0; w = 0; r = 0; - c = 0; new_r = 0; voter_ = 0; N_ = 0; n_samp = 0; - numerator = 0.0; denominator = 1.0; - sum_r = 1.0; n_rc = 0.0; - r_prob_vec.resize(R_); - - - - - //If testing in sample fit - if(check_in_sample){ - race_match.resize(G_); - for (int ii = 0; ii < G_; ++ii) { //iterate over geos - (race_match[ii]).resize(geo_each_size[ii]); - for (int jj = 0; jj < geo_each_size[ii]; ++jj) { //iterate over names - race_match[ii][jj] = 0; - } - } - obs_race = as< std::vector >(data["race_obs"]); - } - -} - -int keyWRU::sample_r(int voter, - int geo_id) -{ - // Extract current race - r = Races[geo_id][voter]; - - //remove data - n_r(r)--; - for(int m = 0; m < M_; ++m){ - names[m].n_rc(r, (names[m].C[geo_id])[voter])--; - if(!(names[m].C[geo_id][voter])){ - names[m].n_wr((names[m].W[geo_id])[voter], r)--; - } - } - - for(int k = 0; k < R_; ++k){ - numerator = 0.0; - denominator = 0.0; - for(int m = 0; m < M_; ++m){ - const XName& name = names[m]; - w = (name.W[geo_id])[voter]; - c = (name.C[geo_id])[voter]; - n_rc = name.n_rc(k, c); - numerator += log(n_rc + name.gamma_prior[c]) - + (c ? log(name.phi_tilde(w, k)) : log(name.n_wr(w, k) + name.beta_w)); - denominator += log(n_r(k) + name.gamma_prior.sum()) - + (c ? 0.0 : log(n_rc + ((double)(name.N_) * name.beta_w))); - } - r_prob_vec(k) = theta(geo_id, k) * exp(numerator - denominator); - } - sum_r = r_prob_vec.sum(); - new_r = rcat_without_normalize(r_prob_vec, - sum_r, - R_); // Cat(r_prob_vec/sum_r) - - - //Add counts back in - n_r(new_r)++ ; - for(int m = 0; m < M_; ++m){ - names[m].n_rc(new_r, (names[m].C[geo_id])[voter])++ ; - if(!(names[m].C[geo_id][voter])){ - names[m].n_wr((names[m].W[geo_id])[voter], new_r)++ ; - } - } - - - return new_r; -} - -void keyWRU::iteration_single(int it) -{ - //geo_indeces = shuffle_indeces(G_); // shuffle geo locs - for (int ii = 0; ii < G_; ++ii) { - geo_id_ = ii;//geo_indeces[ii]; - geo_size = geo_each_size[geo_id_]; - //record_indeces = shuffle_indeces(geo_size); //shuffle records - // Iterate over each record in the geographic loc. - for (int jj = 0; jj < geo_size; ++jj) { - Rcpp::checkUserInterrupt(); - voter_ = jj;//record_indeces[jj]; - //Sample race - Races[geo_id_][voter_] = sample_r(voter_, geo_id_); - if((it+1) > burnin){ - //Store sampled race - RaceSamples[geo_id_](voter_, Races[geo_id_][voter_])++; - if(check_in_sample){ - race_match[geo_id_][voter_] += (Races[geo_id_][voter_] == obs_race[geo_id_][voter_]); - } - } - //Sample mixture component - for(int m = 0; m < M_; ++m){ - names[m].sample_c(Races[geo_id_][voter_], voter_, geo_id_); - } - } - } -} - - -List keyWRU::return_obj() -{ - List phi_hat; - MatrixXd phi_mat; - for(int m = 0; m < M_; ++m){ - phi_mat = names[m].getPhiHat(); - phi_mat /= n_samp; - phi_hat.push_back(phi_mat, names[m].type); - } - List res; - res["phi"] = phi_hat; - res["ll"] = model_fit; - if(check_in_sample){ - res["r_insample"] = race_match; - } - res["predict_race"] = getRHat(); - return res; -} - -void keyWRU::sample() -{ - // Set progress bar up - Progress progress_bar(max_iter, verbose); - - for (int iter = 0; iter < max_iter; ++iter) { - // Run iteration - iteration_single(iter); - - // Store samples and measures of model fit - int r_index = iter + 1; - if(r_index > burnin){ - if ((r_index % llk_per) == 0 || r_index == 1 || r_index == max_iter) { - mfit_store(); - } - if ((r_index % thin) == 0 || r_index == 1 || r_index == max_iter) { - phihat_store(); - n_samp++; - } - } - - // Progress bar - progress_bar.increment(); - - // Check keybord interruption to cancel the iteration - checkUserInterrupt(); - } -} - -double keyWRU::loglik_total() -{ - double loglik = 0.0, beta_w; - int N_; - for (int r = 0; r < R_; ++r) { - for( int m = 0; m < M_; ++m){ - loglik += mylgamma(names[m].n_rc(r, 1) + names[m].gamma_prior[0]) - + mylgamma(names[m].n_rc(r, 0) + names[m].gamma_prior[1]); - loglik -= mylgamma(n_r[r] + names[m].gamma_prior.sum()); - N_ = names[m].N_; - beta_w = names[m].beta_w; - // Possible omp pragma here - for(int w = 0; w < N_; ++w){ - loglik+=mylgamma(names[m].n_wr(w, r) + beta_w); - } - loglik -= mylgamma(names[m].n_rc(r, 0) - + ((double)(names[m].N_) * beta_w)); - } - } - return loglik; -} - - - -void keyWRU::mfit_store() -{ - // Store likelihood during the sampling - double loglik = loglik_total(); - model_fit.push_back(loglik); -} - -void keyWRU::phihat_store() -{ - // Store expected distribution over races for given name - for(int m = 0; m < M_; ++m){ - names[m].phihat_store(); - } -} - -List keyWRU::getRHat() -{ - return(wrap(RaceSamples)); -} - - - - - diff --git a/src/keyWRU.h b/src/keyWRU.h deleted file mode 100644 index 00aab12..0000000 --- a/src/keyWRU.h +++ /dev/null @@ -1,108 +0,0 @@ -#ifndef __keyWRU__ -#define __keyWRU__ - -#include -#include -#include -#include -#include "XName.h" -#include "aux_funs.h" - - -using namespace Eigen; -using namespace Rcpp; - -class keyWRU -{ - // This is a header file for "WRU" class. - // keyWRU class includes variables and functions - // that appear throughout the keyWRU models. - // Potential extensions inherit keyWRU and add model specific - // functions. - // - // keyWRU uses virtual inheritance. - // keyWRU is friends with the XName class, - // which contains First-, Last-, etc. name - // specific data and sampling methods - -public: - - //Constructor/destructor - keyWRU(const List data, - const List ctrl - ); - - //Methods - void sample(); - List return_obj(); - -private: - //Data - - const int M_ // number of name types - , R_ // number of races - , G_ // number of locations - , max_iter // number of MCMC iterations - , thin // thinning interval - , burnin - , llk_per // log_posterior store interval - ; - - const bool verbose; - const MatrixXd theta; - const VectorXi geo_each_size; // - std::vector Races; - std::vector RaceSamples; - - bool check_in_sample; - - Rcpp::NumericVector model_fit; - - //Suff. Stat, race count - VectorXd n_r; - - - //All name components - std::vector names; - - int geo_id_ // location id placeholder - , geo_size // Nr. of voters in location placeholder - , w // name placeholder - , r //race placeholder - , c // mixture component placeholder - , new_r // new race placeholder - , voter_ // voter id placeholder - , N_ // number of unique names placeholder - , n_samp - ; - - //Intermediate computation placeholders - double numerator - , denominator - , sum_r - , n_rc // race-component suff. stat. placeholder - ; - VectorXd r_prob_vec; - - - - std::vector geo_indeces // vector of shuffled locations - , record_indeces // vector of shuffl - ; - - std::vector< VectorXi > race_match; - std::vector obs_race; - // Methods - int sample_r(int voter, - int geo_id); - void iteration_single(int it); - double loglik_total(); - void mfit_store(); - void phihat_store(); - List getRHat(); - -}; - - -#endif - diff --git a/tests/testthat/test-all.R b/tests/testthat/test-all.R index 4522132..77e1d73 100644 --- a/tests/testthat/test-all.R +++ b/tests/testthat/test-all.R @@ -8,11 +8,11 @@ data(voters) test_that("Tests surname only predictions", { set.seed(12345) - + # Prediction using surname only x <- suppressWarnings(predict_race(voter.file = voters, surname.only = TRUE)) # Test and confirm prediction output is as expected - expect_equal(dim(x), c(10, 20)) + expect_equal(dim(x), c(10, 18)) expect_equal(sum(is.na(x)), 0) expect_equal(round(x[x$surname == "Khanna", "pred.whi"], 4), 0.0676) expect_equal(round(x[x$surname == "Johnson", "pred.his"], 4), 0.0236) @@ -25,10 +25,10 @@ test_that("Tests predictions using the Census object", { if (!is.null(k)) { # Remove two NY cases from dataset to reduce the amount of the computation in the following test voters.dc.nj <- voters[c(-3, -7), ] - + # Create Census data object covering DC and NJ census.dc.nj <- get_census_data(key = k, state = c("DC", "NJ"), census.geo = "tract", age = TRUE, sex = FALSE) - + # Prediction using the Census object created in the previous step; tract-level statistics used in prediction x = predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = census.dc.nj, age = TRUE, sex = FALSE, party = "PID") # test and comfirm the prediction output as expected @@ -55,7 +55,7 @@ test_that("Tests predictions using the Census object", { expect_equal(sum(x$surname == "Johnson"), 0) expect_equal(round(x[x$surname == "Khanna", "pred.whi"], 4), 0.0441) expect_equal(round(x[x$surname == "Morse", "pred.his"], 4), 0.0163) - + # Prediction using the Census object built in the previous step; tract-level statistics used in prediction x = predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = censusObj2, party = "PID", age = TRUE, sex = FALSE) # Pr(Race | Surname, Tract, Party) # Test and confirm prediction output is as expected @@ -79,7 +79,7 @@ test_that("Tests predictions using Census API key", { expect_equal(sum(x$surname == "Johnson"), 1) expect_equal(round(x[x$surname == "Khanna", "pred.whi"], 4), 0.0819) expect_equal(round(x[x$surname == "Morse", "pred.his"], 4), 0.0034) - + # Prediction using a valid Census API key; place-level statistics used x = predict_race(voter.file = voters, census.geo = "place", census.key = k, sex = TRUE) # Test and confirm prediction output is as expected @@ -103,7 +103,7 @@ test_that("Tests predictions using Census API key", { expect_equal(round(x[x$surname == "Morse", "pred.his"], 4), 0.0123) } }) - + test_that("Tests predictions using Census data from a different year", { set.seed(12345) @@ -118,4 +118,3 @@ test_that("Tests predictions using Census data from a different year", { expect_equal(round(x[x$surname == "Morse", "pred.his"], 4), 0.0023) } }) -