-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path1_prepare_data.R
executable file
·63 lines (57 loc) · 2.78 KB
/
1_prepare_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/env Rscript
library(tidyverse)
# Load individual files
child_has <- read_csv("data/raw_child_has_long.csv")
adult_has <- read_csv("data/raw_adult_has_long.csv")
child_sim <- read_csv("data/raw_child_sim_long.csv")
adult_sim <- read_csv("data/raw_adult_sim_long.csv")
# Just double checking that Document_name works well as a unique ID
# across data files
sanity_check <- function(d_has, d_similar) {
stopifnot(nrow(d_has) == nrow(d_similar))
stopifnot(length(unique(d_has$Document_name)) == length(unique(d_similar$Document_name)))
demos_has <- d_has %>% select(Document_name, Age, Gender, Community_ID) %>% unique()
demos_sim <- d_similar %>% select(Document_name, Age, Gender, Community_ID) %>% unique()
demos_joined <- full_join(demos_has, demos_sim, by="Document_name")
stopifnot(with(demos_joined, mean(Age.x == Age.y) == 1))
stopifnot(with(demos_joined, mean(Gender.x == Gender.y) == 1))
stopifnot(with(demos_joined, mean(Community_ID.x == Community_ID.y) == 1))
return(TRUE)
}
sanity_check(child_has, child_sim)
sanity_check(adult_has, adult_sim)
# Combine responses for the two kinds of question into one tibble
combine_responses <- function(d_has, d_sim) {
d_has <- d_has %>%
mutate(type = str_sub(type_TF, 6, -1)) %>%
rename(response_has = response_TF)
d_sim <- d_sim %>%
mutate(type = str_sub(type_similar, 9, -1))
d_comb <- left_join(select(d_has, -type_TF),
select(d_sim, Document_name, type, response_similar),
by=c("Document_name", "type")) %>%
rename(Participant_ID=Document_name)
}
child_comb <- combine_responses(child_has, child_sim)
adult_comb <- combine_responses(adult_has, adult_sim)
# Merge children and adults into one tibble, with a column for easy filtering
child_comb$AgeGroup = "children"
adult_comb$AgeGroup = "adults"
combined <- add_row(child_comb, adult_comb) %>%
# Unique numeric PIDs
mutate(Participant_ID = as.integer(as.factor(Participant_ID)),
# Force NA for second question if answer to first is No
response_similar = if_else(!is.na(response_has) & response_has == "No", NA, response_similar),
# Recode "no specification" gender as NA, after clarifying this is a non-response
Gender = if_else(Gender == "no specification", NA, Gender),
# Rename "Galapagos_I" (not a country) to "Ecuador"
Country_full = if_else(Country_full == "Galapagos_I", "Ecuador", Country_full),
Community_ID = case_when(Community_ID == "Galapagos_urban" ~ "Ecuador_urban",
Community_ID == "Galapagos_rural" ~ "Ecuador_rural",
TRUE ~ Community_ID)) %>%
# Just shuffling column order around to look "nice"
select(Participant_ID, AgeGroup, Age, Gender,
Urb_rural, Country_full, Community_ID,
type, response_has, response_similar)
# Save this
write_csv(combined, "data/combined_long_data.csv")