diff --git a/hw/2014-10-22-hw3-greg-werbin.html b/hw/2014-10-22-hw3-greg-werbin.html new file mode 100644 index 0000000..e8b1308 --- /dev/null +++ b/hw/2014-10-22-hw3-greg-werbin.html @@ -0,0 +1,422 @@ + + + + +
+ + + + + + + + +I’ll be comparing Question 5 of the 1999 and 2008 European Values Surveys – Great Britain, which has the same exact wording in both versions:
+++Please look carefully at the following list of voluntary organisations and activities and say … a) which, if any, do you belong to? (Code all mentioned under (a) as ‘1’) b) which, if any, are you currently doing unpaid voluntary work for? (Code all mentioned under (b) as ‘1’)
+
with options:
++++
+- Social welfare services for elderly, handicapped or deprived people
+- Religious or church organisations
+- Education, arts, music or cultural activities
+- Trade unions
+- Political parties or groups
+- Local community action on issues like poverty, employment, housing, racial equality
+- Third world development or human rights
+- Conservation, the environment, ecology, animal rights
+- Professional associations
+- Youth work (e.g. scouts, guides, youth clubs etc.)
+- Sports or recreation
+- Women’s groups
+- Peace movement
+- Voluntary organisations concerned with health
+- Other groups
+- None
+
(Source: p. 5 of each Field Questionnaire)
I’m going to graph the log-base-10 proportion of respondents who answer “yes” to each question. There will be two panels, one for each question. In each panel, I’ll draw a horizontal dot chart with the 1999 an 2008 values plotted on the same row, distinguished by the plotting character. If it helps clarity, I’ll draw straight lines to connect points from the same year. The goal is to compare the popularity of volunteer activities between 1999 and 2008. I’m using a log scale because I’m expecting that some groups will be more popular than others.
+I’ll use ggplot2
in R, so I’ll need a data.frame that looks like:
Category | +Question | +Year | +Proportion | +
---|---|---|---|
Welfare | +A | +1999 | +0.15 | +
library(foreign)
+library(reshape2)
+library(memisc)
+## Loading required package: lattice
+## Loading required package: MASS
+##
+## Attaching package: 'memisc'
+##
+## The following objects are masked from 'package:stats':
+##
+## contr.sum, contr.treatment, contrasts
+##
+## The following object is masked from 'package:base':
+##
+## as.array
+pdt <- data.table:::print.data.table
+
+d1999 <- read.dta("ZA3777_v3-0-1.dta")
+## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
+## else paste0(labels, : duplicated levels in factors are deprecated
+## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
+## else paste0(labels, : duplicated levels in factors are deprecated
+## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
+## else paste0(labels, : duplicated levels in factors are deprecated
+## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
+## else paste0(labels, : duplicated levels in factors are deprecated
+d2008 <- read.dta("ZA4752_v1-0-0.dta")
+## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
+## else paste0(labels, : duplicated levels in factors are deprecated
+## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
+## else paste0(labels, : duplicated levels in factors are deprecated
+## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
+## else paste0(labels, : duplicated levels in factors are deprecated
+## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
+## else paste0(labels, : duplicated levels in factors are deprecated
+## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
+## else paste0(labels, : duplicated levels in factors are deprecated
+## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
+## else paste0(labels, : duplicated levels in factors are deprecated
+## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
+## else paste0(labels, : duplicated levels in factors are deprecated
+## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
+## else paste0(labels, : duplicated levels in factors are deprecated
+## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
+## else paste0(labels, : duplicated levels in factors are deprecated
+## Warning in read.dta("ZA4752_v1-0-0.dta"): value labels ('GB15') for 'GB15'
+## are missing
+## Checking out the structure of the file
+
+dim(d1999)
+## [1] 1000 417
+dim(d2008)
+## [1] 1561 456
+head(names(d1999), 20)
+## [1] "studyno" "version" "id_cocas" "caseno" "intno" "intno2gb"
+## [7] "year" "weight" "v1" "v2" "v3" "v4"
+## [13] "v5" "v6" "v6a_gb" "v7" "v8" "v9"
+## [19] "v10" "v11"
+head(names(d2008), 20)
+## [1] "StudyNo" "Version" "id_cocas" "caseno" "intno" "wave"
+## [7] "year" "country" "country1" "c_abrv" "c_abrv1" "weight"
+## [13] "cntry_y" "cntry1_y" "v1" "v2" "v3" "v4"
+## [19] "v5" "v6"
+pdt(d1999[1:5, 1:8])
+## studyno version id_cocas caseno intno intno2gb year
+## 1: 3777 3.0.1 (2012-12-28) 199982600001 1 2100 8 1999
+## 2: 3777 3.0.1 (2012-12-28) 199982600002 2 2100 9 1999
+## 3: 3777 3.0.1 (2012-12-28) 199982600003 3 2100 10 1999
+## 4: 3777 3.0.1 (2012-12-28) 199982600004 4 2100 4 1999
+## 5: 3777 3.0.1 (2012-12-28) 199982600005 5 2100 3 1999
+## weight
+## 1: 1.647713
+## 2: 1.210022
+## 3: 1.585010
+## 4: 1.585010
+## 5: 1.294967
+pdt(d2008[1:5, 1:14])
+## StudyNo Version id_cocas caseno intno wave year
+## 1: 4752 1.0.0 "2010-11-30" 200882610080 397 1899 wave 2008 2009
+## 2: 4752 1.0.0 "2010-11-30" 200882610093 475 3383 wave 2008 2009
+## 3: 4752 1.0.0 "2010-11-30" 200882610284 1042 2410 wave 2008 2009
+## 4: 4752 1.0.0 "2010-11-30" 200882611058 4401 2408 wave 2008 2009
+## 5: 4752 1.0.0 "2010-11-30" 200882611516 6998 3032 wave 2008 2009
+## country country1 c_abrv c_abrv1 weight
+## 1: Great Britain Great Britain GB GB 0
+## 2: Great Britain Great Britain GB GB 0
+## 3: Great Britain Great Britain GB GB 0
+## 4: Great Britain Great Britain GB GB 0
+## 5: Great Britain Great Britain GB GB 0
+## cntry_y cntry1_y
+## 1: Great Britain (2009) Great Britain (2009) [with split ups]
+## 2: Great Britain (2009) Great Britain (2009) [with split ups]
+## 3: Great Britain (2009) Great Britain (2009) [with split ups]
+## 4: Great Britain (2009) Great Britain (2009) [with split ups]
+## 5: Great Britain (2009) Great Britain (2009) [with split ups]
+unique(d2008$year) # just checking, 2008/2009 difference looked suspicious
+## [1] 2009
+## Levels: 2008 2009
+## Saving the columns I want and dumping the rest
+
+select <- function(...) paste0("v", unlist(lapply(as.list(sys.call())[-1], eval)))
+a99 <- select(12:27)
+b99 <- select(30:45)
+a08 <- select(10:25)
+b08 <- select(28:43)
+# c(length(a99), length(b99), length(a08), length(b08))
+# 16 categories
+
+d1999 <- d1999[, c("id_cocas", "year", a99, b99)]
+d2008 <- d2008[, c("id_cocas", "year", a08, b08, "f25", "f43")]
+Variables f25
and f43
are for flagging inconsistencies in the 2008 survey. Unfortunately there aren’t any flags for the 1999 survey. The inconsistency codes for f43
are:
+++
+- Inconsistent 1: If respondent mentiones at least one organisation and “none”. if v43=1 and any of v28 to v42=1 then f43=1
+- Inconsistent 2: If respondent does not know for at least one organization whether s/he works for it and mentiones “none”. if v43=1 and none of v28 to v42=1 and any of v28 to v42=8 then f43=2
+- Inconsistent 3: If respondent does not know for at least one organization whether s/he works for it and does not mention “none”. if v43=2 and none of v28 to v42=1 and any of v28 to v42=8 then f43=3
+- Inconsistent 4: If respondent does not mention any organisation and does not mention “none”. if v43=2 and all of v28 to v42=2 then f43=4
+- Inconsistent 5: If respondent mentions at least one organization and does not know whether s/he works for “none”. if v43=8 and any of v28 to v42=1 then f43=5
+- Inconsistent 6: If respondent does not mention any organization and does not know whether s/he works for “none”. if v43=8 and all of v28 to v42=2 then f43=6
+- Inconsistent 7: If respondent mentions at least one organization and does not answer whether s/he works for “none”. if v43=9 and any of v28 to v42=1 then f43=7
+- Inconsistent 8: If respondent does not mention any organization and does not answer whether s/he works for “none”. if v43=9 and all of v28 to v42=2 then f43=8
+
(Source: p. 57 of the 2008 Variable Report)
knitr::kable(rbind("belong to" = table(d2008$f25),"work for" = table(d2008$f43)))
++ | consistent | +inconsistent 1 | +inconsistent 2 | +inconsistent 3 | +inconsistent 4 | +inconsistent 5 | +inconsistent 6 | +inconsistent 7 | +inconsistent 8 | +
---|---|---|---|---|---|---|---|---|---|
belong to | +1553 | +5 | +0 | +0 | +2 | +0 | +0 | +1 | +0 | +
work for | +1553 | +1 | +0 | +0 | +7 | +0 | +0 | +0 | +0 | +
There are so few inconsistent responses in the 2008 survey that in my opinion it’s not even worth deleting them. Hopefully the 1999 survey is equally clean. In principle, I should reconstruct the consistency checks and apply them to both questions in both surveys. Then I could decide what to do with each type of inconsistency and recode accordingly.
+reset <- function() {
+ d1999 <<- read.dta("ZA3777_v3-0-1.dta")
+ d2008 <<- read.dta("ZA4752_v1-0-0.dta")
+
+ d1999 <<- d1999[, c("id_cocas", "year", a99, b99)]
+ d2008 <<- d2008[, c("id_cocas", "year", a08, b08, "f25", "f43")]
+}
+# for fixing stuff in case I mess up
+
+categories <- c(
+ "Social welfare",
+ "Religious",
+ "Education, arts, music or cultural",
+ "Trade unions",
+ "Political",
+ "Local community action",
+ "Third world development or human rights",
+ "Conservation, the environment, ecology, animal rights",
+ "Professional associations",
+ "Youth work",
+ "Sports or recreation",
+ "Women's groups",
+ "Peace movement",
+ "Organization concerned with health",
+ "Other groups",
+ "None"
+ )
+varnames <- apply(expand.grid(categories, c("A", "B")), 1, paste, collapse = "_")
+
+names(d1999)[seq.int(3, length.out=2*16)] <- varnames
+
+names(d2008)[seq.int(3, length.out=2*16)] <- c(varnames)
+d2008$f25 <- d2008$f43 <- NULL
+d2008$year <- 2008
+
+calc_proportions <- function(x) {
+ x <- as.character(x)
+ x[x %nin% c("mentioned", "not mentioned")] <- NA
+ x[x == "mentioned"] <- 1
+ x[x == "not mentioned"] <- 0
+ x <- as.numeric(x)
+ mean(x, na.rm = TRUE)
+}
+
+melt_and_split <- function(DF) {
+ DF <- melt(DF, id.vars = "year",
+ variable.name = "category", value.name = "proportion")
+ # it's not a "proportion" column yet, but it will be
+ tmp <- do.call(rbind, strsplit(as.character(DF$category), "_"))
+ DF[c("category", "question")] <- tmp
+ DF
+}
+
+calc_melt_split <- function(DF) {
+ out <- c(year = as.character(DF$year[1]), lapply(DF[-(1:2)], calc_proportions))
+ out <- melt_and_split(data.frame(out, check.names = FALSE))
+ out$question <- recode(out$question, "member" <- "A", "volunteer" <- "B")
+ out
+}
+
+d <- rbind(calc_melt_split(d1999), calc_melt_split(d2008))
+pdt(d, 5)
+## year category proportion question
+## 1: 1999 Social welfare 0.068000000 member
+## 2: 1999 Religious 0.048000000 member
+## 3: 1999 Education, arts, music or cultural 0.104000000 member
+## 4: 1999 Trade unions 0.073000000 member
+## 5: 1999 Political 0.026000000 member
+## ---
+## 60: 2008 Women's groups 0.011553273 volunteer
+## 61: 2008 Peace movement 0.003851091 volunteer
+## 62: 2008 Organization concerned with health 0.031450578 volunteer
+## 63: 2008 Other groups 0.046885035 volunteer
+## 64: 2008 None 0.779345734 volunteer
+library(grid)
+library(ggplot2)
+
+d$year <- factor(d$year)
+
+ord <- order(d[d$year == "2008" & d$question == "member", "proportion"])
+d$category <- factor(d$category, levels = unique(d$category)[ord])
+
+g <- ggplot(d, aes(x = proportion, y = category)) +
+ geom_point(aes(shape = year), color = NA) +
+ geom_hline(aes(yintercept = as.numeric(category)), color = "lightgray") +
+ geom_point(aes(shape = year), size = 3) +
+ scale_x_log10() +
+ scale_shape_manual(values = c(1, 16)) +
+ facet_grid(~ question) +
+ theme_classic() + theme(
+ axis.line = element_line(color = NA),
+ legend.position = "top",
+ panel.border = element_rect(fill = NA),
+ plot.title = element_text(size = 11, face = "bold")
+ ) +
+ ylab("") + xlab("log10 proportion") +
+ ggtitle("Proportion of EVS 1999 and 2008 respondents\nwho belong to or volunteer in each of sixteen organizations")
+
+## Draw the graph with the title centered properly
+# from http://stackoverflow.com/a/10976398/2954547
+gt <- ggplot_gtable(ggplot_build(g))
+## Warning: Removed 1 rows containing missing values (geom_point).
+## Warning: Removed 1 rows containing missing values (geom_point).
+gt$layout[which(gt$layout$name == "title"), c("l", "r")] <- c(1, max(gt$layout$r))
+plot.new()
+grid.draw(gt)
+I think this is plenty encapsulated as-is.
++ Here's some text. +
+ ++ Here's a plot: +
++ isn't it cool? +
+