-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRCrash_2_Visuals.R
340 lines (253 loc) · 13.4 KB
/
RCrash_2_Visuals.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
#### Data I/O and Packages ####
# In this lecture, we are going to write our fake school data to a .csv file
# and then read the data back in to another R object.
# Lets read in some data (first make sure your working directory is set to the
# folder where the data are stored):
# The easiest way to set your working directory is to go to:
# Session -> Set Working Directory -> Choose Directory... and select the folder
# where you would like to set it.
# This is my working directory:
# If I wanted to save some data using the save() function, this is where it
# would go:
my_vec <- 1:1000
save(my_vec, file = "my_vec.RData")
# Your working directory is also the place where R goes to look for data when
# you try to load it in. If I were to now clear my environment and then try to
# load in the data, we see that it would work:
# Works!
rm(list = ls())
load("my_vec.RData")
### Working With .csv Files Using Base R ###
# Create some fake data!
student_id <- c(1:10)
grades <- c("A","B","C","A","C","F","D","B","B","A")
class <- c(rep(0,times = 5),rep(1,times = 5))
free_lunch <- rep(TRUE,times = 10)
# Put it in a data.frame
my_data <- data.frame(student_id,
grades,
class,
free_lunch,
stringsAsFactors = FALSE)
# Set column names
colnames(my_data) <- c("Student_ID", "Grades","Class","Free_Lunch")
# We make use of the 'write.csv()' function here. Make sure you do not write row
# names, this can really mess things up as it adds an additional column and is
# generally confusing:
write.csv(my_data,
file = "school_data.csv",
row.names = FALSE)
# Now we are going to read the data back in from the .csv file we just created.
# You should make sure that you specify the correct separator (the 'write.csv()'
# function defaults to using comma separation). I also always specify
# 'stringsAsFactors = FALSE' to preserve any genuine string variables I read in.
school_data <- read.csv(file = "school_data.csv",
stringsAsFactors = FALSE, # Always!!!
sep = ",")
## Other Data Formats ##
# We will need to load a package in order to read in excel data. This will
# extend the usefulness of R so that we can now read in .xlsx files among other
# types.
# First we need to download the 'rio' package, we can either do this manually
# or by using the package manager in base R. You can check this package out by
# visiting the development Github page: https://github.com/leeper/rio. You need
# to make sure you select 'dependencies = TRUE' so that you download the other
# packages that your package depends on, otherwise it will not work! Here is the
# manual way of installing an R package:
#install.packages("rio", dependencies = TRUE)
# Now we have to actually load the package so we can use it. We do this using
# the library() command:
library(rio)
# Write our school children data to an .xlsx file:
export(my_data, "school_data.xlsx")
# Now we can read in our data from the excel file:
excel_school_data <- import("school_data.xlsx")
# We can do the same thing for Stata .dta files:
# Write data to a .dta file:
export(my_data, "school_data.dta")
# Then read it back in:
stata_school_data <- import("school_data.dta")
## RData files ##
# Finally we may want to read and write our data to an .RData file that can hold
# everything in our workspace, or just a single variable.
# Save the data:
save(my_data, file = "Object.RData")
# Load the data:
load(file = "Object.RData")
#### Data Visualization ####
# In this example, we are going to create four different types of plots using
# the ggplot function. Ggplot2 is a R package dedicated to data visualization.
# It can improve both the quality and aesthetics of your graphics.
# Here's the ggplot2 cheat sheet:
# https://github.com/rstudio/cheatsheets/blob/main/data-visualization-2.1.pdf
setwd("/Users/yang/Downloads/drive-download-20220226T181306Z-001")
# First we need to download and load the 'ggplot2' package.
#install.packages('ggplot2')
library(ggplot2)
# Import a csv-format file to create plots:
dat <- read.csv("data_ggplot_1.csv", stringsAsFactors = FALSE, header = TRUE)
# When we read in the data, the first thing we would do is to check whether it
# is properly imported. To this end, we can either browse a whole dataset by
# creating a separate window or have a look at the first several rows of it.
# Take a look at the whole dataset by creating a separate window
View(dat)
# One easy way is to go to Environment -> click 'dat' (name of the data imported)
# Take a look at the first 6 rows with a header. If there are a specific number
# of rows you want to see, then you can plug in the number inside the head().
head(dat)
head(dat,10)
# Variables in the dataset:
# ctryname Country name
# year
# cowcode Country numeric code
# illtreatment Sum of reported illtreatment cases (ITT)
# RstrctAccess Amnesty reports restricted acess to detainees (ITT)
# wdi_aid Net aid amounts received (Constant USD) (WDI)
# wdi_gr Government Revenue (% of GDP) (WDI)
# wdi_pop Population (WDI)
# van_index Vanhanen's indicators of democratization
# dumAsia
# dumAfrica
# dumEurope
# dumNAmerica
# dumLAmerica Regional dummies
# lngdp Logged GDP (WDI)
# polity2 Political regimes (Autocracy, Anocracy, democracy)
# polity.category Autocracy, Anocracy, democracy
# illtrmt.category low, middle, high
### Scatter plots ###
# We create scatter plots when we want to explore the relationship between two
# variables. I.e., Is there a positive, negative or no relationship b/n x and y?
# In this example, one might expect the more aid countries receive, the more
# violence that the governments are going to use to their citizens. It is
# because the states less likely depend on tax monies as they are given more aid.
# As a result, they may no longer need to be nice to their citizens and thus
# use more violence against them. To examine this, we can create a scatter plot.
# To create a scatter plot, add +geom_point() that allows plotting dots. We will
# display the aid amounts on the x-axis and the number of illtreament cases by
# governments on the y-axis. NAs will be dropped when the plot is created.
ggplot(dat, aes(x = log(wdi_aid), y = illtreatment)) +
geom_point()
# Add + geom_smooth(method=lm) to see a relationship between x and y. You can
# see that there's a slight positive relationship between aid and illtreatment
# as expected:
ggplot(dat, aes(x = log(wdi_aid), y = illtreatment)) +
geom_point() + geom_smooth(method=lm)
### Histograms ###
# We can create histograms when we want to see the distribution of a variable.
# In this example, we want to see the distribution of illtreatment cases made
# in the sample countries in our dataset. As mentioned earlier, the ggplot()
# automatically drops NAs when plotted but you can manually exclude missing
# observations by using the subset() function. is.na() allows us to verify
# whether the given observation is NA or not. !is.na() means that we want to
# include the non-NA values (=non-missing values) only when creating our graph.
# To create a histogram, add +geom_histogram().
ggplot(data=subset(dat, !is.na(illtreatment)), aes(illtreatment)) +
geom_histogram()
# One good way to pull out more meaningful information is display distributions
# by groups. In our dataset, we have a variable named 'polity.category' that
# captures governments' regime types. We are going to create three histograms of
# the illtreatment variable for democracy, anocracy, and autocracy separately
# and compare their distributions.
ggplot(data=subset(dat, !is.na(illtreatment)&!is.na(polity.category)),
aes(illtreatment)) + geom_histogram(aes(group=polity.category)) +
facet_wrap(~ polity.category)
# Interestingly, we can see that not only autocracies but also democracies
# present a large number of governments' illtreatments against their citizens.
# One possible explanation is higher transparency in democracies that
# makes more illtreatment cases reported.
# To make the graph more informative, we can set labels for x and y and a title.
ggplot(data=subset(dat, !is.na(illtreatment)&!is.na(polity.category)),
aes(illtreatment)) + geom_histogram(aes(group=polity.category)) +
facet_wrap(~ polity.category) +
xlab("Number of Illtreatment Events") + # Create a label for x-axis
ylab("Count")+ # Create a label for y-axis
ylim(0,200)+ # set a limit for the range of y-axis from 0 to 200
ggtitle("The Number of Illtreatment Events by Political Regimes")
# Set a title for the plot
### Bar Charts ###
# We can create bar charts when we want to present the frequency of the values
# of a categorical variable. In this example, we are going to use the polity.
# category variable, which is composed of three regime categories:
# democracy, anocracy, and autocracy and check out each category's frequency.
# fill=polity.category: To fill the bar color by category.
# theme_bw(): To set background color as white.
ggplot(data=subset(dat, !is.na(polity.category)), aes(polity.category)) +
geom_bar(aes(fill=polity.category), position = 'dodge') +
theme_bw()+ labs(x=NULL) # To remove x-label
### Box Plots ###
# Let's import another dataset in a CSV-format:
dat <- read.csv("data_ggplot_2.csv", stringsAsFactors = FALSE, header = TRUE)
# To check the first six rows of the data loaded.
head(dat)
# We can create box plots when we want to get a sense of how the variable is
# distributed: e.g., to see the range, mean, median, minimum, and maximum values.
# In this example, we are going to explore the number of civilian deaths inside
# the DR Congo by province. There are 11 provinces in the DR Congo.
# Below, you can see the information on the variables in the dataset.
# Variables in the dataset:
# identifier A unique identifier
# province Province names, DRC
# rebcompetition A level of competition among rebel groups in a province
# civiliandeaths The number of civilian deaths by rebel attacks
# totalaid The amount of aid given to a province
ggplot(subset(dat, !is.na(province)&!is.na(civiliansdeaths)),
aes(province, civiliansdeaths), all.outliers=T) +
geom_boxplot(notch = F, aes(color=province)) + ylim(0,5000) +
ggtitle("Civilian Deaths by Province, DRC") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs (colour = NULL, x ="Province", y = "The Number of Civilian Deaths" )
# From the figure, we can find that Katanga, North and South Kivus, and Oriental
# provinces indicate high civilian attacks by looking at the location of maximum
# Dots indicate outliers, indicating an extremely larger number of attacks.
## Leaflet(): Creating an interactive map to visualize geo-referenced data ##
install.packages("tidyverse")
# Packages to load
library(leaflet)
library(dplyr)
library(tidyverse)
dat <- read.csv("UCDP_georeferenced.csv", stringsAsFactors = FALSE,
sep = ",")
head(dat)
dim(dat)
# Let's begin with a simple example!
newmap <- leaflet() %>%
addTiles() %>% ## the default base map
setView(30, 1.5, zoom = 7) %>% ## The view centered to the lat and lon passed.
addMarkers(lng=30.24842, lat=1.562500, popup="This is Bunia town in Ituri province.
There was an armed conflict between Hema and Lendu in 01/01/2000-01/31/2000.
No civilian deaths captured. You can read more about the Ituri conflict
<a href='https://en.wikipedia.org/wiki/Ituri_conflict'>HERE</a>.")
newmap
# We are going to differentiate the color of violence dots by region by manually
# creating a color palette.
data.frame(table(dat$adm_1))
colorpal <- colorFactor(palette = c("blue", "red", "darkgreen", "darkgrey",
"purple","yellow","orange", "brown","pink",
"black"),
domain=c("Bas Congo province","Equateur province","Ituri province",
"Kasai Occidental province","Kasai Oriental province",
"Katanga province","Maniema province","Nord Kivu province",
"Orientale province","Sud Kivu province"))
drcmap <- leaflet(dat) %>%
addProviderTiles(providers$CartoDB.Positron) %>% ## to specify the basemap
setView(25, -3, zoom = 5) %>% ## The view centered to the lat and lon passed.
addCircleMarkers(~longitude, ~latitude, popup=dat$side_a, weight = 3, radius=5,
color=~colorpal(adm_1), stroke = F, fillOpacity = 0.8) %>%
addLegend("bottomright",
colors = c("blue", "red", "darkgreen", "grey",
"purple","yellow","orange", "brown","pink",
"black"),
labels=c("Bas Congo","Equateur","Ituri","Kasai Occidental",
"Kasai Oriental","Katanga","Maniema","Nord Kivu",
"Orientale","Sud Kivu"), title="Provinces", opacity=0.8)
drcmap
# To export the figure
# Click on `Export` and then `Save as a Web Page`. Put in a name, but make sure
# that it ends with .html. Now that file can be loaded on to a server and will
# retain all of its interactive properties online.
install.packages("htmlwidgets")
library(htmlwidgets)
saveWidget(widget = drcmap,
file = "drc_violence_map_UCDP.html",
selfcontained = TRUE)