-
Notifications
You must be signed in to change notification settings - Fork 2
/
2_data_wrangling_and_vis.R
405 lines (320 loc) · 15.4 KB
/
2_data_wrangling_and_vis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
# ---
# title: 2. Data Wrangling and Visualisation
# author: Justin Chun-ting Ho
# last updated: 12 Jun 2021
# ---
####################################################################################
## Data Manipulation using dplyr ##
####################################################################################
# We're going to learn some of the most common `dplyr` functions:
# - `select()`: subset columns
# - `filter()`: subset rows on conditions
# - `mutate()`: create new columns by using information from other columns
# - `group_by()` and `summarize()`: create summary statistics on grouped data
# - `arrange()`: sort results
# Set Up
library(tidyverse)
scotelection <- read_csv("scotelection2021.csv")
# preview the data
View(scotelection)
####################################################################################
## Selecting columns and filtering rows ##
####################################################################################
# To select columns of a data frame, use `select()`.
# The first argument is the dataframe and the subsequent are the columns to keep.
select(scotelection, likes, comments, shares)
# To choose rows based on a specific criteria, use `filter()`:
filter(scotelection, snsname == "Scottish National Party (SNP)")
####################################################################################
## Pipes ##
####################################################################################
# What if you want to select and filter at the same time?
# There are three ways to do this: use intermediate steps, nested functions, or pipes.
# Intermediate steps
scotelection2 <- filter(scotelection, snsname == "Scottish National Party (SNP)")
scotelection_snp <- select(scotelection2, likes, comments, shares)
# Nest functions (i.e. one function inside of another)
scotelection_snp <- select(filter(scotelection, snsname == "Scottish National Party (SNP)"), likes, comments, shares)
## Pipes
# - take the output of one function and send it directly to the next
# - `%>%`
# - require the `magrittr` package
# - you can type the pipe with 'Ctrl' + 'Shift' + 'M' ('Cmd' + 'Shift' + 'M' for Mac)
scotelection %>%
filter(snsname == "Scottish National Party (SNP)") %>%
select(likes, comments, shares)
# If we want to create a new object with this smaller version of the data, we
# can assign it a new name:
scotelection_snp <- scotelection %>%
filter(snsname == "Scottish National Party (SNP)") %>%
select(likes, comments, shares)
scotelection_snp
########## Exercise ##########
# Using pipes, subset the `scotelection` data to include data points
# where the post type were photo (`Photo`)
# and retain only the columns `likes` and `postlink`.
##############################
####################################################################################
## Mutate ##
####################################################################################
# Create new columns based on the values in existing columns
# We might be interested in the ratio of shares and likes:
scotelection %>%
mutate(like_share_ratio = shares / likes)
# Or the total engagement:
scotelection %>%
mutate(total_engagement = shares + likes + comments)
####################################################################################
## Split-apply-combine data analysis and the summarize() function ##
####################################################################################
# Many data analysis tasks can be approached using the *split-apply-combine* paradigm:
# 1. split the data into groups
# 2. apply some analysis to each group
# 3. combine the results.
# `group_by()` is often used together with `summarize()`, which collapses each
# group into a single-row summary of that group. `group_by()` takes as arguments
# the column names that contain the categorical variables for which you want
# to calculate the summary statistics.
#So to compute the average likes by party:
scotelection %>%
group_by(snsname) %>%
summarize(mean_likes = mean(likes))
# You can also group by multiple columns:
scotelection %>%
group_by(snsname, type) %>%
summarize(mean_likes = mean(likes))
# You can also summarize multiple variables at the same time
scotelection %>%
group_by(snsname, type) %>%
summarize(mean_likes = mean(likes),
mean_comments = mean(comments))
# You can rearrange the result of a query to inspect the values.
scotelection %>%
group_by(snsname, type) %>%
summarize(mean_likes = mean(likes),
mean_comments = mean(comments)) %>%
arrange(mean_likes)
# To sort in descending order, add the `desc()` function.
scotelection %>%
group_by(snsname, type) %>%
summarize(mean_likes = mean(likes),
mean_comments = mean(comments)) %>%
arrange(desc(mean_likes))
####################################################################################
## Counting ##
####################################################################################
# When working with data, we often want to know the number of observations found
# for each factor or combination of factors.
scotelection %>%
count(type)
# `count()` provides the `sort` argument
scotelection %>%
count(type, sort = TRUE)
########## Exercise ##########
# 1. Use `group_by()` and `summarize()` to find the mean, min, and max
# number of shares for each party. Also add the number of
# observations (hint: see `?n`).
# 2. What was the highest likes in each month?
# Tips: Attach the lubridate library using 'library(lubridate)' and
# use the following code: mutate(month = month(datetime))
##############################
####################################################################################
## Data visualisation with ggplot2 ##
####################################################################################
# We start by loading the required package.
library(ggplot2)
# Building plots with **`ggplot2`** is typically an iterative process.
# We start by defining the dataset we'll use:
ggplot(data = scotelection)
# lay out the axis(es), for example we can set up the axis for the Comments count ('comments'):
ggplot(data = scotelection, aes(x = comments))
# and finally choose a geom
ggplot(data = scotelection, aes(x = comments)) +
geom_histogram()
# We could change the binwidth by adding arguments
ggplot(data = scotelection, aes(x = comments)) +
geom_histogram(binwidth = 100)
# We could add a line showing the mean value by adding a geom
mean_likes <- mean(scotelection$likes)
mean_comments <- mean(scotelection$comments)
mean_shares <- mean(scotelection$shares)
ggplot(data = scotelection, aes(x = comments)) +
geom_histogram(binwidth = 100) +
geom_vline(xintercept=mean_comments, color = "red", linetype = "dashed")
# We could change color by adding arguments (fill means color of the filling in ggplot2)
ggplot(data = scotelection, aes(x = comments)) +
geom_histogram(binwidth = 100, fill = "red")
# How about making it transparent?
ggplot(data = scotelection, aes(x = comments)) +
geom_histogram(binwidth = 100, fill = "red", alpha = 0.8)
########## Exercise 4 ##########
# Using the codes above, create a histogram for Likes count ('likes')
##############################
####################################################################################
## Visualising two categorical variables ##
####################################################################################
# We could create a bar plot:
ggplot(scotelection, aes(x = snsname)) +
geom_bar() +
coord_flip()
# We could also create a plot for two categorical variables
# We color the bar by the sentiment of the posts
ggplot(scotelection, aes(x = snsname, fill = type)) +
geom_bar() +
coord_flip()
# We can change how these bars are placed,
# for example 'position = "dodge"' will place them side by side
ggplot(scotelection, aes(x = snsname, fill = type)) +
geom_bar(position = "dodge") +
coord_flip()
# If we use 'position = "fill"', all bar will strech out to fill the whole y axis
# The y axis then become proportion
ggplot(scotelection, aes(x = snsname, fill = type)) +
geom_bar(position = "fill") +
coord_flip() +
labs(y = "proportion")
####################################################################################
## Visualising two continuous variables ##
####################################################################################
# Creating a plot for two continuous variables (comments and likes):
ggplot(data = scotelection, aes(x = comments, y = likes))
# We could use geom_point() for scatter plot
ggplot(data = scotelection, aes(x = comments, y = likes)) +
geom_point()
# Again, you can add the mean values
ggplot(data = scotelection, aes(x = comments, y = likes)) +
geom_point() +
geom_vline(xintercept = mean_comment) +
geom_hline(yintercept = mean_like)
# You can even add the same geom twice, with different values on the x and y axes
ggplot(data = scotelection, aes(x = comments, y = likes)) +
geom_point() +
geom_point(aes(x = mean_comment, y = mean_like), color = "red", size = 6)
# Use the argument 'color = "red"' for red dots
ggplot(data = scotelection, aes(x = comments, y = likes)) +
geom_point(color = "red")
# Or coloring them by party
ggplot(data = scotelection, aes(x = comments, y = likes)) +
geom_point(aes(color = snsname))
# There are many things you can do by adding layers into the ggplot
# You could log them.
# PS: You will see a warning message, don't worry, it is just because our data contain zeros
ggplot(data = scotelection, aes(x = comments, y = likes, color = snsname)) +
geom_point(alpha = 0.5) + # I made the points transparent for visiblity
scale_x_log10() +
scale_y_log10()
# Or fit a line
ggplot(data = scotelection, aes(x = comments, y = likes, color = snsname)) +
geom_point(alpha = 0.5) +
scale_x_log10() +
scale_y_log10() +
geom_smooth(method = "lm", se = FALSE)
# And add labels and legend
ggplot(data = scotelection, aes(x = comments, y = likes, color = snsname)) +
geom_point(alpha = 0.5) +
scale_x_log10() +
scale_y_log10() +
geom_smooth(method = "lm", se = FALSE) +
labs(x = "Comments Count", y = "Likes Count",
title = "Comments and Likes",
subtitle = "One post per dot")
# Change the theme by adding theme_bw()
ggplot(data = scotelection, aes(x = comments, y = likes, color = snsname)) +
geom_point(alpha = 0.5) +
scale_x_log10() +
scale_y_log10() +
geom_smooth(method = "lm", se = FALSE) +
labs(x = "Comments Count", y = "Likes Count",
title = "Comments and Likes",
subtitle = "One post per dot") +
theme_bw()
# To save your graph, you could first define the graph as an object then use ggsave:
myplot <- ggplot(data = scotelection, aes(x = comments, y = likes, color = snsname)) +
geom_point(alpha = 0.5) +
scale_x_log10() +
scale_y_log10() +
geom_smooth(method = "lm", se = FALSE) +
labs(x = "Comments Count", y = "Likes Count",
title = "Comments and Likes",
subtitle = "One post per dot")
ggsave(myplot, filename = "my_plot.png")
# Facet to make small multiples
ggplot(data = scotelection, aes(x = comments, y = likes)) +
geom_point(alpha = 0.5) +
scale_x_log10() +
scale_y_log10() +
geom_smooth(method = "lm", se = FALSE) +
facet_wrap(vars(snsname))
########## Exercise 5 ##########
# Make a scatter plot of shares by comments count, log both axes,
# color them by party, change shape by post type (adding 'shape = type' in aes())
##############################
####################################################################################
## It's about time ##
####################################################################################
# We need this package
# install.packages("scales")
# install.packages("lubridate")
library(scales)
library(lubridate)
# To plot a time series, the first thing you have to do is to transform your data into time/date format
# There are many formats for time and date, but the simpliest way would probably be:
scotelection$date <- as.Date(scotelection$datetime) # Defining a new column called 'date'
# Simply plot the same scatter point, using 'date' as the x axis
ggplot(data = scotelection, aes(x = date, y = likes, color = snsname)) +
geom_point()
# Changing the x scale using scale_x_date()
ggplot(data = scotelection, aes(x = date, y = likes, color = snsname)) +
geom_point() +
scale_x_date(labels = date_format("%m/%y"), date_breaks = "1 month")
# We could use line
ggplot(data = scotelection, aes(x = date, y = likes, color = snsname)) +
geom_line() +
scale_x_date(labels = date_format("%m/%y"), date_breaks = "1 month")
# Or use geom_smooth to fit a local regression line
ggplot(data = scotelection, aes(x = date, y = likes, color = snsname)) +
geom_smooth(method = 'loess') +
scale_x_date(labels = date_format("%m/%y"), date_breaks = "1 month")
# You could add two geoms on top of each other
ggplot(data = scotelection, aes(x = date, y = likes, color = snsname)) +
geom_smooth(method = 'loess') +
geom_point(alpha = 0.3) +
scale_x_date(labels = date_format("%m/%y"), date_breaks = "1 month")
# You could also truncate the y axis (use with caution!)
ggplot(data = scotelection, aes(x = date, y = likes, color = snsname)) +
geom_smooth(method = 'loess') +
geom_point(alpha = 0.3) +
scale_x_date(labels = date_format("%m/%y"), date_breaks = "1 month") +
ylim(c(0, 3000))
# For some plots, it might be easier if you transform the data in advance
# To do so, we have to learn a few things:
# '%>%' is a pipping operator, it acts as a pipe line:
# the output before the pipping operator will feed into the next function as the input
# We also need the following functions:
# group_by() is a function to split the data into groups
# summarise() is a function to make calculation and put the result into a new variable
library(dplyr)
scotelection %>% # Take 'scotelection', put into a pipe
group_by(snsname, date=floor_date(date, "month")) %>% # split the data by snsname and by month
summarise(total_likes = sum(likes)) -> # calculate total likes by taking the sum of likes count
plot_data # assign to the new object 'plot_data'
# Have a look at the transformed data
ggplot(data = plot_data, aes(x = date, y = total_likes, color = snsname)) +
geom_line()
########## Exercise 6 ##########
# Using the above codes, aggregate comment counts by month.
# Plot an area plot (selection a sensibile position)
# TIPS:
# plot_data <- snp %>%
# group_by(type, date=floor_date(date, "month")) %>%
# summarise(###### = sum(#######)) # Change the ########### into the variable names
##############################
####################################################################################
## What's next? ##
####################################################################################
# You won't be able to learn everything about R in three hours,
# but I hope this workshop would give you a head start in your progarmming journey.
# If you would like to learn more, there are plenty (free) resources online:
# https://www.tidyverse.org/
# http://www.cookbook-r.com/
# https://socviz.co/