-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathChicago_community_analysis.Rmd
166 lines (135 loc) · 5.5 KB
/
Chicago_community_analysis.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
---
title: "Chicago_community_analysis"
author: "Ben Ellman"
date: "2024-07-02"
output: html_document
---
#load required packages
library(tidyverse)
library(ggplot2)
library(rvest)
library(data.table)
library(readxl)
library(datawizard)
library(gplots)
#this document is meant to provide analysis to the raw data created and manipulated in the Chicago_businesses (and other) workbooks; to access datasets and cleaning, manipulation, etc., go to those other workbooks
#main datasources: chi_community from "Chicago_businesses.Rmd"
#active business licenses by area
#distribution
chi_community %>%
group_by(region,year) %>%
summarize(bus_per_person = median(bus_per_person,na.rm=TRUE)) %>%
ggplot(aes(x=region,y=bus_per_person)) +
geom_boxplot() +
labs(title='Distribution of businesses across Chicago regions',
x='Region',
y='Active business licenses per person')
plotmeans(bus_per_person ~ region,main='Heterogeneity of businesses across regions',data=chi_community)
#relationship between active business licenses in community and unemployment, crime, and other possible demographic dependent factors
#active business licenses and unemployment (excl LOOP as outlier)
chi_community %>%
group_by(community,year) %>%
filter(community != 'LOOP')%>%
ggplot(aes(x=bus_per_person,y=unemp_rate)) +
geom_point(aes(color = year)) +
geom_smooth() +
labs(title='Unemployment and regional businesses per person seem inversely related',
x='Active business licenses per person',
y='Unemployment rate')
chi_community %>%
group_by(community,year) %>%
filter(community != 'LOOP') %>%
mutate(log_unemp = log(unemp_rate)) %>%
ggplot(aes(x=bus_per_person,y=log_unemp)) +
geom_point(aes(color = year)) +
geom_smooth() +
labs(title='Unemployment and regional businesses per person seem inversely related',
x='Active business licenses per person',
y='Logged unemployment rate')
#active business licenses and crime
chi_community %>%
group_by(community,year) %>%
filter(community != 'LOOP') %>%
ggplot(aes(x=bus_per_person,y=crime_rate)) +
geom_point(aes(color = year)) +
geom_smooth() +
labs(title='Crime and regional businesses per person seem unrelated',
x='Active business licenses per person',
y='Crime rate (per 100k)')
#active business licenses and median income
chi_community %>%
group_by(community,year) %>%
filter(community != 'LOOP') %>%
mutate(log_medinc = log(medinc)) %>%
ggplot(aes(x=bus_per_person,y=log_medinc)) +
geom_point(aes(color = year)) +
geom_smooth() +
labs(title='Income and regional businesses per person seem positively related',
x='Active business licenses per person',
y='Median Income')
#clustering neighborhoods to identify similar areas based on socioeconomic factors
#exclude variables with missing values and 2015 year (no medinc data)
comm_cluster <- chi_community %>%
filter(year != 2015) %>%
select(c(-med_ha,-ind,-indperc,-mix,-mixperc))
#see data that has missing values still
comm_cluster[!complete.cases(comm_cluster),]
#numeric variables only
num_cluster <- comm_cluster[,sapply(comm_cluster,is.numeric)]
#scale variables
num_cluster <- scale(num_cluster)
#initial variables of interest
num_cluster <- num_cluster[,c('year','unemp_rate','lbfrc_rate','tot_pop','crime_rate','violent','med_age','medinc','med_hv','bus_per_person')]
#identify optimal k for k-means clustering of observations
wcss <- numeric(length = 10)
for (i in 1:10) {
kmeans_result <- kmeans(num_cluster, centers = i)
wcss[i] <- kmeans_result$tot.withinss
}
plot(1:10, wcss, type = "b", xlab = "Number of Clusters (k)", ylab = "WCSS")
#k = 3, k-means clustering
set.seed(10)
km_three <- kmeans(num_cluster, centers=3, nstart=30)
km_three
#join km_three clusters to comm_cluster df
comm_cluster <- comm_cluster %>%
bind_cols(km_three$cluster,.name_repair="unique") %>%
rename(cluster = ...150)
comm_cluster <- comm_cluster[,c(1:3,150,4:149)]
#summarize and plot clusters against demographic variables to see spread amongst and between each centroid
comm_cluster %>%
group_by(cluster) %>%
summarize(n = n(),
med_unemp = median(unemp_rate),
med_pop = median(tot_pop),
med_inc = median(medinc),
med_hv = median(med_hv))
comm_cluster %>%
group_by(community,year) %>%
ggplot(aes(x=cluster,y=unemp_rate)) +
geom_boxplot(aes(group=cluster))
comm_cluster %>%
group_by(community,year) %>%
ggplot(aes(x=cluster,y=violent)) +
geom_boxplot(aes(group=cluster))
comm_cluster %>%
group_by(community,year) %>%
ggplot(aes(x=cluster,y=bus_per_person)) +
geom_boxplot(aes(group=cluster))
comm_cluster %>%
group_by(community,year) %>%
ggplot(aes(x=cluster,y= medinc)) +
geom_boxplot(aes(group=cluster))
#analyze using business growth to isolate effects of changes to business licenses and attempt to get rid of between-neighborhood effects (simulating first differences approach); effect of business growth and median income is too small
chi_community %>%
filter(year >2016) %>%
ggplot(aes(x=bus_growth, y= (medinc-dplyr::lag(medinc,n=1L)))) +
geom_point() +
geom_smooth()
#business growth and crime relationship; little to no relationship
chi_community %>%
filter(year >2016) %>%
ggplot(aes(x=bus_growth, y= violent)) +
geom_point() +
geom_smooth()
#it's possible that the rate of change of business licenses doesn't capture the true growth of businesses and business activity in a neighborhood in any given year. To analyze business' effects on income, crime, etc., it is likely that there will be a disconnect between business activity and residual effects to the neighborhood; Alternatively, increased business activity could be a sign of already rising income, decreasing crime, etc., potentially reversing the relationship