MPPSamples/homework2.py at main · MadeByUche/MPPSamples · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
# PPHA 30537
# Spring 2024
# Homework 2

# Uchenna Offorjebe
# MadeByUche

# Due date: Sunday April 21st before midnight
# Write your answers in the space between the questions, and commit/push only
# this file to your repo. Note that there can be a difference between giving a
# "minimally" right answer, and a really good answer, so it can pay to put
# thought into your work.  Using functions for organization will be rewarded.

##################

# To answer these questions, you will use the csv document included in
# your repo.  In nst-est2022-alldata.csv: SUMLEV is the level of aggregation,
# where 10 is the whole US, and other values represent smaller geographies.
# REGION is the fips code for the US region. STATE is the fips code for the
# US state.  The other values are as per the data dictionary at:
# https://www2.census.gov/programs-surveys/popest/technical-documentation/file-layouts/2020-2022/NST-EST2022-ALLDATA.pdf
# Note that each question will build on the modified dataframe from the
# question before.  Make sure the SettingWithCopyWarning is not raised.

# PART 1: Macro Data Exploration

import pandas as pd
from pandas import DataFrame
import os

# Question 1.1: Load the population estimates file into a dataframe. Specify
# an absolute path using the Python os library to join filenames, so that
# anyone who clones your homework repo only needs to update one for all
# loading to work.


base_path = r'/Users/uchennaofforjebe/Documents/GitHub/homework-2-MadeByUche'
path = os.path.join(base_path, 'NST-EST2022-ALLDATA.csv') #relative path that goes to where path is set, relative to the python working directory
path

nst = pd.read_csv(path)

# Question 1.2: Your data only includes fips codes for states (STATE).  Use
# the us library to crosswalk fips codes to state abbreviations.  Drop the
# fips codes.

# Support on question:
# Code here inspired by: PyCharm - Python : https://plugins.jetbrains.com/plugin/16630-introduction-to-python
# Code here inspired by: https://saturncloud.io/blog/how-to-remove-rows-from-pandas-data-frame-that-contains-any-string-in-a-particular-column/

import us as us
nst['state_name'] = nst['STATE'].astype(str)

for i in range(21):
    nst.iloc[i, 44] = '0' + str(nst.iloc[i, 44])

fips_swap = us.states.mapping('fips', 'abbr')
nst['state_name'] = nst['state_name'].replace(fips_swap)

nst.iloc[22, 44] = "DC"

has_zero = '00'
filter = nst['state_name'].str.contains(has_zero)
nst = nst[~filter]

nst = nst.drop('STATE', axis=1)
print(nst)

# Question 1.3: Then show code doing some basic exploration of the
# dataframe; imagine you are an intern and are handed a dataset that your
# boss isn't familiar with, and asks you to summarize for them.  Do not
# create plots or use groupby; we will do that in future homeworks.
# Show the relevant exploration output with print() statements.

print(nst.tail(20))
print(nst.dtypes)
print(nst.head(20))
print(nst.describe())


# Question 1.4: Subset the data so that only observations for individual
# US states remain, and only state abbreviations and data for the population
# estimates in 2020-2022 remain.  The dataframe should now have 4 columns.

popest_2020_2022 = nst[["state_name", "POPESTIMATE2020", "POPESTIMATE2021", "POPESTIMATE2022"]]
print(popest_2020_2022)

# Question 1.5: Show only the 10 largest states by 2021 population estimates,
# in decending order.

top_10_population = popest_2020_2022.groupby('state_name')['POPESTIMATE2021'].max().sort_values(ascending=False).head(10)
print(top_10_population)

# Question 1.6: Create a new column, POPCHANGE, that is equal to the change in
# population from 2020 to 2022.  How many states gained and how many lost
# population between these estimates?

# 20 States had a drop in their population.

popest_2020_2022['POPCHANGE'] = popest_2020_2022.iloc[:, 3] - popest_2020_2022.iloc[:, 1]
print(popest_2020_2022)

population_drop = (popest_2020_2022['POPCHANGE'] < 0).sum()
print(population_drop)


# Question 1.7: Show all the states that had an estimated change in either
# direction of smaller than 1000 people.

# code inspired by: https://www.w3schools.com/python/ref_func_abs.asp
state_popchange_1000 = popest_2020_2022[(popest_2020_2022['POPESTIMATE2022'] - popest_2020_2022['POPESTIMATE2020']).abs() > 1000]
state_popchange_1000_by_states = state_popchange_1000[['state_name','POPCHANGE']].set_index('state_name')
print(state_popchange_1000_by_states)

# Question 1.8: Show the states that had a population growth or loss of
# greater than one standard deviation. Do not create a new column in your
# dataframe.  Sort the result by decending order of the magnitude of
# POPCHANGE.

# Code inspired by: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.std.html
POPCHANGE_std = popest_2020_2022["POPCHANGE"].std()
print(POPCHANGE_std)

popchange_gtlt_std = popest_2020_2022[(popest_2020_2022['POPCHANGE'].abs() > POPCHANGE_std)]
popchange_std_state = popchange_gtlt_std[['state_name','POPCHANGE']].sort_values(by="POPCHANGE", ascending=False)
print(popchange_std_state)


#PART 2: Data manipulation

# Question 2.1: Reshape the data from wide to long, using the wide_to_long function,
# making sure you reset the index to the default values if any of your data is located
# in the index.  What happened to the POPCHANGE column, and why should it be dropped?
# Explain in a brief (1-2 line) comment.

# This column should be dropped because it is a column that takes into account the difference between 2 seperate columns
# this produces incorrect results in some cases. \
popest_2020_2022.reset_index(drop=True)
popest_2020_2022.rename(columns={'state_name': 'STATE'}, inplace=True)
popest_2020_2022_long = pd.wide_to_long(popest_2020_2022, stubnames="POPESTIMATE", i="STATE", j="year")

print(popest_2020_2022_long)


# Question 2.2: Repeat the reshaping using the melt method. Clean up the result so
# that it is the same as the result from 2.1 (without the POPCHANGE column).

popest_2020_2022 = popest_2020_2022.drop('POPCHANGE', axis=1)
popest_2020_2022_long_melt = popest_2020_2022.melt(id_vars=["STATE"], var_name="year", value_name="")
print(popest_2020_2022_long_melt)


# Question 2.3: Open the state-visits.xlsx file in Excel, and fill in the VISITED
# column with a dummy variable for whether you've visited a state or not.  If you
# haven't been to many states, then filling in a random selection of them
# is fine too.  Save your changes.  Then load the xlsx file as a dataframe in
# Python, and merge the VISITED column into your original wide-form population
# dataframe, only keeping values that appear in both dataframes.  Are any
# observations dropped from this?  Show code where you investigate your merge,
# and display any observations that weren't in both dataframes.

# Code insipired by https://gist.github.com/sethbunke/c8132a78432ae99a6410ceaaa5f06854

state_visit = pd.read_excel('state-visits.xlsx')


state_visit["VISITED"] = np.random.choice((1, 0), size=state_visit.shape[0])
state_visit_merge = state_visit.merge(popest_2020_2022, on='STATE', how='inner')
state_visit_merge
end_len = len(state_visit_merge)
start_len = len(state_visit)

print("Start: ",start_len," ","End: ", end_len)
# Non-states have dropped

# Question 2.4: The file policy_uncertainty.xlsx contains monthly measures of
# economic policy uncertainty for each state, beginning in different years for
# each state but ending in 2022 for all states.  The EPU_National column estimates
# uncertainty from national sources, EPU_State from state, and EPU_Composite
# from both (EPU-N, EPU-S, EPU-C).  Load it as a dataframe, then calculate
# the mean EPU-C value for each state/year, leaving only columns for state,
# year, and EPU_Composite, with each row being a unique state-year combination.

# Code inspired by:

policy_uncertainty = pd.read_excel('policy_uncertainty.xlsx')
policy_uncertainty_filtered = policy_uncertainty[["state",'year','EPU_Composite']]
policy_uncertainty_filtered = policy_uncertainty_filtered.groupby(['state','year']).apply(lambda g: g['EPU_Composite'].mean()).reset_index()
policy_uncertainty_filtered.rename(columns={0: 'EPU_C_Mean'}, inplace=True)


# Question 2.5) Reshape the EPU data into wide format so that each row is unique
# by state, and the columns represent the EPU-C values for the years 2022,
# 2021, and 2020.

state_swap = us.states.mapping('name', 'abbr')
policy_uncertainty_filtered['state'] = policy_uncertainty_filtered['state'].replace(state_swap)

policy_uncertainty_last_3 = policy_uncertainty_filtered[policy_uncertainty_filtered['year']  > 2019]
policy_uncertainty_last_3.rename(columns={'state': 'STATE'}, inplace=True)
policy_uncertainty_last_3.loc[policy_uncertainty_last_3['STATE'] == 'District of Columbia', 'STATE'] = "DC"

policy_uncertainty_wide = policy_uncertainty_last_3.pivot(index='STATE', columns='year', values='EPU_C_Mean')
print(policy_uncertainty_wide)

# Question 2.6) Finally, merge this data into your merged data from question 2.3,
# making sure the merge does what you expect.

state_visit_merge = state_visit_merge.merge(policy_uncertainty_last_3, on='STATE', how='inner')
len(state_visit_merge)
print(state_visit_merge)


# Question 2.7: Using groupby on the VISITED column in the dataframe resulting
# from the previous question, answer the following questions and show how you
# calculated them: a) what is the single smallest state by 2022 population
# that you have visited, and not visited?  b) what are the three largest states
# by 2022 population you have visited, and the three largest states by 2022
# population you have not visited? c) do states you have visited or states you
# have not visited have a higher average EPU-C value in 2022?

# Code here inspired by: PyCharm - Python : https://plugins.jetbrains.com/plugin/16630-introduction-to-python

# A
# The single smallest state visited by population is VT
# The single smallest state not visited by population is WY
states_visited = state_visit_merge[(state_visit_merge['VISITED'] == 1)]
states_n_visited = state_visit_merge[(state_visit_merge['VISITED'] == 0)]

smallest_v_state = states_visited.groupby('STATE')['POPESTIMATE2022'].min().sort_values(ascending=True).head(10).reset_index(drop=False)
smallest_nv_state = states_n_visited.groupby('STATE')['POPESTIMATE2022'].min().sort_values(ascending=True).head(10).reset_index(drop=False)
print(smallest_v_state)
print(smallest_nv_state)

# B
# The 3 largest states visited by population are TX, FL, IL
# The 3 largest states not visited by population CA, NY, PA
largest_v_state = states_visited.groupby('STATE')['POPESTIMATE2022'].max().sort_values(ascending=False).head(10).reset_index(drop=False)
largest_nv_state = states_n_visited.groupby('STATE')['POPESTIMATE2022'].max().sort_values(ascending=False).head(10).reset_index(drop=False)
print(largest_v_state)
print(largest_nv_state)

# C
# The states I have not visited have a higher EPU-C mean
# Visited States: 296.89
# States not visited: 340.39

states_visited_epu = states_visited[['STATE','POPESTIMATE2022','EPU_C_Mean']]
states_n_visited_epu = states_n_visited[['STATE','POPESTIMATE2022','EPU_C_Mean']]

v_state_epc_avg = states_visited_epu['EPU_C_Mean'].mean()
nv_state_epc_avg = states_n_visited_epu['EPU_C_Mean'].mean()
print(v_state_epc_avg)
print(nv_state_epc_avg)

# Question 2.8: Transforming data to have mean zero and unit standard deviation
# is often called "standardization", or a "zscore".  The basic formula to
# apply to any given value is: (value - mean) / std
# Return to the long-form EPU data you created in step 2.4 and then, using groupby
# and a function you write, transform the data so that the values for EPU-C
# have mean zero and unit standard deviation for each state.  Add these values
# to a new column named EPU_C_zscore.

# Code here inspired by: PyCharm - Python : https://plugins.jetbrains.com/plugin/16630-introduction-to-python
policy_uncertainty_filtered_long_1 = pd.wide_to_long(policy_uncertainty_filtered, stubnames="EPU", i="state", j="year")
policy_uncertainty_filtered_long_1

policy_uncertainty_filtered_long_1 = policy_uncertainty_filtered.melt(id_vars=["state"], var_name="EPU_C_Mean", value_name="")

policy_uncertainty_filtered_long_1['EPU_C_zscore'] = policy_uncertainty_filtered_long_1.groupby('state')['EPU_C_Mean']