-
Notifications
You must be signed in to change notification settings - Fork 0
/
NRSDeaths_2000-2019DataCSVRead.R
304 lines (255 loc) · 15.4 KB
/
NRSDeaths_2000-2019DataCSVRead.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
################################################
# #
# #
# Read .csv files with deaths (2000-2019) #
# #
# #
################################################
# Set WD to ages data path
setwd(NRSAgesDataDirectory)
# Get list of .csv files in directory
DeathsFilesList = list.files(path = NRSAgesDataDirectory,
pattern = "*.csv",
ignore.case = TRUE
)
# This won't download the data each time as the files are zipped.
# Set up basic data frame that will be filled by loop
DeathsWeekly_pivot_aggregate_All = data.frame(Date = as.Date("2020-01-01", origin = "1970-01-01"),
ISODate = "2020-W1-01",
Year = 2020,
Week = 1,
Age = as.factor("0"),
Deaths = 0
)
DeathsWeekly_pivot_aggregate_All = DeathsWeekly_pivot_aggregate_All[NULL,]
DeathsWeekly_pivot_aggregate = DeathsWeekly_pivot_aggregate_All
DeathsColNames = c("Age","Total",as.character(seq(from = 1,to = 53,by =1)))
DWAggColNames = c("Date","ISODate","Year","Week","Age","Deaths")
# Run through all files and add into DeathsWeekly_pivot_aggregate_All
for (FileCounter in 1:length(DeathsFilesList)) {
DeathsWeekly_pivot_aggregate = DeathsWeekly_pivot_aggregate[NULL,]
# B4:BD4 - column names
# B5:BD24 - female data
# B25:BD44 - male data
# Read female data
DeathsWeekly_Female = read.csv(DeathsFilesList[FileCounter],skip = 3,nrows = 20)
DeathsWeekly_Female = DeathsWeekly_Female %>%
select(-Sex)
colnames(DeathsWeekly_Female) = DeathsColNames
FileYear = str_sub(str_split(DeathsFilesList[FileCounter],".csv")[[1]][1],-4)
DeathsWeekly_Female_pivot = DeathsWeekly_Female %>%
pivot_longer(!Age,names_to = "Week",values_to = "Deaths")
DeathsWeekly_Female_pivot = DeathsWeekly_Female_pivot %>%
mutate(Sex = rep("F",nrow(DeathsWeekly_Female_pivot))) %>% # Add Sex column
relocate(Sex) %>%
mutate(Year = rep(FileYear,nrow(DeathsWeekly_Female_pivot))) %>% # Add year column
relocate(Week) %>%
relocate(Year) %>%
filter(Week != "Total")
DeathsWeekly_Female_pivot = DeathsWeekly_Female_pivot %>% # Add ISO dates and dates (first day of week, Sunday)
mutate(ISODate = paste(Year,"-W",sprintf("%02d",as.integer(Week)),"-","1",sep = "")) %>%
mutate(Date = ISOweek2date(ISODate)) %>%
relocate(ISODate) %>%
relocate(Date)
# Read male data
DeathsWeekly_Male = read.csv(DeathsFilesList[FileCounter],skip = 23,nrows = 20)
DeathsWeekly_Male = DeathsWeekly_Male %>%
select(-X) # Inherets blank name from empty cell above.
colnames(DeathsWeekly_Male) = DeathsColNames
DeathsWeekly_Male_pivot = DeathsWeekly_Male %>%
pivot_longer(!Age,names_to = "Week",values_to = "Deaths")
DeathsWeekly_Male_pivot = DeathsWeekly_Male_pivot %>%
mutate(Sex = rep("F",nrow(DeathsWeekly_Male_pivot))) %>% # Add Sex column
relocate(Sex) %>%
mutate(Year = rep(FileYear,nrow(DeathsWeekly_Male_pivot))) %>% # Add year column
relocate(Week) %>%
relocate(Year) %>%
filter(Week != "Total")
DeathsWeekly_Male_pivot = DeathsWeekly_Male_pivot %>% # Add ISO dates and dates (first day of week, Sunday)
mutate(ISODate = paste(Year,"-W",sprintf("%02d",as.integer(Week)),"-","1",sep = "")) %>%
mutate(Date = ISOweek2date(ISODate)) %>%
relocate(ISODate) %>%
relocate(Date)
# Put all data together and calculate total deaths each week
DeathsWeekly_pivot = rbind.data.frame(DeathsWeekly_Female_pivot,DeathsWeekly_Male_pivot)
# Get formats correct
DeathsWeekly_pivot$Date = as.character(DeathsWeekly_pivot$Date)
DeathsWeekly_pivot$Date = as.Date(DeathsWeekly_pivot$Date,
origin = "1970-01-01")
DeathsWeekly_pivot$Year = as.numeric(DeathsWeekly_pivot$Year)
DeathsWeekly_pivot$Week = as.numeric(DeathsWeekly_pivot$Week)
DeathsWeekly_pivot$Age = as.factor(DeathsWeekly_pivot$Age)
# Aggregate sex results to give total per week
DeathsWeekly_pivot_aggregate = aggregate(x = DeathsWeekly_pivot$Deaths,by=list(DeathsWeekly_pivot$Date,
DeathsWeekly_pivot$ISODate,
DeathsWeekly_pivot$Year,
DeathsWeekly_pivot$Week,
DeathsWeekly_pivot$Age
),
FUN=sum)
colnames(DeathsWeekly_pivot_aggregate) = DWAggColNames
# Add aggregated df into basic df
DeathsWeekly_pivot_aggregate_All = rbind.data.frame(DeathsWeekly_pivot_aggregate_All,DeathsWeekly_pivot_aggregate)
}
DeathsWeekly_pivot_aggregate_All = DeathsWeekly_pivot_aggregate_All %>%
filter(ISODate != "2000-W53-1") %>%
filter(ISODate != "2001-W53-1") %>%
filter(ISODate != "2002-W53-1") %>% # Filter out these non-existent weeks. Doesn't work when ORred for no apparent reason.
filter(ISODate != "2003-W53-1") %>%
filter(ISODate != "2005-W53-1") %>%
filter(ISODate != "2006-W53-1") %>%
filter(ISODate != "2007-W53-1") %>%
filter(ISODate != "2008-W53-1") %>%
filter(ISODate != "2010-W53-1") %>%
filter(ISODate != "2011-W53-1") %>%
filter(ISODate != "2012-W53-1") %>%
filter(ISODate != "2013-W53-1") %>%
filter(ISODate != "2014-W53-1") %>%
filter(ISODate != "2016-W53-1") %>%
filter(ISODate != "2017-W53-1") %>%
filter(ISODate != "2018-W53-1") %>%
filter(ISODate != "2019-W53-1")
DeathsWeekly_pivot_aggregate_2015_2019 = DeathsWeekly_pivot_aggregate_All %>%
filter(Year >= "2015")
DeathsWeekly_mean_2015_2019 = aggregate(x = DeathsWeekly_pivot_aggregate_2015_2019$Deaths,by=list(DeathsWeekly_pivot_aggregate_2015_2019$Week,
DeathsWeekly_pivot_aggregate_2015_2019$Age),
FUN=mean)
colnames(DeathsWeekly_mean_2015_2019) = c("Week","Age","Deaths")
# Recode factors to match those in the NRS Deaths (2020-2021) data.
DeathsWeekly_mean_2015_2019$Age = recode_factor(DeathsWeekly_mean_2015_2019$Age,
"0" = "0",
"1-4" = "1 to 4",
"5-9" = "5 to 9",
"10-14" = "10 to 14",
"15-19" = "15 to 19",
"20-24" = "20 to 24",
"25-29" = "25 to 29",
"30-34" = "30 to 34",
"35-39" = "35 to 39",
"40-44" = "40 to 44",
"45-49" = "45 to 49",
"50-54" = "50 to 54",
"55-59" = "55 to 59",
"60-64" = "60 to 64",
"65-69" = "65 to 69",
"70-74" = "70 to 74",
"75-79" = "75 to 79",
"80-84" = "80 to 84",
"85-89" = "85 to 89",
"90+" = "90+")
# Create dates for 2020 and 2021 attached to DeathsWeekly_mean_2015_2019 data
# Basically, get 2020-W5-1 etc for each date and repeat dataset for 2021
# Do the same for DeathsWeekly_pivot_aggregate_2015_2019 (because...graphing order later on.)
DeathsWeekly_pivot_aggregate_2015_2019$Age = recode_factor(DeathsWeekly_pivot_aggregate_2015_2019$Age,
"0" = "0",
"1-4" = "1 to 4",
"5-9" = "5 to 9",
"10-14" = "10 to 14",
"15-19" = "15 to 19",
"20-24" = "20 to 24",
"25-29" = "25 to 29",
"30-34" = "30 to 34",
"35-39" = "35 to 39",
"40-44" = "40 to 44",
"45-49" = "45 to 49",
"50-54" = "50 to 54",
"55-59" = "55 to 59",
"60-64" = "60 to 64",
"65-69" = "65 to 69",
"70-74" = "70 to 74",
"75-79" = "75 to 79",
"80-84" = "80 to 84",
"85-89" = "85 to 89",
"90+" = "90+")
DeathsWeekly_ExcessBaseline = rbind.data.frame(DeathsWeekly_mean_2015_2019,DeathsWeekly_mean_2015_2019,DeathsWeekly_mean_2015_2019)
# This uses 2015-2019 deaths for the baseline for 2022. This should be updated to match latest NRS method (uses 2016-2019 + 2021).
Year2020 = as.data.frame(x = rep("2020",nrow(DeathsWeekly_mean_2015_2019)))
colnames(Year2020) <- "Year"
Year2021 = as.data.frame(x = rep("2021",nrow(DeathsWeekly_mean_2015_2019)))
colnames(Year2021) <- "Year"
Year2022 = as.data.frame(x = rep("2022",nrow(DeathsWeekly_mean_2015_2019)))
colnames(Year2022) <- "Year"
Year2020_2022 = rbind.data.frame(Year2020,Year2021,Year2022)
colnames(Year2020_2022) = "Year"
DeathsWeekly_ExcessBaseline = cbind.data.frame(Year2020_2022,DeathsWeekly_ExcessBaseline)
rm(Year2020,Year2021,Year2022,Year2020_2022)
DeathsWeekly_ExcessBaseline = DeathsWeekly_ExcessBaseline %>%
mutate(ISODate = paste(Year,"-W",sprintf("%02d",as.integer(Week)),"-","1",sep = "")) %>%
mutate(Date = ISOweek2date(ISODate)) %>%
relocate(ISODate) %>%
relocate(Date) %>%
filter(ISODate != "2022-W01-1") # Remove W01 of 2022 since W53 of 2021 overlaps it.
# Calculate excess deaths
# SUbtract average(2015-2019) from 2020 and 2021 deaths.
# Need to recategorise data first: NRS 2015-2019 has "90+" as oldest age category, but 2020-2021 weekly deaths has "90 to 94" and "95+"
NRSDeaths_AllSum_oldest = NRSDeaths_AllSum %>%
filter(Age == "90 to 94" | Age == "95+")
NRSDeaths_AllSum_oldest_agg = aggregate(x = NRSDeaths_AllSum_oldest$Deaths,by=list(NRSDeaths_AllSum_oldest$Date,
NRSDeaths_AllSum_oldest$Cause),
FUN=sum) # Add together death values from "90-94" and "95+" into "90+"
aggAges = rep("90+",nrow(NRSDeaths_AllSum_oldest_agg)) # Create list of "90+" factors of same length as NRSDeaths_AllSum_oldest_agg
NRSDeaths_AllSum_oldest_agg = cbind.data.frame(NRSDeaths_AllSum_oldest_agg,aggAges)
colnames(NRSDeaths_AllSum_oldest_agg) = c("Date","Cause","Deaths","Age")
NRSDeaths_AllSum_oldest_agg = NRSDeaths_AllSum_oldest_agg %>%
relocate(Age, .after = Date)
NRSDeaths_AllSum_refactored = NRSDeaths_AllSum %>%
filter(Age != "90 to 94", Age != "95+")
NRSDeaths_AllSum_refactored = rbind.data.frame(NRSDeaths_AllSum_refactored,NRSDeaths_AllSum_oldest_agg)
rm(NRSDeaths_AllSum_oldest_agg,aggAges,NRSDeaths_AllSum_oldest)
# Now that 2015-2019 deaths and 2020-2021 deaths are factored in the same way, calculate the excess.
# How?
# Trim DeathsWeekly_ExcessBaseline to same length as NRSDeaths_AllSum_refactored based on dates
DeathsWeekly_ExcessBaseline_trim = DeathsWeekly_ExcessBaseline %>%
filter(Date >= min(NRSDeaths_AllSum_refactored$Date) & Date <= max(NRSDeaths_AllSum_refactored$Date))
Cause = rep("Baseline",nrow(DeathsWeekly_ExcessBaseline_trim))
DeathsWeekly_ExcessBaseline_trim = cbind.data.frame(DeathsWeekly_ExcessBaseline_trim,Cause)
rm(Cause)
NRSDeaths_AllSum_refactored = NRSDeaths_AllSum_refactored %>%
complete(Date,nesting(Age,Cause),fill = list(Deaths = 0)) # This df has missing values - where deaths were 0 for a given factor on a certain date, the value was missed out.
# Form final DeathsTogether data frame.
DeathsTogether = bind_rows(select(DeathsWeekly_ExcessBaseline_trim,Date,Age,Deaths,Cause),NRSDeaths_AllSum_refactored) %>%
group_by(Date,Age) %>% # Age factors are named differently.
mutate(Excess = Deaths[Cause == "All"] - Deaths[Cause == "Baseline"])
# Reorder factors
DeathsTogether$Age <- factor(DeathsTogether$Age, levels=c("0","1 to 4","5 to 9","10 to 14",
"15 to 19","20 to 24","25 to 29",
"30 to 34","35 to 39","40 to 44",
"45 to 49","50 to 54","55 to 59",
"60 to 64","65 to 69","70 to 74",
"75 to 79","80 to 84","85 to 89",
"90+"))
###################################################
# #
# Add cumulative deaths to DeathsTogether df #
# #
###################################################
# May need a different data frame as this doesn't include COVID/non-COVID split. Just all causes & expected.
DeathsTogetherCum2020 <- DeathsTogether %>%
filter(Date >= as.Date("2019-12-01") & Date <= as.Date("2020-12-31")) %>%
filter(Cause == "All") %>%
group_by(Age) %>%
mutate(cumDeaths = cumsum(Deaths)) %>%
mutate(cumExcess = cumsum(Excess))
DeathsTogetherCum2021 <- DeathsTogether %>%
filter(Date >= as.Date("2021-01-01") & Date <= as.Date("2021-12-31")) %>%
filter(Cause == "All") %>%
group_by(Age) %>%
mutate(cumDeaths = cumsum(Deaths)) %>%
mutate(cumExcess = cumsum(Excess))
DeathsTogetherCum2022 <- DeathsTogether %>%
filter(Date >= as.Date("2022-01-01") & Date <= as.Date("2022-12-31")) %>%
filter(Cause == "All") %>%
group_by(Age) %>%
mutate(cumDeaths = cumsum(Deaths)) %>%
mutate(cumExcess = cumsum(Excess))
DeathsTogetherCum2020_2022 <- rbind.data.frame(DeathsTogetherCum2020,DeathsTogetherCum2021,DeathsTogetherCum2022)
# Add year, ISO week and gradient of cumExcess and cumDeaths.
DeathsTogetherCum2020_2022 <- DeathsTogetherCum2020_2022 %>%
group_by(Age) %>%
mutate(Year = as.factor(isoyear(Date))) %>%
mutate(WeekNo = isoweek(Date)) %>%
mutate(gradcumExcess = (cumExcess - lag(cumExcess))/(WeekNo - lag(WeekNo))) %>%
mutate(gradcumDeaths = (cumDeaths - lag(cumDeaths))/(WeekNo - lag(WeekNo)))
DeathsTogetherCum2020_2021 <- filter(DeathsTogetherCum2020_2022,Date < as.Date("2022-01-01"))
############################
setwd(RootDirectory)