-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathupdate_data.py
391 lines (311 loc) · 14.4 KB
/
update_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
from collections import namedtuple
from argparse import ArgumentParser
import calendar
import datetime
import json
import pandas as pd
import geopandas
import data_analysis.compare_scheduled_and_rt as csrt
import data_analysis.plots as plots
from data_analysis.cache_manager import CacheManager
DataUpdate = namedtuple(
"DataUpdate", ["combined_long_df", "summary_df", "start_date", "end_date"]
)
def filter_dates(df: pd.DataFrame, start_date: str, end_date: str) -> pd.DataFrame:
"""Keep data between start_date and end_date (inclusive)
Args:
df (pd.DataFrame): A DataFrame with a 'date' column e.g.
date route_id trip_count_rt ...
0 2022-05-20 1 43
1 2022-05-20 100 33
2 2022-05-20 103 80
3 2022-05-20 106 121
4 2022-05-20 108 34
start_date (str): A date in 'YYYY-MM-DD' format.
Must be on or after 2022-05-20
end_date (str): A date in 'YYYY-MM-DD'. Must be on or before current date
Returns:
pd.DataFrame: A DataFrame filtered between start_date and end_date
"""
df = df.copy()
return df.loc[(df["date"] >= start_date) & (df["date"] <= end_date)]
def aggregate(
df: pd.DataFrame, freq: str = "M", col_list: list = ["day_type", "route_id"]
) -> pd.DataFrame:
"""Sum columns in col_list by frequency freq.
Args:
df (pd.DataFrame): A DataFrame with the following contents
date day_type trip_count_rt trip_count_sched ratio
12 2022-06-01 wk 14908 18444 0.808285
13 2022-06-02 wk 14602 18443 0.791737
14 2022-06-04 sat 9533 13289 0.717360
15 2022-06-05 sun 7624 10641 0.716474
16 2022-06-06 wk 14713 18414 0.799012
freq (str, optional): frequency of grouping the data e.g. daily,
monthly, etc. Defaults to 'M'.
col_list (list, optional): The columns to group by.
Defaults to ['day_type', 'route_id'].
Returns:
pd.DataFrame: A DataFrame grouped by col_list summed by frequency freq.
date day_type trip_type count
0 2022-06-30 Saturday Actual Trips 38349
1 2022-06-30 Sunday Actual Trips 29879
2 2022-06-30 Weekday Actual Trips 291156
3 2022-07-31 Holiday Actual Trips 8326
4 2022-07-31 Saturday Actual Trips 28775
"""
df.copy()
df.loc[:, "date"] = pd.to_datetime(df["date"])
df = df.set_index("date")
groupby_list = [pd.Grouper(freq=freq)] + col_list
agg_day_type = df.groupby(groupby_list).sum()
agg_day_type.drop(columns="ratio", inplace=True)
agg_day_type = agg_day_type.reset_index()
agg_day_type["date"] = agg_day_type["date"].astype(str)
id_vars = ["date"] + col_list
agg_day_type_melted = agg_day_type.melt(
id_vars=id_vars,
value_vars=["trip_count_rt", "trip_count_sched"],
value_name="count",
var_name="trip_type",
)
agg_day_type_melted["day_type"] = agg_day_type_melted["day_type"].map(
plots.DAY_NAMES
)
agg_day_type_melted["trip_type"] = agg_day_type_melted["trip_type"].map(
{"trip_count_rt": "Actual Trips", "trip_count_sched": "Scheduled Trips"}
)
return agg_day_type_melted
def update_interactive_map_data(data_update: DataUpdate) -> None:
"""Generate data for interactive map
Args:
data_update (DataUpdate): A DataUpdate object containing
combined_long_df (pd.DataFrame): first part of tuple output
from csrt.main.
Example.
date route_id trip_count_rt ...
0 2022-05-20 1 43
1 2022-05-20 100 33
2 2022-05-20 103 80
3 2022-05-20 106 121
4 2022-05-20 108 34
summary_df (pd.DataFrame): second part of tuple output from csrt.main
Example.
route_id day_type trip_count_rt trip_count_sched ratio
0 1 hol 47 59 0.796610
1 1 wk 9872 11281 0.875100
2 100 hol 38 53 0.716981
3 100 wk 9281 10177 0.911958
4 103 hol 475 552 0.860507
start_date (str): A date in 'YYYY-MM-DD' format.
Must be on or after 2022-05-20
"""
combined_long_df = data_update.combined_long_df.copy()
summary_df = data_update.summary_df.copy()
start_date = data_update.start_date
end_date = data_update.end_date
# filter to only weekdays because that is all we use for the following calculations
combined_long_df = combined_long_df[combined_long_df.date.dt.dayofweek < 5]
summary_df = summary_df[summary_df.day_type == 'wk']
# Remove 74 Fullerton bus from data
combined_long_df = combined_long_df.loc[combined_long_df["route_id"] != "74"]
summary_df = summary_df.loc[summary_df["route_id"] != "74"]
route_daily_mean = (
combined_long_df.groupby(["route_id"])["trip_count_rt"]
.mean()
.round(1)
.reset_index()
)
route_daily_mean.rename(
columns={"trip_count_rt": "avg_trip_count_rt"}, inplace=True
)
summary_df_mean = summary_df.merge(route_daily_mean, on="route_id")
combined_long_df.loc[:, "date"] = pd.to_datetime(combined_long_df["date"])
# Add ridership data to summary_df_mean
ridership_by_rte_date = plots.fetch_ridership_data()
ridership_end_date = ridership_by_rte_date["date"].max().strftime("%Y-%m-%d")
merged_df = plots.merge_ridership_combined(
combined_long_df=combined_long_df,
ridership_df=ridership_by_rte_date,
start_date=start_date,
ridership_end_date=ridership_end_date,
)
daily_means_riders = plots.calculate_trips_per_rider(merged_df)
# This is the average trip count corresponding to the ridership data,
# which is usually a few months out of date. So we can drop it here and use
# the up-to-date avg_trip_count_rt in summary_df_mean.
daily_means_riders.drop(columns="avg_trip_count_rt", inplace=True)
summary_df_mean = summary_df_mean.merge(daily_means_riders, on="route_id")
# Skip route_id and day_type in the percentile and ranking calculations
for col in summary_df_mean.columns[2:]:
summary_df_mean = plots.calculate_percentile_and_rank(summary_df_mean, col=col)
summary_df_wk = summary_df_mean
# data.json for frontend
shapes_file = plots.ASSETS_PATH / 'bus_route_shapes_simplified_linestring.json'
shapes = geopandas.read_file(shapes_file, driver='GeoJSON')
raw_data_json = summary_df_wk.set_index('route_id').join(shapes.set_index('route_id'))
data_cols = ['route_id', 'day_type', 'ratio', 'ratio_percentiles', 'ratio_ranking', 'shape_id', 'direction', 'trip_id',
'route_short_name', 'route_long_name', 'route_type', 'route_url', 'route_color', 'route_text_color',
'geometry']
# create a json string so we can append the dates attribute
data_json = geopandas.GeoDataFrame(raw_data_json.reset_index()[data_cols]).to_json()
data_json = json.loads(data_json)
data_json.update({"dates": {"start": start_date, "end": end_date }})
data_json_path = plots.DATA_PATH / f"frontend_data_{start_date}_to_{end_date}_wk"
with open(f"{data_json_path}.json", 'w') as f:
json.dump(data_json, f)
def update_lineplot_data(data_update: DataUpdate) -> None:
"""Refresh data for lineplots of bus performance over time
Args:
data_update (DataUpdate): A DataUpdate class containing
combined_long_df (pd.DataFrame): first part of output of csrt.main
Example.
date route_id trip_count_rt ...
0 2022-05-20 1 43
1 2022-05-20 100 33
2 2022-05-20 103 80
3 2022-05-20 106 121
4 2022-05-20 108 34
start_date (str): A date in 'YYYY-MM-DD' format.
Must be on or after 2022-05-20
end_date (str): A date in 'YYYY-MM-DD'. Must be on or before current date
"""
combined_long_df = data_update.combined_long_df.copy()
start_date = data_update.start_date
end_date = data_update.end_date
# date being in actual datetime format is problematic for the front end
combined_long_df["date"] = combined_long_df.date.dt.strftime('%Y-%m-%d')
# JSON files for lineplots
json_cols = ["date", "trip_count_rt", "trip_count_sched", "ratio", "route_id"]
combined_long_df[json_cols].to_json(
plots.DATA_PATH / f"schedule_vs_realtime_all_day_types_routes_"
f"{start_date}_to_{end_date}.json",
date_format="iso",
orient="records",
)
combined_long_df_wk = plots.filter_day_type(combined_long_df, "wk")
combined_long_df_wk[json_cols].to_json(
plots.DATA_PATH / f"schedule_vs_realtime_wk_routes"
f"_{start_date}_to_{end_date}.json",
date_format="iso",
orient="records",
)
json_cols.pop()
combined_long_groupby_date = plots.groupby_long_df(combined_long_df, "date")
combined_long_groupby_date[json_cols].to_json(
plots.DATA_PATH / f"schedule_vs_realtime_all_day_types_overall_"
f"{start_date}_to_{end_date}.json",
date_format="iso",
orient="records",
)
combined_long_groupby_date_wk = plots.groupby_long_df(combined_long_df_wk, "date")
combined_long_groupby_date_wk[json_cols].to_json(
plots.DATA_PATH
/ f"schedule_vs_realtime_wk_overall_{start_date}_to_{end_date}.json",
date_format="iso",
orient="records",
)
def update_barchart_data(
data_update: DataUpdate, bar_start_date: str = "2022-06-01"
) -> None:
"""Refresh data for barcharts over time
Args:
data_update (DataUpdate): a DataUpdate object containing
combined_long_df (pd.DataFrame): first part of output of csrt.main
Example.
date route_id trip_count_rt ...
0 2022-05-20 1 43
1 2022-05-20 100 33
2 2022-05-20 103 80
3 2022-05-20 106 121
4 2022-05-20 108 34
bar_start_date (str, optional): The start date for bar plots.
It should start at the beginning of the month to ensure
a full month of data. Defaults to '2022-06-01'.
"""
# JSON files for barcharts over time
combined_long_df = data_update.combined_long_df.copy()
combined_long_groupby_day_type = plots.groupby_long_df(
combined_long_df, ["date", "day_type"]
)
last_month = plots.datetime.now().month - 1
current_year = plots.datetime.now().year
last_day = calendar.monthrange(current_year, last_month)[1]
last_month_str = f"0{last_month}" if last_month < 10 else str(last_month)
combined_long_groupby_day_type = filter_dates(
combined_long_groupby_day_type,
bar_start_date,
f"{current_year}-{last_month_str}-{last_day}",
)
bar_end_date = combined_long_groupby_day_type["date"].astype(str).max()
monthly_day_type_melted = aggregate(
combined_long_groupby_day_type, col_list=["day_type"]
)
monthly_day_type_melted.to_json(
plots.DATA_PATH / f"schedule_vs_realtime_barchart_by_day_type_"
f"{bar_start_date}_to_{bar_end_date}.json",
date_format="iso",
orient="records",
)
combined_long_df_bardates = filter_dates(
combined_long_df,
bar_start_date,
f"{current_year}-{last_month_str}-{last_day}",
)
monthly_day_type_melted_route = aggregate(combined_long_df_bardates)
monthly_day_type_melted_route.to_json(
plots.DATA_PATH / f"schedule_vs_realtime_barchart_by_day_type_routes_"
f"{bar_start_date}_to_{bar_end_date}.json",
date_format="iso",
orient="records",
)
class Updater:
def __init__(self, previous_file):
self.previous_df = pd.read_json(previous_file)
# https://stackoverflow.com/questions/13703720/converting-between-datetime-timestamp-and-datetime64
def latest(self) -> datetime.datetime:
return pd.Timestamp(max(self.previous_df['date'].unique())).to_pydatetime()
def main() -> None:
"""Refresh data for interactive map, lineplots, and barcharts."""
parser = ArgumentParser(
prog='UpdateData',
description='Update Ghost Buses Data',
)
parser.add_argument('--update', nargs=1, required=False, help="Update all-day comparison file.")
parser.add_argument('--frequency', nargs=1, required=False, default='D',
help="Frequency as described in pandas offset aliases.")
parser.add_argument('--verbose', action='store_true')
args = parser.parse_args()
start_date = None
existing_df = None
if args.update:
u = Updater(args.update[0])
start_date = u.latest()
existing_df = u.previous_df
freq = 'D'
if args.frequency:
freq = args.frequency[0]
cache_manager = CacheManager(verbose=args.verbose)
combined_long_df, summary_df = csrt.main(cache_manager, freq=freq, start_date=start_date, end_date=None,
existing=existing_df)
combined_long_df.loc[:, "ratio"] = (
combined_long_df.loc[:, "trip_count_rt"]
/ combined_long_df.loc[:, "trip_count_sched"]
)
try:
start_date = combined_long_df["date"].min().strftime("%Y-%m-%d")
end_date = combined_long_df["date"].max().strftime("%Y-%m-%d")
except AttributeError:
start_date = combined_long_df["date"].min()
end_date = combined_long_df["date"].max()
data_update = DataUpdate(
combined_long_df=combined_long_df,
summary_df=summary_df,
start_date=start_date,
end_date=end_date,
)
update_interactive_map_data(data_update)
update_lineplot_data(data_update)
update_barchart_data(data_update)
if __name__ == "__main__":
main()