Skip to content

Commit cba3adb

Browse files
authored
Merge pull request #1496 from cal-itp/gtfs-digest
GTFS Digest - Operator Section Refactor
2 parents 80c731e + 50ce8b5 commit cba3adb

File tree

73 files changed

+24516
-84005
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

73 files changed

+24516
-84005
lines changed

_shared_utils/shared_utils/gtfs_analytics_data.yml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,13 @@ rt_vs_schedule_tables:
6161

6262
digest_tables:
6363
dir: ${gcs_paths.RT_SCHED_GCS}
64-
# Amanda: delete route_schedule_vp later
65-
route_schedule_vp: "digest/schedule_vp_metrics"
6664
monthly_route_schedule_vp: "digest/schedule_vp_metrics"
67-
quarterly_route_schedule_vp: "digest/quarterly_schedule_vp_metrics"
65+
monthly_route_schedule_vp_report: ${.monthly_route_schedule_vp}_report
66+
#quarterly_route_schedule_vp: "digest/quarterly_schedule_vp_metrics" # doesn't exist yet
6867
operator_profiles: "digest/operator_profiles"
68+
operator_profiles_report: ${.operator_profiles}_report
6969
operator_routes_map: "digest/operator_routes"
70+
operator_routes_map_report: ${.operator_routes_map}_report
7071
operator_sched_rt: "digest/operator_schedule_rt_category"
7172
scheduled_service_hours: "digest/total_scheduled_service_hours"
7273

_shared_utils/shared_utils/publish_utils.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,12 +101,15 @@ def filter_to_recent_date(df: pd.DataFrame, group_cols: list) -> pd.DataFrame:
101101
Example: By schedule_gtfs_dataset_name, keep the most recent
102102
service_date that shows up in scheduled trips.
103103
"""
104-
df2 = (
104+
most_recent_df = (
105105
df.groupby(group_cols, group_keys=False)
106106
.service_date.max()
107107
.reset_index()
108108
.sort_values(["service_date"] + group_cols, ascending=[False] + [True for c in group_cols])
109109
.reset_index(drop=True)
110110
# .astype({"service_date": "str"})
111111
)
112-
return df2
112+
113+
subset_df = pd.merge(df, most_recent_df, on=group_cols + ["service_date"], how="inner")
114+
115+
return subset_df

_shared_utils/shared_utils/shared_data_catalog.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,30 +23,35 @@ sources:
2323
args:
2424
# source: shared_utils/shared_data.py
2525
urlpath: gs://calitp-analytics-data/data-analyses/shared_data/state_highway_network.parquet
26+
use_fsspec: true
2627
ca_transit_routes:
2728
driver: geoparquet
2829
description: CA transit routes with line geometry at the operator-level (open data)
2930
args:
3031
# source: open_data/create_routes_data.py
3132
urlpath: gs://calitp-analytics-data/data-analyses/traffic_ops/ca_transit_routes.parquet
33+
use_fsspec: true
3234
ca_transit_stops:
3335
driver: geoparquet
3436
description: CA transit stops with point geometry (open data)
3537
args:
3638
# source: open_data/create_stops_data.py
3739
urlpath: gs://calitp-analytics-data/data-analyses/traffic_ops/ca_transit_stops.parquet
40+
use_fsspec: true
3841
hqta_stops:
3942
driver: geoparquet
4043
description: CA high quality transit areas with point geometry (open data)
4144
args:
4245
# source: high_quality_transit_areas/D1_assemble_hqta_points.py
4346
urlpath: gs://calitp-analytics-data/data-analyses/high_quality_transit_areas/hqta_points.parquet
47+
use_fsspec: true
4448
hqta_areas:
4549
driver: geoparquet
4650
description: CA high quality transit areas with polygon geometry (open data)
4751
args:
4852
# source: high_quality_transit_areas/D2_assemble_hqta_polygons.py
4953
urlpath: gs://calitp-analytics-data/data-analyses/high_quality_transit_areas/hqta_areas.parquet
54+
use_fsspec: true
5055
us_states:
5156
driver: geojson
5257
description: US state polygons
@@ -59,19 +64,22 @@ sources:
5964
args:
6065
# source: bus_service_increase/bus_service_utils/generate_calenviroscreen_lehd_data.py
6166
urlpath: gs://calitp-analytics-data/data-analyses/bus_service_increase/calenviroscreen_lehd_by_tract.parquet
67+
use_fsspec: true
6268
state_highway_network_postmiles:
6369
driver: geoparquet
6470
description: Caltrans State Highway Network postmiles (every 0.1 mile) with postmiles as point geometry.
6571
args:
6672
# source: https://gisdata-caltrans.opendata.arcgis.com/datasets/c22341fec9c74c6b9488ee4da23dd967_0/about
6773
# hitting url directly would limit us to 2,000 rows
6874
urlpath: gs://calitp-analytics-data/data-analyses/shared_data/state_highway_network_postmiles.parquet
75+
use_fsspec: true
6976
state_highway_network_postmile_segments:
7077
driver: geoparquet
7178
description: Caltrans State Highway Network postmile segments (postmiles converted to line segments)
7279
args:
7380
# source: shared_utils/shared_data.py
7481
urlpath: gs://calitp-analytics-data/data-analyses/shared_data/state_highway_network_postmile_segments.parquet
82+
use_fsspec: true
7583
assembly_districts:
7684
driver: geojson
7785
description: California assembly district polygons
File renamed without changes.

0 commit comments

Comments
 (0)