From f13cd44164fd233497871ecc1068cd00b9f1da89 Mon Sep 17 00:00:00 2001 From: Erika Pacheco Date: Mon, 27 Oct 2025 11:33:13 -0700 Subject: [PATCH 1/3] Add documentation about how to run download_schedule_feeds from command line. Small fixes on the code. [#4354] --- .../dags/download_gtfs_schedule_v2/README.md | 50 ++++++++++++++++++- .../download_schedule_feeds.py | 21 ++++---- 2 files changed, 60 insertions(+), 11 deletions(-) diff --git a/airflow/dags/download_gtfs_schedule_v2/README.md b/airflow/dags/download_gtfs_schedule_v2/README.md index cf5d1efef7..b347f0d552 100644 --- a/airflow/dags/download_gtfs_schedule_v2/README.md +++ b/airflow/dags/download_gtfs_schedule_v2/README.md @@ -2,8 +2,54 @@ Type: [Now / Scheduled](https://docs.calitp.org/data-infra/airflow/dags-maintenance.html) -This DAG orchestrates raw data capture for GTFS schedule data. It reads GTFS data configuration files that are generated by the [`airtable_loader_2` DAG](../airtable_loader_v2/README.md) to determine the list of GTFS schedule URLs to scrape (this DAG will just find the latest such configuration file, so there is no formal dependency between the two DAGs on a daily run basis.) +This DAG orchestrates raw data capture for GTFS schedule data. +It reads GTFS data configuration files that are generated by the [`airtable_loader_2` DAG](../airtable_loader_v2/README.md) to determine the list of GTFS schedule URLs to scrape +(this DAG will just find the latest such configuration file, so there is no formal dependency between the two DAGs on a daily run basis.) + ## Secrets -You may need to change authentication information in [Secret Manager](https://console.cloud.google.com/security/secret-manager); auth keys are loaded from Secret Manager at the start of DAG executions. You may create new versions of existing secrets, or add entirely new secrets. Secrets must be tagged with `gtfs_schedule: true` to be loaded and are referenced by `url_secret_key_name` or `header_secret_key_name` in Airtable's GTFS dataset records. +You may need to change authentication information in [Secret Manager](https://console.cloud.google.com/security/secret-manager); +auth keys are loaded from Secret Manager at the start of DAG executions. +You may create new versions of existing secrets, or add entirely new secrets. +Secrets must be tagged with `gtfs_schedule: true` to be loaded and are referenced by `url_secret_key_name` or `header_secret_key_name` in Airtable's GTFS dataset records. + + +## Running from the command line + +To download the GTFS schedule data manually you can also run from the command line using `poetry run python download_schedule_feeds.py`. + +Follow the following steps: + + +1. Login with gcloud. + + ```bash + $ gcloud auth application-default login --login-config=../../../iac/login.json + ``` + +2. Define the correct values for the environment variables needed. + + * _GOOGLE_CLOUD_PROJECT_: The project where the secret keys can be found. + * _CALITP_BUCKET__GTFS_DOWNLOAD_CONFIG_: The source bucket where the configuration files are located. + * _CALITP_BUCKET__GTFS_SCHEDULE_RAW_: The destination bucket where the schedule result files will be saved. + + +> [!NOTE] +> Bucket names could change, make sure these buckets are still correct before you run. + +3. Run the command line with the environment variable. + + To run on Staging, the full command should looks like this: + + ```bash + $ GOOGLE_CLOUD_PROJECT=cal-itp-data-infra-staging CALITP_BUCKET__GTFS_DOWNLOAD_CONFIG="gs://calitp-staging-gtfs-download-config" CALITP_BUCKET__GTFS_SCHEDULE_RAW="gs://calitp-staging-gtfs-schedule-raw-v2" poetry run python download_schedule_feeds.py + ``` + + + To run on Production, the full command should looks like this: + + ```bash + $ GOOGLE_CLOUD_PROJECT=cal-itp-data-infra CALITP_BUCKET__GTFS_DOWNLOAD_CONFIG="gs://calitp-gtfs-download-config" CALITP_BUCKET__GTFS_SCHEDULE_RAW="gs://calitp-gtfs-schedule-raw-v2" poetry run python download_schedule_feeds.py + ``` + diff --git a/airflow/dags/download_gtfs_schedule_v2/download_schedule_feeds.py b/airflow/dags/download_gtfs_schedule_v2/download_schedule_feeds.py index 38aad85c8e..46b4a9da8a 100644 --- a/airflow/dags/download_gtfs_schedule_v2/download_schedule_feeds.py +++ b/airflow/dags/download_gtfs_schedule_v2/download_schedule_feeds.py @@ -92,14 +92,16 @@ def download_all(task_instance, execution_date, **kwargs): ] outcomes: List[GTFSDownloadOutcome] = [] - logging.info(f"processing {len(configs)} configs") + print(f"processing {len(configs)} configs") for i, config in enumerate(configs, start=1): with sentry_sdk.push_scope() as scope: - logging.info(f"attempting to fetch {i}/{len(configs)} {config.url}") + print(f"attempting to fetch {i}/{len(configs)} {config.url}") + scope.set_tag("config_name", config.name) scope.set_tag("config_url", config.url) scope.set_context("config", config.dict()) + try: extract, content = download_feed( config=config, @@ -137,7 +139,7 @@ def download_all(task_instance, execution_date, **kwargs): ) print( - f"took {humanize.naturaltime(pendulum.now() - start)} to process {len(configs)} configs" + f"took {humanize.naturaldelta(pendulum.now() - start)} to process {len(configs)} configs" ) result = DownloadFeedsResult( @@ -162,12 +164,13 @@ def download_all(task_instance, execution_date, **kwargs): str(f.exception) or str(type(f.exception)) for f in result.failures ), ) - task_instance.xcom_push( - key="download_failures", - value=[ - json.loads(f.json()) for f in result.failures - ], # use the Pydantic serializer - ) + # Commenting out since it is used only for email_download_failures.py (temporarily disabled) + # task_instance.xcom_push( + # key="download_failures", + # value=[ + # json.loads(f.json()) for f in result.failures + # ], # use the Pydantic serializer + # ) success_rate = len(result.successes) / len(configs) if success_rate < GTFS_FEED_LIST_ERROR_THRESHOLD: From a5bd8652be3f665bebf1c7e874898ee6f315d53d Mon Sep 17 00:00:00 2001 From: Erika Pacheco Date: Tue, 28 Oct 2025 22:23:14 -0700 Subject: [PATCH 2/3] Add item to check for UTC time --- airflow/dags/download_gtfs_schedule_v2/README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/airflow/dags/download_gtfs_schedule_v2/README.md b/airflow/dags/download_gtfs_schedule_v2/README.md index b347f0d552..43e422d0c7 100644 --- a/airflow/dags/download_gtfs_schedule_v2/README.md +++ b/airflow/dags/download_gtfs_schedule_v2/README.md @@ -53,3 +53,16 @@ Follow the following steps: $ GOOGLE_CLOUD_PROJECT=cal-itp-data-infra CALITP_BUCKET__GTFS_DOWNLOAD_CONFIG="gs://calitp-gtfs-download-config" CALITP_BUCKET__GTFS_SCHEDULE_RAW="gs://calitp-gtfs-schedule-raw-v2" poetry run python download_schedule_feeds.py ``` +4. Check the timestamp of the result files. + + Go to Google Cloud Storage and check if the destination bucket (_CALITP_BUCKET__GTFS_SCHEDULE_RAW_) contains the new files with the timestamp in UTC time. + + For example: + + The schedule file and the Download Schedule Feed Results were created with `ts=2025-10-29T03:00:23.941260+00:00` where `+00:00` means that the time is in UTC. + + * `gs://calitp-gtfs-schedule-raw-v2/schedule/dt=2025-10-29/ts=2025-10-29T03:00:23.941260+00:00/base64_url=XXXXX` + * `gs://calitp-gtfs-schedule-raw-v2/download_schedule_feed_results/dt=2025-10-29/ts=2025-10-29T03:00:23.941260+00:00/results.jsonl` + + + If the Timestamp is in Pacific time or other time the next process `Unizp and Validate GTFS Schedule Hourly` may not process those files. From 28aa2e0da45e52bd326a791d766da3715eb49632 Mon Sep 17 00:00:00 2001 From: Erika Pacheco Date: Tue, 4 Nov 2025 17:21:24 -0800 Subject: [PATCH 3/3] Add notes about time to execute. --- airflow/dags/download_gtfs_schedule_v2/README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/airflow/dags/download_gtfs_schedule_v2/README.md b/airflow/dags/download_gtfs_schedule_v2/README.md index 43e422d0c7..528f137fde 100644 --- a/airflow/dags/download_gtfs_schedule_v2/README.md +++ b/airflow/dags/download_gtfs_schedule_v2/README.md @@ -19,9 +19,13 @@ Secrets must be tagged with `gtfs_schedule: true` to be loaded and are reference To download the GTFS schedule data manually you can also run from the command line using `poetry run python download_schedule_feeds.py`. -Follow the following steps: +> [!IMPORTANT] +> This command should be executed after the scheduled DAG runs but before noon Pacific Time to be a part of the regular GTFS Feed. +> Unless the schedule changes, aim for 9am to noon Pacific Time due to some business logic rules. +To execute the command, follow these steps: + 1. Login with gcloud. ```bash