cal-itp · erikamov · Dec 9, 2025 · Nov 26, 2025 · Nov 26, 2025 · Dec 2, 2025
@@ -17,29 +17,29 @@
 from airflow.utils.trigger_rule import TriggerRule
 
 GTFS_SCHEDULE_FILENAMES = {
-    "agency": "agency.txt",
-    "areas": "areas.txt",
-    "attributions": "attributions.txt",
-    "calendar_dates": "calendar_dates.txt",
-    "calendar": "calendar.txt",
-    "fare_attributes": "fare_attributes.txt",
-    "fare_leg_rules": "fare_leg_rules.txt",
-    "fare_media": "fare_media.txt",
-    "fare_products": "fare_products.txt",
-    "fare_rules": "fare_rules.txt",
-    "fare_transfer_rules": "fare_transfer_rules.txt",
-    "feed_info": "feed_info.txt",
-    "frequencies": "frequencies.txt",
-    "levels": "levels.txt",
-    "pathways": "pathways.txt",
-    "routes": "routes.txt",
-    "shapes": "shapes.txt",
-    "stop_areas": "stop_areas.txt",
-    "stop_times": "stop_times.txt",
-    "stops": "stops.txt",
-    "transfers": "transfers.txt",
-    "translations": "translations.txt",
-    "trips": "trips.txt",
+    "agency.txt": "agency",
+    "areas.txt": "areas",
+    "attributions.txt": "attributions",
+    "calendar_dates.txt": "calendar_dates",
+    "calendar.txt": "calendar",
+    "fare_attributes.txt": "fare_attributes",
+    "fare_leg_rules.txt": "fare_leg_rules",
+    "fare_media.txt": "fare_media",
+    "fare_products.txt": "fare_products",
+    "fare_rules.txt": "fare_rules",
+    "fare_transfer_rules.txt": "fare_transfer_rules",
+    "feed_info.txt": "feed_info",
+    "frequencies.txt": "frequencies",
+    "levels.txt": "levels",
+    "pathways.txt": "pathways",
+    "routes.txt": "routes",
+    "shapes.txt": "shapes",
+    "stop_areas.txt": "stop_areas",
+    "stop_times.txt": "stop_times",
+    "stops.txt": "stops",
+    "transfers.txt": "transfers",
+    "translations.txt": "translations",
+    "trips.txt": "trips",
 }
 
 
@@ -50,11 +50,13 @@
     start_date=datetime(2025, 11, 1),
     catchup=False,
     tags=["gtfs"],
+    user_defined_macros={"basename": os.path.basename},
 ):
     latest_only = LatestOnlyOperator(task_id="latest_only", depends_on_past=False)
 
     download_config = BigQueryToDownloadConfigOperator(
         task_id="bigquery_to_download_config",
+        retries=1,
         dataset_name="staging",
         table_name="int_transit_database__gtfs_datasets_dim",
         destination_bucket=os.getenv("CALITP_BUCKET__GTFS_DOWNLOAD_CONFIG"),
@@ -63,13 +65,16 @@
 
     schedule_download_configs = GCSDownloadConfigFilterOperator(
         task_id="download_config_filter",
+        limit=None,
+        retries=1,
         feed_type="schedule",
         source_bucket=os.getenv("CALITP_BUCKET__GTFS_DOWNLOAD_CONFIG"),
         source_path="gtfs_download_configs/dt={{ ds }}/ts={{ ts }}/configs.jsonl.gz",
     )
 
     downloads = DownloadConfigToGCSOperator.partial(
         task_id="download_config_to_gcs",
+        retries=1,
         destination_bucket=os.getenv("CALITP_BUCKET__GTFS_SCHEDULE_RAW"),
         destination_path="schedule/dt={{ ds }}/ts={{ ts }}",
         results_path="download_schedule_feed_results/dt={{ ds }}/ts={{ ts }}",
@@ -98,75 +103,61 @@ def create_validate_kwargs(download):
 
     ValidateGTFSToGCSOperator.partial(
         task_id="validate_gtfs_to_gcs",
+        retries=1,
         destination_bucket=os.getenv("CALITP_BUCKET__GTFS_SCHEDULE_VALIDATION_HOURLY"),
         source_bucket=os.getenv("CALITP_BUCKET__GTFS_SCHEDULE_RAW"),
+        map_index_template="{{ task.download_schedule_feed_results['config']['name'] }}",
         trigger_rule=TriggerRule.ALL_DONE,
     ).expand_kwargs(XComArg(downloads).map(create_validate_kwargs))
 
-    for schedule_file_type, schedule_filename in GTFS_SCHEDULE_FILENAMES.items():
+    def unzip_files_kwargs(download):
+        return {
+            "download_schedule_feed_results": download[
+                "download_schedule_feed_results"
+            ],
+            "source_path": download["schedule_feed_path"],
+            "base64_url": download["base64_url"],
+        }
 
-        def create_unzip_kwargs(download):
-            return {
-                "download_schedule_feed_results": download[
-                    "download_schedule_feed_results"
-                ],
-                "source_path": download["schedule_feed_path"],
-                "base64_url": download["base64_url"],
-                "destination_path": os.path.join(
-                    schedule_filename,
-                    "dt={{ ds }}",
-                    "ts={{ ts }}",
-                    f"base64_url={download['base64_url']}",
-                    schedule_filename,
-                ),
-                "results_path": os.path.join(
-                    "unzipping_results",
-                    "dt={{ ds }}",
-                    "ts={{ ts }}",
-                    f"{download['base64_url']}_{schedule_filename}.jsonl",
-                ),
-            }
+    unzipped_files = UnzipGTFSToGCSOperator.partial(
+        task_id="unzip_to_gcs",
+        retries=1,
+        filenames=list(GTFS_SCHEDULE_FILENAMES.keys()),
+        source_bucket=os.getenv("CALITP_BUCKET__GTFS_SCHEDULE_RAW"),
+        destination_bucket=os.getenv("CALITP_BUCKET__GTFS_SCHEDULE_UNZIPPED_HOURLY"),
+        destination_path_fragment="dt={{ ds }}/ts={{ ts }}/base64_url={{ task.base64_url }}",
+        results_path="unzipping_results/dt={{ ds }}/ts={{ ts }}/{{ task.base64_url }}.jsonl",
+        map_index_template="{{ task.download_schedule_feed_results['config']['name'] }}",
+        trigger_rule=TriggerRule.ALL_DONE,
+    ).expand_kwargs(downloads.output.map(unzip_files_kwargs))
 
-        unzipped_files = UnzipGTFSToGCSOperator.partial(
-            task_id=f"unzip_{schedule_file_type}_to_gcs",
-            filename=schedule_filename,
-            source_bucket=os.getenv("CALITP_BUCKET__GTFS_SCHEDULE_RAW"),
-            destination_bucket=os.getenv(
-                "CALITP_BUCKET__GTFS_SCHEDULE_UNZIPPED_HOURLY"
+    def list_unzipped_files(unzipped_file):
+        return {
+            "unzip_results": unzipped_file["unzip_results"],
+            "source_path_fragment": os.path.join(
+                "dt={{ ds }}",
+                "ts={{ ts }}",
+                f"base64_url={unzipped_file['base64_url']}",
             ),
-            trigger_rule=TriggerRule.ALL_DONE,
-        ).expand_kwargs(XComArg(downloads).map(create_unzip_kwargs))
-
-        def create_parse_kwargs(unzipped_file):
-            return {
-                "unzip_results": unzipped_file["unzip_results"],
-                "source_path": os.path.join(
-                    schedule_filename,
-                    "dt={{ ds }}",
-                    "ts={{ ts }}",
-                    f"base64_url={unzipped_file['base64_url']}",
-                    schedule_filename,
-                ),
-                "results_path": os.path.join(
-                    f"{schedule_filename}_parsing_results",
-                    "dt={{ ds }}",
-                    "ts={{ ts }}",
-                    f"{unzipped_file['base64_url']}_{schedule_filename}.jsonl",
-                ),
-                "destination_path": os.path.join(
-                    schedule_file_type,
-                    "dt={{ ds }}",
-                    "ts={{ ts }}",
-                    f"base64_url={unzipped_file['base64_url']}",
-                    f"{schedule_file_type}.jsonl.gz",
-                ),
-            }
+            "results_path_fragment": os.path.join(
+                "dt={{ ds }}",
+                "ts={{ ts }}",
+                f"{unzipped_file['base64_url']}.jsonl",
+            ),
+            "destination_path_fragment": os.path.join(
+                "dt={{ ds }}",
+                "ts={{ ts }}",
+                f"base64_url={unzipped_file['base64_url']}",
+            ),
+        }
 
-        GTFSCSVToJSONLOperator.partial(
-            task_id=f"convert_{schedule_file_type}_to_jsonl",
-            source_bucket=os.getenv("CALITP_BUCKET__GTFS_SCHEDULE_UNZIPPED_HOURLY"),
-            destination_bucket=os.getenv("CALITP_BUCKET__GTFS_SCHEDULE_PARSED_HOURLY"),
-            trigger_rule=TriggerRule.ALL_DONE,
-        ).expand_kwargs(XComArg(unzipped_files).map(create_parse_kwargs))
+    GTFSCSVToJSONLOperator.partial(
+        task_id="convert_to_jsonl",
+        retries=1,
+        source_bucket=os.getenv("CALITP_BUCKET__GTFS_SCHEDULE_UNZIPPED_HOURLY"),
+        destination_bucket=os.getenv("CALITP_BUCKET__GTFS_SCHEDULE_PARSED_HOURLY"),
+        map_index_template="{{ task.unzip_results['extract']['config']['name'] }}",
+        trigger_rule=TriggerRule.ALL_DONE,
+    ).expand_kwargs(XComArg(unzipped_files).map(list_unzipped_files))
 
     latest_only >> download_config >> schedule_download_configs
@@ -7,6 +7,30 @@
 from airflow.hooks.base import BaseHook
 
 
+class GTFSExtractedFile:
+    def __init__(
+        self,
+        current_date: pendulum.DateTime,
+        config: dict[str, str],
+        filename: str,
+        original_filename: str,
+        content: bytes,
+    ) -> None:
+        self.filename = filename
+        self.original_filename = original_filename
+        self.content = content
+        self.current_date = current_date
+        self.config = config
+
+    def metadata(self) -> dict[str, str]:
+        return {
+            "filename": self.filename,
+            "original_filename": self.original_filename,
+            "ts": self.current_date.isoformat(),
+            "extract_config": self.config,
+        }
+
+
 class GTFSUnzipResult:
     def __init__(
         self,
@@ -19,9 +43,7 @@ def __init__(
         self._md5hash = hashlib.md5()
         self._files = []
         self._directories = []
-        self._content_name = None
-        self._content_original_name = None
-        self._content = None
+        self._extracted_files = []
 
     def valid(self) -> bool:
         if not self._directories:
@@ -56,34 +78,28 @@ def add_directory(self, directory) -> None:
     def add_hash(self, content: bytes) -> None:
         self._md5hash.update(content)
 
-    def add_content(self, name: str, original_name: str, content: bytes) -> None:
-        self._content_name = name
-        self._content_original_name = original_name
-        self._content = content
+    def add_content(self, name: str, original_filename: str, content: bytes) -> None:
+        self._extracted_files.append(
+            GTFSExtractedFile(
+                filename=name,
+                original_filename=original_filename,
+                content=content,
+                config=self.extract_config(),
+                current_date=self.current_date,
+            )
+        )
 
     def files(self) -> list[str]:
         return sorted(self._files)
 
     def directories(self) -> list[str]:
         return sorted(self._directories)
 
-    def content(self) -> bytes:
-        return self._content
-
     def md5hash(self) -> str:
         return self._md5hash.hexdigest()
 
-    def extracted_files(self) -> list[str]:
-        if self._content_name is None:
-            return []
-        return [
-            {
-                "filename": self._content_name,
-                "original_filename": self._content_original_name,
-                "ts": self.current_date.isoformat(),
-                "extract_config": self.extract_config(),
-            }
-        ]
+    def extracted_files(self) -> list[GTFSExtractedFile]:
+        return self._extracted_files
 
     def results(self) -> dict:
         return {
@@ -93,7 +109,7 @@ def results(self) -> dict:
             "zipfile_extract_md5hash": self.md5hash(),
             "zipfile_files": self.files(),
             "zipfile_dirs": self.directories(),
-            "extracted_files": self.extracted_files(),
+            "extracted_files": [ef.metadata() for ef in self.extracted_files()],
         }
 
 
@@ -105,7 +121,7 @@ class GTFSZip:
     def __init__(self, filename: str) -> None:
         self.filename = filename
 
-    def extract(self, path_to_extract: str, result: GTFSUnzipResult) -> None:
+    def extract(self, filenames: str, result: GTFSUnzipResult) -> None:
         with zipfile.ZipFile(self.filename, "r") as zip_file:
             for name in zip_file.namelist():
                 if name.startswith("__MACOSX"):
@@ -119,24 +135,22 @@ def extract(self, path_to_extract: str, result: GTFSUnzipResult) -> None:
                 with zip_file.open(file_path) as child_file:
                     content = child_file.read()
                     result.add_hash(content)
-                    if (
-                        result.valid()
-                        and os.path.basename(file_path) == path_to_extract
-                    ):
+                    filename = os.path.basename(file_path)
+                    if result.valid() and filename in filenames:
                         result.add_content(
-                            name=path_to_extract,
-                            original_name=file_path,
+                            name=filename,
+                            original_filename=file_path,
                             content=content,
                         )
 
 
 class GTFSUnzipHook(BaseHook):
-    filename: str
+    filenames: list[str]
     current_date: pendulum.DateTime
 
-    def __init__(self, filename: str, current_date: pendulum.DateTime):
+    def __init__(self, filenames: list[str], current_date: pendulum.DateTime):
         super().__init__()
-        self.filename = filename
+        self.filenames = filenames
         self.current_date = current_date
 
     def run(
@@ -149,7 +163,7 @@ def run(
 
         try:
             GTFSZip(filename=zipfile_path).extract(
-                path_to_extract=self.filename, result=result
+                filenames=self.filenames, result=result
             )
             if not result.valid():
                 raise ValueError(

@@ -41,7 +41,9 @@ def metadata(self) -> dict:
         }
 
     def notices(self) -> list:
-        return [{"metadata": self.metadata(), **n} for n in self.report.get("notices")]
+        return [
+            {"metadata": self.metadata(), **n} for n in self.report.get("notices", [])
+        ]
 
     def set_exception(self, exception: Exception) -> None:
         self._exception = exception

@@ -55,6 +55,19 @@ def resolve(
         }
 
 
+class ActiveRowQuery:
+    def __init__(self, rows: list[dict], current_time: pendulum.DateTime):
+        self.rows = rows
+        self.current_time = current_time
+
+    def resolve(self) -> list[dict]:
+        resolved = []
+        for row in self.rows:
+            if row["_is_current"] and row["deprecated_date"] is None:
+                resolved.append(row)
+        return resolved
+
+
 class BigQueryToDownloadConfigOperator(BaseOperator):
     template_fields: Sequence[str] = (
         "dataset_name",
@@ -96,9 +109,9 @@ def download_config_rows(self, current_time: pendulum.DateTime) -> list:
         response = self.bigquery_hook().list_rows(
             dataset_id=self.dataset_name, table_id=self.table_name
         )
-        active_rows = [
-            r for r in response if r["_is_current"] and r["deprecated_date"] is None
-        ]
+        active_rows = ActiveRowQuery(
+            rows=list(response), current_time=current_time
+        ).resolve()
         mapped_rows = {row["key"]: row for row in active_rows}
         return [
             DownloadConfigRow(row).resolve(current_time, mapped_rows)