CDCgov · Copilot · Jan 6, 2026 · Jan 6, 2026 · Jan 6, 2026 · Jan 6, 2026
diff --git a/pipelines/epiautogp/README.md b/pipelines/epiautogp/README.md
@@ -10,6 +10,16 @@ The EpiAutoGP pipeline supports forecasting of:
 
 It operates on both **daily** and **epiweekly** temporal frequencies, with optional percentage transformations for ED visit data.
 
+### Key Data Options for Gaussian Process Models
+
+EpiAutoGP uses Gaussian processes (GPs) which have different data requirements than traditional time series models:
+
+1. **Weekly-only data option**: Use `--frequency epiweekly` to work with weekly data when the target is weekly. EpiAutoGP can work with either daily or weekly data; time steps are less critical for GP models than for sequential models like ARIMA.
+
+2. **Extended data horizon**: Use `--n-training-days` with larger values (e.g., 180, 365, or more) to provide longer training data horizons. Unlike traditional renewal models that typically use 90-150 days, GPs can benefit from much longer data to better infer temporal autocovariance kernels.
+
+3. **Period data exclusion**: Use `--exclude-date-ranges` to remove periods with known reporting problems. GPs don't require regular sequential data, so gaps from excluded periods are acceptable. Specify ranges as comma-separated `start:end` pairs (e.g., `2024-01-15:2024-01-20,2024-03-01:2024-03-07`).
+
 ## Pipeline Architecture
 
 The forecasting pipeline consists of five main steps:
@@ -34,6 +44,7 @@ Main entry point for the forecasting pipeline.
 - `--target`: Data type (`nssp` or `nhsn`)
 - `--frequency`: Temporal frequency (`daily` or `epiweekly`)
 - `--use-percentage`: Convert ED visits to percentage of total visits
+- `--exclude-date-ranges`: Comma-separated list of date ranges to exclude from training data (e.g., `2024-01-15:2024-01-20,2024-03-01:2024-03-07`)
 - `--n-particles`: Number of particles for Sequential Monte Carlo (default: 24)
 - `--n-mcmc`: MCMC steps for GP kernel structure (default: 100)
 - `--n-hmc`: HMC steps for GP kernel hyperparameters (default: 50)

diff --git a/pipelines/epiautogp/forecast_epiautogp.py b/pipelines/epiautogp/forecast_epiautogp.py
@@ -1,4 +1,5 @@
 import argparse
+import datetime as dt
 import logging
 from pathlib import Path
 
@@ -13,6 +14,58 @@
 )
 
 
+def _parse_exclude_date_ranges(
+    exclude_date_ranges_str: str | None,
+) -> list[tuple[dt.date, dt.date]] | None:
+    """
+    Parse comma-separated date ranges from string to list of tuples.
+
+    Parameters
+    ----------
+    exclude_date_ranges_str : str | None
+        Comma-separated list of date ranges in format 'start:end'.
+        Example: '2024-01-15:2024-01-20,2024-03-01:2024-03-07'
+
+    Returns
+    -------
+    list[tuple[dt.date, dt.date]] | None
+        List of (start_date, end_date) tuples, or None if input is None/empty
+
+    Raises
+    ------
+    ValueError
+        If date range format is invalid, dates can't be parsed, or start > end
+    """
+    if exclude_date_ranges_str is None or not exclude_date_ranges_str.strip():
+        return None
+
+    parsed_ranges = []
+    for date_range_str in exclude_date_ranges_str.split(","):
+        date_range_str = date_range_str.strip()
+        if ":" not in date_range_str:
+            raise ValueError(
+                f"Invalid date range format: '{date_range_str}'. "
+                "Expected format: 'start_date:end_date' (e.g., '2024-01-15:2024-01-20')"
+            )
+        start_str, end_str = date_range_str.split(":", 1)
+        try:
+            start_date = dt.datetime.strptime(start_str.strip(), "%Y-%m-%d").date()
+            end_date = dt.datetime.strptime(end_str.strip(), "%Y-%m-%d").date()
+        except ValueError as e:
+            raise ValueError(
+                f"Invalid date format in range '{date_range_str}'. "
+                f"Expected YYYY-MM-DD format. Error: {e}"
+            )
+        if start_date > end_date:
+            raise ValueError(
+                f"Invalid date range '{date_range_str}': "
+                f"start_date ({start_date}) must be <= end_date ({end_date})"
+            )
+        parsed_ranges.append((start_date, end_date))
+
+    return parsed_ranges
+
+
 def run_epiautogp_forecast(
     json_input_path: Path,
     model_dir: Path,
@@ -113,6 +166,7 @@ def main(
     n_forecast_draws: int = 2000,
     smc_data_proportion: float = 0.1,
     n_threads: int = 1,
+    exclude_date_ranges: str | None = None,
 ) -> None:
     """
     Run the complete EpiAutoGP forecasting pipeline for a single location.
@@ -176,6 +230,12 @@ def main(
         Proportion of data used in each SMC step
     n_threads : int, default=1
         Number of threads for Julia execution
+    exclude_date_ranges : str | None, default=None
+        Comma-separated list of date ranges to exclude from training data.
+        Each range should be specified as 'start:end' (both dates inclusive).
+        Example: '2024-01-15:2024-01-20,2024-03-01:2024-03-07' excludes
+        two periods with known reporting problems. GPs don't require regular
+        sequential data, so gaps are acceptable.
 
     Returns
     -------
@@ -224,6 +284,13 @@ def main(
     else:
         n_ahead = n_forecast_days
 
+    # Parse exclude_date_ranges from string to list of tuples
+    parsed_exclude_date_ranges = _parse_exclude_date_ranges(exclude_date_ranges)
+    if parsed_exclude_date_ranges:
+        logger.info(
+            f"Will exclude {len(parsed_exclude_date_ranges)} date range(s) from training data"
+        )
+
     # Epiautogp params and execution settings
     params = {
         "n_ahead": n_ahead,
@@ -277,6 +344,7 @@ def main(
     epiautogp_input_json_path = convert_to_epiautogp_json(
         context=context,
         paths=paths,
+        exclude_date_ranges=parsed_exclude_date_ranges,
     )
 
     # Step 4: Run EpiAutoGP forecast
@@ -390,5 +458,18 @@ def main(
         help="Number of threads to use for EpiAutoGP computations (default: 1).",
     )
 
+    parser.add_argument(
+        "--exclude-date-ranges",
+        type=str,
+        default=None,
+        help=(
+            "Comma-separated list of date ranges to exclude from training data. "
+            "Each range should be 'start:end' (both dates inclusive, YYYY-MM-DD format). "
+            "Example: '2024-01-15:2024-01-20,2024-03-01:2024-03-07' "
+            "excludes two periods with known reporting problems. "
+            "GPs don't require regular sequential data, so gaps are acceptable."
+        ),
+    )
+
     args = parser.parse_args()
     main(**vars(args))
diff --git a/pipelines/epiautogp/prep_epiautogp_data.py b/pipelines/epiautogp/prep_epiautogp_data.py
@@ -69,6 +69,7 @@ def convert_to_epiautogp_json(
     paths: ModelPaths,
     nowcast_dates: list[dt.date] | None = None,
     nowcast_reports: list[list[float]] | None = None,
+    exclude_date_ranges: list[tuple[dt.date, dt.date]] | None = None,
 ) -> Path:
     """
     Convert surveillance data to EpiAutoGP JSON format.
@@ -91,6 +92,12 @@ def convert_to_epiautogp_json(
     nowcast_reports : list[list[float]] | `None`, default=`None`
         Samples for nowcast dates to represent nowcast uncertainty. If `None`,
         defaults to empty list. Not currently used.
+    exclude_date_ranges : list[tuple[dt.date, dt.date]] | `None`, default=`None`
+        List of date ranges to exclude from the data. Each tuple represents
+        (start_date, end_date) where both dates are inclusive. This is useful
+        for removing periods with known reporting problems. If `None`, no
+        dates are excluded. GPs don't require regular sequential data, so
+        gaps from excluded periods are acceptable.
 
     Returns
     -------
@@ -135,11 +142,13 @@ def convert_to_epiautogp_json(
         context.target, context.frequency, context.use_percentage, context.ed_visit_type
     )
 
-    # Set defaults for nowcasting
+    # Set defaults for nowcasting and date exclusion
     if nowcast_dates is None:
         nowcast_dates = []
     if nowcast_reports is None:
         nowcast_reports = []
+    if exclude_date_ranges is None:
+        exclude_date_ranges = []
 
     # Define input data JSON path
     input_json_path = paths.model_output_dir / f"{context.model_name}_input.json"
@@ -160,6 +169,7 @@ def convert_to_epiautogp_json(
         context.use_percentage,
         context.ed_visit_type,
         logger,
+        exclude_date_ranges=exclude_date_ranges,
     )
 
     # Create EpiAutoGP input structure
@@ -199,6 +209,7 @@ def _read_tsv_data(
     use_percentage: bool,
     ed_visit_type: str,
     logger: logging.Logger,
+    exclude_date_ranges: list[tuple[dt.date, dt.date]] | None = None,
 ) -> tuple[list[dt.date], list[float]]:
     """
     Read surveillance data from TSV files and extract target variable.
@@ -225,6 +236,10 @@ def _read_tsv_data(
         Type of ED visits: "observed" or "other" (only for NSSP)
     logger : logging.Logger
         Logger for progress messages
+    exclude_date_ranges : list[tuple[dt.date, dt.date]] | `None`, default=`None`
+        List of date ranges to exclude from the data. Each tuple represents
+        (start_date, end_date) where both dates are inclusive. If `None`,
+        no dates are excluded.
 
     Returns
     -------
@@ -268,6 +283,16 @@ def _read_tsv_data(
     df_pivot = df_pivot.with_columns(pl.col("date").cast(pl.Date))
     df_pivot = df_pivot.sort("date")
 
+    # Filter out excluded date ranges if specified
+    if exclude_date_ranges:
+        logger.info(f"Excluding {len(exclude_date_ranges)} date range(s) from data")
+        for start_date, end_date in exclude_date_ranges:
+            # Filter out dates in the range [start_date, end_date] (inclusive)
+            df_pivot = df_pivot.filter(
+                ~((pl.col("date") >= start_date) & (pl.col("date") <= end_date))
+            )
+            logger.info(f"Excluded dates from {start_date} to {end_date}")
+
     # Extract data based on target
     if target == "nssp":
         dates, reports = _extract_nssp_from_pivot(