Skip to content
11 changes: 11 additions & 0 deletions pipelines/epiautogp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,16 @@ The EpiAutoGP pipeline supports forecasting of:

It operates on both **daily** and **epiweekly** temporal frequencies, with optional percentage transformations for ED visit data.

### Key Data Options for Gaussian Process Models

EpiAutoGP uses Gaussian processes (GPs) which have different data requirements than traditional time series models:

1. **Weekly-only data option**: Use `--frequency epiweekly` to work with weekly data when the target is weekly. EpiAutoGP can work with either daily or weekly data; time steps are less critical for GP models than for sequential models like ARIMA.

2. **Extended data horizon**: Use `--n-training-days` with larger values (e.g., 180, 365, or more) to provide longer training data horizons. Unlike traditional renewal models that typically use 90-150 days, GPs can benefit from much longer data to better infer temporal autocovariance kernels.

3. **Period data exclusion**: Use `--exclude-date-ranges` to remove periods with known reporting problems. GPs don't require regular sequential data, so gaps from excluded periods are acceptable. Specify ranges as comma-separated `start:end` pairs (e.g., `2024-01-15:2024-01-20,2024-03-01:2024-03-07`).

## Pipeline Architecture

The forecasting pipeline consists of five main steps:
Expand All @@ -34,6 +44,7 @@ Main entry point for the forecasting pipeline.
- `--target`: Data type (`nssp` or `nhsn`)
- `--frequency`: Temporal frequency (`daily` or `epiweekly`)
- `--use-percentage`: Convert ED visits to percentage of total visits
- `--exclude-date-ranges`: Comma-separated list of date ranges to exclude from training data (e.g., `2024-01-15:2024-01-20,2024-03-01:2024-03-07`)
- `--n-particles`: Number of particles for Sequential Monte Carlo (default: 24)
- `--n-mcmc`: MCMC steps for GP kernel structure (default: 100)
- `--n-hmc`: HMC steps for GP kernel hyperparameters (default: 50)
Expand Down
81 changes: 81 additions & 0 deletions pipelines/epiautogp/forecast_epiautogp.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import argparse
import datetime as dt
import logging
from pathlib import Path

Expand All @@ -13,6 +14,58 @@
)


def _parse_exclude_date_ranges(
exclude_date_ranges_str: str | None,
) -> list[tuple[dt.date, dt.date]] | None:
"""
Parse comma-separated date ranges from string to list of tuples.

Parameters
----------
exclude_date_ranges_str : str | None
Comma-separated list of date ranges in format 'start:end'.
Example: '2024-01-15:2024-01-20,2024-03-01:2024-03-07'

Returns
-------
list[tuple[dt.date, dt.date]] | None
List of (start_date, end_date) tuples, or None if input is None/empty

Raises
------
ValueError
If date range format is invalid, dates can't be parsed, or start > end
"""
if exclude_date_ranges_str is None or not exclude_date_ranges_str.strip():
return None

parsed_ranges = []
for date_range_str in exclude_date_ranges_str.split(","):
date_range_str = date_range_str.strip()
if ":" not in date_range_str:
raise ValueError(
f"Invalid date range format: '{date_range_str}'. "
"Expected format: 'start_date:end_date' (e.g., '2024-01-15:2024-01-20')"
)
start_str, end_str = date_range_str.split(":", 1)
try:
start_date = dt.datetime.strptime(start_str.strip(), "%Y-%m-%d").date()
end_date = dt.datetime.strptime(end_str.strip(), "%Y-%m-%d").date()
except ValueError as e:
raise ValueError(
f"Invalid date format in range '{date_range_str}'. "
f"Expected YYYY-MM-DD format. Error: {e}"
)
if start_date > end_date:
raise ValueError(
f"Invalid date range '{date_range_str}': "
f"start_date ({start_date}) must be <= end_date ({end_date})"
)
parsed_ranges.append((start_date, end_date))

return parsed_ranges


def run_epiautogp_forecast(
json_input_path: Path,
model_dir: Path,
Expand Down Expand Up @@ -113,6 +166,7 @@ def main(
n_forecast_draws: int = 2000,
smc_data_proportion: float = 0.1,
n_threads: int = 1,
exclude_date_ranges: str | None = None,
) -> None:
"""
Run the complete EpiAutoGP forecasting pipeline for a single location.
Expand Down Expand Up @@ -176,6 +230,12 @@ def main(
Proportion of data used in each SMC step
n_threads : int, default=1
Number of threads for Julia execution
exclude_date_ranges : str | None, default=None
Comma-separated list of date ranges to exclude from training data.
Each range should be specified as 'start:end' (both dates inclusive).
Example: '2024-01-15:2024-01-20,2024-03-01:2024-03-07' excludes
two periods with known reporting problems. GPs don't require regular
sequential data, so gaps are acceptable.

Returns
-------
Expand Down Expand Up @@ -224,6 +284,13 @@ def main(
else:
n_ahead = n_forecast_days

# Parse exclude_date_ranges from string to list of tuples
parsed_exclude_date_ranges = _parse_exclude_date_ranges(exclude_date_ranges)
if parsed_exclude_date_ranges:
logger.info(
f"Will exclude {len(parsed_exclude_date_ranges)} date range(s) from training data"
)

# Epiautogp params and execution settings
params = {
"n_ahead": n_ahead,
Expand Down Expand Up @@ -277,6 +344,7 @@ def main(
epiautogp_input_json_path = convert_to_epiautogp_json(
context=context,
paths=paths,
exclude_date_ranges=parsed_exclude_date_ranges,
)

# Step 4: Run EpiAutoGP forecast
Expand Down Expand Up @@ -390,5 +458,18 @@ def main(
help="Number of threads to use for EpiAutoGP computations (default: 1).",
)

parser.add_argument(
"--exclude-date-ranges",
type=str,
default=None,
help=(
"Comma-separated list of date ranges to exclude from training data. "
"Each range should be 'start:end' (both dates inclusive, YYYY-MM-DD format). "
"Example: '2024-01-15:2024-01-20,2024-03-01:2024-03-07' "
"excludes two periods with known reporting problems. "
"GPs don't require regular sequential data, so gaps are acceptable."
),
)

args = parser.parse_args()
main(**vars(args))
27 changes: 26 additions & 1 deletion pipelines/epiautogp/prep_epiautogp_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def convert_to_epiautogp_json(
paths: ModelPaths,
nowcast_dates: list[dt.date] | None = None,
nowcast_reports: list[list[float]] | None = None,
exclude_date_ranges: list[tuple[dt.date, dt.date]] | None = None,
) -> Path:
"""
Convert surveillance data to EpiAutoGP JSON format.
Expand All @@ -91,6 +92,12 @@ def convert_to_epiautogp_json(
nowcast_reports : list[list[float]] | `None`, default=`None`
Samples for nowcast dates to represent nowcast uncertainty. If `None`,
defaults to empty list. Not currently used.
exclude_date_ranges : list[tuple[dt.date, dt.date]] | `None`, default=`None`
List of date ranges to exclude from the data. Each tuple represents
(start_date, end_date) where both dates are inclusive. This is useful
for removing periods with known reporting problems. If `None`, no
dates are excluded. GPs don't require regular sequential data, so
gaps from excluded periods are acceptable.

Returns
-------
Expand Down Expand Up @@ -135,11 +142,13 @@ def convert_to_epiautogp_json(
context.target, context.frequency, context.use_percentage, context.ed_visit_type
)

# Set defaults for nowcasting
# Set defaults for nowcasting and date exclusion
if nowcast_dates is None:
nowcast_dates = []
if nowcast_reports is None:
nowcast_reports = []
if exclude_date_ranges is None:
exclude_date_ranges = []

# Define input data JSON path
input_json_path = paths.model_output_dir / f"{context.model_name}_input.json"
Expand All @@ -160,6 +169,7 @@ def convert_to_epiautogp_json(
context.use_percentage,
context.ed_visit_type,
logger,
exclude_date_ranges=exclude_date_ranges,
)

# Create EpiAutoGP input structure
Expand Down Expand Up @@ -199,6 +209,7 @@ def _read_tsv_data(
use_percentage: bool,
ed_visit_type: str,
logger: logging.Logger,
exclude_date_ranges: list[tuple[dt.date, dt.date]] | None = None,
) -> tuple[list[dt.date], list[float]]:
"""
Read surveillance data from TSV files and extract target variable.
Expand All @@ -225,6 +236,10 @@ def _read_tsv_data(
Type of ED visits: "observed" or "other" (only for NSSP)
logger : logging.Logger
Logger for progress messages
exclude_date_ranges : list[tuple[dt.date, dt.date]] | `None`, default=`None`
List of date ranges to exclude from the data. Each tuple represents
(start_date, end_date) where both dates are inclusive. If `None`,
no dates are excluded.

Returns
-------
Expand Down Expand Up @@ -268,6 +283,16 @@ def _read_tsv_data(
df_pivot = df_pivot.with_columns(pl.col("date").cast(pl.Date))
df_pivot = df_pivot.sort("date")

# Filter out excluded date ranges if specified
if exclude_date_ranges:
logger.info(f"Excluding {len(exclude_date_ranges)} date range(s) from data")
for start_date, end_date in exclude_date_ranges:
# Filter out dates in the range [start_date, end_date] (inclusive)
df_pivot = df_pivot.filter(
~((pl.col("date") >= start_date) & (pl.col("date") <= end_date))
)
logger.info(f"Excluded dates from {start_date} to {end_date}")

# Extract data based on target
if target == "nssp":
dates, reports = _extract_nssp_from_pivot(
Expand Down
Loading
Loading