Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 160 additions & 0 deletions examples/07_agentic_model_selection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
#!/usr/bin/env python
"""
Example 7: Agentic Model Selection Workflow

This example demonstrates a simple LLM-style selection loop using the current
sktime-mcp tool surface:
1. Discover forecasting estimators
2. Inspect candidate capabilities
3. Instantiate and evaluate candidates
4. Select the best-performing model
5. Generate a forecast
6. Export reproducible Python code

Run this example:
python examples/07_agentic_model_selection.py
"""

from __future__ import annotations

import statistics
import sys
from typing import Any

sys.path.insert(0, "src")

from sktime_mcp.tools.codegen import export_code_tool
from sktime_mcp.tools.describe_estimator import describe_estimator_tool
from sktime_mcp.tools.evaluate import evaluate_estimator_tool
from sktime_mcp.tools.fit_predict import fit_predict_tool
from sktime_mcp.tools.instantiate import instantiate_estimator_tool
from sktime_mcp.tools.list_estimators import list_estimators_tool

PREFERRED_CANDIDATES: dict[str, dict[str, Any]] = {
"NaiveForecaster": {"strategy": "last", "sp": 12},
"ThetaForecaster": {},
"ExponentialSmoothing": {"trend": "add", "seasonal": "add", "sp": 12},
}


def print_section(title: str) -> None:
"""Print a section header."""
print("\n" + "=" * 72)
print(f" {title}")
print("=" * 72)


def summarize_metric(result: dict[str, Any], metric_key: str) -> float:
"""Return the mean value of a fold metric from evaluate_estimator output."""
scores = [fold[metric_key] for fold in result["results"] if metric_key in fold]
return statistics.fmean(scores)


def main() -> None:
"""Run the agentic workflow example."""
print("\n🤖 sktime-mcp: Agentic Model-Selection Workflow Demo")
print("=" * 72)
print("Scenario: an LLM needs to pick a forecasting model for the airline dataset.")

print_section("STEP 1: Discover forecasting estimators")
discovery = list_estimators_tool(task="forecasting", limit=200)
if not discovery["success"]:
print(f"Discovery failed: {discovery.get('error')}")
return

available_names = {est["name"] for est in discovery["estimators"]}
candidate_names = [name for name in PREFERRED_CANDIDATES if name in available_names]
print(f"Discovered {discovery['total']} forecasting estimators.")
print(f"Shortlisted candidates: {candidate_names}")

print_section("STEP 2: Inspect candidate capabilities")
for name in candidate_names:
details = describe_estimator_tool(name)
if not details["success"]:
print(f"- {name}: description unavailable")
continue
pred_int = details["tags"].get("capability:pred_int")
print(f"- {name}: module={details['module']}, pred_int={pred_int}")

print_section("STEP 3: Instantiate and evaluate candidates")
leaderboard: list[dict[str, Any]] = []

for name in candidate_names:
params = PREFERRED_CANDIDATES[name]
instantiation = instantiate_estimator_tool(name, params)
if not instantiation["success"]:
print(f"- {name}: instantiation failed -> {instantiation.get('error')}")
continue

handle = instantiation["handle"]
evaluation = evaluate_estimator_tool(handle, "airline", 3)
if not evaluation["success"]:
print(f"- {name}: evaluation failed -> {evaluation.get('error')}")
continue

metric_key = "test_MeanAbsolutePercentageError"
mean_mape = summarize_metric(evaluation, metric_key)
leaderboard.append(
{
"name": name,
"handle": handle,
"params": params,
"mean_mape": mean_mape,
"cv_folds": evaluation["cv_folds_run"],
}
)
print(f"- {name}: mean MAPE={mean_mape:.4f} across {evaluation['cv_folds_run']} folds")

if not leaderboard:
print("No candidate evaluations succeeded.")
return

leaderboard.sort(key=lambda item: item["mean_mape"])

print_section("STEP 4: Rank candidates and select a winner")
print("Leaderboard (lower mean MAPE is better):")
for idx, row in enumerate(leaderboard, start=1):
print(f"{idx}. {row['name']}: {row['mean_mape']:.4f}")

winner = leaderboard[0]
print(f"\nSelected winner: {winner['name']}")
print(f"Chosen params: {winner['params']}")

print_section("STEP 5: Generate forecast with the winner")
forecast = fit_predict_tool(
estimator_handle=winner["handle"],
dataset="airline",
horizon=12,
)
if not forecast["success"]:
print(f"Forecast failed: {forecast.get('error')}")
return

print(f"Forecast generated for {forecast['horizon']} steps.")
for step, value in list(forecast["predictions"].items())[:5]:
print(f"- {step}: {value:.2f}")

print_section("STEP 6: Export reproducible code")
exported = export_code_tool(
winner["handle"],
var_name="selected_model",
include_fit_example=True,
dataset="airline",
)
if not exported["success"]:
print(f"Code export failed: {exported.get('error')}")
return

code_preview = "\n".join(exported["code"].splitlines()[:12])
print("Exported code preview:")
print(code_preview)

print("\n" + "=" * 72)
print("✅ Agentic workflow complete!")
print("=" * 72)
print("The example used only the existing MCP tool surface to choose a model,")
print("evaluate it, forecast, and export reproducible code.")


if __name__ == "__main__":
main()
29 changes: 29 additions & 0 deletions tests/test_agentic_workflow_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""Smoke test for the agentic model-selection workflow example."""

import subprocess
import sys
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parents[1]
EXAMPLE_PATH = REPO_ROOT / "examples" / "07_agentic_model_selection.py"


def test_agentic_model_selection_example_runs_successfully():
"""The agentic workflow example should complete end to end."""
result = subprocess.run(
[sys.executable, str(EXAMPLE_PATH)],
cwd=REPO_ROOT,
capture_output=True,
text=True,
check=True,
)

output = result.stdout

assert "Agentic Model-Selection Workflow Demo" in output
assert "Shortlisted candidates:" in output
assert "Leaderboard (lower mean MAPE is better):" in output
assert "Selected winner:" in output
assert "Forecast generated for 12 steps." in output
assert "Exported code preview:" in output
assert "Agentic workflow complete!" in output
Loading