diff --git a/examples/07_agentic_model_selection.py b/examples/07_agentic_model_selection.py new file mode 100644 index 00000000..dc655341 --- /dev/null +++ b/examples/07_agentic_model_selection.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python +""" +Example 7: Agentic Model Selection Workflow + +This example demonstrates a simple LLM-style selection loop using the current +sktime-mcp tool surface: +1. Discover forecasting estimators +2. Inspect candidate capabilities +3. Instantiate and evaluate candidates +4. Select the best-performing model +5. Generate a forecast +6. Export reproducible Python code + +Run this example: + python examples/07_agentic_model_selection.py +""" + +from __future__ import annotations + +import statistics +import sys +from typing import Any + +sys.path.insert(0, "src") + +from sktime_mcp.tools.codegen import export_code_tool +from sktime_mcp.tools.describe_estimator import describe_estimator_tool +from sktime_mcp.tools.evaluate import evaluate_estimator_tool +from sktime_mcp.tools.fit_predict import fit_predict_tool +from sktime_mcp.tools.instantiate import instantiate_estimator_tool +from sktime_mcp.tools.list_estimators import list_estimators_tool + +PREFERRED_CANDIDATES: dict[str, dict[str, Any]] = { + "NaiveForecaster": {"strategy": "last", "sp": 12}, + "ThetaForecaster": {}, + "ExponentialSmoothing": {"trend": "add", "seasonal": "add", "sp": 12}, +} + + +def print_section(title: str) -> None: + """Print a section header.""" + print("\n" + "=" * 72) + print(f" {title}") + print("=" * 72) + + +def summarize_metric(result: dict[str, Any], metric_key: str) -> float: + """Return the mean value of a fold metric from evaluate_estimator output.""" + scores = [fold[metric_key] for fold in result["results"] if metric_key in fold] + return statistics.fmean(scores) + + +def main() -> None: + """Run the agentic workflow example.""" + print("\nšŸ¤– sktime-mcp: Agentic Model-Selection Workflow Demo") + print("=" * 72) + print("Scenario: an LLM needs to pick a forecasting model for the airline dataset.") + + print_section("STEP 1: Discover forecasting estimators") + discovery = list_estimators_tool(task="forecasting", limit=200) + if not discovery["success"]: + print(f"Discovery failed: {discovery.get('error')}") + return + + available_names = {est["name"] for est in discovery["estimators"]} + candidate_names = [name for name in PREFERRED_CANDIDATES if name in available_names] + print(f"Discovered {discovery['total']} forecasting estimators.") + print(f"Shortlisted candidates: {candidate_names}") + + print_section("STEP 2: Inspect candidate capabilities") + for name in candidate_names: + details = describe_estimator_tool(name) + if not details["success"]: + print(f"- {name}: description unavailable") + continue + pred_int = details["tags"].get("capability:pred_int") + print(f"- {name}: module={details['module']}, pred_int={pred_int}") + + print_section("STEP 3: Instantiate and evaluate candidates") + leaderboard: list[dict[str, Any]] = [] + + for name in candidate_names: + params = PREFERRED_CANDIDATES[name] + instantiation = instantiate_estimator_tool(name, params) + if not instantiation["success"]: + print(f"- {name}: instantiation failed -> {instantiation.get('error')}") + continue + + handle = instantiation["handle"] + evaluation = evaluate_estimator_tool(handle, "airline", 3) + if not evaluation["success"]: + print(f"- {name}: evaluation failed -> {evaluation.get('error')}") + continue + + metric_key = "test_MeanAbsolutePercentageError" + mean_mape = summarize_metric(evaluation, metric_key) + leaderboard.append( + { + "name": name, + "handle": handle, + "params": params, + "mean_mape": mean_mape, + "cv_folds": evaluation["cv_folds_run"], + } + ) + print(f"- {name}: mean MAPE={mean_mape:.4f} across {evaluation['cv_folds_run']} folds") + + if not leaderboard: + print("No candidate evaluations succeeded.") + return + + leaderboard.sort(key=lambda item: item["mean_mape"]) + + print_section("STEP 4: Rank candidates and select a winner") + print("Leaderboard (lower mean MAPE is better):") + for idx, row in enumerate(leaderboard, start=1): + print(f"{idx}. {row['name']}: {row['mean_mape']:.4f}") + + winner = leaderboard[0] + print(f"\nSelected winner: {winner['name']}") + print(f"Chosen params: {winner['params']}") + + print_section("STEP 5: Generate forecast with the winner") + forecast = fit_predict_tool( + estimator_handle=winner["handle"], + dataset="airline", + horizon=12, + ) + if not forecast["success"]: + print(f"Forecast failed: {forecast.get('error')}") + return + + print(f"Forecast generated for {forecast['horizon']} steps.") + for step, value in list(forecast["predictions"].items())[:5]: + print(f"- {step}: {value:.2f}") + + print_section("STEP 6: Export reproducible code") + exported = export_code_tool( + winner["handle"], + var_name="selected_model", + include_fit_example=True, + dataset="airline", + ) + if not exported["success"]: + print(f"Code export failed: {exported.get('error')}") + return + + code_preview = "\n".join(exported["code"].splitlines()[:12]) + print("Exported code preview:") + print(code_preview) + + print("\n" + "=" * 72) + print("āœ… Agentic workflow complete!") + print("=" * 72) + print("The example used only the existing MCP tool surface to choose a model,") + print("evaluate it, forecast, and export reproducible code.") + + +if __name__ == "__main__": + main() diff --git a/tests/test_agentic_workflow_example.py b/tests/test_agentic_workflow_example.py new file mode 100644 index 00000000..80c58973 --- /dev/null +++ b/tests/test_agentic_workflow_example.py @@ -0,0 +1,29 @@ +"""Smoke test for the agentic model-selection workflow example.""" + +import subprocess +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[1] +EXAMPLE_PATH = REPO_ROOT / "examples" / "07_agentic_model_selection.py" + + +def test_agentic_model_selection_example_runs_successfully(): + """The agentic workflow example should complete end to end.""" + result = subprocess.run( + [sys.executable, str(EXAMPLE_PATH)], + cwd=REPO_ROOT, + capture_output=True, + text=True, + check=True, + ) + + output = result.stdout + + assert "Agentic Model-Selection Workflow Demo" in output + assert "Shortlisted candidates:" in output + assert "Leaderboard (lower mean MAPE is better):" in output + assert "Selected winner:" in output + assert "Forecast generated for 12 steps." in output + assert "Exported code preview:" in output + assert "Agentic workflow complete!" in output