sktime · biru-codeastromer · Apr 29, 2026
diff --git a/examples/01_forecasting_workflow.py b/examples/01_forecasting_workflow.py
@@ -110,9 +110,9 @@ def main():
     if result.warnings:
         print(f"   Warnings: {result.warnings}")
 
-    # Invalid pipeline: Forecaster -> Forecaster
-    print("\n❌ Testing: ['NaiveForecaster', 'ExponentialSmoothing']")
-    result = validator.validate_pipeline(["NaiveForecaster", "ExponentialSmoothing"])
+    # Invalid pipeline: Forecaster -> Transformer
+    print("\n❌ Testing: ['NaiveForecaster', 'Imputer']")
+    result = validator.validate_pipeline(["NaiveForecaster", "Imputer"])
     print(f"   Valid: {result.valid}")
     if result.errors:
         print(f"   Errors: {result.errors}")

diff --git a/examples/02_llm_query_simulation.py b/examples/02_llm_query_simulation.py
@@ -97,15 +97,15 @@ def simulate_query_1():
 
 def simulate_query_2():
     """
-    Query: "Compare ARIMA and Theta for my sunspot data"
+    Query: "Compare ARIMA and Theta for my airline data"
     """
     print("\n" + "=" * 70)
     print("  QUERY 2: Compare Two Forecasters")
     print("=" * 70)
-    print('\nUser: "Compare NaiveForecaster and ThetaForecaster for sunspot data"')
+    print('\nUser: "Compare NaiveForecaster and ThetaForecaster for airline data"')
 
     # Step 1: LLM plans comparison
-    print_llm_thought("I'll describe both estimators and run them on sunspot data")
+    print_llm_thought("I'll describe both estimators and run them on airline data")
 
     # Step 2: Describe first estimator
     print_tool_call("describe_estimator", {"estimator": "NaiveForecaster"})
@@ -138,21 +138,21 @@ def simulate_query_2():
         # Step 5: Run predictions
         if h1:
             print_tool_call(
-                "fit_predict", {"estimator_handle": h1, "dataset": "sunspots", "horizon": 6}
+                "fit_predict", {"estimator_handle": h1, "dataset": "airline", "horizon": 6}
             )
-            pred1 = fit_predict_tool(h1, "sunspots", 6)
+            pred1 = fit_predict_tool(h1, "airline", 6)
             print_result({"success": pred1["success"], "horizon": pred1.get("horizon")})
 
         if h2:
             print_tool_call(
-                "fit_predict", {"estimator_handle": h2, "dataset": "sunspots", "horizon": 6}
+                "fit_predict", {"estimator_handle": h2, "dataset": "airline", "horizon": 6}
             )
-            pred2 = fit_predict_tool(h2, "sunspots", 6)
+            pred2 = fit_predict_tool(h2, "airline", 6)
             print_result({"success": pred2["success"], "horizon": pred2.get("horizon")})
 
         # Step 6: Generate comparison
         print("\n🤖 LLM Response:")
-        print("   Comparison of NaiveForecaster vs ThetaForecaster on Sunspots:")
+        print("   Comparison of NaiveForecaster vs ThetaForecaster on airline:")
         print("   - NaiveForecaster: Simple baseline, uses last season's values")
         print("   - ThetaForecaster: Decomposition-based, better for trended data")
         if h1 and pred1["success"]:
@@ -163,19 +163,19 @@ def simulate_query_2():
 
 def simulate_query_3():
     """
-    Query: "Can I use ARIMA after LogTransformer?"
+    Query: "Can I use ARIMA after Detrender?"
     """
     print("\n" + "=" * 70)
     print("  QUERY 3: Validate Pipeline Composition")
     print("=" * 70)
-    print('\nUser: "Can I build a pipeline with Imputer -> Detrend -> NaiveForecaster?"')
+    print('\nUser: "Can I build a pipeline with Imputer -> Detrender -> NaiveForecaster?"')
 
     # Step 1: LLM uses composition validator
     print_llm_thought("Let me validate this pipeline composition...")
 
     validator = get_composition_validator()
 
-    pipeline = ["Imputer", "Detrend", "NaiveForecaster"]
+    pipeline = ["Imputer", "Detrender", "NaiveForecaster"]
     print_tool_call("validate_pipeline", {"components": pipeline})
     result = validator.validate_pipeline(pipeline)
     print_result(result.to_dict())

diff --git a/tests/test_examples.py b/tests/test_examples.py
@@ -0,0 +1,48 @@
+"""Smoke tests for documented agentic/MCP workflow examples."""
+
+import subprocess
+import sys
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+EXAMPLES_DIR = REPO_ROOT / "examples"
+
+
+def _run_example(name: str) -> str:
+    """Run an example script and return stdout."""
+    result = subprocess.run(
+        [sys.executable, str(EXAMPLES_DIR / name)],
+        cwd=REPO_ROOT,
+        capture_output=True,
+        text=True,
+        check=True,
+    )
+    return result.stdout
+
+
+def test_forecasting_workflow_example_runs_successfully():
+    """The end-to-end forecasting workflow example should finish cleanly."""
+    output = _run_example("01_forecasting_workflow.py")
+
+    assert "Demo Complete" in output
+    assert "Valid: False" in output
+    assert "Error:" not in output
+
+
+def test_llm_query_simulation_example_runs_successfully():
+    """The LLM-style query simulation should not hide failed workflow steps."""
+    output = _run_example("02_llm_query_simulation.py")
+
+    assert "All LLM Query Simulations Complete" in output
+    assert "ThetaForecaster predictions generated successfully" in output
+    assert "NaiveForecaster predictions generated successfully" in output
+    assert '"success": false' not in output
+    assert "Unknown estimator: Detrend" not in output
+
+
+def test_pipeline_demo_example_runs_successfully():
+    """The pipeline demo should continue to show a successful two-call workflow."""
+    output = _run_example("04_mcp_pipeline_demo.py")
+
+    assert "SUCCESS! LLM created and used a complete pipeline" in output
+    assert '"success": true' in output