diff --git a/src/sktime_mcp/runtime/executor.py b/src/sktime_mcp/runtime/executor.py index 0dc2673f..378cc70a 100644 --- a/src/sktime_mcp/runtime/executor.py +++ b/src/sktime_mcp/runtime/executor.py @@ -788,46 +788,54 @@ def format_data_handle( # 3. Infer and set frequency if auto_infer_freq: - freq = y.index.freq - - if freq is None: - # Try to infer - freq = pd.infer_freq(y.index) + # Guard: integer / RangeIndex data should not go through + # datetime-specific frequency inference (fixes #390). + if isinstance(y.index, pd.RangeIndex) or not hasattr(y.index, "freq"): + changes_made["frequency_set"] = True + changes_made["frequency"] = "Integer" + else: + freq = y.index.freq if freq is None: - # Manual inference - time_diffs = y.index.to_series().diff().dropna() - if len(time_diffs) > 0: - most_common_diff = time_diffs.mode()[0] - - if most_common_diff == pd.Timedelta(days=1): - freq = "D" - elif most_common_diff == pd.Timedelta(hours=1): - freq = "h" - elif most_common_diff == pd.Timedelta(minutes=1): - freq = "min" - elif most_common_diff == pd.Timedelta(seconds=1): - freq = "s" - elif most_common_diff == pd.Timedelta(days=7): - freq = "W" - elif most_common_diff.days >= 28 and most_common_diff.days <= 31: - freq = "MS" - else: - freq = "D" - - # Create complete date range - if freq: - full_range = pd.date_range(start=y.index.min(), end=y.index.max(), freq=freq) - - n_gaps = len(full_range) - len(y) - - y = y.reindex(full_range) - if X is not None: - X = X.reindex(full_range) - - changes_made["gaps_filled"] = n_gaps - changes_made["frequency_set"] = True - changes_made["frequency"] = freq + # Try to infer + freq = pd.infer_freq(y.index) + + if freq is None: + # Manual inference + time_diffs = y.index.to_series().diff().dropna() + if len(time_diffs) > 0: + most_common_diff = time_diffs.mode()[0] + + if most_common_diff == pd.Timedelta(days=1): + freq = "D" + elif most_common_diff == pd.Timedelta(hours=1): + freq = "h" + elif most_common_diff == pd.Timedelta(minutes=1): + freq = "min" + elif most_common_diff == pd.Timedelta(seconds=1): + freq = "s" + elif most_common_diff == pd.Timedelta(days=7): + freq = "W" + elif most_common_diff.days >= 28 and most_common_diff.days <= 31: + freq = "MS" + else: + freq = "D" + + # Create complete date range + if freq: + full_range = pd.date_range( + start=y.index.min(), end=y.index.max(), freq=freq + ) + + n_gaps = len(full_range) - len(y) + + y = y.reindex(full_range) + if X is not None: + X = X.reindex(full_range) + + changes_made["gaps_filled"] = n_gaps + changes_made["frequency_set"] = True + changes_made["frequency"] = freq # 4. Fill missing values if fill_missing and y.isna().any(): @@ -837,8 +845,13 @@ def format_data_handle( X = X.ffill().bfill() changes_made["missing_filled"] = n_missing - # 5. Set frequency explicitly on index - if hasattr(y.index, "freq") and changes_made.get("frequency"): + # 5. Set frequency explicitly on index (only for datetime-like indexes) + if ( + hasattr(y.index, "freq") + and not isinstance(y.index, pd.RangeIndex) + and changes_made.get("frequency") + and changes_made["frequency"] != "Integer" + ): y.index.freq = changes_made["frequency"] if X is not None: X.index.freq = changes_made["frequency"] @@ -853,7 +866,13 @@ def format_data_handle( "metadata": { **data_info["metadata"], "formatted": True, - "frequency": str(y.index.freq) if y.index.freq else changes_made.get("frequency"), + "frequency": ( + str(y.index.freq) + if hasattr(y.index, "freq") + and not isinstance(y.index, pd.RangeIndex) + and y.index.freq + else changes_made.get("frequency") + ), "rows": len(y), "start_date": str(y.index.min()), "end_date": str(y.index.max()), diff --git a/tests/test_core.py b/tests/test_core.py index 5f478e75..2c64924a 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -256,6 +256,73 @@ def fake_save_model(**kwargs): assert calls["serialization_format"] == "pickle" +class TestRangeIndexFormatting: + """Tests for RangeIndex (integer-indexed) time series formatting (issue #390).""" + + def test_format_data_handle_rangeindex_no_crash(self): + """format_data_handle should not crash on integer-indexed data.""" + import pandas as pd + + from sktime_mcp.runtime.executor import Executor + + executor = Executor() + + # Manually store a RangeIndex series as if load_data_source created it + y = pd.Series([1, 2, 3, 4, 5], name="target") + handle = "data_test_int" + executor._data_handles[handle] = { + "y": y, + "X": None, + "metadata": {"columns": ["target"]}, + "validation": {}, + "config": {}, + } + + result = executor.format_data_handle(handle, auto_infer_freq=True) + + assert result["success"], f"Expected success but got: {result}" + assert result["changes_made"]["frequency"] == "Integer" + assert result["changes_made"]["frequency_set"] is True + + def test_format_data_handle_rangeindex_metadata(self): + """Metadata should contain 'Integer' as the frequency for RangeIndex data.""" + import pandas as pd + + from sktime_mcp.runtime.executor import Executor + + executor = Executor() + + y = pd.Series([10, 20, 30], name="value") + handle = "data_test_meta" + executor._data_handles[handle] = { + "y": y, + "X": None, + "metadata": {"columns": ["value"]}, + "validation": {}, + "config": {}, + } + + result = executor.format_data_handle(handle) + + assert result["success"] + assert result["metadata"]["frequency"] == "Integer" + + def test_load_data_source_rangeindex_end_to_end(self): + """load_data_source with integer-indexed inline data should succeed.""" + from sktime_mcp.runtime.executor import Executor + + executor = Executor() + + config = { + "type": "pandas", + "data": {"y": [1, 2, 3, 4]}, + } + + result = executor.load_data_source(config) + + assert result["success"], f"Expected success but got: {result}" + + class TestSearchEstimatorsLimit: """Tests for the limit parameter validation in search_estimators_tool."""