From ee5b9e64eb7fee54bade3e66edc37cb7937e72a3 Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Thu, 20 Jul 2023 10:11:12 -0400 Subject: [PATCH 01/14] Implement baseline pipeline --- evalml/pipelines/components/component_base.py | 1 + ...tiseries_time_series_baseline_regressor.py | 122 ++++++++++++++++++ 2 files changed, 123 insertions(+) create mode 100644 evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py diff --git a/evalml/pipelines/components/component_base.py b/evalml/pipelines/components/component_base.py index 12b6603bb4..5c00a053e1 100644 --- a/evalml/pipelines/components/component_base.py +++ b/evalml/pipelines/components/component_base.py @@ -31,6 +31,7 @@ class ComponentBase(ABC, metaclass=ComponentBaseMeta): # Referring to the pandas nullable dtypes; not just woodwork logical types _integer_nullable_incompatibilities = [] _boolean_nullable_incompatibilities = [] + is_multiseries = False def __init__(self, parameters=None, component_obj=None, random_seed=0, **kwargs): """Base class for all components. diff --git a/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py b/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py new file mode 100644 index 0000000000..27e7ab0099 --- /dev/null +++ b/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py @@ -0,0 +1,122 @@ +"""Time series estimator that predicts using the naive forecasting approach.""" +import numpy as np +import pandas as pd + +from evalml.model_family import ModelFamily +from evalml.pipelines.components.estimators import Estimator +from evalml.problem_types import ProblemTypes +from evalml.utils import infer_feature_types + + +class MultiseriesTimeSeriesBaselineRegressor(Estimator): + """Multiseries time series regressor that predicts using the naive forecasting approach. + + This is useful as a simple baseline estimator for multiseries time series problems. + + Args: + gap (int): Gap between prediction date and target date and must be a positive integer. If gap is 0, target date will be shifted ahead by 1 time period. Defaults to 1. + forecast_horizon (int): Number of time steps the model is expected to predict. + random_seed (int): Seed for the random number generator. Defaults to 0. + """ + + name = "Time Series Baseline Regressor" + hyperparameter_ranges = {} + """{}""" + model_family = ModelFamily.BASELINE + """ModelFamily.BASELINE""" + is_multiseries = True + supported_problem_types = [ + ProblemTypes.TIME_SERIES_REGRESSION, + ] + """[ + ProblemTypes.TIME_SERIES_REGRESSION, + ]""" + + def __init__(self, gap=1, forecast_horizon=1, random_seed=0, **kwargs): + self._prediction_value = None + self.start_delay = forecast_horizon + gap + self._num_features = None + + if gap < 0: + raise ValueError( + f"gap value must be a positive integer. {gap} was provided.", + ) + + parameters = {"gap": gap, "forecast_horizon": forecast_horizon} + parameters.update(kwargs) + super().__init__( + parameters=parameters, + component_obj=None, + random_seed=random_seed, + ) + + def fit(self, X, y=None): + """Fits multiseries time series baseline regressor to data. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series): The target training data of length [n_samples]. + + Returns: + self + + Raises: + ValueError: If input y is None. + """ + if y is None: + raise ValueError( + "Cannot train Multiseries Time Series Baseline Regressor if y is None", + ) + self._series_names = y.columns + + delay_columns = pd.DataFrame( + np.zeros((self.start_delay, y.shape[1])), + columns=self._series_names, + index=range(y.index[-1], self.start_delay + y.index[-1]), + ) + y = pd.concat([y, delay_columns]) + + self._delayed_target = y.shift(self.start_delay, fill_value=0) + + return self + + def predict(self, X): + """Make predictions using fitted multiseries time series baseline regressor. + + Args: + X (pd.DataFrame): Data of shape [n_samples, n_features]. + + Returns: + pd.Series: Predicted values. + + Raises: + ValueError: If input y is None. + """ + X = infer_feature_types(X) + self._num_features = X.shape[1] + + in_sample_delay = self._delayed_target[self._delayed_target.index.isin(X.index)] + + out_of_sample_delay = pd.DataFrame(columns=self._series_names) + out_of_sample_offset = X.index[-1] - self._delayed_target.index[-1] + if out_of_sample_offset > 0: + out_of_sample_delay = pd.DataFrame( + np.zeros((out_of_sample_offset, len(self._series_names))), + columns=self._series_names, + index=range(self._delayed_target.index[-1] + 1, X.index[-1] + 1), + ) + + y_pred = pd.concat([in_sample_delay, out_of_sample_delay]) + return y_pred + + @property + def feature_importance(self): + """Returns importance associated with each feature. + + Since baseline estimators do not use input features to calculate predictions, returns an array of zeroes. + + Returns: + np.ndarray (float): An array of zeroes. + """ + importance = np.array([0] * self._num_features) + return importance From 004a1ddb1064c6d004a25979ab945a0a6105cd60 Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Thu, 20 Jul 2023 10:13:48 -0400 Subject: [PATCH 02/14] Add multiseries to init files --- evalml/pipelines/components/__init__.py | 1 + evalml/pipelines/components/estimators/__init__.py | 1 + evalml/pipelines/components/estimators/regressors/__init__.py | 3 +++ 3 files changed, 5 insertions(+) diff --git a/evalml/pipelines/components/__init__.py b/evalml/pipelines/components/__init__.py index 1d00a850cf..30a200256b 100644 --- a/evalml/pipelines/components/__init__.py +++ b/evalml/pipelines/components/__init__.py @@ -21,6 +21,7 @@ DecisionTreeClassifier, DecisionTreeRegressor, TimeSeriesBaselineEstimator, + MultiseriesTimeSeriesBaselineRegressor, KNeighborsClassifier, ProphetRegressor, SVMClassifier, diff --git a/evalml/pipelines/components/estimators/__init__.py b/evalml/pipelines/components/estimators/__init__.py index 1528742106..ce9dc742a1 100644 --- a/evalml/pipelines/components/estimators/__init__.py +++ b/evalml/pipelines/components/estimators/__init__.py @@ -25,6 +25,7 @@ ExtraTreesRegressor, BaselineRegressor, TimeSeriesBaselineEstimator, + MultiseriesTimeSeriesBaselineRegressor, DecisionTreeRegressor, SVMRegressor, ExponentialSmoothingRegressor, diff --git a/evalml/pipelines/components/estimators/regressors/__init__.py b/evalml/pipelines/components/estimators/regressors/__init__.py index a35167d54b..b98e3a7fdb 100644 --- a/evalml/pipelines/components/estimators/regressors/__init__.py +++ b/evalml/pipelines/components/estimators/regressors/__init__.py @@ -29,6 +29,9 @@ from evalml.pipelines.components.estimators.regressors.time_series_baseline_estimator import ( TimeSeriesBaselineEstimator, ) +from evalml.pipelines.components.estimators.regressors.multiseries_time_series_baseline_regressor import ( + MultiseriesTimeSeriesBaselineRegressor, +) from evalml.pipelines.components.estimators.regressors.prophet_regressor import ( ProphetRegressor, ) From 3f45eb29793f0c4ebfc8077cee4b5bc90829ab46 Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Thu, 20 Jul 2023 10:54:32 -0400 Subject: [PATCH 03/14] Add tests --- ...tiseries_time_series_baseline_regressor.py | 2 +- .../test_multiseries_baseline_regressor.py | 57 +++++++++++++++++++ evalml/tests/conftest.py | 16 ++++++ 3 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 evalml/tests/component_tests/test_multiseries_baseline_regressor.py diff --git a/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py b/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py index 27e7ab0099..ad83f42cb7 100644 --- a/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py @@ -72,7 +72,7 @@ def fit(self, X, y=None): delay_columns = pd.DataFrame( np.zeros((self.start_delay, y.shape[1])), columns=self._series_names, - index=range(y.index[-1], self.start_delay + y.index[-1]), + index=range(y.index[-1] + 1, self.start_delay + y.index[-1] + 1), ) y = pd.concat([y, delay_columns]) diff --git a/evalml/tests/component_tests/test_multiseries_baseline_regressor.py b/evalml/tests/component_tests/test_multiseries_baseline_regressor.py new file mode 100644 index 0000000000..031f9b20b0 --- /dev/null +++ b/evalml/tests/component_tests/test_multiseries_baseline_regressor.py @@ -0,0 +1,57 @@ +import pandas as pd +import pytest + +from evalml.model_family import ModelFamily +from evalml.pipelines.components import MultiseriesTimeSeriesBaselineRegressor + + +def test_multiseries_time_series_baseline_regressor_init(): + baseline = MultiseriesTimeSeriesBaselineRegressor() + assert baseline.model_family == ModelFamily.BASELINE + assert baseline.is_multiseries + assert baseline.start_delay == 2 + + baseline = MultiseriesTimeSeriesBaselineRegressor(gap=2, forecast_horizon=5) + assert baseline.start_delay == 7 + + +def test_multiseries_time_series_baseline_gap_negative(): + with pytest.raises(ValueError, match="gap value must be a positive integer."): + MultiseriesTimeSeriesBaselineRegressor(gap=-1) + + +def test_multiseries_time_series_baseline_estimator_y_is_none( + X_y_multiseries_regression, +): + X, y = X_y_multiseries_regression + + estimator = MultiseriesTimeSeriesBaselineRegressor(gap=0, forecast_horizon=2) + + with pytest.raises(ValueError, match="if y is None"): + estimator.fit(X, None) + + +def test_multiseries_time_series_baseline_lags(X_y_multiseries_regression): + X, y = X_y_multiseries_regression + + estimator = MultiseriesTimeSeriesBaselineRegressor(gap=0, forecast_horizon=2) + estimator.fit(X, y) + + assert len(estimator._delayed_target) == len(y) + 2 + assert (estimator._delayed_target.columns == y.columns).all() + + +def test_multiseries_time_series_baseline_includes_future(X_y_multiseries_regression): + X, y = X_y_multiseries_regression + + estimator = MultiseriesTimeSeriesBaselineRegressor(gap=1, forecast_horizon=2) + estimator.fit(X, y) + + X_future = pd.DataFrame(columns=X.columns, index=range(len(X), len(X) + 10)) + y_pred = estimator.predict(X_future) + + pd.testing.assert_frame_equal( + y_pred[:3].reset_index(drop=True), + y[-3:].reset_index(drop=True), + ) + assert (y_pred[3:] == 0).all().all() diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 608ccda275..35a71a60ab 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -830,6 +830,22 @@ def X_y_regression(): return X, y +@pytest.fixture +def X_y_multiseries_regression(): + X, _ = datasets.make_regression( + n_samples=100, + n_features=20, + n_informative=3, + random_state=0, + ) + y, _ = datasets.make_regression(n_samples=100, n_features=4) + X = pd.DataFrame(X) + X.ww.init(logical_types={col: "double" for col in X.columns}) + y = pd.DataFrame(y) + y.ww.init(logical_types={col: "double" for col in y.columns}) + return X, y + + @pytest.fixture def X_y_multi(): X, y = datasets.make_classification( From 308c8bca2981fb37c3cb141dd88596abda6184e6 Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Mon, 24 Jul 2023 11:48:54 -0400 Subject: [PATCH 04/14] Release notes --- docs/source/release_notes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 620e5df1c0..eca03c993c 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -3,6 +3,7 @@ Release Notes **Future Releases** * Enhancements * Updated regression metrics to handle multioutput dataframes as well as single output series :pr:`4233` + * Added baseline regressor for multiseries time series problems :pr:`4246` * Fixes * Changes * Unpinned sktime version :pr:`4214` From 0f35eec6350e10808c788e57b55b4d129aba11a6 Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Mon, 24 Jul 2023 15:10:38 -0400 Subject: [PATCH 05/14] Test fixes --- .../multiseries_time_series_baseline_regressor.py | 12 ++++++++++-- evalml/tests/component_tests/test_components.py | 15 +++++++++++++-- .../test_multiseries_baseline_regressor.py | 6 ++++-- evalml/tests/component_tests/test_utils.py | 1 + evalml/tests/pipeline_tests/test_pipelines.py | 4 ++-- evalml/utils/gen_utils.py | 1 + 6 files changed, 31 insertions(+), 8 deletions(-) diff --git a/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py b/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py index ad83f42cb7..8938d59d82 100644 --- a/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py @@ -19,7 +19,7 @@ class MultiseriesTimeSeriesBaselineRegressor(Estimator): random_seed (int): Seed for the random number generator. Defaults to 0. """ - name = "Time Series Baseline Regressor" + name = "Multiseries Time Series Baseline Regressor" hyperparameter_ranges = {} """{}""" model_family = ModelFamily.BASELINE @@ -67,7 +67,14 @@ def fit(self, X, y=None): raise ValueError( "Cannot train Multiseries Time Series Baseline Regressor if y is None", ) + if isinstance(y, pd.Series): + raise ValueError( + "y must be a DataFrame with multiple columns for Multiseries Time Series Baseline Regressor", + ) + self._num_features = X.shape[1] self._series_names = y.columns + if not y.index.is_numeric(): + y = y.reset_index(drop=True) delay_columns = pd.DataFrame( np.zeros((self.start_delay, y.shape[1])), @@ -93,7 +100,8 @@ def predict(self, X): ValueError: If input y is None. """ X = infer_feature_types(X) - self._num_features = X.shape[1] + if not X.index.is_numeric(): + X = X.reset_index(drop=True) in_sample_delay = self._delayed_target[self._delayed_target.index.isin(X.index)] diff --git a/evalml/tests/component_tests/test_components.py b/evalml/tests/component_tests/test_components.py index 3245a57185..00a2a13190 100644 --- a/evalml/tests/component_tests/test_components.py +++ b/evalml/tests/component_tests/test_components.py @@ -1015,9 +1015,9 @@ def test_components_can_be_used_for_partial_dependence_fast_mode(): # Expected number is hardcoded so that this test will fail when new components are added # It should be len(all_native_components) - num_invalid_for_pd_fast_mode if ProphetRegressor not in all_native_components: - expected_num_valid_for_pd_fast_mode = 63 - else: expected_num_valid_for_pd_fast_mode = 64 + else: + expected_num_valid_for_pd_fast_mode = 65 assert num_valid_for_pd_fast_mode == expected_num_valid_for_pd_fast_mode @@ -1224,6 +1224,8 @@ def test_all_estimators_check_fit( in component_class.supported_problem_types ): X, _, y = ts_data() + if component_class.is_multiseries: + y = pd.DataFrame({"target_a": y, "target_b": y}) else: X, y = X_y_binary @@ -1367,6 +1369,9 @@ def test_serialization( else: X, y = X_y_binary + if component_class.is_multiseries: + y = pd.DataFrame({"target_a": y, "target_b": y}) + component.fit(X, y) for pickle_protocol in range(cloudpickle.DEFAULT_PROTOCOL + 1): @@ -1740,6 +1745,9 @@ def test_estimator_fit_respects_custom_indices( X = pd.DataFrame(X) y = pd.Series(y) + if estimator_class.is_multiseries: + y = pd.DataFrame({"target_a": y, "target_b": y}) + if use_custom_index and ts_problem: X.index = pd.date_range("2020-10-01", periods=40) y.index = pd.date_range("2020-10-01", periods=40) @@ -1961,6 +1969,9 @@ def test_components_support_nullable_types( else: X = X.ww.select(["numeric", "Boolean", "BooleanNullable"]) + if component.is_multiseries: + y = pd.DataFrame({"target_a": y, "target_b": y}) + component.fit(X, y) if issubclass(component_class, Estimator): component.predict(X) diff --git a/evalml/tests/component_tests/test_multiseries_baseline_regressor.py b/evalml/tests/component_tests/test_multiseries_baseline_regressor.py index 031f9b20b0..753337bd06 100644 --- a/evalml/tests/component_tests/test_multiseries_baseline_regressor.py +++ b/evalml/tests/component_tests/test_multiseries_baseline_regressor.py @@ -20,15 +20,17 @@ def test_multiseries_time_series_baseline_gap_negative(): MultiseriesTimeSeriesBaselineRegressor(gap=-1) -def test_multiseries_time_series_baseline_estimator_y_is_none( +def test_multiseries_time_series_baseline_estimator_invalid_y( X_y_multiseries_regression, ): - X, y = X_y_multiseries_regression + X, _ = X_y_multiseries_regression estimator = MultiseriesTimeSeriesBaselineRegressor(gap=0, forecast_horizon=2) with pytest.raises(ValueError, match="if y is None"): estimator.fit(X, None) + with pytest.raises(ValueError, match="y must be a DataFrame"): + estimator.fit(X, pd.Series(range(100))) def test_multiseries_time_series_baseline_lags(X_y_multiseries_regression): diff --git a/evalml/tests/component_tests/test_utils.py b/evalml/tests/component_tests/test_utils.py index 4d59202d3a..3e4d0f6c56 100644 --- a/evalml/tests/component_tests/test_utils.py +++ b/evalml/tests/component_tests/test_utils.py @@ -75,6 +75,7 @@ "Target Imputer", "Natural Language Featurizer", "Time Series Baseline Estimator", + "Multiseries Time Series Baseline Regressor", "Time Series Imputer", "Time Series Regularizer", "URL Featurizer", diff --git a/evalml/tests/pipeline_tests/test_pipelines.py b/evalml/tests/pipeline_tests/test_pipelines.py index 6720f4732d..e2660d9e1e 100644 --- a/evalml/tests/pipeline_tests/test_pipelines.py +++ b/evalml/tests/pipeline_tests/test_pipelines.py @@ -113,9 +113,9 @@ def test_all_estimators( is_using_conda, ): if is_using_conda: - n_estimators = 13 - else: n_estimators = 14 + else: + n_estimators = 15 assert len(_all_estimators_used_in_search()) == n_estimators diff --git a/evalml/utils/gen_utils.py b/evalml/utils/gen_utils.py index c0aabf7424..af253fa021 100644 --- a/evalml/utils/gen_utils.py +++ b/evalml/utils/gen_utils.py @@ -209,6 +209,7 @@ def _get_subclasses(base_class): "BaselineClassifier", "BaselineRegressor", "TimeSeriesBaselineEstimator", + "MultiseriesTimeSeriesBaselineRegressor", "StackedEnsembleClassifier", "StackedEnsembleRegressor", "KNeighborsClassifier", From 3f597b4f0170ccbb39ea68d27b217923b62b9108 Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Mon, 24 Jul 2023 15:38:49 -0400 Subject: [PATCH 06/14] Revert test fix --- evalml/tests/pipeline_tests/test_pipelines.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evalml/tests/pipeline_tests/test_pipelines.py b/evalml/tests/pipeline_tests/test_pipelines.py index e2660d9e1e..6720f4732d 100644 --- a/evalml/tests/pipeline_tests/test_pipelines.py +++ b/evalml/tests/pipeline_tests/test_pipelines.py @@ -113,9 +113,9 @@ def test_all_estimators( is_using_conda, ): if is_using_conda: - n_estimators = 14 + n_estimators = 13 else: - n_estimators = 15 + n_estimators = 14 assert len(_all_estimators_used_in_search()) == n_estimators From c179eeb0cd5d2f6c1646735eea94fa313666f237 Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Wed, 26 Jul 2023 14:05:26 -0400 Subject: [PATCH 07/14] Update baseline to expect delayed columns in X --- ...tiseries_time_series_baseline_regressor.py | 38 +++++-------------- 1 file changed, 10 insertions(+), 28 deletions(-) diff --git a/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py b/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py index 8938d59d82..c8027d08b2 100644 --- a/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py @@ -71,19 +71,7 @@ def fit(self, X, y=None): raise ValueError( "y must be a DataFrame with multiple columns for Multiseries Time Series Baseline Regressor", ) - self._num_features = X.shape[1] - self._series_names = y.columns - if not y.index.is_numeric(): - y = y.reset_index(drop=True) - - delay_columns = pd.DataFrame( - np.zeros((self.start_delay, y.shape[1])), - columns=self._series_names, - index=range(y.index[-1] + 1, self.start_delay + y.index[-1] + 1), - ) - y = pd.concat([y, delay_columns]) - - self._delayed_target = y.shift(self.start_delay, fill_value=0) + self._target_column_names = list(y.columns) return self @@ -100,22 +88,16 @@ def predict(self, X): ValueError: If input y is None. """ X = infer_feature_types(X) - if not X.index.is_numeric(): - X = X.reset_index(drop=True) - - in_sample_delay = self._delayed_target[self._delayed_target.index.isin(X.index)] - - out_of_sample_delay = pd.DataFrame(columns=self._series_names) - out_of_sample_offset = X.index[-1] - self._delayed_target.index[-1] - if out_of_sample_offset > 0: - out_of_sample_delay = pd.DataFrame( - np.zeros((out_of_sample_offset, len(self._series_names))), - columns=self._series_names, - index=range(self._delayed_target.index[-1] + 1, X.index[-1] + 1), + feature_names = [ + f"{col}_delay_{self.start_delay}" for col in self._target_column_names + ] + if not set(feature_names).issubset(set(X.columns)): + raise ValueError( + "Multiseries Time Series Baseline Regressor is meant to be used in a pipeline with " + "a Time Series Featurizer", ) - - y_pred = pd.concat([in_sample_delay, out_of_sample_delay]) - return y_pred + self._num_features = X.shape[1] + return X.ww[feature_names] @property def feature_importance(self): From fc932d5602a3008096263cfe44925e50d92c5710 Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Wed, 26 Jul 2023 14:22:32 -0400 Subject: [PATCH 08/14] Refactor TS featurizer to handle y as a df --- .../preprocessing/time_series_featurizer.py | 58 ++++++++++++------- .../test_time_series_featurizer.py | 15 +++++ evalml/tests/conftest.py | 2 +- 3 files changed, 52 insertions(+), 23 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py index c966dc3162..0cb0c31e7f 100644 --- a/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py @@ -124,12 +124,15 @@ def fit(self, X, y=None): """ if self.time_index is None: raise ValueError("time_index cannot be None!") - self.statistically_significant_lags = self._find_significant_lags( - y, - conf_level=self.conf_level, - start_delay=self.start_delay, - max_delay=self.max_delay, - ) + if isinstance(y, pd.DataFrame): + self.statistically_significant_lags = [self.start_delay] + else: + self.statistically_significant_lags = self._find_significant_lags( + y, + conf_level=self.conf_level, + start_delay=self.start_delay, + max_delay=self.max_delay, + ) return self @staticmethod @@ -215,6 +218,22 @@ def _compute_rolling_transforms(self, X, y, original_features): ) return data + def _delay_df( + self, + data, + cols_to_delay, + categorical_columns=None, + X_categorical=None, + ): + lagged_features = {} + for col_name in cols_to_delay: + col = data[col_name] + if categorical_columns and col_name in categorical_columns: + col = X_categorical[col_name] + for t in self.statistically_significant_lags: + lagged_features[f"{col_name}_delay_{t}"] = col.shift(t) + return lagged_features + def _compute_delays(self, X_ww, y): """Computes the delayed features for numeric/categorical features in X and y. @@ -234,33 +253,28 @@ def _compute_delays(self, X_ww, y): ).columns, ) categorical_columns = self._get_categorical_columns(X_ww) - cols_derived_from_categoricals = [] lagged_features = {} if self.delay_features and len(X_ww) > 0: X_categorical = self._encode_X_while_preserving_index( X_ww[categorical_columns], ) - for col_name in cols_to_delay: - col = X_ww[col_name] - if col_name in categorical_columns: - col = X_categorical[col_name] - for t in self.statistically_significant_lags: - feature_name = f"{col_name}_delay_{t}" - lagged_features[f"{col_name}_delay_{t}"] = col.shift(t) - if col_name in categorical_columns: - cols_derived_from_categoricals.append(feature_name) + lagged_features.update( + self._delay_df(X_ww, cols_to_delay, categorical_columns, X_categorical), + ) # Handle cases where the target was passed in if self.delay_target and y is not None: - if type(y.ww.logical_type) == logical_types.Categorical: - y = self._encode_y_while_preserving_index(y) - for t in self.statistically_significant_lags: - lagged_features[self.target_colname_prefix.format(t)] = y.shift(t) + if isinstance(y, pd.DataFrame): + lagged_features.update(self._delay_df(y, y.columns)) + else: + if type(y.ww.logical_type) == logical_types.Categorical: + y = self._encode_y_while_preserving_index(y) + for t in self.statistically_significant_lags: + lagged_features[self.target_colname_prefix.format(t)] = y.shift(t) # Features created from categorical columns should no longer be categorical - lagged_features = pd.DataFrame(lagged_features) + lagged_features = pd.DataFrame(lagged_features, index=X_ww.index) lagged_features.ww.init( logical_types={col: "Double" for col in lagged_features.columns}, ) - lagged_features.index = X_ww.index return ww.concat_columns([X_ww, lagged_features]) def transform(self, X, y=None): diff --git a/evalml/tests/component_tests/test_time_series_featurizer.py b/evalml/tests/component_tests/test_time_series_featurizer.py index db206495aa..386db4190f 100644 --- a/evalml/tests/component_tests/test_time_series_featurizer.py +++ b/evalml/tests/component_tests/test_time_series_featurizer.py @@ -981,3 +981,18 @@ def test_delay_feature_transformer_works_for_non_numeric_ordinal_categories(ts_d output.fit(X, y) X_t = output.transform(X, y) assert set(X_t["cats_delay_1"].value_counts().to_dict().keys()) == {2.0, 0.0, 1.0} + + +def test_featurizer_y_dataframe(X_y_multiseries_regression): + X, y = X_y_multiseries_regression + X.index = pd.date_range("2021-01-01", periods=X.shape[0]) + + featurizer = TimeSeriesFeaturizer(time_index="index", gap=1, forecast_horizon=5) + featurizer.fit(X, y) + + assert featurizer.statistically_significant_lags == [6] + + expected_y_cols = [f"y_{i}_delay_6" for i in range(y.shape[1])] + X_t = featurizer.transform(X, y) + for expected_y_col in expected_y_cols: + assert expected_y_col in X_t.columns diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 35a71a60ab..d413eb9241 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -841,7 +841,7 @@ def X_y_multiseries_regression(): y, _ = datasets.make_regression(n_samples=100, n_features=4) X = pd.DataFrame(X) X.ww.init(logical_types={col: "double" for col in X.columns}) - y = pd.DataFrame(y) + y = pd.DataFrame(y, columns=[f"y_{i}" for i in range(y.shape[1])]) y.ww.init(logical_types={col: "double" for col in y.columns}) return X, y From 7b20f7f99b3214b9f6b8f1201a3a511be939fb41 Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Wed, 26 Jul 2023 14:30:41 -0400 Subject: [PATCH 09/14] Adjust baseline tests to account for change --- .../test_multiseries_baseline_regressor.py | 33 ++++++++----------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/evalml/tests/component_tests/test_multiseries_baseline_regressor.py b/evalml/tests/component_tests/test_multiseries_baseline_regressor.py index 753337bd06..685b912e16 100644 --- a/evalml/tests/component_tests/test_multiseries_baseline_regressor.py +++ b/evalml/tests/component_tests/test_multiseries_baseline_regressor.py @@ -2,7 +2,10 @@ import pytest from evalml.model_family import ModelFamily -from evalml.pipelines.components import MultiseriesTimeSeriesBaselineRegressor +from evalml.pipelines.components import ( + MultiseriesTimeSeriesBaselineRegressor, + TimeSeriesFeaturizer, +) def test_multiseries_time_series_baseline_regressor_init(): @@ -36,24 +39,14 @@ def test_multiseries_time_series_baseline_estimator_invalid_y( def test_multiseries_time_series_baseline_lags(X_y_multiseries_regression): X, y = X_y_multiseries_regression - estimator = MultiseriesTimeSeriesBaselineRegressor(gap=0, forecast_horizon=2) - estimator.fit(X, y) - - assert len(estimator._delayed_target) == len(y) + 2 - assert (estimator._delayed_target.columns == y.columns).all() + feat = TimeSeriesFeaturizer(time_index="index", gap=0, forecast_horizon=2) + feat.fit(X, y) + X_t = feat.transform(X, y) + estimator = MultiseriesTimeSeriesBaselineRegressor(gap=0, forecast_horizon=2) + estimator.fit(X_t, y) -def test_multiseries_time_series_baseline_includes_future(X_y_multiseries_regression): - X, y = X_y_multiseries_regression - - estimator = MultiseriesTimeSeriesBaselineRegressor(gap=1, forecast_horizon=2) - estimator.fit(X, y) - - X_future = pd.DataFrame(columns=X.columns, index=range(len(X), len(X) + 10)) - y_pred = estimator.predict(X_future) - - pd.testing.assert_frame_equal( - y_pred[:3].reset_index(drop=True), - y[-3:].reset_index(drop=True), - ) - assert (y_pred[3:] == 0).all().all() + pred = estimator.predict(X_t) + expected = y.shift(2) + expected.columns = [f"{col}_delay_2" for col in expected.columns] + pd.testing.assert_frame_equal(pred, expected) From c66b5723522429c7f7db94664a7ff8fef2483025 Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Wed, 26 Jul 2023 14:36:35 -0400 Subject: [PATCH 10/14] Missing test case --- .../multiseries_time_series_baseline_regressor.py | 2 +- .../test_multiseries_baseline_regressor.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py b/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py index c8027d08b2..2c595ca680 100644 --- a/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py @@ -85,7 +85,7 @@ def predict(self, X): pd.Series: Predicted values. Raises: - ValueError: If input y is None. + ValueError: If the lagged columns are not present in X. """ X = infer_feature_types(X) feature_names = [ diff --git a/evalml/tests/component_tests/test_multiseries_baseline_regressor.py b/evalml/tests/component_tests/test_multiseries_baseline_regressor.py index 685b912e16..d0e71618df 100644 --- a/evalml/tests/component_tests/test_multiseries_baseline_regressor.py +++ b/evalml/tests/component_tests/test_multiseries_baseline_regressor.py @@ -36,6 +36,16 @@ def test_multiseries_time_series_baseline_estimator_invalid_y( estimator.fit(X, pd.Series(range(100))) +def test_multiseries_baseline_no_featurizer(X_y_multiseries_regression): + X, y = X_y_multiseries_regression + + estimator = MultiseriesTimeSeriesBaselineRegressor(gap=0, forecast_horizon=2) + estimator.fit(X, y) + + with pytest.raises(ValueError, match="is meant to be used in a pipeline with "): + estimator.predict(X) + + def test_multiseries_time_series_baseline_lags(X_y_multiseries_regression): X, y = X_y_multiseries_regression From 42f2a26dc9e24acde1d8927d3143a890a4e2ae6a Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Wed, 26 Jul 2023 14:59:24 -0400 Subject: [PATCH 11/14] Test fixes --- .../multiseries_time_series_baseline_regressor.py | 2 +- evalml/tests/component_tests/test_components.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py b/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py index 2c595ca680..3c59464c03 100644 --- a/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py @@ -72,6 +72,7 @@ def fit(self, X, y=None): "y must be a DataFrame with multiple columns for Multiseries Time Series Baseline Regressor", ) self._target_column_names = list(y.columns) + self._num_features = X.shape[1] return self @@ -96,7 +97,6 @@ def predict(self, X): "Multiseries Time Series Baseline Regressor is meant to be used in a pipeline with " "a Time Series Featurizer", ) - self._num_features = X.shape[1] return X.ww[feature_names] @property diff --git a/evalml/tests/component_tests/test_components.py b/evalml/tests/component_tests/test_components.py index 00a2a13190..918a237d18 100644 --- a/evalml/tests/component_tests/test_components.py +++ b/evalml/tests/component_tests/test_components.py @@ -40,6 +40,7 @@ LinearDiscriminantAnalysis, LinearRegressor, LogisticRegressionClassifier, + MultiseriesTimeSeriesBaselineRegressor, NaturalLanguageFeaturizer, OneHotEncoder, Oversampler, @@ -1210,6 +1211,7 @@ def test_all_estimators_check_fit( StackedEnsembleClassifier, StackedEnsembleRegressor, TimeSeriesBaselineEstimator, + MultiseriesTimeSeriesBaselineRegressor, VowpalWabbitBinaryClassifier, VowpalWabbitMulticlassClassifier, VowpalWabbitRegressor, @@ -1224,8 +1226,6 @@ def test_all_estimators_check_fit( in component_class.supported_problem_types ): X, _, y = ts_data() - if component_class.is_multiseries: - y = pd.DataFrame({"target_a": y, "target_b": y}) else: X, y = X_y_binary @@ -1923,7 +1923,10 @@ def test_components_support_nullable_types( component is added that has nullable type incompatibilities, this should fail.""" cannot_handle_boolean_target = [CatBoostRegressor] - if component_class == TimeSeriesBaselineEstimator: + if ( + component_class == TimeSeriesBaselineEstimator + or component_class == MultiseriesTimeSeriesBaselineRegressor + ): pytest.skip( "Time Series Baseline Estimator can only be used within a Pipeline.", ) @@ -1969,9 +1972,6 @@ def test_components_support_nullable_types( else: X = X.ww.select(["numeric", "Boolean", "BooleanNullable"]) - if component.is_multiseries: - y = pd.DataFrame({"target_a": y, "target_b": y}) - component.fit(X, y) if issubclass(component_class, Estimator): component.predict(X) From 6489510d4f4aa8c9073483b8d15ba234a8c06e96 Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Mon, 31 Jul 2023 11:40:26 -0400 Subject: [PATCH 12/14] Consolidate test fixtures to agreed framework --- evalml/pipelines/utils.py | 10 +++------ .../test_multiseries_baseline_regressor.py | 14 ++++++------ .../test_time_series_featurizer.py | 9 ++++---- evalml/tests/conftest.py | 22 +------------------ .../pipeline_tests/test_pipeline_utils.py | 15 ------------- 5 files changed, 15 insertions(+), 55 deletions(-) diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index c95a3bbc51..750d9e4b5c 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -1357,7 +1357,6 @@ def unstack_multiseries( series_id, time_index, target_name, - keep_time_in_index=True, ): """Converts multiseries data with one series_id column and one target column to one target column per series id. @@ -1367,8 +1366,6 @@ def unstack_multiseries( series_id (str): The column which identifies which series each row belongs to. time_index (str): Specifies the name of the column in X that provides the datetime objects. target_name (str): The name of the target column. - keep_time_in_index (bool): Whether to maintain the time index as the index of the returned dataframes. Defaults to True. - If set to false, will discard the time index information entirely. Returns: pd.DataFrame, pd.DataFrame: The unstacked X and y data. @@ -1401,10 +1398,9 @@ def unstack_multiseries( X_unstacked = pd.concat(X_unstacked_cols, axis=1) y_unstacked = pd.concat(y_unstacked_cols, axis=1) - # Reset the axis if need be - if not keep_time_in_index: - X_unstacked.reset_index(drop=True, inplace=True) - y_unstacked.reset_index(drop=True, inplace=True) + # Reset the axes now that they've been unstacked, keep time info in X + X_unstacked = X_unstacked.reset_index() + y_unstacked = y_unstacked.reset_index(drop=True) return X_unstacked, y_unstacked diff --git a/evalml/tests/component_tests/test_multiseries_baseline_regressor.py b/evalml/tests/component_tests/test_multiseries_baseline_regressor.py index d0e71618df..0b63eca318 100644 --- a/evalml/tests/component_tests/test_multiseries_baseline_regressor.py +++ b/evalml/tests/component_tests/test_multiseries_baseline_regressor.py @@ -24,9 +24,9 @@ def test_multiseries_time_series_baseline_gap_negative(): def test_multiseries_time_series_baseline_estimator_invalid_y( - X_y_multiseries_regression, + multiseries_ts_data_unstacked, ): - X, _ = X_y_multiseries_regression + X, _ = multiseries_ts_data_unstacked estimator = MultiseriesTimeSeriesBaselineRegressor(gap=0, forecast_horizon=2) @@ -36,8 +36,8 @@ def test_multiseries_time_series_baseline_estimator_invalid_y( estimator.fit(X, pd.Series(range(100))) -def test_multiseries_baseline_no_featurizer(X_y_multiseries_regression): - X, y = X_y_multiseries_regression +def test_multiseries_baseline_no_featurizer(multiseries_ts_data_unstacked): + X, y = multiseries_ts_data_unstacked estimator = MultiseriesTimeSeriesBaselineRegressor(gap=0, forecast_horizon=2) estimator.fit(X, y) @@ -46,10 +46,10 @@ def test_multiseries_baseline_no_featurizer(X_y_multiseries_regression): estimator.predict(X) -def test_multiseries_time_series_baseline_lags(X_y_multiseries_regression): - X, y = X_y_multiseries_regression +def test_multiseries_time_series_baseline_lags(multiseries_ts_data_unstacked): + X, y = multiseries_ts_data_unstacked - feat = TimeSeriesFeaturizer(time_index="index", gap=0, forecast_horizon=2) + feat = TimeSeriesFeaturizer(time_index="date", gap=0, forecast_horizon=2) feat.fit(X, y) X_t = feat.transform(X, y) diff --git a/evalml/tests/component_tests/test_time_series_featurizer.py b/evalml/tests/component_tests/test_time_series_featurizer.py index 386db4190f..0458d8cfd0 100644 --- a/evalml/tests/component_tests/test_time_series_featurizer.py +++ b/evalml/tests/component_tests/test_time_series_featurizer.py @@ -983,16 +983,15 @@ def test_delay_feature_transformer_works_for_non_numeric_ordinal_categories(ts_d assert set(X_t["cats_delay_1"].value_counts().to_dict().keys()) == {2.0, 0.0, 1.0} -def test_featurizer_y_dataframe(X_y_multiseries_regression): - X, y = X_y_multiseries_regression - X.index = pd.date_range("2021-01-01", periods=X.shape[0]) +def test_featurizer_y_dataframe(multiseries_ts_data_unstacked): + X, y = multiseries_ts_data_unstacked - featurizer = TimeSeriesFeaturizer(time_index="index", gap=1, forecast_horizon=5) + featurizer = TimeSeriesFeaturizer(time_index="date", gap=1, forecast_horizon=5) featurizer.fit(X, y) assert featurizer.statistically_significant_lags == [6] - expected_y_cols = [f"y_{i}_delay_6" for i in range(y.shape[1])] + expected_y_cols = [f"target_{i}_delay_6" for i in range(y.shape[1])] X_t = featurizer.transform(X, y) for expected_y_col in expected_y_cols: assert expected_y_col in X_t.columns diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index f67c5ef55d..1882f7c05e 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -830,22 +830,6 @@ def X_y_regression(): return X, y -@pytest.fixture -def X_y_multiseries_regression(): - X, _ = datasets.make_regression( - n_samples=100, - n_features=20, - n_informative=3, - random_state=0, - ) - y, _ = datasets.make_regression(n_samples=100, n_features=4) - X = pd.DataFrame(X) - X.ww.init(logical_types={col: "double" for col in X.columns}) - y = pd.DataFrame(y, columns=[f"y_{i}" for i in range(y.shape[1])]) - y.ww.init(logical_types={col: "double" for col in y.columns}) - return X, y - - @pytest.fixture def X_y_multi(): X, y = datasets.make_classification( @@ -1050,11 +1034,7 @@ def multiseries_ts_data_unstacked(): y = pd.DataFrame({f"target_{i}": range(i, 100, 5) for i in range(5)}) - X.index = pd.date_range(start="1/1/2018", periods=20) - X.index.name = "date" - y.index = pd.date_range(start="1/1/2018", periods=20) - y.index.name = "date" - + X["date"] = pd.date_range(start="1/1/2018", periods=20) return X, y diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py index d1b74d5283..2064dcc835 100644 --- a/evalml/tests/pipeline_tests/test_pipeline_utils.py +++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py @@ -1379,10 +1379,8 @@ def test_make_pipeline_features_and_dfs(X_y_binary): @pytest.mark.parametrize("target_name", ["target", "Target_Data"]) -@pytest.mark.parametrize("keep_time_in_index", [True, False]) def test_unstack_multiseries( target_name, - keep_time_in_index, multiseries_ts_data_stacked, multiseries_ts_data_unstacked, ): @@ -1392,9 +1390,6 @@ def test_unstack_multiseries( y_unstacked.columns = [ f"{target_name}_{i}" for i in range(len(y_unstacked.columns)) ] - if not keep_time_in_index: - X_unstacked.reset_index(drop=True, inplace=True) - y_unstacked.reset_index(drop=True, inplace=True) X_unstacked_transformed, y_unstacked_transformed = unstack_multiseries( X, @@ -1402,7 +1397,6 @@ def test_unstack_multiseries( "series_id", "date", target_name=target_name, - keep_time_in_index=keep_time_in_index, ) pd.testing.assert_frame_equal( X_unstacked.sort_index(axis=1), @@ -1418,11 +1412,9 @@ def test_unstack_multiseries( @pytest.mark.parametrize("include_series_id", [True, False]) @pytest.mark.parametrize("series_id_name", [None, "SERIES"]) -@pytest.mark.parametrize("index_type", ["datetime", "int"]) def test_stack_data( include_series_id, series_id_name, - index_type, multiseries_ts_data_stacked, multiseries_ts_data_unstacked, ): @@ -1430,13 +1422,6 @@ def test_stack_data( _, y_stacked = multiseries_ts_data_stacked y_stacked.name = "target" - - if index_type == "datetime": - y_stacked.index = pd.date_range(start="1/1/2018", periods=20).repeat(5) - y_stacked.index.name = "date" - else: - y = y.reset_index(drop=True) - y_stacked_transformed = stack_data( y, include_series_id=include_series_id, From e08f1c5dfa9bbfb0d646069032a18ff754568b9e Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Mon, 31 Jul 2023 11:42:11 -0400 Subject: [PATCH 13/14] Docstring corrections --- .../multiseries_time_series_baseline_regressor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py b/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py index 3c59464c03..536aa1bbaa 100644 --- a/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py @@ -54,8 +54,8 @@ def fit(self, X, y=None): """Fits multiseries time series baseline regressor to data. Args: - X (pd.DataFrame): The input training data of shape [n_samples, n_features]. - y (pd.Series): The target training data of length [n_samples]. + X (pd.DataFrame): The input training data of shape [n_samples, n_features * n_series]. + y (pd.DataFrame): The target training data of shape [n_samples, n_features * n_series]. Returns: self @@ -83,7 +83,7 @@ def predict(self, X): X (pd.DataFrame): Data of shape [n_samples, n_features]. Returns: - pd.Series: Predicted values. + pd.DataFrame: Predicted values. Raises: ValueError: If the lagged columns are not present in X. From 38eb4346be6d24620ebcaf5d0a58d15d1c38c511 Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Mon, 31 Jul 2023 16:51:35 -0400 Subject: [PATCH 14/14] PR comments --- .../multiseries_time_series_baseline_regressor.py | 6 ++++-- .../transformers/preprocessing/time_series_featurizer.py | 8 +++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py b/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py index 536aa1bbaa..f01132a608 100644 --- a/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py @@ -4,6 +4,7 @@ from evalml.model_family import ModelFamily from evalml.pipelines.components.estimators import Estimator +from evalml.pipelines.components.transformers import TimeSeriesFeaturizer from evalml.problem_types import ProblemTypes from evalml.utils import infer_feature_types @@ -61,7 +62,7 @@ def fit(self, X, y=None): self Raises: - ValueError: If input y is None. + ValueError: If input y is None or if y is not a DataFrame with multiple columns. """ if y is None: raise ValueError( @@ -90,7 +91,8 @@ def predict(self, X): """ X = infer_feature_types(X) feature_names = [ - f"{col}_delay_{self.start_delay}" for col in self._target_column_names + TimeSeriesFeaturizer.df_colname_prefix.format(col, self.start_delay) + for col in self._target_column_names ] if not set(feature_names).issubset(set(X.columns)): raise ValueError( diff --git a/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py index 0cb0c31e7f..f812471090 100644 --- a/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py @@ -61,6 +61,8 @@ class TimeSeriesFeaturizer(Transformer): needs_fitting = True target_colname_prefix = "target_delay_{}" """target_delay_{}""" + df_colname_prefix = "{}_delay_{}" + """{}_delay_{}""" def __init__( self, @@ -124,6 +126,8 @@ def fit(self, X, y=None): """ if self.time_index is None: raise ValueError("time_index cannot be None!") + + # For the multiseries case, where we only want the start delay lag for the baseline if isinstance(y, pd.DataFrame): self.statistically_significant_lags = [self.start_delay] else: @@ -231,7 +235,9 @@ def _delay_df( if categorical_columns and col_name in categorical_columns: col = X_categorical[col_name] for t in self.statistically_significant_lags: - lagged_features[f"{col_name}_delay_{t}"] = col.shift(t) + lagged_features[self.df_colname_prefix.format(col_name, t)] = col.shift( + t, + ) return lagged_features def _compute_delays(self, X_ww, y):