From 65955663bae2fdc2b9ef71f9058d72850e2c8bb7 Mon Sep 17 00:00:00 2001 From: Anirudh Dagar Date: Thu, 31 Oct 2024 17:19:22 +0100 Subject: [PATCH 1/7] BUG: Fix feature transform failure when test_id_column=None --- .../transformer/feature_transformers/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/autogluon_assistant/transformer/feature_transformers/base.py b/src/autogluon_assistant/transformer/feature_transformers/base.py index b8f16f2..a9d9754 100644 --- a/src/autogluon_assistant/transformer/feature_transformers/base.py +++ b/src/autogluon_assistant/transformer/feature_transformers/base.py @@ -51,7 +51,9 @@ def transform(self, task: TabularPredictionTask) -> TabularPredictionTask: errors="ignore", ) train_y = task.train_data[task.label_column] - test_x = task.test_data.drop(columns=[task.test_id_column]) + if task.test_id_column in task.test_data.columns: + # Skip if test_id_column is not found + test_x = task.test_data.drop(columns=[task.test_id_column]) train_x, test_x = self._transform_dataframes(train_X=train_x, test_X=test_x) From c2ed395598668a956eeca4a9cbe53fdc876ca315 Mon Sep 17 00:00:00 2001 From: Anirudh Dagar Date: Thu, 31 Oct 2024 17:26:00 +0100 Subject: [PATCH 2/7] assign test_x for else condition --- .../transformer/feature_transformers/base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/autogluon_assistant/transformer/feature_transformers/base.py b/src/autogluon_assistant/transformer/feature_transformers/base.py index a9d9754..9a79840 100644 --- a/src/autogluon_assistant/transformer/feature_transformers/base.py +++ b/src/autogluon_assistant/transformer/feature_transformers/base.py @@ -54,6 +54,8 @@ def transform(self, task: TabularPredictionTask) -> TabularPredictionTask: if task.test_id_column in task.test_data.columns: # Skip if test_id_column is not found test_x = task.test_data.drop(columns=[task.test_id_column]) + else: + test_x = task.test_data train_x, test_x = self._transform_dataframes(train_X=train_x, test_X=test_x) From 6ecef8c9578562f09851841c3aadba9a46fe27f8 Mon Sep 17 00:00:00 2001 From: Anirudh Dagar Date: Thu, 31 Oct 2024 17:36:20 +0100 Subject: [PATCH 3/7] Handle transformed_test_data similarly --- .../transformer/feature_transformers/base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/autogluon_assistant/transformer/feature_transformers/base.py b/src/autogluon_assistant/transformer/feature_transformers/base.py index 9a79840..ba31957 100644 --- a/src/autogluon_assistant/transformer/feature_transformers/base.py +++ b/src/autogluon_assistant/transformer/feature_transformers/base.py @@ -65,7 +65,10 @@ def transform(self, task: TabularPredictionTask) -> TabularPredictionTask: transformed_train_data = pd.concat( [transformed_train_data, task.train_data[task.test_id_column]], axis=1 ) - transformed_test_data = pd.concat([test_x, task.test_data[task.test_id_column]], axis=1) + if task.test_id_column in task.test_data.columns: + transformed_test_data = pd.concat([test_x, task.test_data[task.test_id_column]], axis=1) + else: + transformed_test_data = test_x task = copy.deepcopy(task) task.train_data = transformed_train_data From 0f8213615f16f9592a59ae024e800e2c40292af0 Mon Sep 17 00:00:00 2001 From: Anirudh Dagar Date: Thu, 31 Oct 2024 18:26:08 +0100 Subject: [PATCH 4/7] BUG: Drop train_id_column for train_x --- .../transformer/feature_transformers/base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/autogluon_assistant/transformer/feature_transformers/base.py b/src/autogluon_assistant/transformer/feature_transformers/base.py index ba31957..f0c594e 100644 --- a/src/autogluon_assistant/transformer/feature_transformers/base.py +++ b/src/autogluon_assistant/transformer/feature_transformers/base.py @@ -21,7 +21,7 @@ def _fit_dataframes(self, train_X: pd.DataFrame, train_y: pd.Series, **kwargs) - def fit(self, task: TabularPredictionTask) -> "BaseFeatureTransformer": try: train_x = task.train_data.drop( - columns=task.columns_in_train_but_not_test + [task.test_id_column], + columns=task.columns_in_train_but_not_test + [task.train_id_column], errors="ignore", ) train_y = task.train_data[task.label_column] @@ -47,7 +47,7 @@ def _transform_dataframes(self, train_X: pd.DataFrame, test_X: pd.DataFrame) -> def transform(self, task: TabularPredictionTask) -> TabularPredictionTask: try: train_x = task.train_data.drop( - columns=task.columns_in_train_but_not_test + [task.test_id_column], + columns=task.columns_in_train_but_not_test + [task.train_id_column], errors="ignore", ) train_y = task.train_data[task.label_column] @@ -61,9 +61,9 @@ def transform(self, task: TabularPredictionTask) -> TabularPredictionTask: # add back id and label columns transformed_train_data = pd.concat([train_x, train_y.rename(task.label_column)], axis=1) - if task.test_id_column in task.train_data.columns: + if task.train_id_column in task.train_data.columns: transformed_train_data = pd.concat( - [transformed_train_data, task.train_data[task.test_id_column]], axis=1 + [transformed_train_data, task.train_data[task.train_id_column]], axis=1 ) if task.test_id_column in task.test_data.columns: transformed_test_data = pd.concat([test_x, task.test_data[task.test_id_column]], axis=1) From 548816acc23dbea609b0f1615d68b4fbd9607c2d Mon Sep 17 00:00:00 2001 From: Anirudh Dagar Date: Thu, 31 Oct 2024 18:55:09 +0100 Subject: [PATCH 5/7] BUG: Handle datasets with test set containing label columns --- .../transformer/feature_transformers/base.py | 26 ++++++++++++++++--- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/src/autogluon_assistant/transformer/feature_transformers/base.py b/src/autogluon_assistant/transformer/feature_transformers/base.py index f0c594e..4a2409e 100644 --- a/src/autogluon_assistant/transformer/feature_transformers/base.py +++ b/src/autogluon_assistant/transformer/feature_transformers/base.py @@ -50,25 +50,43 @@ def transform(self, task: TabularPredictionTask) -> TabularPredictionTask: columns=task.columns_in_train_but_not_test + [task.train_id_column], errors="ignore", ) + label_column_avaialable_in_test_set = False + if task.label_column in train_x.columns: + # Label Column also present in test data + # requires explicit dropping of the column + # from train set before feature transformation + train_x = train_x.drop(columns=[task.label_column]) train_y = task.train_data[task.label_column] + if task.test_id_column in task.test_data.columns: # Skip if test_id_column is not found test_x = task.test_data.drop(columns=[task.test_id_column]) else: test_x = task.test_data + if task.label_column in test_x.columns: + # Label Column also present in test data + # requires explicit dropping of the column + # from test set before feature transformation + label_column_avaialable_in_test_set = True + test_x = test_x.drop(columns=[task.label_column]) train_x, test_x = self._transform_dataframes(train_X=train_x, test_X=test_x) - # add back id and label columns + # add back label columns transformed_train_data = pd.concat([train_x, train_y.rename(task.label_column)], axis=1) + if label_column_avaialable_in_test_set: + # Add back label column to test set as it was available before + transformed_test_data = pd.concat([test_x, test_y.rename(task.label_column)], axis=1) + else: + transformed_test_data = test_x + + # add back id columns if task.train_id_column in task.train_data.columns: transformed_train_data = pd.concat( [transformed_train_data, task.train_data[task.train_id_column]], axis=1 ) if task.test_id_column in task.test_data.columns: - transformed_test_data = pd.concat([test_x, task.test_data[task.test_id_column]], axis=1) - else: - transformed_test_data = test_x + transformed_test_data = pd.concat([transformed_test_data, task.test_data[task.test_id_column]], axis=1) task = copy.deepcopy(task) task.train_data = transformed_train_data From 1bd780072853a6bc45c91b3bb857bb5d3ffd532a Mon Sep 17 00:00:00 2001 From: Anirudh Dagar Date: Thu, 31 Oct 2024 19:13:50 +0100 Subject: [PATCH 6/7] assign test_y if label column available --- src/autogluon_assistant/transformer/feature_transformers/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/autogluon_assistant/transformer/feature_transformers/base.py b/src/autogluon_assistant/transformer/feature_transformers/base.py index 4a2409e..c7d0d90 100644 --- a/src/autogluon_assistant/transformer/feature_transformers/base.py +++ b/src/autogluon_assistant/transformer/feature_transformers/base.py @@ -69,6 +69,7 @@ def transform(self, task: TabularPredictionTask) -> TabularPredictionTask: # from test set before feature transformation label_column_avaialable_in_test_set = True test_x = test_x.drop(columns=[task.label_column]) + test_y = task.test_data[task.label_column] train_x, test_x = self._transform_dataframes(train_X=train_x, test_X=test_x) From e2aa9e89a77bfa69d4e42dd2d2a16585eeb27a5f Mon Sep 17 00:00:00 2001 From: Anirudh Dagar Date: Fri, 1 Nov 2024 02:31:39 +0100 Subject: [PATCH 7/7] log error in except block; fix typo --- .../transformer/feature_transformers/base.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/autogluon_assistant/transformer/feature_transformers/base.py b/src/autogluon_assistant/transformer/feature_transformers/base.py index c7d0d90..df7b728 100644 --- a/src/autogluon_assistant/transformer/feature_transformers/base.py +++ b/src/autogluon_assistant/transformer/feature_transformers/base.py @@ -50,7 +50,7 @@ def transform(self, task: TabularPredictionTask) -> TabularPredictionTask: columns=task.columns_in_train_but_not_test + [task.train_id_column], errors="ignore", ) - label_column_avaialable_in_test_set = False + label_column_available_in_test_set = False if task.label_column in train_x.columns: # Label Column also present in test data # requires explicit dropping of the column @@ -67,7 +67,7 @@ def transform(self, task: TabularPredictionTask) -> TabularPredictionTask: # Label Column also present in test data # requires explicit dropping of the column # from test set before feature transformation - label_column_avaialable_in_test_set = True + label_column_available_in_test_set = True test_x = test_x.drop(columns=[task.label_column]) test_y = task.test_data[task.label_column] @@ -75,7 +75,7 @@ def transform(self, task: TabularPredictionTask) -> TabularPredictionTask: # add back label columns transformed_train_data = pd.concat([train_x, train_y.rename(task.label_column)], axis=1) - if label_column_avaialable_in_test_set: + if label_column_available_in_test_set: # Add back label column to test set as it was available before transformed_test_data = pd.concat([test_x, test_y.rename(task.label_column)], axis=1) else: @@ -92,7 +92,7 @@ def transform(self, task: TabularPredictionTask) -> TabularPredictionTask: task = copy.deepcopy(task) task.train_data = transformed_train_data task.test_data = transformed_test_data - except: - logger.warning(f"FeatureTransformer {self.__class__.__name__} failed to transform.") + except Exception as e: + logger.warning(f"FeatureTransformer {self.__class__.__name__} failed to transform. Error: {str(e)}") finally: return task