diff --git a/ibis_ml/steps/_standardize.py b/ibis_ml/steps/_standardize.py index 2ccefa1..7df080f 100644 --- a/ibis_ml/steps/_standardize.py +++ b/ibis_ml/steps/_standardize.py @@ -11,6 +11,8 @@ from collections.abc import Iterable _DOCS_PAGE_NAME = "standardization" +# a small epsilon value to handle near-constant columns during normalization +_APPROX_EPS = 10e-7 class ScaleMinMax(Step): @@ -68,7 +70,11 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None: def transform_table(self, table: ir.Table) -> ir.Table: return table.mutate( [ - ((table[c] - min) / (max - min)).name(c) # type: ignore + # for near-constant column, set the scale to 1.0 + ( + (table[c] - min) + / (1.0 if abs(max - min) < _APPROX_EPS else max - min) + ).name(c) for c, (max, min) in self.stats_.items() ] ) @@ -128,7 +134,10 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None: def transform_table(self, table: ir.Table) -> ir.Table: return table.mutate( [ - ((table[c] - center) / scale).name(c) # type: ignore + # for near-constant column, set the scale to 1.0 + ( + (table[c] - center) / (1.0 if abs(scale) < _APPROX_EPS else scale) + ).name(c) for c, (center, scale) in self.stats_.items() ] ) diff --git a/tests/test_standardize.py b/tests/test_standardize.py new file mode 100644 index 0000000..ad35217 --- /dev/null +++ b/tests/test_standardize.py @@ -0,0 +1,42 @@ +import ibis +import numpy as np +import pandas as pd +import pandas.testing as tm +import pytest + +import ibis_ml as ml + + +def test_scalestandard(): + cols = np.arange(0, 100) + mean = np.mean(cols) + std = np.std(cols) + table = ibis.memtable({"col": cols}) + step = ml.ScaleStandard("col") + step.fit_table(table, ml.core.Metadata()) + result = step.transform_table(table) + expected = pd.DataFrame({"col": (cols - mean) / std}) + tm.assert_frame_equal(result.execute(), expected, check_exact=False) + + +def test_scaleminmax(): + cols = np.arange(0, 100) + min_val = np.min(cols) + max_val = np.max(cols) + table = ibis.memtable({"col": cols}) + step = ml.ScaleMinMax("col") + step.fit_table(table, ml.core.Metadata()) + result = step.transform_table(table) + expected = pd.DataFrame({"col": (cols - min_val) / (max_val - min_val)}) + tm.assert_frame_equal(result.execute(), expected, check_exact=False) + + +@pytest.mark.parametrize("scaler", ["ScaleStandard", "ScaleMinMax"]) +def test_constant_columns(scaler): + table = ibis.memtable({"int_col": [100], "float_col": [100.0]}) + scaler_class = getattr(ml, scaler) + scale_step = scaler_class(ml.numeric()) + scale_step.fit_table(table, ml.core.Metadata()) + result = scale_step.transform_table(table) + expected = pd.DataFrame({"int_col": [0.0], "float_col": [0.0]}) + tm.assert_frame_equal(result.execute(), expected)