diff --git a/setup.py b/setup.py index 0bf74e5d..a6dc6527 100644 --- a/setup.py +++ b/setup.py @@ -36,7 +36,7 @@ keywords='data science ydata', url='https://github.com/ydataai/ydata-quality', license="https://github.com/ydataai/ydata-quality/blob/master/LICENSE", - python_requires=">=3.7, <3.9", + python_requires=">3.9", packages=find_namespace_packages('src'), package_dir={'':'src'}, include_package_data=True, diff --git a/src/ydata_quality/bias_fairness/engine.py b/src/ydata_quality/bias_fairness/engine.py index 94b099a1..a5c13cc9 100644 --- a/src/ydata_quality/bias_fairness/engine.py +++ b/src/ydata_quality/bias_fairness/engine.py @@ -49,8 +49,7 @@ def proxy_identification(self, th=0.5): Non-sensitive features can serve as proxy for protected attributes, exposing the data to a possible subsequent bias in the data pipeline. High association values indicate that alternative features can - be used in place of the original sensitive attributes. - """ + be used in place of the original sensitive attributes.""" # TODO: multiple thresholds per association type (num/num, num/cat, cat/cat) # Compute association measures for sensitive features diff --git a/src/ydata_quality/utils/correlations.py b/src/ydata_quality/utils/correlations.py index b9fc6786..9cfd8f0e 100644 --- a/src/ydata_quality/utils/correlations.py +++ b/src/ydata_quality/utils/correlations.py @@ -7,6 +7,7 @@ from typing import List, Optional from matplotlib.pyplot import figure as pltfigure, show as pltshow +import numpy as np from numpy import ( nan, fill_diagonal, @@ -55,8 +56,9 @@ def filter_associations(corrs: DataFrame, th: float, Returns corrs (Series): map of feature_pair to association metric value, filtered """ + corrs = corrs if isinstance(corrs, DataFrame) else DataFrame(corrs) # convert to DataFrame if needed corrs = corrs.copy() # keep original - fill_diagonal(corrs.values, nan) # remove the same column pairs + np.fill_diagonal(corrs.to_numpy(), nan) # remove the same column pairs using numpy array corrs = corrs[subset] if subset is not None else corrs # subset features corrs = corrs[(corrs > th) | (corrs < -th)].melt(ignore_index=False).reset_index().dropna() # subset by threshold corrs['features'] = ['_'.join(sorted((i.index, i.variable))) diff --git a/src/ydata_quality/utils/enum.py b/src/ydata_quality/utils/enum.py index 4eb66456..458f23b4 100644 --- a/src/ydata_quality/utils/enum.py +++ b/src/ydata_quality/utils/enum.py @@ -42,26 +42,35 @@ def __lt__(self, other): class StringEnum(Enum): + """Enum that allows case-insensitive string lookup.""" @classmethod def _missing_(cls, value): if isinstance(value, str): upper_value = value.upper() - key = StringEnum._key_from_str_(upper_value) + key = cls.find_member(upper_value) if key is not None: return key lower_value = value.lower() - key = StringEnum._key_from_str_(lower_value) + key = cls.find_member(lower_value) if key is not None: return key raise ValueError(f"{value} is not a valid {cls.__name__}") @classmethod - def _key_from_str_(cls, value: str): + def find_member(cls, value: str): + """Find an enum member by its string value. + + Args: + value: The string value to look up + + Returns: + The enum member if found, None otherwise + """ if value in cls.__members__: return cls(value) diff --git a/tests/engines/test_bias_fairness.py b/tests/engines/test_bias_fairness.py new file mode 100644 index 00000000..6a5c4c45 --- /dev/null +++ b/tests/engines/test_bias_fairness.py @@ -0,0 +1,39 @@ +""" +Unit tests for the bias fairness engine +""" + +import pandas as pd + +from src.ydata_quality.bias_fairness.engine import BiasFairness + + +def get_fake_data(): + """Returns fake data for tests.""" + return pd.DataFrame({ + 'age': [25, 35, 45, 55], + 'salary': [30000, 45000, 60000, 75000], + 'gender': ['M', 'F', 'M', 'F'], + 'department': ['IT', 'HR', 'IT', 'HR'] + }) + + +class TestBiasFairness: + """Test class for BiasFairness.""" + + def test_sensitive_features_property(self): + """Test sensitive features property returns correct features.""" + df = get_fake_data() + sensitive_features = ['gender', 'age'] + bf = BiasFairness(df=df, sensitive_features=sensitive_features) + assert bf.sensitive_features == sensitive_features + + def test_proxy_identification(self): + """Test proxy identification returns expected correlations.""" + df = get_fake_data() + sensitive_features = ['gender'] + bf = BiasFairness( + df=df, + sensitive_features=sensitive_features + ) + correlations = bf.proxy_identification(th=0.5) + assert len(correlations) >= 0