Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
keywords='data science ydata',
url='https://github.com/ydataai/ydata-quality',
license="https://github.com/ydataai/ydata-quality/blob/master/LICENSE",
python_requires=">=3.7, <3.9",
python_requires=">3.9",
packages=find_namespace_packages('src'),
package_dir={'':'src'},
include_package_data=True,
Expand Down
3 changes: 1 addition & 2 deletions src/ydata_quality/bias_fairness/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,7 @@ def proxy_identification(self, th=0.5):

Non-sensitive features can serve as proxy for protected attributes, exposing the data to a possible
subsequent bias in the data pipeline. High association values indicate that alternative features can
be used in place of the original sensitive attributes.
"""
be used in place of the original sensitive attributes."""
# TODO: multiple thresholds per association type (num/num, num/cat, cat/cat)

# Compute association measures for sensitive features
Expand Down
4 changes: 3 additions & 1 deletion src/ydata_quality/utils/correlations.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import List, Optional

from matplotlib.pyplot import figure as pltfigure, show as pltshow
import numpy as np
from numpy import (
nan,
fill_diagonal,
Expand Down Expand Up @@ -55,8 +56,9 @@ def filter_associations(corrs: DataFrame, th: float,
Returns
corrs (Series): map of feature_pair to association metric value, filtered
"""
corrs = corrs if isinstance(corrs, DataFrame) else DataFrame(corrs) # convert to DataFrame if needed
corrs = corrs.copy() # keep original
fill_diagonal(corrs.values, nan) # remove the same column pairs
np.fill_diagonal(corrs.to_numpy(), nan) # remove the same column pairs using numpy array
corrs = corrs[subset] if subset is not None else corrs # subset features
corrs = corrs[(corrs > th) | (corrs < -th)].melt(ignore_index=False).reset_index().dropna() # subset by threshold
corrs['features'] = ['_'.join(sorted((i.index, i.variable)))
Expand Down
15 changes: 12 additions & 3 deletions src/ydata_quality/utils/enum.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,26 +42,35 @@ def __lt__(self, other):


class StringEnum(Enum):
"""Enum that allows case-insensitive string lookup."""

@classmethod
def _missing_(cls, value):
if isinstance(value, str):
upper_value = value.upper()

key = StringEnum._key_from_str_(upper_value)
key = cls.find_member(upper_value)
if key is not None:
return key

lower_value = value.lower()

key = StringEnum._key_from_str_(lower_value)
key = cls.find_member(lower_value)
if key is not None:
return key

raise ValueError(f"{value} is not a valid {cls.__name__}")

@classmethod
def _key_from_str_(cls, value: str):
def find_member(cls, value: str):
"""Find an enum member by its string value.

Args:
value: The string value to look up

Returns:
The enum member if found, None otherwise
"""
if value in cls.__members__:
return cls(value)

Expand Down
39 changes: 39 additions & 0 deletions tests/engines/test_bias_fairness.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""
Unit tests for the bias fairness engine
"""

import pandas as pd

from src.ydata_quality.bias_fairness.engine import BiasFairness


def get_fake_data():
"""Returns fake data for tests."""
return pd.DataFrame({
'age': [25, 35, 45, 55],
'salary': [30000, 45000, 60000, 75000],
'gender': ['M', 'F', 'M', 'F'],
'department': ['IT', 'HR', 'IT', 'HR']
})


class TestBiasFairness:
"""Test class for BiasFairness."""

def test_sensitive_features_property(self):
"""Test sensitive features property returns correct features."""
df = get_fake_data()
sensitive_features = ['gender', 'age']
bf = BiasFairness(df=df, sensitive_features=sensitive_features)
assert bf.sensitive_features == sensitive_features

def test_proxy_identification(self):
"""Test proxy identification returns expected correlations."""
df = get_fake_data()
sensitive_features = ['gender']
bf = BiasFairness(
df=df,
sensitive_features=sensitive_features
)
correlations = bf.proxy_identification(th=0.5)
assert len(correlations) >= 0