Skip to content

Commit

Permalink
Code review
Browse files Browse the repository at this point in the history
  • Loading branch information
sbrugman committed May 16, 2020
1 parent 9dfdf2b commit 9d9730f
Show file tree
Hide file tree
Showing 22 changed files with 1,226 additions and 1,291 deletions.
3 changes: 3 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@ include LICENSE

global-include README.rst
global-exclude *.py[cod] __pycache__ *.so
exclude docs tests .readthedocs.yml
recursive-exclude tests *.py
recursive-exclude docs *
2 changes: 1 addition & 1 deletion docs/source/introduction.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ The ``PhiK`` correlation analyzer library contains several useful functions to h
* Visualizing the dependency between variables can be tricky, especially when dealing with (unordered) categorical variables.
To help interpret any variable relationship found, we provide a method for the detection of
significant excesses or deficits of records with respect to the expected values in a contingency table, so-called outliers,
using a statistically independent evaluation for expected frequency of records, accouncting for the uncertainty on the expectation.
using a statistically independent evaluation for expected frequency of records, accounting for the uncertainty on the expectation.
We evaluate the significance of each outlier frequency in a table, and normalize and visualize these accordingly.
The resulting plots we find to be very valuable to help interpret variable dependencies,
and work alike for interval, ordinal and categorical variables.
Expand Down
34 changes: 34 additions & 0 deletions example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import pandas as pd

import phik
from phik import resources, report

# open fake car insurance data
df = pd.read_csv( resources.fixture('fake_insurance_data.csv.gz') )
df.head()

# Pearson's correlation matrix between numeric variables (pandas functionality)
df.corr()

# get the phi_k correlation matrix between all variables
df.phik_matrix()

# get global correlations based on phi_k correlation matrix
df.global_phik()

# get the significance matrix (expressed as one-sided Z)
# of the hypothesis test of each variable-pair dependency
df.significance_matrix()

# contingency table of two columns
cols = ['mileage', 'car_size']
df[cols].hist2d()

# normalized residuals of contingency test applied to cols
df[cols].outlier_significance_matrix()

# show the normalized residuals of each variable-pair
df.outlier_significance_matrices()

# generate a phik correlation report and save as test.pdf
report.correlation_report(df, pdf_file_name='test.pdf')
30 changes: 15 additions & 15 deletions python/phik/betainc.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
"""
import numpy as np
from scipy.special import gammaln
from typing import Union
from typing import Tuple


def contfractbeta(a: float, b: float, x: float, ITMAX: int = 5000, EPS:float = 1.0e-7) -> float:
"""Continued fraction form of the incomplete Beta function.
Expand Down Expand Up @@ -58,7 +59,6 @@ def contfractbeta(a: float, b: float, x: float, ITMAX: int = 5000, EPS:float = 1
return az

raise ValueError('a={0:f} or b={1:f} too large, or ITMAX={2:d} too small to compute incomplete beta function.'.format(a,b,ITMAX))
return 0


def incompbeta(a: float, b: float, x: float) -> float:
Expand All @@ -79,20 +79,20 @@ def incompbeta(a: float, b: float, x: float) -> float:
:rtype: float
'''
# special cases
if (x == 0):
return 0;
elif (x == 1):
return 1;
if x == 0:
return 0
elif x == 1:
return 1
# default
lbeta = gammaln(a+b) - gammaln(a) - gammaln(b) + a * np.log(x) + b * np.log(1-x)
if (x < (a+1) / (a+b+2)):
if x < (a + 1) / (a + b + 2):
p = np.exp(lbeta) * contfractbeta(a, b, x) / a
else:
p = 1 - np.exp(lbeta) * contfractbeta(b, a, 1-x) / b
return p


def log_incompbeta(a: float, b: float, x: float) -> Union[float,float]:
def log_incompbeta(a: float, b: float, x: float) -> Tuple[float,float]:
'''Evaluation of logarithm of incomplete beta function
Logarithm of incomplete beta function is implemented to ensure sufficient precision
Expand All @@ -113,19 +113,19 @@ def log_incompbeta(a: float, b: float, x: float) -> Union[float,float]:
:rtype: tuple
'''
# special cases
if (x == 0):
return (-np.inf, 0)
elif (x == 1):
return (0, -np.inf)
if x == 0:
return -np.inf, 0
elif x == 1:
return 0, -np.inf
# default
lbeta = gammaln(a+b) - gammaln(a) - gammaln(b) + a * np.log(x) + b * np.log(1-x)

if (x < (a+1) / (a+b+2)):
if x < (a + 1) / (a + b + 2):
p = np.exp(lbeta) * contfractbeta(a, b, x) / a
logp = lbeta + np.log(contfractbeta(a, b, x)) - np.log(a)
logq = np.log(1-p)
else:
p = 1 - np.exp(lbeta) * ( contfractbeta(b, a, 1-x) / b )
p = 1 - np.exp(lbeta) * (contfractbeta(b, a, 1-x) / b)
logp = np.log(p)
logq = lbeta + np.log(contfractbeta(b, a, 1-x)) - np.log(b)
return (logp, logq)
return logp, logq
117 changes: 62 additions & 55 deletions python/phik/binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,17 @@
modification, are permitted according to the terms listed in the file
LICENSE.
"""
from typing import List, Tuple, Union, Optional

import numpy as np
import pandas as pd
import warnings

from phik import definitions as defs
from phik.utils import array_like_to_dataframe, guess_interval_cols


def bin_edges(arr, nbins:int, quantile:bool = False) -> np.ndarray:
def bin_edges(arr: Union[np.ndarray, list, pd.Series], nbins:int, quantile:bool = False) -> np.ndarray:
"""
Create uniform or quantile bin-edges for the input array.
Expand All @@ -29,20 +31,18 @@ def bin_edges(arr, nbins:int, quantile:bool = False) -> np.ndarray:
:param bool quantile: uniform bins (False) or bins based on quantiles (True)
:returns: array with bin edges
"""
if not isinstance(arr, (np.ndarray, list, pd.Series)):
raise TypeError('arr is not array like.')

if quantile:
quantiles = np.linspace(0, 1, nbins + 1)
xbins = np.quantile(arr[~np.isnan(arr)], quantiles)
xbins[0] = xbins[0] - 1E-14
xbins[0] -= 1E-14
else:
xbins = np.linspace(min(arr[~np.isnan(arr)]) - 1E-14, max(arr[~np.isnan(arr)]), nbins + 1)

return xbins


def bin_array(arr, bin_edges):
def bin_array(arr: Union[np.ndarray, list], bin_edges: Union[np.ndarray, list]) -> Tuple[np.ndarray, list]:
"""
Index the data given the bin_edges.
Expand All @@ -52,10 +52,6 @@ def bin_array(arr, bin_edges):
:param bin_edges: list with bin edges.
:returns: indexed data
"""
if not isinstance(arr, (np.ndarray, list)):
raise TypeError('arr is not a list or numpy array.')
if not isinstance(bin_edges, (np.ndarray, list)):
raise TypeError('bin_edges is not a list or numpy array.')

# Bin data
binned_arr = np.searchsorted(bin_edges, arr).astype(object)
Expand All @@ -79,35 +75,33 @@ def bin_array(arr, bin_edges):
return binned_arr, bin_labels


def bin_data(data, cols:list=[], bins=10, quantile: bool=False, retbins: bool=False):
def bin_data(data: pd.DataFrame, cols: Union[list, np.ndarray, tuple]=(), bins:Union[int,list,np.ndarray,dict]=10,
quantile: bool=False, retbins: bool=False):
"""
Index the input dataframe given the bin_edges for the columns specified in cols.
Index the input DataFrame given the bin_edges for the columns specified in cols.
:param DataFrame data: input data
:param list cols: list of columns with numeric data which needs to be indexed
:param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)\
E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
:param quantile: when bins is an integer, uniform bins (False) or bins based on quantiles (True)
:returns: rebinned dataframe
:returns: rebinned DataFrame
:rtype: pandas.DataFrame
"""
if not isinstance(data, pd.DataFrame):
raise TypeError('data is not a pandas DataFrame.')
if not isinstance(cols, (list,np.ndarray)):
raise TypeError('cols is not array-like.')
if not isinstance(bins, (int,list,np.ndarray,dict)):
raise TypeError('bins is of incorrect type.')

if isinstance(bins, dict):
for col in cols:
if col not in bins:
raise AssertionError('column {0} is not included in bins dictionary.'.format(col))
raise ValueError('column {0} is not included in bins dictionary.'.format(col))

# check for numeric bins
for col in list(set(data._get_numeric_data().columns) - set(cols)):
nuq = data[col].nunique()
if (nuq > 0.9 * len(data)) or (nuq > 100):
warnings.warn(
"numeric variable {1:s} has {0:d} unique values. Are you sure you don't want to bin it?".format(nuq, str(col)), Warning)
"numeric variable {1:s} has {0:d} unique values. Are you sure you don't want to bin it?".format(nuq, str(col)),
Warning
)

binned_data = data.copy()

Expand All @@ -133,44 +127,40 @@ def bin_data(data, cols:list=[], bins=10, quantile: bool=False, retbins: bool=Fa
return binned_data


def create_correlation_overview_table(vals:dict) -> pd.DataFrame:
def create_correlation_overview_table(vals: List[Tuple[str, str, float]]) -> pd.DataFrame:
"""
Create overview table of phik/significance data.
:param dict vals: dictionary holding data for each variable pair formatted as {'var1:var2' : value}
:param list vals: list holding tuples of data for each variable pair formatted as ('var1', 'var2', value)
:returns: symmetric table with phik/significances of all variable pairs
:rtype: pandas.DataFrame
"""
if not isinstance(vals, dict):
raise TypeError('vals is not a dict.')

ll = []
for k, v in vals.items():
ll.append(k.split(':')+[v])
ll.append(list(reversed(k.split(':')))+[v])
for c0, c1, v in vals:
ll.append([c0, c1, v])
ll.append([c1, c0, v])

corr_matrix = pd.DataFrame(ll, columns=['var1', 'var2', 'vals'])\
.pivot_table(index='var1', columns='var2', values='vals')
corr_matrix = pd.DataFrame(ll, columns=['var1', 'var2', 'vals']).pivot_table(index='var1', columns='var2', values='vals')
corr_matrix.columns.name = None
corr_matrix.index.name = None
return corr_matrix


def hist2d_from_rebinned_df(data_binned:pd.DataFrame, dropna:bool=True, drop_underflow:bool=True, drop_overflow:bool=True) -> pd.DataFrame:
"""
Give binned 2d dataframe of two colums of rebinned input dataframe
Give binned 2d DataFrame of two columns of rebinned input DataFrame
:param df: input data. Dataframe must contain exactly two columns
:param df: input data. DataFrame must contain exactly two columns
:param bool dropna: remove NaN values with True
:param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning\
a numeric variable)
:param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning\
a numeric variable)
:returns: histogram dataframe
:returns: histogram DataFrame
"""
if not isinstance(data_binned, pd.DataFrame):
raise TypeError('data_binned is not a pandas DataFrame.')

assert len(data_binned.columns) == 2, 'DataFrame should contain only two columns'
c0, c1 = data_binned.columns

if not dropna:
data_binned.fillna(defs.NaN, inplace=True)
Expand All @@ -180,19 +170,20 @@ def hist2d_from_rebinned_df(data_binned:pd.DataFrame, dropna:bool=True, drop_und
data_binned.replace(defs.OF, np.nan, inplace=True)

# create a contingency table
c0, c1 = data_binned.columns
df_datahist = data_binned.groupby([c0, c1])[c0].count().to_frame().unstack().fillna(0)
df_datahist.columns = df_datahist.columns.droplevel()

return df_datahist


def hist2d(df, interval_cols=None, bins=10, quantile:bool=False, dropna:bool=True, drop_underflow:bool=True,
drop_overflow:bool=True, retbins:bool=False) -> pd.DataFrame:
def hist2d(df: pd.DataFrame, interval_cols:Optional[Union[list, np.ndarray]]=None,
bins:Union[int,float,list,np.ndarray,dict]=10,
quantile:bool=False, dropna:bool=True, drop_underflow:bool=True,
drop_overflow:bool=True, retbins:bool=False) -> Union[pd.DataFrame, Tuple[pd.DataFrame, dict]]:
"""
Give binned 2d dataframe of two colums of input dataframe
Give binned 2d DataFrame of two columns of input DataFrame
:param df: input data. Dataframe must contain exactly two columns
:param df: input data. DataFrame must contain exactly two columns
:param interval_cols: columns with interval variables which need to be binned
:param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)\
E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
Expand All @@ -202,27 +193,43 @@ def hist2d(df, interval_cols=None, bins=10, quantile:bool=False, dropna:bool=Tru
a numeric variable)
:param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning\
a numeric variable)
:returns: histogram dataframe
:returns: histogram DataFrame
"""
if not isinstance(df, pd.DataFrame):
raise TypeError('df is not a pandas DataFrame.')
if not isinstance(interval_cols, (type(None), list, np.ndarray)):
raise TypeError('interval_cols is not None or a list.')
if not isinstance(bins, (int,float,list,np.ndarray,dict)):
raise TypeError('bins is of incorrect type.')

assert len(df.columns) == 2, 'DataFrame should contain only two columns'
if len(df.columns) != 2:
raise ValueError('DataFrame should contain only two columns')

if isinstance( interval_cols, type(None) ):
interval_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if interval_cols:
print('interval_cols not set, guessing: {0:s}'.format(str(interval_cols)))
assert isinstance( interval_cols, list ), 'interval_cols is not a list.'
if interval_cols is None:
interval_cols = guess_interval_cols(df)

data_binned, binning_dict = bin_data(df, interval_cols, retbins=True, bins=bins, quantile=quantile)
datahist = hist2d_from_rebinned_df(data_binned, dropna=dropna, drop_underflow=drop_underflow, drop_overflow=drop_overflow)
datahist = hist2d_from_rebinned_df(
data_binned, dropna=dropna, drop_underflow=drop_underflow, drop_overflow=drop_overflow
)

if retbins:
return datahist, binning_dict

return datahist


def hist2d_from_array(x: Union[pd.Series, list, np.ndarray], y: [pd.Series, list, np.ndarray], **kwargs) -> Union[pd.DataFrame, Tuple[pd.DataFrame, dict]]:
"""
Give binned 2d DataFrame of two input arrays
:param x: input data. First array-like.
:param y: input data. Second array-like.
:param interval_cols: columns with interval variables which need to be binned
:param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)\
E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
:param bool quantile: when the number of bins is specified, use uniform binning (False) or quantile binning (True)
:param bool dropna: remove NaN values with True
:param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning\
a numeric variable)
:param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning\
a numeric variable)
:returns: histogram DataFrame
"""

df = array_like_to_dataframe(x, y)
return hist2d(df, **kwargs)
Loading

0 comments on commit 9d9730f

Please sign in to comment.