Code review

KaveIO · May 16, 2020 · 9d9730f · 9d9730f
1 parent 9dfdf2b
commit 9d9730f
Show file tree

Hide file tree

Showing 22 changed files with 1,226 additions and 1,291 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -3,3 +3,6 @@ include LICENSE
 
 global-include README.rst
 global-exclude *.py[cod] __pycache__ *.so
+exclude docs tests .readthedocs.yml
+recursive-exclude tests *.py
+recursive-exclude docs *
diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst
@@ -31,7 +31,7 @@ The ``PhiK`` correlation analyzer library contains several useful functions to h
 * Visualizing the dependency between variables can be tricky, especially when dealing with (unordered) categorical variables. 
   To help interpret any variable relationship found, we provide a method for the detection of
   significant excesses or deficits of records with respect to the expected values in a contingency table, so-called outliers,
-  using a statistically independent evaluation for expected frequency of records, accouncting for the uncertainty on the expectation.
+  using a statistically independent evaluation for expected frequency of records, accounting for the uncertainty on the expectation.
   We evaluate the significance of each outlier frequency in a table, and normalize and visualize these accordingly.
   The resulting plots we find to be very valuable to help interpret variable dependencies,
   and work alike for interval, ordinal and categorical variables.

diff --git a/example.py b/example.py
@@ -0,0 +1,34 @@
+import pandas as pd
+
+import phik
+from phik import resources, report
+
+# open fake car insurance data
+df = pd.read_csv( resources.fixture('fake_insurance_data.csv.gz') )
+df.head()
+
+# Pearson's correlation matrix between numeric variables (pandas functionality)
+df.corr()
+
+# get the phi_k correlation matrix between all variables
+df.phik_matrix()
+
+# get global correlations based on phi_k correlation matrix
+df.global_phik()
+
+# get the significance matrix (expressed as one-sided Z)
+# of the hypothesis test of each variable-pair dependency
+df.significance_matrix()
+
+# contingency table of two columns
+cols = ['mileage', 'car_size']
+df[cols].hist2d()
+
+# normalized residuals of contingency test applied to cols
+df[cols].outlier_significance_matrix()
+
+# show the normalized residuals of each variable-pair
+df.outlier_significance_matrices()
+
+# generate a phik correlation report and save as test.pdf
+report.correlation_report(df, pdf_file_name='test.pdf')
diff --git a/python/phik/betainc.py b/python/phik/betainc.py
@@ -14,7 +14,8 @@
 """
 import numpy as np
 from scipy.special import gammaln
-from typing import Union
+from typing import Tuple
+
 
 def contfractbeta(a: float, b: float, x: float, ITMAX: int = 5000, EPS:float = 1.0e-7) -> float:
     """Continued fraction form of the incomplete Beta function.
@@ -58,7 +59,6 @@ def contfractbeta(a: float, b: float, x: float, ITMAX: int = 5000, EPS:float = 1
             return az
 
     raise ValueError('a={0:f} or b={1:f} too large, or ITMAX={2:d} too small to compute incomplete beta function.'.format(a,b,ITMAX))
-    return 0
 
 
 def incompbeta(a: float, b: float, x: float) -> float:
@@ -79,20 +79,20 @@ def incompbeta(a: float, b: float, x: float) -> float:
     :rtype: float
     ''' 
     # special cases
-    if (x == 0):
-        return 0;
-    elif (x == 1):
-        return 1;
+    if x == 0:
+        return 0
+    elif x == 1:
+        return 1
     # default
     lbeta = gammaln(a+b) - gammaln(a) - gammaln(b) + a * np.log(x) + b * np.log(1-x)
-    if (x < (a+1) / (a+b+2)):
+    if x < (a + 1) / (a + b + 2):
         p = np.exp(lbeta) * contfractbeta(a, b, x) / a
     else:
         p = 1 - np.exp(lbeta) * contfractbeta(b, a, 1-x) / b
     return p
 
 
-def log_incompbeta(a: float, b: float, x: float) -> Union[float,float]: 
+def log_incompbeta(a: float, b: float, x: float) -> Tuple[float,float]:
     '''Evaluation of logarithm of incomplete beta function
 
     Logarithm of incomplete beta function is implemented to ensure sufficient precision
@@ -113,19 +113,19 @@ def log_incompbeta(a: float, b: float, x: float) -> Union[float,float]:
     :rtype: tuple
     '''
     # special cases
-    if (x == 0):
-        return (-np.inf, 0)
-    elif (x == 1):
-        return (0, -np.inf)
+    if x == 0:
+        return -np.inf, 0
+    elif x == 1:
+        return 0, -np.inf
     # default
     lbeta = gammaln(a+b) - gammaln(a) - gammaln(b) + a * np.log(x) + b * np.log(1-x)
 
-    if (x < (a+1) / (a+b+2)):
+    if x < (a + 1) / (a + b + 2):
         p = np.exp(lbeta) * contfractbeta(a, b, x) / a
         logp = lbeta + np.log(contfractbeta(a, b, x)) - np.log(a)
         logq = np.log(1-p)
     else:
-        p = 1 - np.exp(lbeta) * ( contfractbeta(b, a, 1-x) / b )
+        p = 1 - np.exp(lbeta) * (contfractbeta(b, a, 1-x) / b)
         logp = np.log(p)
         logq = lbeta + np.log(contfractbeta(b, a, 1-x)) - np.log(b)
-    return (logp, logq)
+    return logp, logq
diff --git a/python/phik/binning.py b/python/phik/binning.py
@@ -12,15 +12,17 @@
 modification, are permitted according to the terms listed in the file
 LICENSE.
 """
+from typing import List, Tuple, Union, Optional
 
 import numpy as np
 import pandas as pd
 import warnings
 
 from phik import definitions as defs
+from phik.utils import array_like_to_dataframe, guess_interval_cols
 
 
-def bin_edges(arr, nbins:int, quantile:bool = False) -> np.ndarray:
+def bin_edges(arr: Union[np.ndarray, list, pd.Series], nbins:int, quantile:bool = False) -> np.ndarray:
     """
     Create uniform or quantile bin-edges for the input array.
 
@@ -29,20 +31,18 @@ def bin_edges(arr, nbins:int, quantile:bool = False) -> np.ndarray:
     :param bool quantile: uniform bins (False) or bins based on quantiles (True)
     :returns: array with bin edges
     """
-    if not isinstance(arr, (np.ndarray, list, pd.Series)):
-        raise TypeError('arr is not array like.')
 
     if quantile:
         quantiles = np.linspace(0, 1, nbins + 1)
         xbins = np.quantile(arr[~np.isnan(arr)], quantiles)
-        xbins[0] = xbins[0] - 1E-14
+        xbins[0] -= 1E-14
     else:
         xbins = np.linspace(min(arr[~np.isnan(arr)]) - 1E-14, max(arr[~np.isnan(arr)]), nbins + 1)
 
     return xbins
 
 
-def bin_array(arr, bin_edges):
+def bin_array(arr: Union[np.ndarray, list], bin_edges: Union[np.ndarray, list]) -> Tuple[np.ndarray, list]:
     """
     Index the data given the bin_edges. 
 
@@ -52,10 +52,6 @@ def bin_array(arr, bin_edges):
     :param bin_edges: list with bin edges.
     :returns: indexed data
     """
-    if not isinstance(arr, (np.ndarray, list)):
-        raise TypeError('arr is not a list or numpy array.')
-    if not isinstance(bin_edges, (np.ndarray, list)):
-        raise TypeError('bin_edges is not a list or numpy array.')
 
     # Bin data
     binned_arr = np.searchsorted(bin_edges, arr).astype(object)
@@ -79,35 +75,33 @@ def bin_array(arr, bin_edges):
     return binned_arr, bin_labels
 
 
-def bin_data(data, cols:list=[], bins=10, quantile: bool=False, retbins: bool=False):
+def bin_data(data: pd.DataFrame, cols: Union[list, np.ndarray, tuple]=(), bins:Union[int,list,np.ndarray,dict]=10,
+             quantile: bool=False, retbins: bool=False):
     """
-    Index the input dataframe given the bin_edges for the columns specified in cols.
+    Index the input DataFrame given the bin_edges for the columns specified in cols.
 
     :param DataFrame data: input data
     :param list cols: list of columns with numeric data which needs to be indexed
     :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)\
     E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
     :param quantile: when bins is an integer, uniform bins (False) or bins based on quantiles (True)
-    :returns: rebinned dataframe
+    :returns: rebinned DataFrame
     :rtype: pandas.DataFrame
     """
-    if not isinstance(data, pd.DataFrame):
-        raise TypeError('data is not a pandas DataFrame.')    
-    if not isinstance(cols, (list,np.ndarray)):
-        raise TypeError('cols is not array-like.')    
-    if not isinstance(bins, (int,list,np.ndarray,dict)):
-        raise TypeError('bins is of incorrect type.')
+
     if isinstance(bins, dict):
         for col in cols:
             if col not in bins:
-                raise AssertionError('column {0} is not included in bins dictionary.'.format(col))
+                raise ValueError('column {0} is not included in bins dictionary.'.format(col))
 
     # check for numeric bins
     for col in list(set(data._get_numeric_data().columns) - set(cols)):
         nuq = data[col].nunique()
         if (nuq > 0.9 * len(data)) or (nuq > 100):
             warnings.warn(
-                "numeric variable {1:s} has {0:d} unique values. Are you sure you don't want to bin it?".format(nuq, str(col)), Warning)
+                "numeric variable {1:s} has {0:d} unique values. Are you sure you don't want to bin it?".format(nuq, str(col)),
+                Warning
+            )
 
     binned_data = data.copy()
 
@@ -133,44 +127,40 @@ def bin_data(data, cols:list=[], bins=10, quantile: bool=False, retbins: bool=Fa
     return binned_data
 
 
-def create_correlation_overview_table(vals:dict) -> pd.DataFrame:
+def create_correlation_overview_table(vals: List[Tuple[str, str, float]]) -> pd.DataFrame:
     """
     Create overview table of phik/significance data.
 
-    :param dict vals: dictionary holding data for each variable pair formatted as {'var1:var2' : value}
+    :param list vals: list holding tuples of data for each variable pair formatted as ('var1', 'var2', value)
     :returns: symmetric table with phik/significances of all variable pairs
     :rtype: pandas.DataFrame
     """
-    if not isinstance(vals, dict):
-        raise TypeError('vals is not a dict.')    
 
     ll = []
-    for k, v in vals.items():
-        ll.append(k.split(':')+[v])
-        ll.append(list(reversed(k.split(':')))+[v])
+    for c0, c1, v in vals:
+        ll.append([c0, c1, v])
+        ll.append([c1, c0, v])
 
-    corr_matrix = pd.DataFrame(ll, columns=['var1', 'var2', 'vals'])\
-        .pivot_table(index='var1', columns='var2', values='vals')
-        
+    corr_matrix = pd.DataFrame(ll, columns=['var1', 'var2', 'vals']).pivot_table(index='var1', columns='var2', values='vals')
+    corr_matrix.columns.name = None
+    corr_matrix.index.name = None
     return corr_matrix
 
 
 def hist2d_from_rebinned_df(data_binned:pd.DataFrame, dropna:bool=True, drop_underflow:bool=True, drop_overflow:bool=True) -> pd.DataFrame:
     """
-    Give binned 2d dataframe of two colums of rebinned input dataframe
+    Give binned 2d DataFrame of two columns of rebinned input DataFrame
 
-    :param df: input data. Dataframe must contain exactly two columns
+    :param df: input data. DataFrame must contain exactly two columns
     :param bool dropna: remove NaN values with True
     :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning\
     a numeric variable)
     :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning\
     a numeric variable)
-    :returns: histogram dataframe
+    :returns: histogram DataFrame
     """
-    if not isinstance(data_binned, pd.DataFrame):
-        raise TypeError('data_binned is not a pandas DataFrame.')    
 
-    assert len(data_binned.columns) == 2, 'DataFrame should contain only two columns'
+    c0, c1 = data_binned.columns
 
     if not dropna:
         data_binned.fillna(defs.NaN, inplace=True)
@@ -180,19 +170,20 @@ def hist2d_from_rebinned_df(data_binned:pd.DataFrame, dropna:bool=True, drop_und
         data_binned.replace(defs.OF, np.nan, inplace=True)
 
     # create a contingency table
-    c0, c1 = data_binned.columns
     df_datahist = data_binned.groupby([c0, c1])[c0].count().to_frame().unstack().fillna(0)
     df_datahist.columns = df_datahist.columns.droplevel()
 
     return df_datahist
 
 
-def hist2d(df, interval_cols=None, bins=10, quantile:bool=False, dropna:bool=True, drop_underflow:bool=True,
-           drop_overflow:bool=True, retbins:bool=False) -> pd.DataFrame:
+def hist2d(df: pd.DataFrame, interval_cols:Optional[Union[list, np.ndarray]]=None,
+           bins:Union[int,float,list,np.ndarray,dict]=10,
+           quantile:bool=False, dropna:bool=True, drop_underflow:bool=True,
+           drop_overflow:bool=True, retbins:bool=False) -> Union[pd.DataFrame, Tuple[pd.DataFrame, dict]]:
     """
-    Give binned 2d dataframe of two colums of input dataframe
+    Give binned 2d DataFrame of two columns of input DataFrame
 
-    :param df: input data. Dataframe must contain exactly two columns
+    :param df: input data. DataFrame must contain exactly two columns
     :param interval_cols: columns with interval variables which need to be binned
     :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)\
     E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
@@ -202,27 +193,43 @@ def hist2d(df, interval_cols=None, bins=10, quantile:bool=False, dropna:bool=Tru
     a numeric variable)
     :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning\
     a numeric variable)
-    :returns: histogram dataframe
+    :returns: histogram DataFrame
     """
-    if not isinstance(df, pd.DataFrame):
-        raise TypeError('df is not a pandas DataFrame.')    
-    if not isinstance(interval_cols, (type(None), list, np.ndarray)):
-        raise TypeError('interval_cols is not None or a list.')    
-    if not isinstance(bins, (int,float,list,np.ndarray,dict)):
-        raise TypeError('bins is of incorrect type.')    
 
-    assert len(df.columns) == 2, 'DataFrame should contain only two columns'
+    if len(df.columns) != 2:
+        raise ValueError('DataFrame should contain only two columns')
 
-    if isinstance( interval_cols, type(None) ):
-        interval_cols = df.select_dtypes(include=[np.number]).columns.tolist()
-        if interval_cols:
-            print('interval_cols not set, guessing: {0:s}'.format(str(interval_cols)))
-    assert isinstance( interval_cols, list ), 'interval_cols is not a list.'
+    if interval_cols is None:
+        interval_cols = guess_interval_cols(df)
 
     data_binned, binning_dict = bin_data(df, interval_cols, retbins=True, bins=bins, quantile=quantile)
-    datahist = hist2d_from_rebinned_df(data_binned, dropna=dropna, drop_underflow=drop_underflow, drop_overflow=drop_overflow)
+    datahist = hist2d_from_rebinned_df(
+        data_binned, dropna=dropna, drop_underflow=drop_underflow, drop_overflow=drop_overflow
+    )
 
     if retbins:
         return datahist, binning_dict
 
     return datahist
+
+
+def hist2d_from_array(x: Union[pd.Series, list, np.ndarray], y: [pd.Series, list, np.ndarray], **kwargs) -> Union[pd.DataFrame, Tuple[pd.DataFrame, dict]]:
+    """
+    Give binned 2d DataFrame of two input arrays
+
+    :param x: input data. First array-like.
+    :param y: input data. Second array-like.
+    :param interval_cols: columns with interval variables which need to be binned
+    :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)\
+    E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
+    :param bool quantile: when the number of bins is specified, use uniform binning (False) or quantile binning (True)
+    :param bool dropna: remove NaN values with True
+    :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning\
+    a numeric variable)
+    :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning\
+    a numeric variable)
+    :returns: histogram DataFrame
+    """
+
+    df = array_like_to_dataframe(x, y)
+    return hist2d(df, **kwargs)