Skip to content

Commit

Permalink
Refined KDE inference - uploaded more examples
Browse files Browse the repository at this point in the history
  • Loading branch information
robinthibaut committed Jun 3, 2021
1 parent 01528b0 commit 0b2354e
Show file tree
Hide file tree
Showing 6 changed files with 394 additions and 50 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

setup(
name="skbel",
version="1.0.3",
version="1.0.4",
packages=my_pckg,
include_package_data=True,
url="https://github.com/robinthibaut/skbel",
Expand Down
77 changes: 33 additions & 44 deletions skbel/algorithms/_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,11 +488,9 @@ def mvn_inference(
return y_posterior_mean, y_posterior_covariance


_MESSAGE = "The integral is probably divergent, or slowly convergent."


def _convergent(quadrature):
if len(quadrature) > 3 and quadrature[3] == _MESSAGE:
msg = "The integral is probably divergent, or slowly convergent."
if len(quadrature) > 3 and quadrature[3] == msg:
return False
else:
return True
Expand Down Expand Up @@ -528,7 +526,6 @@ def normalize(pdf, lower_bd=-np.inf, upper_bd=np.inf, vectorize=False):
raise ValueError('PDF integral likely divergent.')
A = quadrature[0]

# pdf_normed = lambda x: pdf(x) / A if lower_bd <= x <= upper_bd else 0
def pdf_normed(x):
if lower_bd <= x <= upper_bd:
return pdf(x) / A
Expand Down Expand Up @@ -589,16 +586,6 @@ def cdf_vector(x):
return cdf_vector


############################################
# CHEBYSHEV APPROXIMATION OF PDF & CDF
############################################

# This section follows the work of Olver & Townsend (2013), who suggest using
# Chebyshev polynomials to approximate the PDF. This allows for trivial
# construction of the CDF, as these polynomials can be integrated in closed
# form.


def _chebnodes(a, b, n):
"""Chebyshev nodes of rank n on integral [a,b]."""
if not a < b:
Expand Down Expand Up @@ -694,8 +681,6 @@ def cdf(x_):
return cdf


# Sample via root-finding on CDF

def it_sampling(pdf,
num_samples,
lower_bd=-np.inf,
Expand All @@ -716,8 +701,6 @@ def it_sampling(pdf,
manually establish cutoffs for the density.
upper_bd : float
Upper bound of the support of the pdf.
guess : float or int
Initial guess for the numerical solver to use when inverting the CDF.
chebyshev: Boolean, optional (default=False)
If True, then the CDF is approximated using Chebyshev polynomials.
Expand All @@ -741,36 +724,42 @@ def it_sampling(pdf,
for >100k samples.
"""
seeds = uniform(0, 1, num_samples)

if chebyshev:

if not (np.isfinite(lower_bd) and np.isfinite(upper_bd)):
raise ValueError('Bounds must be finite for Chebyshev approximation of CDF.')

cdf = chebcdf(pdf, lower_bd, upper_bd)
else:
cdf = get_cdf(pdf, lower_bd, upper_bd)
cdf_y = cdf(np.linspace(lower_bd, upper_bd, 200))
inverse_cdf = interpolate.interp1d(cdf_y, pdf.x, kind="linear", fill_value="extrapolate")
simple_samples = inverse_cdf(seeds)

# Compute empirical mean and standard deviation
mean = sum(pdf.x * pdf.y) / sum(pdf.y)
estd = np.sqrt(sum(pdf.y * (pdf.x - mean) ** 2) / sum(pdf.y))
# If distribution is too tight (small std) then use of simple inverse transform sampling
if estd <= 0.6:
return simple_samples
# Otherwise, use Chebyshev approach
else:
samples = []

seeds = uniform(0, 1, num_samples)
samples = []
# Get initial guess
cdf_y = np.cumsum(pdf.y) # cumulative distribution function, cdf
cdf_y = cdf_y / cdf_y.max() # takes care of normalizing cdf to 1.0
inverse_cdf = interpolate.interp1d(cdf_y, pdf.x, kind="linear", fill_value="extrapolate") # this is a function
simple_samples = inverse_cdf(seeds)

# Compute empirical mean and standard deviation
mean = sum(pdf.x * pdf.y) / sum(pdf.y)
estd = np.sqrt(sum(pdf.y * (pdf.x - mean) ** 2) / sum(pdf.y))
# If distribution is too tight (small std) then use of simple inverse transform sampling
if estd <= 0.6:
return simple_samples
# Otherwise, use Chebyshev approach
else:
guess = np.mean(simple_samples)
guess = np.mean(simple_samples)

for seed in seeds:
def shifted(x):
return cdf(x) - seed

for seed in seeds:
def shifted(x):
return cdf(x) - seed
soln = root(shifted, guess)
samples.append(soln.x[0])

soln = root(shifted, guess)
samples.append(soln.x[0])
return np.array(samples)
else:
cdf = get_cdf(pdf, lower_bd, upper_bd)
cdf_y = cdf(np.linspace(lower_bd, upper_bd, 200))
inverse_cdf = interpolate.interp1d(cdf_y, pdf.x, kind="linear", fill_value="extrapolate")
simple_samples = inverse_cdf(seeds)
return simple_samples

return np.array(samples)
168 changes: 168 additions & 0 deletions skbel/examples/demo_kde.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
# Copyright (c) 2021. Robin Thibaut, Ghent University

import os
from os.path import join as jp

import joblib
import pandas as pd
from loguru import logger
from sklearn.cross_decomposition import CCA
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PowerTransformer

import demo_visualization as myvis

from skbel import utils
from skbel.learning.bel import BEL


def init_bel():
"""
Set all BEL pipelines. This is the blueprint of the framework.
"""
# Pipeline before CCA
X_pre_processing = Pipeline(
[
("scaler", StandardScaler(with_mean=False)),
("pca", PCA()),
]
)
Y_pre_processing = Pipeline(
[
("scaler", StandardScaler(with_mean=False)),
("pca", PCA()),
]
)

# Canonical Correlation Analysis
cca = CCA()

# Pipeline after CCA
X_post_processing = Pipeline(
[("normalizer", PowerTransformer(method="yeo-johnson", standardize=True))]
)
Y_post_processing = Pipeline(
[("normalizer", PowerTransformer(method="yeo-johnson", standardize=True))]
)

# Initiate BEL object
bel_model = BEL(
X_pre_processing=X_pre_processing,
X_post_processing=X_post_processing,
Y_pre_processing=Y_pre_processing,
Y_post_processing=Y_post_processing,
cca=cca,
)

return bel_model


def bel_training(
bel_,
*,
X_train_: pd.DataFrame,
x_test_: pd.DataFrame,
y_train_: pd.DataFrame,
y_test_: pd.DataFrame = None,
directory: str = None,
):
"""
:param bel_: BEL model
:param X_train_: Predictor set for training
:param x_test_: Predictor "test"
:param y_train_: Target set for training
:param y_test_: "True" target (optional)
:param directory: Path to the directory in which to unload the results
:return:
"""
#%% Directory in which to load forecasts
if directory is None:
sub_dir = os.getcwd()
else:
sub_dir = directory

# Folders
obj_dir = jp(sub_dir, "obj") # Location to save the BEL model
fig_data_dir = jp(sub_dir, "data") # Location to save the raw data figures
fig_pca_dir = jp(sub_dir, "pca") # Location to save the PCA figures
fig_cca_dir = jp(sub_dir, "cca") # Location to save the CCA figures
fig_pred_dir = jp(sub_dir, "uq") # Location to save the prediction figures

# Creates directories
[
utils.dirmaker(f, erase=True)
for f in [
obj_dir,
fig_data_dir,
fig_pca_dir,
fig_cca_dir,
fig_pred_dir,
]
]

# %% Fit BEL model
bel_.Y_obs = y_test_
bel_.fit(X=X_train_, Y=y_train_)

# %% Sample for the observation
# Extract n random sample (target CV's).
# The posterior distribution is computed within the method below.
bel_.predict(x_test_)

# Save the fitted BEL model
joblib.dump(bel_, jp(obj_dir, "bel.pkl"))
msg = f"model trained and saved in {obj_dir}"
logger.info(msg)


if __name__ == "__main__":

# Set directories
data_dir = jp(os.getcwd(), "dataset")
output_dir = jp(os.getcwd(), "results_kde")

# Load dataset
X_train = pd.read_pickle(jp(data_dir, "X_train.pkl"))
X_test = pd.read_pickle(jp(data_dir, "X_test.pkl"))
y_train = pd.read_pickle(jp(data_dir, "y_train.pkl"))
y_test = pd.read_pickle(jp(data_dir, "y_test.pkl"))

# Initiate BEL model
model = init_bel()

# Set model parameters
model.mode = "kde" # How to compute the posterior conditional distribution
# Set PC cut
model.X_n_pc = 50
model.Y_n_pc = 30
# Save original dimensions of both predictor and target
model.X_shape = (6, 200) # Six curves with 200 time steps each
model.Y_shape = (1, 100, 87) # One matrix with 100 rows and 87 columns
# Number of CCA components is chosen as the min number of PC
n_cca = min(model.X_n_pc, model.Y_n_pc)
model.cca.n_components = n_cca
# Number of samples to be extracted from the posterior distribution
model.n_posts = 400

# Train model
bel_training(
bel_=model,
X_train_=X_train,
x_test_=X_test,
y_train_=y_train,
y_test_=y_test,
directory=output_dir,
)

# Plot raw data
myvis.plot_results(model, base_dir=output_dir)

# Plot PCA
myvis.pca_vision(
model,
base_dir=output_dir,
)

# Plot CCA
myvis.cca_vision(bel=model, base_dir=output_dir)
Loading

0 comments on commit 0b2354e

Please sign in to comment.