diff --git a/setup.py b/setup.py index a71e612..45d9ce1 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ setup( name="skbel", - version="1.0.3", + version="1.0.4", packages=my_pckg, include_package_data=True, url="https://github.com/robinthibaut/skbel", diff --git a/skbel/algorithms/_statistics.py b/skbel/algorithms/_statistics.py index a0f15ad..49d81c4 100644 --- a/skbel/algorithms/_statistics.py +++ b/skbel/algorithms/_statistics.py @@ -488,11 +488,9 @@ def mvn_inference( return y_posterior_mean, y_posterior_covariance -_MESSAGE = "The integral is probably divergent, or slowly convergent." - - def _convergent(quadrature): - if len(quadrature) > 3 and quadrature[3] == _MESSAGE: + msg = "The integral is probably divergent, or slowly convergent." + if len(quadrature) > 3 and quadrature[3] == msg: return False else: return True @@ -528,7 +526,6 @@ def normalize(pdf, lower_bd=-np.inf, upper_bd=np.inf, vectorize=False): raise ValueError('PDF integral likely divergent.') A = quadrature[0] - # pdf_normed = lambda x: pdf(x) / A if lower_bd <= x <= upper_bd else 0 def pdf_normed(x): if lower_bd <= x <= upper_bd: return pdf(x) / A @@ -589,16 +586,6 @@ def cdf_vector(x): return cdf_vector -############################################ -# CHEBYSHEV APPROXIMATION OF PDF & CDF -############################################ - -# This section follows the work of Olver & Townsend (2013), who suggest using -# Chebyshev polynomials to approximate the PDF. This allows for trivial -# construction of the CDF, as these polynomials can be integrated in closed -# form. - - def _chebnodes(a, b, n): """Chebyshev nodes of rank n on integral [a,b].""" if not a < b: @@ -694,8 +681,6 @@ def cdf(x_): return cdf -# Sample via root-finding on CDF - def it_sampling(pdf, num_samples, lower_bd=-np.inf, @@ -716,8 +701,6 @@ def it_sampling(pdf, manually establish cutoffs for the density. upper_bd : float Upper bound of the support of the pdf. - guess : float or int - Initial guess for the numerical solver to use when inverting the CDF. chebyshev: Boolean, optional (default=False) If True, then the CDF is approximated using Chebyshev polynomials. @@ -741,36 +724,42 @@ def it_sampling(pdf, for >100k samples. """ + seeds = uniform(0, 1, num_samples) + if chebyshev: + if not (np.isfinite(lower_bd) and np.isfinite(upper_bd)): raise ValueError('Bounds must be finite for Chebyshev approximation of CDF.') + cdf = chebcdf(pdf, lower_bd, upper_bd) - else: - cdf = get_cdf(pdf, lower_bd, upper_bd) + cdf_y = cdf(np.linspace(lower_bd, upper_bd, 200)) + inverse_cdf = interpolate.interp1d(cdf_y, pdf.x, kind="linear", fill_value="extrapolate") + simple_samples = inverse_cdf(seeds) + + # Compute empirical mean and standard deviation + mean = sum(pdf.x * pdf.y) / sum(pdf.y) + estd = np.sqrt(sum(pdf.y * (pdf.x - mean) ** 2) / sum(pdf.y)) + # If distribution is too tight (small std) then use of simple inverse transform sampling + if estd <= 0.6: + return simple_samples + # Otherwise, use Chebyshev approach + else: + samples = [] - seeds = uniform(0, 1, num_samples) - samples = [] - # Get initial guess - cdf_y = np.cumsum(pdf.y) # cumulative distribution function, cdf - cdf_y = cdf_y / cdf_y.max() # takes care of normalizing cdf to 1.0 - inverse_cdf = interpolate.interp1d(cdf_y, pdf.x, kind="linear", fill_value="extrapolate") # this is a function - simple_samples = inverse_cdf(seeds) - - # Compute empirical mean and standard deviation - mean = sum(pdf.x * pdf.y) / sum(pdf.y) - estd = np.sqrt(sum(pdf.y * (pdf.x - mean) ** 2) / sum(pdf.y)) - # If distribution is too tight (small std) then use of simple inverse transform sampling - if estd <= 0.6: - return simple_samples - # Otherwise, use Chebyshev approach - else: - guess = np.mean(simple_samples) + guess = np.mean(simple_samples) + + for seed in seeds: + def shifted(x): + return cdf(x) - seed - for seed in seeds: - def shifted(x): - return cdf(x) - seed + soln = root(shifted, guess) + samples.append(soln.x[0]) - soln = root(shifted, guess) - samples.append(soln.x[0]) + return np.array(samples) + else: + cdf = get_cdf(pdf, lower_bd, upper_bd) + cdf_y = cdf(np.linspace(lower_bd, upper_bd, 200)) + inverse_cdf = interpolate.interp1d(cdf_y, pdf.x, kind="linear", fill_value="extrapolate") + simple_samples = inverse_cdf(seeds) + return simple_samples - return np.array(samples) diff --git a/skbel/examples/demo_kde.py b/skbel/examples/demo_kde.py new file mode 100644 index 0000000..ae549f1 --- /dev/null +++ b/skbel/examples/demo_kde.py @@ -0,0 +1,168 @@ +# Copyright (c) 2021. Robin Thibaut, Ghent University + +import os +from os.path import join as jp + +import joblib +import pandas as pd +from loguru import logger +from sklearn.cross_decomposition import CCA +from sklearn.decomposition import PCA +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler, PowerTransformer + +import demo_visualization as myvis + +from skbel import utils +from skbel.learning.bel import BEL + + +def init_bel(): + """ + Set all BEL pipelines. This is the blueprint of the framework. + """ + # Pipeline before CCA + X_pre_processing = Pipeline( + [ + ("scaler", StandardScaler(with_mean=False)), + ("pca", PCA()), + ] + ) + Y_pre_processing = Pipeline( + [ + ("scaler", StandardScaler(with_mean=False)), + ("pca", PCA()), + ] + ) + + # Canonical Correlation Analysis + cca = CCA() + + # Pipeline after CCA + X_post_processing = Pipeline( + [("normalizer", PowerTransformer(method="yeo-johnson", standardize=True))] + ) + Y_post_processing = Pipeline( + [("normalizer", PowerTransformer(method="yeo-johnson", standardize=True))] + ) + + # Initiate BEL object + bel_model = BEL( + X_pre_processing=X_pre_processing, + X_post_processing=X_post_processing, + Y_pre_processing=Y_pre_processing, + Y_post_processing=Y_post_processing, + cca=cca, + ) + + return bel_model + + +def bel_training( + bel_, + *, + X_train_: pd.DataFrame, + x_test_: pd.DataFrame, + y_train_: pd.DataFrame, + y_test_: pd.DataFrame = None, + directory: str = None, +): + """ + :param bel_: BEL model + :param X_train_: Predictor set for training + :param x_test_: Predictor "test" + :param y_train_: Target set for training + :param y_test_: "True" target (optional) + :param directory: Path to the directory in which to unload the results + :return: + """ + #%% Directory in which to load forecasts + if directory is None: + sub_dir = os.getcwd() + else: + sub_dir = directory + + # Folders + obj_dir = jp(sub_dir, "obj") # Location to save the BEL model + fig_data_dir = jp(sub_dir, "data") # Location to save the raw data figures + fig_pca_dir = jp(sub_dir, "pca") # Location to save the PCA figures + fig_cca_dir = jp(sub_dir, "cca") # Location to save the CCA figures + fig_pred_dir = jp(sub_dir, "uq") # Location to save the prediction figures + + # Creates directories + [ + utils.dirmaker(f, erase=True) + for f in [ + obj_dir, + fig_data_dir, + fig_pca_dir, + fig_cca_dir, + fig_pred_dir, + ] + ] + + # %% Fit BEL model + bel_.Y_obs = y_test_ + bel_.fit(X=X_train_, Y=y_train_) + + # %% Sample for the observation + # Extract n random sample (target CV's). + # The posterior distribution is computed within the method below. + bel_.predict(x_test_) + + # Save the fitted BEL model + joblib.dump(bel_, jp(obj_dir, "bel.pkl")) + msg = f"model trained and saved in {obj_dir}" + logger.info(msg) + + +if __name__ == "__main__": + + # Set directories + data_dir = jp(os.getcwd(), "dataset") + output_dir = jp(os.getcwd(), "results_kde") + + # Load dataset + X_train = pd.read_pickle(jp(data_dir, "X_train.pkl")) + X_test = pd.read_pickle(jp(data_dir, "X_test.pkl")) + y_train = pd.read_pickle(jp(data_dir, "y_train.pkl")) + y_test = pd.read_pickle(jp(data_dir, "y_test.pkl")) + + # Initiate BEL model + model = init_bel() + + # Set model parameters + model.mode = "kde" # How to compute the posterior conditional distribution + # Set PC cut + model.X_n_pc = 50 + model.Y_n_pc = 30 + # Save original dimensions of both predictor and target + model.X_shape = (6, 200) # Six curves with 200 time steps each + model.Y_shape = (1, 100, 87) # One matrix with 100 rows and 87 columns + # Number of CCA components is chosen as the min number of PC + n_cca = min(model.X_n_pc, model.Y_n_pc) + model.cca.n_components = n_cca + # Number of samples to be extracted from the posterior distribution + model.n_posts = 400 + + # Train model + bel_training( + bel_=model, + X_train_=X_train, + x_test_=X_test, + y_train_=y_train, + y_test_=y_test, + directory=output_dir, + ) + + # Plot raw data + myvis.plot_results(model, base_dir=output_dir) + + # Plot PCA + myvis.pca_vision( + model, + base_dir=output_dir, + ) + + # Plot CCA + myvis.cca_vision(bel=model, base_dir=output_dir) diff --git a/skbel/examples/demo_kde_cheby.py b/skbel/examples/demo_kde_cheby.py new file mode 100644 index 0000000..e1bc833 --- /dev/null +++ b/skbel/examples/demo_kde_cheby.py @@ -0,0 +1,168 @@ +# Copyright (c) 2021. Robin Thibaut, Ghent University + +import os +from os.path import join as jp + +import joblib +import pandas as pd +from loguru import logger +from sklearn.cross_decomposition import CCA +from sklearn.decomposition import PCA +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler, PowerTransformer + +import demo_visualization as myvis + +from skbel import utils +from skbel.learning.bel import BEL + + +def init_bel(): + """ + Set all BEL pipelines. This is the blueprint of the framework. + """ + # Pipeline before CCA + X_pre_processing = Pipeline( + [ + ("scaler", StandardScaler(with_mean=False)), + ("pca", PCA()), + ] + ) + Y_pre_processing = Pipeline( + [ + ("scaler", StandardScaler(with_mean=False)), + ("pca", PCA()), + ] + ) + + # Canonical Correlation Analysis + cca = CCA() + + # Pipeline after CCA + X_post_processing = Pipeline( + [("normalizer", PowerTransformer(method="yeo-johnson", standardize=True))] + ) + Y_post_processing = Pipeline( + [("normalizer", PowerTransformer(method="yeo-johnson", standardize=True))] + ) + + # Initiate BEL object + bel_model = BEL( + X_pre_processing=X_pre_processing, + X_post_processing=X_post_processing, + Y_pre_processing=Y_pre_processing, + Y_post_processing=Y_post_processing, + cca=cca, + ) + + return bel_model + + +def bel_training( + bel_, + *, + X_train_: pd.DataFrame, + x_test_: pd.DataFrame, + y_train_: pd.DataFrame, + y_test_: pd.DataFrame = None, + directory: str = None, +): + """ + :param bel_: BEL model + :param X_train_: Predictor set for training + :param x_test_: Predictor "test" + :param y_train_: Target set for training + :param y_test_: "True" target (optional) + :param directory: Path to the directory in which to unload the results + :return: + """ + #%% Directory in which to load forecasts + if directory is None: + sub_dir = os.getcwd() + else: + sub_dir = directory + + # Folders + obj_dir = jp(sub_dir, "obj") # Location to save the BEL model + fig_data_dir = jp(sub_dir, "data") # Location to save the raw data figures + fig_pca_dir = jp(sub_dir, "pca") # Location to save the PCA figures + fig_cca_dir = jp(sub_dir, "cca") # Location to save the CCA figures + fig_pred_dir = jp(sub_dir, "uq") # Location to save the prediction figures + + # Creates directories + [ + utils.dirmaker(f, erase=True) + for f in [ + obj_dir, + fig_data_dir, + fig_pca_dir, + fig_cca_dir, + fig_pred_dir, + ] + ] + + # %% Fit BEL model + bel_.Y_obs = y_test_ + bel_.fit(X=X_train_, Y=y_train_) + + # %% Sample for the observation + # Extract n random sample (target CV's). + # The posterior distribution is computed within the method below. + bel_.predict(x_test_) + + # Save the fitted BEL model + joblib.dump(bel_, jp(obj_dir, "bel.pkl")) + msg = f"model trained and saved in {obj_dir}" + logger.info(msg) + + +if __name__ == "__main__": + + # Set directories + data_dir = jp(os.getcwd(), "dataset") + output_dir = jp(os.getcwd(), "results_kde_cheby") + + # Load dataset + X_train = pd.read_pickle(jp(data_dir, "X_train.pkl")) + X_test = pd.read_pickle(jp(data_dir, "X_test.pkl")) + y_train = pd.read_pickle(jp(data_dir, "y_train.pkl")) + y_test = pd.read_pickle(jp(data_dir, "y_test.pkl")) + + # Initiate BEL model + model = init_bel() + + # Set model parameters + model.mode = "kde_chebyshev" # How to compute the posterior conditional distribution + # Set PC cut + model.X_n_pc = 50 + model.Y_n_pc = 30 + # Save original dimensions of both predictor and target + model.X_shape = (6, 200) # Six curves with 200 time steps each + model.Y_shape = (1, 100, 87) # One matrix with 100 rows and 87 columns + # Number of CCA components is chosen as the min number of PC + n_cca = min(model.X_n_pc, model.Y_n_pc) + model.cca.n_components = n_cca + # Number of samples to be extracted from the posterior distribution + model.n_posts = 400 + + # Train model + bel_training( + bel_=model, + X_train_=X_train, + x_test_=X_test, + y_train_=y_train, + y_test_=y_test, + directory=output_dir, + ) + + # Plot raw data + myvis.plot_results(model, base_dir=output_dir) + + # Plot PCA + myvis.pca_vision( + model, + base_dir=output_dir, + ) + + # Plot CCA + myvis.cca_vision(bel=model, base_dir=output_dir) diff --git a/skbel/goggles/_visualization.py b/skbel/goggles/_visualization.py index f0cd8cd..6841a10 100644 --- a/skbel/goggles/_visualization.py +++ b/skbel/goggles/_visualization.py @@ -552,7 +552,7 @@ def _kde_cca( bel.Y_obs_f = bel.transform(Y=Y_obs) # load prediction object - post_test = bel.random_sample(n_posts=5000) + post_test = bel.random_sample(n_posts=400) for comp_n in range(bel.cca.n_components): # Get figure default parameters @@ -565,7 +565,7 @@ def _kde_cca( # Plot h posterior given d density, support = kde_params( - x=bel.X_f.T[comp_n], y=bel.Y_f.T[comp_n], gridsize=1000 + x=bel.X_f.T[comp_n], y=bel.Y_f.T[comp_n], gridsize=200 ) xx, yy = support diff --git a/skbel/learning/bel.py b/skbel/learning/bel.py index b81207e..5e46f48 100644 --- a/skbel/learning/bel.py +++ b/skbel/learning/bel.py @@ -298,12 +298,16 @@ def transform(self, X=None, Y=None) -> (np.array, np.array): return _xp, _yp - def random_sample(self, n_posts: int = None) -> np.array: + def random_sample(self, n_posts: int = None, mode: str = None) -> np.array: """ Random sample the inferred posterior Gaussian distribution - :param n_posts: + :param n_posts: int + :param mode: str :return: """ + if mode is not None: + self.mode = mode + # Set the seed for later use if self.seed is None: self.seed = np.random.randint(2 ** 32 - 1, dtype="uint32") @@ -321,7 +325,20 @@ def random_sample(self, n_posts: int = None) -> np.array: Y_samples = np.random.multivariate_normal( mean=self.posterior_mean, cov=self.posterior_covariance, size=n_posts ) + if self.mode == "kde": + Y_samples = np.zeros((self._n_posts, self.pdf.shape[0])) + for i, pdf in enumerate(self.pdf): + uniform_samples = it_sampling(pdf=pdf, + num_samples=self._n_posts, + lower_bd=pdf.x.min(), + upper_bd=pdf.x.max(), + chebyshev=False, + ) + + Y_samples[:, i] = uniform_samples + + if self.mode == "kde_chebyshev": Y_samples = np.zeros((self._n_posts, self.pdf.shape[0])) for i, pdf in enumerate(self.pdf): uniform_samples = it_sampling(pdf=pdf, @@ -345,10 +362,12 @@ def fit_transform(self, X, y=None, **fit_params): return self.fit(X, y).transform(X, y) - def predict(self, X_obs) -> (np.array, np.array): + def predict(self, X_obs, mode: str = None) -> (np.array, np.array): """ Make predictions, in the BEL fashion. """ + if mode is not None: + self.mode = mode self.X_obs = X_obs # Save dataframe with name try: X_obs = check_array(self.X_obs)