Refined KDE inference - uploaded more examples

robinthibaut · Jun 3, 2021 · 0b2354e · 0b2354e
1 parent 01528b0
commit 0b2354e
Show file tree

Hide file tree

Showing 6 changed files with 394 additions and 50 deletions.
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
 
 setup(
     name="skbel",
-    version="1.0.3",
+    version="1.0.4",
     packages=my_pckg,
     include_package_data=True,
     url="https://github.com/robinthibaut/skbel",

diff --git a/skbel/algorithms/_statistics.py b/skbel/algorithms/_statistics.py
@@ -488,11 +488,9 @@ def mvn_inference(
     return y_posterior_mean, y_posterior_covariance
 
 
-_MESSAGE = "The integral is probably divergent, or slowly convergent."
-
-
 def _convergent(quadrature):
-    if len(quadrature) > 3 and quadrature[3] == _MESSAGE:
+    msg = "The integral is probably divergent, or slowly convergent."
+    if len(quadrature) > 3 and quadrature[3] == msg:
         return False
     else:
         return True
@@ -528,7 +526,6 @@ def normalize(pdf, lower_bd=-np.inf, upper_bd=np.inf, vectorize=False):
         raise ValueError('PDF integral likely divergent.')
     A = quadrature[0]
 
-    # pdf_normed = lambda x: pdf(x) / A if lower_bd <= x <= upper_bd else 0
     def pdf_normed(x):
         if lower_bd <= x <= upper_bd:
             return pdf(x) / A
@@ -589,16 +586,6 @@ def cdf_vector(x):
     return cdf_vector
 
 
-############################################
-# CHEBYSHEV APPROXIMATION OF PDF & CDF
-############################################
-
-# This section follows the work of Olver & Townsend (2013), who suggest using
-# Chebyshev polynomials to approximate the PDF. This allows for trivial
-# construction of the CDF, as these polynomials can be integrated in closed
-# form.
-
-
 def _chebnodes(a, b, n):
     """Chebyshev nodes of rank n on integral [a,b]."""
     if not a < b:
@@ -694,8 +681,6 @@ def cdf(x_):
     return cdf
 
 
-# Sample via root-finding on CDF
-
 def it_sampling(pdf,
                 num_samples,
                 lower_bd=-np.inf,
@@ -716,8 +701,6 @@ def it_sampling(pdf,
         manually establish cutoffs for the density.
     upper_bd : float
         Upper bound of the support of the pdf.
-    guess : float or int
-        Initial guess for the numerical solver to use when inverting the CDF.
     chebyshev: Boolean, optional (default=False)
         If True, then the CDF is approximated using Chebyshev polynomials.
 
@@ -741,36 +724,42 @@ def it_sampling(pdf,
     for >100k samples.
 
     """
+    seeds = uniform(0, 1, num_samples)
+
     if chebyshev:
+
         if not (np.isfinite(lower_bd) and np.isfinite(upper_bd)):
             raise ValueError('Bounds must be finite for Chebyshev approximation of CDF.')
+
         cdf = chebcdf(pdf, lower_bd, upper_bd)
-    else:
-        cdf = get_cdf(pdf, lower_bd, upper_bd)
+        cdf_y = cdf(np.linspace(lower_bd, upper_bd, 200))
+        inverse_cdf = interpolate.interp1d(cdf_y, pdf.x, kind="linear", fill_value="extrapolate")
+        simple_samples = inverse_cdf(seeds)
+
+        # Compute empirical mean and standard deviation
+        mean = sum(pdf.x * pdf.y) / sum(pdf.y)
+        estd = np.sqrt(sum(pdf.y * (pdf.x - mean) ** 2) / sum(pdf.y))
+        # If distribution is too tight (small std) then use of simple inverse transform sampling
+        if estd <= 0.6:
+            return simple_samples
+        # Otherwise, use Chebyshev approach
+        else:
+            samples = []
 
-    seeds = uniform(0, 1, num_samples)
-    samples = []
-    # Get initial guess
-    cdf_y = np.cumsum(pdf.y)  # cumulative distribution function, cdf
-    cdf_y = cdf_y / cdf_y.max()  # takes care of normalizing cdf to 1.0
-    inverse_cdf = interpolate.interp1d(cdf_y, pdf.x, kind="linear", fill_value="extrapolate")  # this is a function
-    simple_samples = inverse_cdf(seeds)
-
-    # Compute empirical mean and standard deviation
-    mean = sum(pdf.x * pdf.y) / sum(pdf.y)
-    estd = np.sqrt(sum(pdf.y * (pdf.x - mean) ** 2) / sum(pdf.y))
-    # If distribution is too tight (small std) then use of simple inverse transform sampling
-    if estd <= 0.6:
-        return simple_samples
-    # Otherwise, use Chebyshev approach
-    else:
-        guess = np.mean(simple_samples)
+            guess = np.mean(simple_samples)
+
+            for seed in seeds:
+                def shifted(x):
+                    return cdf(x) - seed
 
-        for seed in seeds:
-            def shifted(x):
-                return cdf(x) - seed
+                soln = root(shifted, guess)
+                samples.append(soln.x[0])
 
-            soln = root(shifted, guess)
-            samples.append(soln.x[0])
+            return np.array(samples)
+    else:
+        cdf = get_cdf(pdf, lower_bd, upper_bd)
+        cdf_y = cdf(np.linspace(lower_bd, upper_bd, 200))
+        inverse_cdf = interpolate.interp1d(cdf_y, pdf.x, kind="linear", fill_value="extrapolate")
+        simple_samples = inverse_cdf(seeds)
+        return simple_samples
 
-        return np.array(samples)
diff --git a/skbel/examples/demo_kde.py b/skbel/examples/demo_kde.py
@@ -0,0 +1,168 @@
+#  Copyright (c) 2021. Robin Thibaut, Ghent University
+
+import os
+from os.path import join as jp
+
+import joblib
+import pandas as pd
+from loguru import logger
+from sklearn.cross_decomposition import CCA
+from sklearn.decomposition import PCA
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler, PowerTransformer
+
+import demo_visualization as myvis
+
+from skbel import utils
+from skbel.learning.bel import BEL
+
+
+def init_bel():
+    """
+    Set all BEL pipelines. This is the blueprint of the framework.
+    """
+    # Pipeline before CCA
+    X_pre_processing = Pipeline(
+        [
+            ("scaler", StandardScaler(with_mean=False)),
+            ("pca", PCA()),
+        ]
+    )
+    Y_pre_processing = Pipeline(
+        [
+            ("scaler", StandardScaler(with_mean=False)),
+            ("pca", PCA()),
+        ]
+    )
+
+    # Canonical Correlation Analysis
+    cca = CCA()
+
+    # Pipeline after CCA
+    X_post_processing = Pipeline(
+        [("normalizer", PowerTransformer(method="yeo-johnson", standardize=True))]
+    )
+    Y_post_processing = Pipeline(
+        [("normalizer", PowerTransformer(method="yeo-johnson", standardize=True))]
+    )
+
+    # Initiate BEL object
+    bel_model = BEL(
+        X_pre_processing=X_pre_processing,
+        X_post_processing=X_post_processing,
+        Y_pre_processing=Y_pre_processing,
+        Y_post_processing=Y_post_processing,
+        cca=cca,
+    )
+
+    return bel_model
+
+
+def bel_training(
+    bel_,
+    *,
+    X_train_: pd.DataFrame,
+    x_test_: pd.DataFrame,
+    y_train_: pd.DataFrame,
+    y_test_: pd.DataFrame = None,
+    directory: str = None,
+):
+    """
+    :param bel_: BEL model
+    :param X_train_: Predictor set for training
+    :param x_test_: Predictor "test"
+    :param y_train_: Target set for training
+    :param y_test_: "True" target (optional)
+    :param directory: Path to the directory in which to unload the results
+    :return:
+    """
+    #%% Directory in which to load forecasts
+    if directory is None:
+        sub_dir = os.getcwd()
+    else:
+        sub_dir = directory
+
+    # Folders
+    obj_dir = jp(sub_dir, "obj")  # Location to save the BEL model
+    fig_data_dir = jp(sub_dir, "data")  # Location to save the raw data figures
+    fig_pca_dir = jp(sub_dir, "pca")  # Location to save the PCA figures
+    fig_cca_dir = jp(sub_dir, "cca")  # Location to save the CCA figures
+    fig_pred_dir = jp(sub_dir, "uq")  # Location to save the prediction figures
+
+    # Creates directories
+    [
+        utils.dirmaker(f, erase=True)
+        for f in [
+            obj_dir,
+            fig_data_dir,
+            fig_pca_dir,
+            fig_cca_dir,
+            fig_pred_dir,
+        ]
+    ]
+
+    # %% Fit BEL model
+    bel_.Y_obs = y_test_
+    bel_.fit(X=X_train_, Y=y_train_)
+
+    # %% Sample for the observation
+    # Extract n random sample (target CV's).
+    # The posterior distribution is computed within the method below.
+    bel_.predict(x_test_)
+
+    # Save the fitted BEL model
+    joblib.dump(bel_, jp(obj_dir, "bel.pkl"))
+    msg = f"model trained and saved in {obj_dir}"
+    logger.info(msg)
+
+
+if __name__ == "__main__":
+
+    # Set directories
+    data_dir = jp(os.getcwd(), "dataset")
+    output_dir = jp(os.getcwd(), "results_kde")
+
+    # Load dataset
+    X_train = pd.read_pickle(jp(data_dir, "X_train.pkl"))
+    X_test = pd.read_pickle(jp(data_dir, "X_test.pkl"))
+    y_train = pd.read_pickle(jp(data_dir, "y_train.pkl"))
+    y_test = pd.read_pickle(jp(data_dir, "y_test.pkl"))
+
+    # Initiate BEL model
+    model = init_bel()
+
+    # Set model parameters
+    model.mode = "kde"  # How to compute the posterior conditional distribution
+    # Set PC cut
+    model.X_n_pc = 50
+    model.Y_n_pc = 30
+    # Save original dimensions of both predictor and target
+    model.X_shape = (6, 200)  # Six curves with 200 time steps each
+    model.Y_shape = (1, 100, 87)  # One matrix with 100 rows and 87 columns
+    # Number of CCA components is chosen as the min number of PC
+    n_cca = min(model.X_n_pc, model.Y_n_pc)
+    model.cca.n_components = n_cca
+    # Number of samples to be extracted from the posterior distribution
+    model.n_posts = 400
+
+    # Train model
+    bel_training(
+        bel_=model,
+        X_train_=X_train,
+        x_test_=X_test,
+        y_train_=y_train,
+        y_test_=y_test,
+        directory=output_dir,
+    )
+
+    # Plot raw data
+    myvis.plot_results(model, base_dir=output_dir)
+
+    # Plot PCA
+    myvis.pca_vision(
+        model,
+        base_dir=output_dir,
+    )
+
+    # Plot CCA
+    myvis.cca_vision(bel=model, base_dir=output_dir)