diff --git a/lightgbmlss/distributions/Gaussian.py b/lightgbmlss/distributions/Gaussian.py index e591f5a..b813635 100644 --- a/lightgbmlss/distributions/Gaussian.py +++ b/lightgbmlss/distributions/Gaussian.py @@ -35,6 +35,8 @@ def __init__(self, response_fn: str = "exp", loss_fn: str = "nll", natural_gradient: bool = False, + quantile_clipping: bool = False, + clip_value: float = None, ): # Input Checks @@ -66,4 +68,6 @@ def __init__(self, distribution_arg_names=list(param_dict.keys()), loss_fn=loss_fn, natural_gradient=natural_gradient, + quantile_clipping=quantile_clipping, + clip_value=clip_value, ) diff --git a/lightgbmlss/distributions/distribution_utils.py b/lightgbmlss/distributions/distribution_utils.py index f15c990..0181126 100644 --- a/lightgbmlss/distributions/distribution_utils.py +++ b/lightgbmlss/distributions/distribution_utils.py @@ -35,13 +35,36 @@ class DistributionClass: distribution_arg_names: List List of distributional parameter names. loss_fn: str - Loss function. Options are "nll" (negative log-likelihood) or "crps" (continuous ranked probability score). - Note that if "crps" is used, the Hessian is set to 1, as the current CRPS version is not twice differentiable. - Hence, using the CRPS disregards any variation in the curvature of the loss function. + Loss function. Options are "nll" (negative log-likelihood) or "crps" + (continuous ranked probability score). Note that if "crps" is used, the + Hessian is set to 1, as the current CRPS version is not twice + differentiable. Hence, using the CRPS disregards any variation in the + curvature of the loss function. + natural_gradient: bool + Specifies whether to use natural gradients instead of standard gradients + for optimization. Natural gradients scale the gradients by the inverse + of the Fisher Information Matrix (FIM), often leading to more stable and + efficient convergence. When set to True, natural gradients are applied; + otherwise, standard gradients are used. + quantile_clipping: bool + Indicates whether to use quantile-based clipping for + gradients and Hessians during optimization. When set to True, the values of + gradients and Hessians are clipped based on specified quantile ranges (e.g., + 0.1 and 0.9), effectively removing extreme outliers while preserving most of + the data distribution. This approach dynamically adapts the clipping bounds + to the gradient distribution in each training step. + clip_value: float + Defines the maximum absolute value for gradient and Hessian clipping. + Clipping helps to stabilize training by capping extreme values, + preventing issues like exploding gradients. When specified, gradients + are clipped to lie within the range [-clip_value, clip_value]. If not + provided, no clipping is applied, or alternative strategies (like + quantile-based clipping) might be used. tau: List List of expectiles. Only used for Expectile distributon. penalize_crossing: bool - Whether to include a penalty term to discourage crossing of expectiles. Only used for Expectile distribution. + Whether to include a penalty term to discourage crossing of expectiles. + Only used for Expectile distribution. """ def __init__(self, distribution: torch.distributions.Distribution = None, @@ -53,6 +76,8 @@ def __init__(self, distribution_arg_names: List = None, loss_fn: str = "nll", natural_gradient: bool = False, + quantile_clipping: bool = False, + clip_value: float = None, tau: Optional[List[torch.Tensor]] = None, penalize_crossing: bool = False, ): @@ -66,6 +91,8 @@ def __init__(self, self.distribution_arg_names = distribution_arg_names self.loss_fn = loss_fn self.natural_gradient = natural_gradient + self.quantile_clipping = quantile_clipping + self.clip_value = clip_value self.tau = tau self.penalize_crossing = penalize_crossing @@ -471,6 +498,29 @@ def compute_gradients_and_hessians(self, warnings.warn("Natural Gradient is not implemented for CRPS. Using standard Gradient instead.") else: pass + + if self.quantile_clipping: + # Clip Gradients and Hessians + # Ensure gradients and Hessians are detached before computing quantiles + # Convert list of tensors to NumPy arrays + grad_np = np.concatenate([g.detach().numpy() for g in grad]) + hess_np = np.concatenate([h.detach().numpy() for h in hess]) + + grad_min = np.quantile(grad_np, self.clip_value) + grad_max = np.quantile(grad_np, 1 - self.clip_value) + hess_min = np.quantile(hess_np, self.clip_value) + hess_max = np.quantile(hess_np, 1 - self.clip_value) + + # Clip Gradients and Hessians + grad = [torch.clamp(g, min=grad_min, max=grad_max) for g in grad] + hess = [torch.clamp(h, min=hess_min, max=hess_max) for h in hess] + elif self.clip_value is not None: + # Fixed-value Clipping + grad = [torch.clamp(g, min=-self.clip_value, max=self.clip_value) for g in grad] + hess = [torch.clamp(h, min=self.clip_value, max=1) for h in hess] + else: + pass + # # Approximation of Hessian # step_size = 1e-6 # predt_upper = [ diff --git a/lightgbmlss/model.py b/lightgbmlss/model.py index 33896ce..9e1aa5a 100644 --- a/lightgbmlss/model.py +++ b/lightgbmlss/model.py @@ -375,6 +375,13 @@ def objective(trial): log=param_log ) }) + else: + raise ValueError("Invalid parameter type") + + if param_name == "clip_value": + print(f"clip_value: {hyper_params[param_name]}") + self.dist.clip_value = hyper_params[param_name] + del hyper_params["clip_value"] # Add booster if not included in dictionary if "boosting" not in hyper_params.keys():