Skip to content

Commit

Permalink
gradient_clipping
Browse files Browse the repository at this point in the history
  • Loading branch information
ujohn33 committed Jan 10, 2025
1 parent 2e15db7 commit 1db916b
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 4 deletions.
4 changes: 4 additions & 0 deletions lightgbmlss/distributions/Gaussian.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ def __init__(self,
response_fn: str = "exp",
loss_fn: str = "nll",
natural_gradient: bool = False,
quantile_clipping: bool = False,
clip_value: float = None,
):

# Input Checks
Expand Down Expand Up @@ -66,4 +68,6 @@ def __init__(self,
distribution_arg_names=list(param_dict.keys()),
loss_fn=loss_fn,
natural_gradient=natural_gradient,
quantile_clipping=quantile_clipping,
clip_value=clip_value,
)
58 changes: 54 additions & 4 deletions lightgbmlss/distributions/distribution_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,36 @@ class DistributionClass:
distribution_arg_names: List
List of distributional parameter names.
loss_fn: str
Loss function. Options are "nll" (negative log-likelihood) or "crps" (continuous ranked probability score).
Note that if "crps" is used, the Hessian is set to 1, as the current CRPS version is not twice differentiable.
Hence, using the CRPS disregards any variation in the curvature of the loss function.
Loss function. Options are "nll" (negative log-likelihood) or "crps"
(continuous ranked probability score). Note that if "crps" is used, the
Hessian is set to 1, as the current CRPS version is not twice
differentiable. Hence, using the CRPS disregards any variation in the
curvature of the loss function.
natural_gradient: bool
Specifies whether to use natural gradients instead of standard gradients
for optimization. Natural gradients scale the gradients by the inverse
of the Fisher Information Matrix (FIM), often leading to more stable and
efficient convergence. When set to True, natural gradients are applied;
otherwise, standard gradients are used.
quantile_clipping: bool
Indicates whether to use quantile-based clipping for
gradients and Hessians during optimization. When set to True, the values of
gradients and Hessians are clipped based on specified quantile ranges (e.g.,
0.1 and 0.9), effectively removing extreme outliers while preserving most of
the data distribution. This approach dynamically adapts the clipping bounds
to the gradient distribution in each training step.
clip_value: float
Defines the maximum absolute value for gradient and Hessian clipping.
Clipping helps to stabilize training by capping extreme values,
preventing issues like exploding gradients. When specified, gradients
are clipped to lie within the range [-clip_value, clip_value]. If not
provided, no clipping is applied, or alternative strategies (like
quantile-based clipping) might be used.
tau: List
List of expectiles. Only used for Expectile distributon.
penalize_crossing: bool
Whether to include a penalty term to discourage crossing of expectiles. Only used for Expectile distribution.
Whether to include a penalty term to discourage crossing of expectiles.
Only used for Expectile distribution.
"""
def __init__(self,
distribution: torch.distributions.Distribution = None,
Expand All @@ -53,6 +76,8 @@ def __init__(self,
distribution_arg_names: List = None,
loss_fn: str = "nll",
natural_gradient: bool = False,
quantile_clipping: bool = False,
clip_value: float = None,
tau: Optional[List[torch.Tensor]] = None,
penalize_crossing: bool = False,
):
Expand All @@ -66,6 +91,8 @@ def __init__(self,
self.distribution_arg_names = distribution_arg_names
self.loss_fn = loss_fn
self.natural_gradient = natural_gradient
self.quantile_clipping = quantile_clipping
self.clip_value = clip_value
self.tau = tau
self.penalize_crossing = penalize_crossing

Expand Down Expand Up @@ -471,6 +498,29 @@ def compute_gradients_and_hessians(self,
warnings.warn("Natural Gradient is not implemented for CRPS. Using standard Gradient instead.")
else:
pass

if self.quantile_clipping:
# Clip Gradients and Hessians
# Ensure gradients and Hessians are detached before computing quantiles
# Convert list of tensors to NumPy arrays
grad_np = np.concatenate([g.detach().numpy() for g in grad])
hess_np = np.concatenate([h.detach().numpy() for h in hess])

grad_min = np.quantile(grad_np, self.clip_value)
grad_max = np.quantile(grad_np, 1 - self.clip_value)
hess_min = np.quantile(hess_np, self.clip_value)
hess_max = np.quantile(hess_np, 1 - self.clip_value)

# Clip Gradients and Hessians
grad = [torch.clamp(g, min=grad_min, max=grad_max) for g in grad]
hess = [torch.clamp(h, min=hess_min, max=hess_max) for h in hess]
elif self.clip_value is not None:
# Fixed-value Clipping
grad = [torch.clamp(g, min=-self.clip_value, max=self.clip_value) for g in grad]
hess = [torch.clamp(h, min=self.clip_value, max=1) for h in hess]
else:
pass

# # Approximation of Hessian
# step_size = 1e-6
# predt_upper = [
Expand Down
7 changes: 7 additions & 0 deletions lightgbmlss/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,13 @@ def objective(trial):
log=param_log
)
})
else:
raise ValueError("Invalid parameter type")

if param_name == "clip_value":
print(f"clip_value: {hyper_params[param_name]}")
self.dist.clip_value = hyper_params[param_name]
del hyper_params["clip_value"]

# Add booster if not included in dictionary
if "boosting" not in hyper_params.keys():
Expand Down

0 comments on commit 1db916b

Please sign in to comment.