gradient_clipping

StatMixedML · Jan 10, 2025 · 1db916b · 1db916b
1 parent 2e15db7
commit 1db916b
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 4 deletions.
diff --git a/lightgbmlss/distributions/Gaussian.py b/lightgbmlss/distributions/Gaussian.py
@@ -35,6 +35,8 @@ def __init__(self,
                  response_fn: str = "exp",
                  loss_fn: str = "nll",
                  natural_gradient: bool = False,
+                 quantile_clipping: bool = False,
+                 clip_value: float = None,
                  ):
 
         # Input Checks
@@ -66,4 +68,6 @@ def __init__(self,
                          distribution_arg_names=list(param_dict.keys()),
                          loss_fn=loss_fn,
                          natural_gradient=natural_gradient,
+                         quantile_clipping=quantile_clipping,  
+                         clip_value=clip_value,  
                          )
diff --git a/lightgbmlss/distributions/distribution_utils.py b/lightgbmlss/distributions/distribution_utils.py
@@ -35,13 +35,36 @@ class DistributionClass:
     distribution_arg_names: List
         List of distributional parameter names.
     loss_fn: str
-        Loss function. Options are "nll" (negative log-likelihood) or "crps" (continuous ranked probability score).
-        Note that if "crps" is used, the Hessian is set to 1, as the current CRPS version is not twice differentiable.
-        Hence, using the CRPS disregards any variation in the curvature of the loss function.
+        Loss function. Options are "nll" (negative log-likelihood) or "crps"
+        (continuous ranked probability score). Note that if "crps" is used, the
+        Hessian is set to 1, as the current CRPS version is not twice
+        differentiable. Hence, using the CRPS disregards any variation in the
+        curvature of the loss function.
+    natural_gradient: bool
+        Specifies whether to use natural gradients instead of standard gradients
+        for optimization. Natural gradients scale the gradients by the inverse
+        of the Fisher Information Matrix (FIM), often leading to more stable and
+        efficient convergence. When set to True, natural gradients are applied;
+        otherwise, standard gradients are used.
+    quantile_clipping: bool 
+        Indicates whether to use quantile-based clipping for
+        gradients and Hessians during optimization. When set to True, the values of
+        gradients and Hessians are clipped based on specified quantile ranges (e.g.,
+        0.1 and 0.9), effectively removing extreme outliers while preserving most of
+        the data distribution. This approach dynamically adapts the clipping bounds
+        to the gradient distribution in each training step. 
+    clip_value: float
+        Defines the maximum absolute value for gradient and Hessian clipping.
+        Clipping helps to stabilize training by capping extreme values,
+        preventing issues like exploding gradients. When specified, gradients
+        are clipped to lie within the range [-clip_value, clip_value]. If not
+        provided, no clipping is applied, or alternative strategies (like
+        quantile-based clipping) might be used.
     tau: List
         List of expectiles. Only used for Expectile distributon.
     penalize_crossing: bool
-        Whether to include a penalty term to discourage crossing of expectiles. Only used for Expectile distribution.
+        Whether to include a penalty term to discourage crossing of expectiles.
+        Only used for Expectile distribution.
     """
     def __init__(self,
                  distribution: torch.distributions.Distribution = None,
@@ -53,6 +76,8 @@ def __init__(self,
                  distribution_arg_names: List = None,
                  loss_fn: str = "nll",
                  natural_gradient: bool = False,
+                 quantile_clipping: bool = False,
+                 clip_value: float = None,  
                  tau: Optional[List[torch.Tensor]] = None,
                  penalize_crossing: bool = False,
                  ):
@@ -66,6 +91,8 @@ def __init__(self,
         self.distribution_arg_names = distribution_arg_names
         self.loss_fn = loss_fn
         self.natural_gradient = natural_gradient
+        self.quantile_clipping = quantile_clipping
+        self.clip_value = clip_value
         self.tau = tau
         self.penalize_crossing = penalize_crossing
 
@@ -471,6 +498,29 @@ def compute_gradients_and_hessians(self,
                 warnings.warn("Natural Gradient is not implemented for CRPS. Using standard Gradient instead.")
             else:
                 pass
+
+        if self.quantile_clipping:
+            # Clip Gradients and Hessians
+            # Ensure gradients and Hessians are detached before computing quantiles
+            # Convert list of tensors to NumPy arrays
+            grad_np = np.concatenate([g.detach().numpy() for g in grad])
+            hess_np = np.concatenate([h.detach().numpy() for h in hess])
+
+            grad_min = np.quantile(grad_np, self.clip_value)
+            grad_max = np.quantile(grad_np, 1 - self.clip_value)
+            hess_min = np.quantile(hess_np, self.clip_value)
+            hess_max = np.quantile(hess_np, 1 - self.clip_value)
+
+            # Clip Gradients and Hessians
+            grad = [torch.clamp(g, min=grad_min, max=grad_max) for g in grad]
+            hess = [torch.clamp(h, min=hess_min, max=hess_max) for h in hess]
+        elif self.clip_value is not None:
+            # Fixed-value Clipping
+            grad = [torch.clamp(g, min=-self.clip_value, max=self.clip_value) for g in grad]
+            hess = [torch.clamp(h, min=self.clip_value, max=1) for h in hess]
+        else:
+            pass
+
             # # Approximation of Hessian
             # step_size = 1e-6
             # predt_upper = [

diff --git a/lightgbmlss/model.py b/lightgbmlss/model.py
@@ -375,6 +375,13 @@ def objective(trial):
                                                        log=param_log
                                                        )
                          })
+                else:
+                    raise ValueError("Invalid parameter type")
+
+                if param_name == "clip_value":
+                    print(f"clip_value: {hyper_params[param_name]}")
+                    self.dist.clip_value = hyper_params[param_name]
+                    del hyper_params["clip_value"]
 
             # Add booster if not included in dictionary
             if "boosting" not in hyper_params.keys():