diff --git a/src/peft/tuners/loha/config.py b/src/peft/tuners/loha/config.py
index 79c1f63013..747c860711 100644
--- a/src/peft/tuners/loha/config.py
+++ b/src/peft/tuners/loha/config.py
@@ -14,7 +14,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, field
-from typing import Optional, Union
+from typing import Literal, Optional, Union
 
 from peft.tuners.lycoris_utils import LycorisConfig
 from peft.utils import PeftType
@@ -49,9 +49,12 @@ class LoHaConfig(LycorisConfig):
             The names of the modules to not apply the adapter. When passing a string, a regex match will be performed.
             When passing a list of strings, either an exact match will be performed or it is checked if the name of the
             module ends with any of the passed strings.
-        init_weights (`bool`):
-            Whether to perform initialization of adapter weights. This defaults to `True`, passing `False` is
-            discouraged.
+        init_weights (`Union[bool, Literal["abba"]]`):
+            How to initialize the weights of the LoHa layers. Pass `True` (default) for default initialization, `False`
+            for random initialization, or `'abba'` for ABBA initialization which approximates pretrained weights using
+            SVD decomposition, potentially improving training stability and convergence. Based on the ABBA paper:
+            https://arxiv.org/pdf/2505.14238 See https://github.com/huggingface/peft/issues/2587 for implementation
+            details.
         layers_to_transform (`Union[List[int], int]`):
             The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices
             that are specified in this list. If a single integer is passed, it will apply the transformations on the
@@ -69,7 +72,24 @@ class LoHaConfig(LycorisConfig):
             List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint.
     """
 
-    r: int = field(default=8, metadata={"help": "LoHa rank"})
+    r: int = field(
+        default=8,
+        metadata={
+            "help": (
+                "LoHa rank for the first Hadamard component. For standard LoHa, both components use this rank. "
+                "For asymmetric ranks, use r2 to specify a different rank for the second component."
+            )
+        },
+    )
+    r2: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Rank for the second Hadamard component (w2a @ w2b). "
+                "If not specified, defaults to r (symmetric ranks)."
+            )
+        },
+    )
     alpha: int = field(default=8, metadata={"help": "LoHa alpha"})
     rank_dropout: float = field(
         default=0.0, metadata={"help": "The dropout probability for rank dimension during training"}
@@ -86,6 +106,19 @@ class LoHaConfig(LycorisConfig):
             )
         },
     )
+    use_khatri_rao: Union[bool, Literal["auto"]] = field(
+        default="auto",
+        metadata={
+            "help": (
+                "Use Khatri-Rao product optimization to reduce memory overhead. "
+                "This reparameterizes the update using Khatri-Rao product instead of "
+                "constructing full B1A1 and B2A2 matrices, reducing memory footprint "
+                "to be similar to LoRA while maintaining expressiveness. "
+                "When set to 'auto' (default), it is enabled for ABBA initialization (per paper recommendation) "
+                "and disabled for standard LoHa. Set to True or False to explicitly control this behavior."
+            )
+        },
+    )
     target_modules: Optional[Union[list[str], str]] = field(
         default=None,
         metadata={
@@ -98,12 +131,17 @@ class LoHaConfig(LycorisConfig):
         default=None,
         metadata={"help": "List of module names or regex expression of the module names to exclude from LoHa."},
     )
-    init_weights: bool = field(
+    init_weights: Union[bool, Literal["abba"]] = field(
         default=True,
         metadata={
             "help": (
-                "Whether to initialize the weights of the LoHa layers with their default initialization. Don't change "
-                "this setting, except if you know exactly what you're doing."
+                "How to initialize the weights of the LoHa layers. "
+                "Pass `True` (default) for default initialization (zeros for one matrix), "
+                "`False` for random initialization, or `'abba'` for ABBA initialization "
+                "which initializes weights to approximate the pretrained weights using SVD decomposition. "
+                "ABBA initialization can improve training stability and convergence. "
+                "Based on the ABBA paper: https://arxiv.org/pdf/2505.14238. "
+                "See https://github.com/huggingface/peft/issues/2587 for implementation details."
             ),
         },
     )
diff --git a/src/peft/tuners/loha/layer.py b/src/peft/tuners/loha/layer.py
index 96f9b1e016..f46368f65f 100644
--- a/src/peft/tuners/loha/layer.py
+++ b/src/peft/tuners/loha/layer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import math
-from typing import Any
+from typing import Any, Literal, Union
 
 import torch
 import torch.nn as nn
@@ -25,7 +25,18 @@
 class LoHaLayer(nn.Module, LycorisLayer):
     # All names of layers that may contain adapter weights
     adapter_layer_names = ("hada_w1_a", "hada_w1_b", "hada_w2_a", "hada_w2_b", "hada_t1", "hada_t2")
-    # other_param_names is defined on parent class
+    # Override other_param_names to include ABBA-specific parameters
+    other_param_names = (
+        "r",
+        "alpha",
+        "scaling",
+        "rank_dropout",
+        "module_dropout",
+        "r2",
+        "use_khatri_rao",
+        "scaling1",
+        "scaling2",
+    )
 
     def __init__(self, base_layer: nn.Module):
         super().__init__()
@@ -39,34 +50,45 @@ def __init__(self, base_layer: nn.Module):
         self.hada_t1 = nn.ParameterDict({})
         self.hada_t2 = nn.ParameterDict({})
 
+        # Khatri-Rao optimization flag
+        self.use_khatri_rao = {}
+
+        # Store second rank for ABBA (r is first component, r2 is second component, defaults to r)
+        self.r2 = {}
+
+        # Separate scaling factors for ABBA (α/√r and α/√r₂)
+        self.scaling1 = {}
+        self.scaling2 = {}
+
     @property
     def _available_adapters(self) -> set[str]:
         return {*self.hada_w1_a, *self.hada_w1_b, *self.hada_w2_a, *self.hada_w2_b, *self.hada_t1, *self.hada_t2}
 
-    def create_adapter_parameters(self, adapter_name: str, r: int, shape: tuple[int, ...]):
+    def create_adapter_parameters(self, adapter_name: str, r1: int, r2: int, shape: tuple[int, ...]):
         # https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py#L130C9-L143C75
+        # Support different ranks for the two Hadamard components (ABBA-style)
         if len(shape) == 4:  # Conv2d
-            self.hada_t1[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2], shape[3]))
-            self.hada_w1_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0]))  # out_dim, 1-mode
-            self.hada_w1_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1]))  # in_dim , 2-mode
+            self.hada_t1[adapter_name] = nn.Parameter(torch.empty(r1, r1, shape[2], shape[3]))
+            self.hada_w1_a[adapter_name] = nn.Parameter(torch.empty(r1, shape[0]))  # out_dim, 1-mode
+            self.hada_w1_b[adapter_name] = nn.Parameter(torch.empty(r1, shape[1]))  # in_dim , 2-mode
 
-            self.hada_t2[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2], shape[3]))
-            self.hada_w2_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0]))  # out_dim, 1-mode
-            self.hada_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1]))  # in_dim , 2-mode
+            self.hada_t2[adapter_name] = nn.Parameter(torch.empty(r2, r2, shape[2], shape[3]))
+            self.hada_w2_a[adapter_name] = nn.Parameter(torch.empty(r2, shape[0]))  # out_dim, 1-mode
+            self.hada_w2_b[adapter_name] = nn.Parameter(torch.empty(r2, shape[1]))  # in_dim , 2-mode
         elif len(shape) == 3:  # Conv1d
-            self.hada_t1[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2], 1))
-            self.hada_w1_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0]))  # out_dim, 1-mode
-            self.hada_w1_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1]))  # in_dim , 2-mode
+            self.hada_t1[adapter_name] = nn.Parameter(torch.empty(r1, r1, shape[2], 1))
+            self.hada_w1_a[adapter_name] = nn.Parameter(torch.empty(r1, shape[0]))  # out_dim, 1-mode
+            self.hada_w1_b[adapter_name] = nn.Parameter(torch.empty(r1, shape[1]))  # in_dim , 2-mode
 
-            self.hada_t2[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2], 1))
-            self.hada_w2_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0]))  # out_dim, 1-mode
-            self.hada_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1]))  # in_dim , 2-mode
+            self.hada_t2[adapter_name] = nn.Parameter(torch.empty(r2, r2, shape[2], 1))
+            self.hada_w2_a[adapter_name] = nn.Parameter(torch.empty(r2, shape[0]))  # out_dim, 1-mode
+            self.hada_w2_b[adapter_name] = nn.Parameter(torch.empty(r2, shape[1]))  # in_dim , 2-mode
         else:  # Linear
-            self.hada_w1_a[adapter_name] = nn.Parameter(torch.empty(shape[0], r))
-            self.hada_w1_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1]))
+            self.hada_w1_a[adapter_name] = nn.Parameter(torch.empty(shape[0], r1))
+            self.hada_w1_b[adapter_name] = nn.Parameter(torch.empty(r1, shape[1]))
 
-            self.hada_w2_a[adapter_name] = nn.Parameter(torch.empty(shape[0], r))
-            self.hada_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1]))
+            self.hada_w2_a[adapter_name] = nn.Parameter(torch.empty(shape[0], r2))
+            self.hada_w2_b[adapter_name] = nn.Parameter(torch.empty(r2, shape[1]))
 
     def reset_adapter_parameters(self, adapter_name: str):
         # Original implementation performs initialization with normal distribution
@@ -98,6 +120,109 @@ def reset_adapter_parameters_random(self, adapter_name: str):
             nn.init.kaiming_uniform_(self.hada_t1[adapter_name], a=math.sqrt(5))
             nn.init.kaiming_uniform_(self.hada_t2[adapter_name], a=math.sqrt(5))
 
+    def reset_adapter_parameters_abba(self, adapter_name: str):
+        """
+        ABBA initialization: Initialize LoHa weights to approximate the pretrained weights. This is based on the ABBA
+        paper which proposes initializing adapters to approximate the identity function or the pretrained weights.
+
+        For LoHa with separate ranks: (w1a @ w1b) ⊙ (w2a @ w2b) where w1a,w1b have rank r and w2a,w2b have rank r2
+        (defaults to r). We want this to approximate the pretrained weight W.
+
+        Strategy: Use SVD to split the weight matrix, allocating r singular values to the first component and r2 to the
+        second component.
+        """
+        if adapter_name in self.hada_w1_a.keys():
+            base_layer = self.get_base_layer()
+
+            # Get the ranks (r for first component, r2 for second component)
+            r1 = self.r[adapter_name]  # First component uses r
+            r2 = self.r2[adapter_name]  # Second component uses r2
+
+            # Step 1: Get weight tensor
+            weight = base_layer.weight
+
+            # ABBA doesn't support quantized models yet
+            is_quantized = hasattr(weight, "quant_state") or type(weight).__name__ in ("Params4bit", "Int8Params")
+            if is_quantized:
+                raise NotImplementedError(
+                    "ABBA initialization does not support quantized models (int4/int8) yet. "
+                    "Please use dtype='float32', 'float16', or 'bfloat16' instead of quantized dtypes."
+                )
+
+            # Get weight data (should be float32, bfloat16, or float16)
+            weight = weight.data if hasattr(weight, "data") else weight
+
+            # Step 2: Prepare weight for SVD
+            # For Linear layers, weight is already 2D with shape (out_features, in_features)
+            # For Conv layers, flatten to 2D
+            if isinstance(base_layer, nn.Linear):
+                W = weight
+            else:
+                # For conv layers, flatten to 2D: (out_channels, in_channels * kernel_size)
+                W = weight.reshape(weight.shape[0], -1)
+
+            # Step 3: Always cast to float32 for SVD
+            # PyTorch's torch.linalg.svd does NOT support: float16, bfloat16, or any integer types
+            if W.dtype != torch.float32:
+                W = W.float()
+
+            # Step 4: Perform SVD on GPU (results are in float32)
+            U, S, Vh = torch.linalg.svd(W, full_matrices=False)
+
+            # Split singular values between r1 and r2
+            # Take top r1+r2 singular values and split them
+            total_r = min(r1 + r2, len(S))
+            actual_r1 = min(r1, total_r)
+            actual_r2 = min(r2, total_r)
+
+            # Get components for first Hadamard term (rank r1)
+            U_r1 = U[:, :actual_r1]  # (m, r1)
+            S_r1 = S[:actual_r1]  # (r1,)
+            Vh_r1 = Vh[:actual_r1, :]  # (r1, n)
+
+            # Get components for second Hadamard term (rank r2)
+            # Use next r2 singular values or reuse if not enough
+            if actual_r1 + actual_r2 <= len(S):
+                U_r2 = U[:, actual_r1 : actual_r1 + actual_r2]  # (m, r2)
+                S_r2 = S[actual_r1 : actual_r1 + actual_r2]  # (r2,)
+                Vh_r2 = Vh[actual_r1 : actual_r1 + actual_r2, :]  # (r2, n)
+            else:
+                # Reuse early singular values if needed
+                U_r2 = U[:, :actual_r2]  # (m, r2)
+                S_r2 = S[:actual_r2]  # (r2,)
+                Vh_r2 = Vh[:actual_r2, :]  # (r2, n)
+
+            # Step 5: Initialize adapter parameters from SVD results
+            # Use fourth root so that (w1a @ w1b) ⊙ (w2a @ w2b) ≈ W
+
+            # Get adapter dtype from PEFT config (respects user configuration)
+            # Adapters can be float32 (default), bfloat16, float16, etc.
+            adapter_dtype = self.hada_w1_a[adapter_name].dtype
+
+            # Initialize first Hadamard component: w1a @ w1b
+            fourth_root_S1 = torch.pow(S_r1, 0.25)
+            w1a_init = U_r1 * fourth_root_S1
+            w1b_init = fourth_root_S1.unsqueeze(1) * Vh_r1
+
+            # Cast from float32 (SVD output) to adapter dtype and copy
+            self.hada_w1_a[adapter_name].data.copy_(w1a_init.to(adapter_dtype))
+            self.hada_w1_b[adapter_name].data.copy_(w1b_init.to(adapter_dtype))
+
+            # Initialize second Hadamard component: w2a @ w2b
+            fourth_root_S2 = torch.pow(S_r2, 0.25)
+            w2a_init = U_r2 * fourth_root_S2
+            w2b_init = fourth_root_S2.unsqueeze(1) * Vh_r2
+
+            # Cast from float32 (SVD output) to adapter dtype and copy
+            self.hada_w2_a[adapter_name].data.copy_(w2a_init.to(adapter_dtype))
+            self.hada_w2_b[adapter_name].data.copy_(w2b_init.to(adapter_dtype))
+
+        if adapter_name in self.hada_t1.keys():
+            # For convolutional layers with effective decomposition, use random init
+            # ABBA initialization for CP decomposition is more complex
+            nn.init.kaiming_uniform_(self.hada_t1[adapter_name], a=math.sqrt(5))
+            nn.init.kaiming_uniform_(self.hada_t2[adapter_name], a=math.sqrt(5))
+
     def update_layer(
         self,
         adapter_name: str,
@@ -105,8 +230,10 @@ def update_layer(
         alpha: float,
         rank_dropout: float,
         module_dropout: float,
-        init_weights: bool,
+        init_weights: Union[bool, Literal["abba"]],
         use_effective_conv2d: bool = False,
+        use_khatri_rao: Union[bool, Literal["auto"]] = "auto",
+        r2: int = None,
         inference_mode: bool = False,
         **kwargs,
     ) -> None:
@@ -114,23 +241,58 @@ def update_layer(
 
         Args:
             adapter_name (`str`): Name for the adapter to add.
-            r (`int`): Rank for the added adapter.
+            r (`int`): Rank for the added adapter. For standard LoHa, both Hadamard components use
+                this rank. For ABBA mode, this is the rank of the first Hadamard component (the second component's rank
+                is controlled by r2).
             alpha (`float`): Alpha for the added adapter.
             rank_dropout (`float`): The dropout probability for rank dimension during training.
             module_dropout (`float`): The dropout probability for disabling adapter during training.
-            init_weights (`bool`): Whether to initialize weights.
+            init_weights (`Union[bool, Literal["abba"]]`): How to initialize weights.
+                `True` for default initialization (one matrix initialized to zeros), `False` for random initialization,
+                or `"abba"` for ABBA initialization which approximates the pretrained weights using SVD decomposition.
+                ABBA initialization enables the adapter to start with behavior close to the original model, potentially
+                improving training stability and convergence. Based on the ABBA paper: https://arxiv.org/pdf/2505.14238
+                See https://github.com/huggingface/peft/issues/2587 for implementation details.
             use_effective_conv2d (`bool`, *optional*, defaults to `False`):
                 Use parameter effective decomposition for Conv2d with ksize > 1.
+            use_khatri_rao (`Union[bool, Literal["auto"]]`, *optional*, defaults to `"auto"`):
+                Use Khatri-Rao product optimization to reduce memory overhead. When set to `"auto"`, it is enabled for
+                ABBA initialization (recommended by the paper) and disabled for standard LoHa. Set to `True` or `False`
+                to explicitly control this behavior.
+            r2 (`int`, *optional*): Rank for the second Hadamard component. If None, defaults to r
+                (symmetric ranks). Only relevant when using different ranks for the two components.
         """
         if r <= 0:
             raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
 
+        # Determine r2
+        # If not specified, r2 defaults to r (symmetric ranks)
+        if r2 is None:
+            r2 = r
+
+        # Ensure at least rank 1
+        r = max(1, r)
+        r2 = max(1, r2)
+
         self.r[adapter_name] = r
+        self.r2[adapter_name] = r2
         self.alpha[adapter_name] = alpha
-        self.scaling[adapter_name] = alpha / r
+        self.scaling[adapter_name] = alpha / r  # Original scaling (for backward compatibility)
+
+        # ABBA paper: separate scaling factors α/√r and α/√r₂
+        self.scaling1[adapter_name] = alpha / math.sqrt(r)
+        self.scaling2[adapter_name] = alpha / math.sqrt(r2)
+
         self.rank_dropout[adapter_name] = rank_dropout
         self.module_dropout[adapter_name] = module_dropout
 
+        # Handle use_khatri_rao: "auto" enables it for ABBA, disables for standard LoHa
+        # User can explicitly set True/False to override
+        is_abba = isinstance(init_weights, str) and init_weights.lower() == "abba"
+        if use_khatri_rao == "auto":
+            use_khatri_rao = is_abba  # True for ABBA, False for standard LoHa
+        self.use_khatri_rao[adapter_name] = use_khatri_rao
+
         # Determine shape of LoHa weights
         base_layer = self.get_base_layer()
         if isinstance(base_layer, nn.Linear):
@@ -165,11 +327,13 @@ def update_layer(
         else:
             raise TypeError(f"LoHa is not implemented for base layers of type {type(base_layer).__name__}")
 
-        # Create weights with provided shape
-        self.create_adapter_parameters(adapter_name, r, shape)
+        # Create weights with provided shape (using r and r2)
+        self.create_adapter_parameters(adapter_name, r, r2, shape)
 
         # Initialize weights
-        if init_weights:
+        if isinstance(init_weights, str) and init_weights.lower() == "abba":
+            self.reset_adapter_parameters_abba(adapter_name)
+        elif init_weights:
             self.reset_adapter_parameters(adapter_name)
         else:
             self.reset_adapter_parameters_random(adapter_name)
@@ -191,13 +355,27 @@ def get_delta_weight(self, adapter_name: str) -> torch.Tensor:
                 scale=torch.tensor(self.scaling[adapter_name]),
             )
         else:
-            weight = make_weight(
-                self.hada_w1_a[adapter_name],
-                self.hada_w1_b[adapter_name],
-                self.hada_w2_a[adapter_name],
-                self.hada_w2_b[adapter_name],
-                scale=torch.tensor(self.scaling[adapter_name]),
-            )
+            # Check if Khatri-Rao optimization is enabled
+            use_kr = self.use_khatri_rao.get(adapter_name, False)
+
+            if use_kr:
+                # Use ABBA paper formula with separate scales: α/√r₁ and α/√r₂
+                weight = make_weight_kr(
+                    self.hada_w1_a[adapter_name],
+                    self.hada_w1_b[adapter_name],
+                    self.hada_w2_a[adapter_name],
+                    self.hada_w2_b[adapter_name],
+                    scale1=torch.tensor(self.scaling1[adapter_name]),
+                    scale2=torch.tensor(self.scaling2[adapter_name]),
+                )
+            else:
+                weight = make_weight(
+                    self.hada_w1_a[adapter_name],
+                    self.hada_w1_b[adapter_name],
+                    self.hada_w2_a[adapter_name],
+                    self.hada_w2_b[adapter_name],
+                    scale=torch.tensor(self.scaling[adapter_name]),
+                )
 
         base_layer = self.get_base_layer()
 
@@ -255,14 +433,26 @@ def __init__(
         alpha: float = 0.0,
         rank_dropout: float = 0.0,
         module_dropout: float = 0.0,
-        init_weights: bool = True,
+        init_weights: Union[bool, Literal["abba"]] = True,
+        use_khatri_rao: Union[bool, Literal["auto"]] = "auto",
+        r2: int = None,
         **kwargs,
     ):
         super().__init__(base_layer)
 
         # Create adapter and set it active
         self._active_adapter = adapter_name
-        self.update_layer(adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, **kwargs)
+        self.update_layer(
+            adapter_name,
+            r,
+            alpha,
+            rank_dropout,
+            module_dropout,
+            init_weights,
+            use_khatri_rao=use_khatri_rao,
+            r2=r2,
+            **kwargs,
+        )
 
     def _get_delta_activations(
         self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any
@@ -289,7 +479,9 @@ def __init__(
         rank_dropout: float = 0.0,
         module_dropout: float = 0.0,
         use_effective_conv2d: bool = False,
-        init_weights: bool = True,
+        init_weights: Union[bool, Literal["abba"]] = True,
+        use_khatri_rao: Union[bool, Literal["auto"]] = "auto",
+        r2: int = None,
         **kwargs,
     ):
         super().__init__(base_layer)
@@ -297,7 +489,16 @@ def __init__(
         # Create adapter and set it active
         self._active_adapter = adapter_name
         self.update_layer(
-            adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, use_effective_conv2d, **kwargs
+            adapter_name,
+            r,
+            alpha,
+            rank_dropout,
+            module_dropout,
+            init_weights,
+            use_effective_conv2d,
+            use_khatri_rao=use_khatri_rao,
+            r2=r2,
+            **kwargs,
         )
 
     def _get_delta_activations(
@@ -333,7 +534,9 @@ def __init__(
         rank_dropout: float = 0.0,
         module_dropout: float = 0.0,
         use_effective_conv2d: bool = False,
-        init_weights: bool = True,
+        init_weights: Union[bool, Literal["abba"]] = True,
+        use_khatri_rao: Union[bool, Literal["auto"]] = "auto",
+        r2: int = None,
         **kwargs,
     ):
         super().__init__(base_layer)
@@ -341,7 +544,16 @@ def __init__(
         # Create adapter and set it active
         self._active_adapter = adapter_name
         self.update_layer(
-            adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, use_effective_conv2d, **kwargs
+            adapter_name,
+            r,
+            alpha,
+            rank_dropout,
+            module_dropout,
+            init_weights,
+            use_effective_conv2d,
+            use_khatri_rao=use_khatri_rao,
+            r2=r2,
+            **kwargs,
         )
 
     def _get_delta_activations(
@@ -365,6 +577,113 @@ def __repr__(self) -> str:
         return "loha." + rep
 
 
+# For abba
+class HadaWeightKR(torch.autograd.Function):
+    """
+    Khatri-Rao optimized version of HadaWeight that avoids materializing the full B1A1 and B2A2 matrices, significantly
+    reducing memory overhead.
+
+    Key Innovation: Instead of computing (w1a @ w1b) * (w2a @ w2b) which requires storing two m×n matrices, we compute
+    the result row-by-row (or in chunks), never storing the full intermediate matrices in memory.
+
+    ABBA paper formula: ΔW = (α/√r₁ · B₁A₁) ⊙ (α/√r₂ · B₂A₂) where scale1 = α/√r₁ and scale2 = α/√r₂
+
+    Mathematical equivalence: result[i,j] = scale1 * (sum_k w1a[i,k]*w1b[k,j]) * scale2 * (sum_k w2a[i,k]*w2b[k,j])
+
+    This can be computed without materializing full matrices by processing one row at a time or using einsum with no
+    intermediate storage.
+
+    Memory savings: O(m*n) -> O(n) for forward pass (processing row by row)
+    """
+
+    @staticmethod
+    def forward(ctx, w1a, w1b, w2a, w2b, scale1, scale2):
+        ctx.save_for_backward(w1a, w1b, w2a, w2b, scale1, scale2)
+
+        # Handle different ranks: w1a/w1b may have rank r1, w2a/w2b may have rank r2
+        # w1a: (m, r1), w1b: (r1, n)
+        # w2a: (m, r2), w2b: (r2, n)
+
+        m = w1a.shape[0]
+        n = w1b.shape[1]
+
+        # Allocate output
+        diff_weight = torch.empty(m, n, dtype=w1a.dtype, device=w1a.device)
+
+        # Process in chunks to save memory (chunk_size can be tuned)
+        # Smaller chunk_size = less memory, but more overhead
+        chunk_size = min(128, m)  # Process 128 rows at a time
+
+        for i in range(0, m, chunk_size):
+            end_i = min(i + chunk_size, m)
+            # Compute chunk of term1: scale1 * (w1a[i:end_i] @ w1b) -> (chunk_size, n)
+            term1_chunk = scale1 * (w1a[i:end_i] @ w1b)  # Only materialize chunk_size × n
+            # Compute chunk of term2: scale2 * (w2a[i:end_i] @ w2b) -> (chunk_size, n)
+            term2_chunk = scale2 * (w2a[i:end_i] @ w2b)  # Only materialize chunk_size × n
+            # Element-wise multiply and store
+            # Result: (α/√r₁ · B₁A₁) ⊙ (α/√r₂ · B₂A₂)
+            diff_weight[i:end_i] = term1_chunk * term2_chunk
+            # These chunks are automatically freed after use
+
+        return diff_weight
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        (w1a, w1b, w2a, w2b, scale1, scale2) = ctx.saved_tensors
+
+        # Handle different ranks: w1a/w1b may have rank r1, w2a/w2b may have rank r2
+        # w1a: (m, r1), w1b: (r1, n)
+        # w2a: (m, r2), w2b: (r2, n)
+        m = w1a.shape[0]
+        n = w1b.shape[1]
+
+        # Initialize gradients
+        grad_w1a = torch.zeros_like(w1a)
+        grad_w1b = torch.zeros_like(w1b)
+        grad_w2a = torch.zeros_like(w2a)
+        grad_w2b = torch.zeros_like(w2b)
+
+        # Process in chunks to save memory
+        chunk_size = min(128, m)
+
+        for i in range(0, m, chunk_size):
+            end_i = min(i + chunk_size, m)
+
+            # Recompute forward pass chunks (trade computation for memory)
+            # term1_chunk = scale1 * (w1a @ w1b), term2_chunk = scale2 * (w2a @ w2b)
+            term1_chunk = scale1 * (w1a[i:end_i] @ w1b)  # (chunk_size, n)
+            term2_chunk = scale2 * (w2a[i:end_i] @ w2b)  # (chunk_size, n)
+
+            grad_out_chunk = grad_out[i:end_i]  # (chunk_size, n)
+
+            # Gradients for w1a and w1b
+            # d(ΔW)/d(B₁A₁) = grad_out ⊙ scale1 ⊙ (scale2 · B₂A₂)
+            # Chain rule: d/dw1a = scale1 * (grad_out ⊙ term2_chunk) @ w1b.T
+            grad_term1_chunk = scale1 * (grad_out_chunk * term2_chunk)  # (chunk_size, n)
+            grad_w1a[i:end_i] = grad_term1_chunk @ w1b.T  # (chunk_size, r1)
+            grad_w1b += w1a[i:end_i].T @ grad_term1_chunk  # (r1, n)
+
+            # Gradients for w2a and w2b
+            # d(ΔW)/d(B₂A₂) = grad_out ⊙ scale2 ⊙ (scale1 · B₁A₁)
+            # Chain rule: d/dw2a = scale2 * (grad_out ⊙ term1_chunk) @ w2b.T
+            grad_term2_chunk = scale2 * (grad_out_chunk * term1_chunk)  # (chunk_size, n)
+            grad_w2a[i:end_i] = grad_term2_chunk @ w2b.T  # (chunk_size, r2)
+            grad_w2b += w2a[i:end_i].T @ grad_term2_chunk  # (r2, n)
+
+            # Chunks are freed here
+
+        return grad_w1a, grad_w1b, grad_w2a, grad_w2b, None, None
+
+
+def make_weight_kr(w1a, w1b, w2a, w2b, scale1, scale2):
+    """
+    Generate weights using Khatri-Rao optimization with separate scaling.
+
+    ABBA paper formula: ΔW = (α/√r₁ · B₁A₁) ⊙ (α/√r₂ · B₂A₂) where scale1 = α/√r₁ and scale2 = α/√r₂
+    """
+    return HadaWeightKR.apply(w1a, w1b, w2a, w2b, scale1, scale2)
+
+
 # Below code is a direct copy from https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py#L9
 
 
diff --git a/src/peft/tuners/loha/model.py b/src/peft/tuners/loha/model.py
index c39be6434d..df833ac39f 100644
--- a/src/peft/tuners/loha/model.py
+++ b/src/peft/tuners/loha/model.py
@@ -108,6 +108,8 @@ def _create_and_replace(
         kwargs = config.to_dict()
         kwargs["r"] = config.rank_pattern.get(r_key, config.r)
         kwargs["alpha"] = config.alpha_pattern.get(alpha_key, config.alpha)
+        # Pass r2 if specified in config (r is always passed, r2 defaults to r if not specified)
+        kwargs["r2"] = getattr(config, "r2", None)
 
         if isinstance(target, LoHaLayer):
             target.update_layer(adapter_name, **kwargs)
diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
index ed83db98cb..5e5ae6b183 100644
--- a/tests/test_custom_models.py
+++ b/tests/test_custom_models.py
@@ -286,6 +286,41 @@
         {"target_modules": ["conv1d"], "r": 2, "use_effective_conv2d": False},
     ),
     ("Conv2d 1x1 LOHA", "Conv2d1x1", LoHaConfig, {"target_modules": ["conv2d"]}),
+    ########
+    # ABBA #
+    ########
+    # ABBA tests are included in main TEST_CASES for basic functionality
+    # Note: ABBA uses SVD-based initialization, so parameters are non-zero from start
+    ("Vanilla MLP 1 ABBA", "MLP", LoHaConfig, {"target_modules": "lin0", "init_weights": "abba"}),
+    ("Vanilla MLP 2 ABBA", "MLP", LoHaConfig, {"target_modules": ["lin0"], "init_weights": "abba"}),
+    (
+        "Vanilla MLP 3 ABBA",
+        "MLP",
+        LoHaConfig,
+        {
+            "target_modules": ["lin0"],
+            "alpha": 4,
+            "module_dropout": 0.1,
+            "init_weights": "abba",
+        },
+    ),
+    ("Vanilla MLP 4 ABBA", "MLP", LoHaConfig, {"target_modules": "lin0", "rank_dropout": 0.5, "init_weights": "abba"}),
+    (
+        "Vanilla MLP 5 ABBA with Khatri-Rao",
+        "MLP",
+        LoHaConfig,
+        {"target_modules": ["lin0"], "init_weights": "abba", "use_khatri_rao": True},
+    ),
+    ("Conv2d 1 ABBA", "Conv2d", LoHaConfig, {"target_modules": ["conv2d"], "init_weights": "abba"}),
+    ("Conv1d ABBA 1", "Conv1d", LoHaConfig, {"target_modules": ["conv1d"], "init_weights": "abba"}),
+    ("Conv1d ABBA 2", "Conv1d", LoHaConfig, {"target_modules": ["conv1d"], "r": 2, "init_weights": "abba"}),
+    (
+        "Conv1d ABBA 3",
+        "Conv1dBigger",
+        LoHaConfig,
+        {"target_modules": ["conv1d"], "r": 2, "init_weights": "abba"},
+    ),
+    ("Conv2d 1x1 ABBA", "Conv2d1x1", LoHaConfig, {"target_modules": ["conv2d"], "init_weights": "abba"}),
     # LoKr
     ("Vanilla MLP 1 LOKR", "MLP", LoKrConfig, {"target_modules": "lin0"}),
     ("Vanilla MLP 2 LOKR", "MLP", LoKrConfig, {"target_modules": ["lin0"]}),
@@ -2056,6 +2091,13 @@ def test_parameters_after_loading_model(self, test_name, model_id, config_cls, c
             lr = 1e-3  # we get exploding gradients with MHA when learning rate is too high
         elif issubclass(config_cls, (VBLoRAConfig, RandLoraConfig, OSFConfig)):
             lr = 0.01  # otherwise we get nan
+        elif config_kwargs.get("init_weights") == "abba":
+            # ABBA starts closer to pretrained, use gentler updates than standard (0.5)
+            # Conv layers with ABBA need much lower LR due to Hadamard product amplification
+            if model_id in ["Conv1d", "Conv1dBigger", "Conv2d", "Conv2d1x1"]:
+                lr = 0.01  # Very low LR to prevent exploding gradients with Hadamard products
+            else:
+                lr = 0.1
         optimizer = torch.optim.SGD(model.parameters(), lr=lr)
 
         # train at least 3 steps for all parameters to be updated (probably this is required because of symmetry
@@ -2105,8 +2147,12 @@ def test_disable_adapters(self, test_name, model_id, config_cls, config_kwargs):
             torch.nn.init.zeros_(model.vblora_vector_bank["default"])
         model.eval()
         outputs_before = model(**X)
+        # ABBA uses SVD initialization, so outputs won't match base model initially - skip that assertion
         # OSF uses SVD reconstruction which introduces small numerical differences
-        if issubclass(config_cls, OSFConfig):
+        if config_kwargs.get("init_weights") == "abba":
+            # ABBA uses non-zero initialization, so skip the assertion
+            pass
+        elif issubclass(config_cls, OSFConfig):
             assert torch.allclose(outputs_base, outputs_before, rtol=1e-4, atol=1e-4)
         else:
             assert torch.allclose(outputs_base, outputs_before)
@@ -2147,8 +2193,12 @@ def test_disable_adapters(self, test_name, model_id, config_cls, config_kwargs):
         else:
             rtol, atol = 1e-5, 1e-8
         assert not torch.allclose(outputs_before, outputs_after, rtol=rtol, atol=atol)
+        # For ABBA: outputs_before != outputs_disabled because ABBA uses non-zero init
+        # But outputs_disabled should equal base model for both ABBA and others
         # OSF uses SVD reconstruction which introduces small numerical differences
-        if issubclass(config_cls, OSFConfig):
+        if config_kwargs.get("init_weights") == "abba":
+            assert torch.allclose(outputs_base, outputs_disabled)
+        elif issubclass(config_cls, OSFConfig):
             assert torch.allclose(outputs_before, outputs_disabled, rtol=1e-4, atol=1e-4)
         else:
             assert torch.allclose(outputs_before, outputs_disabled)
@@ -2161,6 +2211,8 @@ def test_disable_adapters_with_merging(self, test_name, model_id, config_cls, co
         # same as test_disable_adapters, but with merging
         X = self.prepare_inputs_for_testing()
         model = self.transformers_class.from_pretrained(model_id).to(self.torch_device)
+        model.eval()
+        outputs_base = model(**X)  # Save base model outputs for ABBA comparison
         config = config_cls(
             base_model_name_or_path=model_id,
             **config_kwargs,
@@ -2183,6 +2235,14 @@ def test_disable_adapters_with_merging(self, test_name, model_id, config_cls, co
         else:
             # Adam optimizer since SGD isn't great for small models with IA3 + Conv1D
             lr = 0.01
+            # ABBA Conv layers need lower learning rate to prevent gradient explosion
+            if config_kwargs.get("init_weights") == "abba" and model_id in [
+                "Conv1d",
+                "Conv1dBigger",
+                "Conv2d",
+                "Conv2d1x1",
+            ]:
+                lr = 0.001  # Very low LR for ABBA Conv with Adam
             optimizer = torch.optim.Adam(model.parameters(), lr=lr)
 
         # train at least 3 steps for all parameters to be updated (probably this is required because of symmetry
@@ -2218,6 +2278,15 @@ def test_disable_adapters_with_merging(self, test_name, model_id, config_cls, co
         if config_kwargs.get("use_dora") and model_id == "EmbConv1D":
             atol, rtol = 1e-4, 1e-4
 
+        # ABBA Conv layers have slightly more numerical instability during merge/unmerge
+        if config_kwargs.get("init_weights") == "abba" and model_id in [
+            "Conv1d",
+            "Conv1dBigger",
+            "Conv2d",
+            "Conv2d1x1",
+        ]:
+            atol, rtol = 1e-4, 1e-4
+
         # check that there is a difference in results after training
         assert not torch.allclose(outputs_before, outputs_after, atol=atol, rtol=rtol)
 
@@ -2228,7 +2297,12 @@ def test_disable_adapters_with_merging(self, test_name, model_id, config_cls, co
         assert torch.allclose(outputs_after, outputs_unmerged, atol=atol, rtol=rtol)
 
         # check that disabling adapters gives the same results as before training
-        assert torch.allclose(outputs_before, outputs_disabled, atol=atol, rtol=rtol)
+        # For ABBA: outputs_before != outputs_disabled because ABBA uses non-zero init
+        # But outputs_disabled should equal base model for both ABBA and others
+        if config_kwargs.get("init_weights") == "abba":
+            assert torch.allclose(outputs_base, outputs_disabled, atol=atol, rtol=rtol)
+        else:
+            assert torch.allclose(outputs_before, outputs_disabled, atol=atol, rtol=rtol)
 
         # check that enabling + disabling adapters does not change the results
         assert torch.allclose(outputs_after, outputs_enabled_after_disable, atol=atol, rtol=rtol)
diff --git a/tests/test_stablediffusion.py b/tests/test_stablediffusion.py
index 8eb18dc9a6..ed38ed2832 100644
--- a/tests/test_stablediffusion.py
+++ b/tests/test_stablediffusion.py
@@ -17,6 +17,7 @@
 
 import numpy as np
 import pytest
+import torch
 from diffusers import StableDiffusionPipeline
 
 from peft import (
@@ -99,6 +100,38 @@
             },
         },
     ),
+    (
+        LoHaConfig,
+        {
+            "text_encoder": {
+                "r": 8,
+                "alpha": 32,
+                "target_modules": ["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"],
+                "rank_dropout": 0.0,
+                "module_dropout": 0.0,
+                "init_weights": "abba",
+                "use_khatri_rao": True,
+            },
+            "unet": {
+                "r": 8,
+                "alpha": 32,
+                "target_modules": [
+                    "proj_in",
+                    "proj_out",
+                    "to_k",
+                    "to_q",
+                    "to_v",
+                    "to_out.0",
+                    "ff.net.0.proj",
+                    "ff.net.2",
+                ],
+                "rank_dropout": 0.0,
+                "module_dropout": 0.0,
+                "init_weights": "abba",
+                "use_khatri_rao": True,
+            },
+        },
+    ),
     (
         LoKrConfig,
         {
@@ -227,6 +260,17 @@ class TestStableDiffusionModel(PeftCommonTester):
     transformers_class = StableDiffusionPipeline
     sd_model = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-sd-pipe")
 
+    @pytest.fixture(autouse=True)
+    def autofixture(self):
+        # Disable TF32 in cudnn
+        prev_value = torch.backends.cudnn.allow_tf32
+        try:
+            torch.backends.cudnn.allow_tf32 = False
+            yield
+        finally:
+            torch.backends.cudnn.allow_tf32 = prev_value
+        torch.cuda.empty_cache()
+
     def instantiate_sd_peft(self, model_id, config_cls, config_kwargs):
         # Instantiate StableDiffusionPipeline
         if model_id == "hf-internal-testing/tiny-sd-pipe":
@@ -268,6 +312,9 @@ def test_merge_layers(self, model_id, config_cls, config_kwargs):
         if (config_cls == LoKrConfig) and (self.torch_device not in ["cuda", "xpu"]):
             pytest.skip("Merging test with LoKr fails without GPU")
 
+        # Store original config_kwargs before modification (deep copy)
+        original_config_kwargs = copy.deepcopy(config_kwargs)
+
         # Instantiate model & adapters
         config_kwargs = set_init_weights_false(config_cls, config_kwargs)
         model = self.instantiate_sd_peft(model_id, config_cls, config_kwargs)
@@ -293,6 +340,10 @@ def test_merge_layers(self, model_id, config_cls, config_kwargs):
     @pytest.mark.parametrize("model_id", PEFT_DIFFUSERS_SD_MODELS_TO_TEST)
     @pytest.mark.parametrize("config_cls,config_kwargs", DIFFUSERS_CONFIGS)
     def test_merge_layers_safe_merge(self, model_id, config_cls, config_kwargs):
+        init_weights = (
+            config_kwargs.get("text_encoder", {})["init_weights"] if config_cls == LoHaConfig else "not loha"
+        )
+
         if (config_cls == LoKrConfig) and (self.torch_device not in ["cuda", "xpu"]):
             pytest.skip("Merging test with LoKr fails without GPU")
 
diff --git a/tests/test_vision_models.py b/tests/test_vision_models.py
index 74e5d654de..307404e569 100644
--- a/tests/test_vision_models.py
+++ b/tests/test_vision_models.py
@@ -45,6 +45,12 @@
 CONFIGS = {
     "lora": LoraConfig(target_modules=["convolution"], modules_to_save=["classifier", "normalization"]),
     "loha": LoHaConfig(target_modules=["convolution"], modules_to_save=["classifier", "normalization"]),
+    "abba": LoHaConfig(
+        target_modules=["convolution"],
+        modules_to_save=["classifier", "normalization"],
+        init_weights="abba",
+        r=2,
+    ),
     "lokr": LoKrConfig(target_modules=["convolution"], modules_to_save=["classifier", "normalization"]),
     "oft": OFTConfig(
         r=1, oft_block_size=0, target_modules=["convolution"], modules_to_save=["classifier", "normalization"]