sgl-project
diff --git a/‎python/sgl_jax/srt/lora/__init__.py‎
Lines changed: 0 additions & 20 deletions b/‎python/sgl_jax/srt/lora/__init__.py‎
Lines changed: 0 additions & 20 deletions
diff --git a/‎python/sgl_jax/srt/lora/layers.py‎
Lines changed: 188 additions & 0 deletions b/‎python/sgl_jax/srt/lora/layers.py‎
Lines changed: 188 additions & 0 deletions
diff --git a/‎python/sgl_jax/srt/lora/lora.py‎
Lines changed: 0 additions & 81 deletions b/‎python/sgl_jax/srt/lora/lora.py‎
Lines changed: 0 additions & 81 deletions
@@ -0,0 +1,188 @@
+# Copyright 2023-2024 SGLang Team
+# Modifications copyright 2025 SGLang-JAX Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""LoRA layer wrappers using Flax Model Surgery."""
+
+from __future__ import annotations
+
+import jax
+import jax.numpy as jnp
+from flax import nnx
+
+
+class LoRALinear(nnx.Module):
+    """
+    LoRA wrapper for Linear layers using Flax NNX.
+
+    This wraps an existing Linear layer and adds LoRA (Low-Rank Adaptation)
+    computation. Uses Model Surgery to preserve the original weights and sharding.
+
+    The forward pass computes:
+        output = base_layer(x) + scaling * (x @ lora_A @ lora_B)
+
+    where the LoRA term is only added when `enabled=True`.
+
+    Attributes:
+        base_layer: Original Linear layer (preserves weights and sharding)
+        lora_A: LoRA A matrix (in_features, lora_rank)
+        lora_B: LoRA B matrix (lora_rank, out_features)
+        scaling: LoRA scaling factor (typically alpha / rank)
+        enabled: Whether LoRA computation is active
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        lora_rank: int,
+        base_layer: nnx.Linear | None = None,
+        rngs: nnx.Rngs | None = None,
+    ):
+        """
+        Initialize LoRA Linear layer.
+
+        Args:
+            in_features: Input dimension
+            out_features: Output dimension
+            lora_rank: Rank of LoRA matrices
+            base_layer: Existing Linear layer to wrap (optional)
+            rngs: Random number generators for initialization
+        """
+        self.in_features = in_features
+        self.out_features = out_features
+        self.lora_rank = lora_rank
+
+        # Base layer - will be populated via nnx.update() during surgery
+        if base_layer is not None:
+            self.base_layer = base_layer
+        else:
+            # Create placeholder base layer
+            if rngs is None:
+                rngs = nnx.Rngs(0)
+            self.base_layer = nnx.Linear(
+                in_features,
+                out_features,
+                use_bias=True,
+                rngs=rngs,
+            )
+
+        # LoRA parameters (initialized to small random values)
+        if rngs is None:
+            rngs = nnx.Rngs(0)
+
+        # Initialize lora_A with normal distribution scaled by 1/sqrt(rank)
+        self.lora_A = nnx.Param(
+            jax.random.normal(rngs(), (in_features, lora_rank)) / jnp.sqrt(lora_rank)
+        )
+
+        # Initialize lora_B to zeros (standard LoRA initialization)
+        self.lora_B = nnx.Param(jnp.zeros((lora_rank, out_features)))
+
+        # Control variables (not trainable)
+        self.scaling = nnx.Variable(1.0)  # Will be set to alpha / rank
+        self.enabled = nnx.Variable(False)  # Whether LoRA is active
+
+    def __call__(self, x: jax.Array) -> jax.Array:
+        """
+        Forward pass with optional LoRA computation.
+
+        Args:
+            x: Input tensor
+
+        Returns:
+            Output tensor with LoRA delta added (if enabled)
+        """
+        # Base layer computation (preserves original behavior)
+        output = self.base_layer(x)
+
+        # Add LoRA delta if enabled
+        if self.enabled.value:
+            # Compute: x @ lora_A @ lora_B
+            lora_delta = (x @ self.lora_A.value) @ self.lora_B.value
+            output = output + self.scaling.value * lora_delta
+
+        return output
+
+
+class LoRAEmbedding(nnx.Module):
+    """
+    LoRA wrapper for Embedding layers.
+
+    Similar to LoRALinear but for embedding layers.
+    Currently a placeholder for future implementation.
+    """
+
+    def __init__(
+        self,
+        num_embeddings: int,
+        features: int,
+        lora_rank: int,
+        base_layer: nnx.Embed | None = None,
+        rngs: nnx.Rngs | None = None,
+    ):
+        """
+        Initialize LoRA Embedding layer.
+
+        Args:
+            num_embeddings: Size of vocabulary
+            features: Embedding dimension
+            lora_rank: Rank of LoRA matrices
+            base_layer: Existing Embed layer to wrap (optional)
+            rngs: Random number generators
+        """
+        self.num_embeddings = num_embeddings
+        self.features = features
+        self.lora_rank = lora_rank
+
+        # Base layer
+        if base_layer is not None:
+            self.base_layer = base_layer
+        else:
+            if rngs is None:
+                rngs = nnx.Rngs(0)
+            self.base_layer = nnx.Embed(
+                num_embeddings,
+                features,
+                rngs=rngs,
+            )
+
+        # LoRA parameters for embeddings
+        if rngs is None:
+            rngs = nnx.Rngs(0)
+
+        self.lora_A = nnx.Param(jax.random.normal(rngs(), (num_embeddings, lora_rank)))
+        self.lora_B = nnx.Param(jnp.zeros((lora_rank, features)))
+
+        self.scaling = nnx.Variable(1.0)
+        self.enabled = nnx.Variable(False)
+
+    def __call__(self, x: jax.Array) -> jax.Array:
+        """
+        Forward pass for embedding with LoRA.
+
+        Args:
+            x: Input token indices
+
+        Returns:
+            Embedded output with LoRA delta
+        """
+        output = self.base_layer(x)
+
+        if self.enabled.value:
+            # Embedding LoRA: lookup lora_A then multiply by lora_B
+            lora_a_embed = self.lora_A.value[x]  # Shape: [..., lora_rank]
+            lora_delta = lora_a_embed @ self.lora_B.value  # Shape: [..., features]
+            output = output + self.scaling.value * lora_delta
+
+        return output
@@ -22,7 +22,6 @@
 import re
 
 import jax
-import jax.numpy as jnp
 from flax import nnx
 
 from sgl_jax.srt.configs.load_config import LoadConfig
@@ -93,83 +92,3 @@ def initialize_weights(self):
                 self.layers[layer_id].weights[name] = loaded_weight
             else:
                 self.weights[name] = loaded_weight
-
-        # normalize kv_proj and gate_up_proj
-        for layer in self.layers:
-            weight_names = list(layer.weights.keys())
-            self.normalize_qkv_proj(weight_names, layer.weights)
-            self.normalize_gate_up_proj(weight_names, layer.weights)
-
-    def normalize_qkv_proj(self, weight_names: list[str], weights: dict[str, jax.Array]):
-        # Collect target q/k/v modules. This process is necessary since there might be no lora attached to k_proj
-        target_module = set()
-        for weight_name in weight_names:
-            if "k_proj" in weight_name:
-                target_module.add("k_proj")
-            if "q_proj" in weight_name:
-                target_module.add("q_proj")
-            if "v_proj" in weight_name:
-                target_module.add("v_proj")
-            if "qkv_proj" in weight_name:
-                target_module.add("qkv_proj")
-        if len(target_module) == 0:
-            return
-
-        for weight_name in weight_names:
-            # We assume every lora adaptor should contain lora modules for q_proj
-            if "q_proj" in weight_name:
-                q_name = weight_name
-                k_name = weight_name.replace("q_proj", "k_proj")
-                v_name = weight_name.replace("q_proj", "v_proj")
-                qkv_name = weight_name.replace("q_proj", "qkv_proj")
-
-                # If k_proj doesn't have lora, initialize it to zero
-                k_proj_weight = (
-                    weights[k_name]
-                    if "k_proj" in target_module
-                    else jnp.zeros_like(weights[v_name])
-                )
-                weights[qkv_name] = jnp.concatenate(
-                    (
-                        weights[q_name],
-                        k_proj_weight,
-                        weights[v_name],
-                    ),
-                    0,
-                )
-                weights.pop(q_name)
-                if "k_proj" in target_module:
-                    weights.pop(k_name)
-                weights.pop(v_name)
-            elif "qkv_proj" in weight_name:
-                # If qkv_proj is already stacked, we normalize it following the SGL convention.
-                qkv_name = weight_name
-                q_name = weight_name.replace("qkv_proj", "q_proj")
-                k_name = weight_name.replace("qkv_proj", "k_proj")
-                v_name = weight_name.replace("qkv_proj", "v_proj")
-                if "lora_A" in weight_name:
-                    weights[qkv_name] = weights[qkv_name].repeat(3, 1)
-                # else: no-op as LoRA B weight is already stacked.
-
-    def normalize_gate_up_proj(self, weight_names: list[str], weights: dict[str, jax.Array]):
-        for weight_name in weight_names:
-            if "gate_proj" in weight_name:
-                up_name = weight_name.replace("gate_proj", "up_proj")
-                gate_up_name = weight_name.replace("gate_proj", "gate_up_proj")
-                if up_name not in weights:
-                    weights[up_name] = jax.zeros_like(weights[weight_name])
-                    assert isinstance(self.lora_backend, SUPPORTED_BACKENDS), (
-                        f"LoRA weight initialization currently only supported for LoRA backends: {', '.join(b.name for b in SUPPORTED_BACKENDS)}"
-                        f"Received backend: {self.lora_backend.name}. Please verify your backend configuration "
-                        f"or consider implementing custom initialization logic for other backends."
-                    )
-                weights[gate_up_name] = jnp.concatenate((weights[weight_name], weights[up_name]), 0)
-                weights.pop(weight_name)
-                if up_name in weights:
-                    weights.pop(up_name)
-            elif "gate_up_proj" in weight_name:
-                # If gate_up_proj is already stacked, we normalize it following the SGL convention
-                gate_up_name = weight_name
-                if "lora_A" in weight_name:
-                    weights[gate_up_name] = weights[gate_up_name].repeat(2, 1)
-                # else: no-op as LoRA B weight is already stacked.