add part codes for bgmv backend

aolemila · aolemila · commit a7ae79fc4d25 · 2025-11-14T17:58:16.000+08:00
diff --git a/python/sgl_jax/srt/lora/backend/base_backend.py b/python/sgl_jax/srt/lora/backend/base_backend.py
@@ -22,11 +22,10 @@ def run_lora_a_gemm(self, x: jax.Array, weights: jax.Array, *args, **kwargs) ->
 
         Args:
              x: input matrix with shape (s, input_dim), here s is the sum of all sequence lengths
-             weights: a set of lora weights with shape (num_lora, c * r, input_dim),
-                      here r is lora rank, c is a multiplier for stacked modules (e.g., c=3 for qkv_proj, c=2 for gate_up_proj)
+             weights: a set of lora weights with shape (num_lora, r, input_dim), r is lora rank,
                       usually input_dim is much larger than r
         Returns:
-             result with shape (s, c * r)
+             result with shape (s, r)
         """
         pass
 
@@ -47,6 +46,7 @@ def run_qkv_lora(
         x: jax.Array,
         qkv_lora_a: jax.Array,
         qkv_lora_b: jax.Array | tuple[jax.Array],
+        output_slices: tuple,
         *args,
         **kwargs,
     ) -> jax.Array:
@@ -56,10 +56,11 @@ def run_qkv_lora(
             x: input matrix with shape (s, input_dim), here s is the sum of all sequence lengths
             qkv_lora_a: lora_a module for qkv, with shape (num_lora, 3 * r, input_dim)
             qkv_lora_b: lora_b module for qkv.
-                        If passed in as a tensor, its shape should be (num_lora,output_dim_q + 2 * output_dim_kv, r)
+                        If passed in as a tensor, its shape should be (num_lora, output_dim_q + 2 * output_dim_kv, r)
                         If passed in as a tuple of two tensors, it should contain:
                            a lora_b module for q, with shape (1, num_lora, output_dim_q, r)
                            and a combined lora_b module for kv, with shape (2, num_lora, output_dim_kv, r)
+            output_slices: a fixed tuple which has three item, (output_dim_q, output_dim_kv, output_dim_kv)
         Returns:
             result with shape (s, output_dim_q + 2 * output_dim_kv)
         """
@@ -81,6 +82,7 @@ def run_gate_up_lora(
             gate_up_lora_b: lora_b module for qkv.
                         If passed in as a tensor, its shape should be (num_lora, 2 * output_dim, r)
                         If passed in as a tuple, it should contain two tensors with shape (num_lora, output_dim, r)
+            output_slices: a fixed tuple which has three item, (output_dim_q, output_dim_kv, output_dim_kv)
         Returns:
             result with shape (s, 2 * output_dim)
         """
diff --git a/python/sgl_jax/srt/lora/backend/bgmv_backend.py b/python/sgl_jax/srt/lora/backend/bgmv_backend.py
@@ -0,0 +1,236 @@
+from secrets import token_bytes
+from typing import Optional
+
+
+from sgl_jax.srt.lora.backend.base_backend import BaseLoRABackend
+from sgl_jax.srt.lora.utils import LoRABatchInfo
+from sgl_jax.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+import jax
+import jax.numpy as jnp
+import numpy as np
+
+MIN_CHUNK_SIZE = 16
+
+
+class BgmvLoRABackend(BaseLoRABackend):
+    """
+    Bgmv LoRA backend using batched grouped matrix-vector multiplication.
+    """
+
+    name = "bgmv"
+
+    def __init__(
+        self,
+        max_loras_per_batch: int,
+        max_lora_rank: int,
+    ):
+        super().__init__(max_loras_per_batch)
+        self.max_lora_rank = max_lora_rank
+
+    def run_lora_a_gemm(
+        self, 
+        x: jax.Array, # (s, input_dim) 
+        weights: jax.Array, # (num_lora, r, input_dim) 
+        *args, 
+        **kwargs
+    ) -> jax.Array:
+        #x = x.reshape(-1, x.shape[-1])
+
+        # # Add dimension for bgmv_shrink
+        # lora_a_reshaped = jnp.expand_dims(weights, axis=1)
+
+        # Single bgmv_shrink call
+        return bgmv_shrink(x, weights, self.batch_info.token_lora_indices, self.batch_info.scalings)
+
+    def run_lora_b_gemm(
+        self, 
+        x: jax.Array, # (s, r)
+        weights: jax.Array, # (num_lora, output_dim, r)
+        base_output: jax.Array |None =None,
+        *args, 
+        **kwargs,
+    ) -> jax.Array:
+        s = x.shape[0]
+        output_dim=weights.shape[1]       
+
+        return bgmv_expand_slice(
+            x,
+            weights,
+            base_output,
+            self.batch_info.token_lora_indices,
+            0,
+            output_dim
+            (s, output_dim)
+        ) 
+
+
+    def run_qkv_lora(
+        self,
+        x: jax.Array, # (s, input_dim)
+        qkv_lora_a: jax.Array, # (num_lora, 3 * r, input_dim)
+        qkv_lora_b: jax.Array | tuple[jax.Array], # (num_lora, output_dim_q + 2 * output_dim_kv, r) or ((1, num_lora, output_dim_q, r), (2, num_lora, output_dim_kv, r)) 
+        output_slices: tuple, # a tuple = (output_dim_q, output_dim_kv, output_dim_kv)
+        base_output: jax.Array |None =None,
+        *args,
+        **kwargs,
+    ) -> jax.Array:
+        """Run the lora pass for QKV Layer.
+
+        Args:
+            x: input matrix with shape (s, input_dim), here s is the sum of all sequence lengths
+            qkv_lora_a: lora_a module for qkv, with shape (num_lora, 3 * r, input_dim)
+            qkv_lora_b: lora_b module for qkv.
+                        If passed in as a tensor, its shape should be (num_lora, output_dim_q + 2 * output_dim_kv, r)
+                        If passed in as a tuple of two tensors, it should contain:
+                           a lora_b module for q, with shape (1, num_lora, output_dim_q, r)
+                           and a combined lora_b module for kv, with shape (2, num_lora, output_dim_kv, r)
+            output_slices: a fixed tuple which has three item, (output_dim_q, output_dim_kv, output_dim_kv) 
+        Returns:
+            result with shape (s, output_dim_q + 2 * output_dim_kv)
+        """
+        pass
+
+    def run_gate_up_lora(
+        self,
+        x: jax.Array,
+        gate_up_lora_a: jax.Array,
+        gate_up_lora_b: jax.Array | tuple[jax.Array],
+        base_output: jax.Array |None =None,
+        *args,
+        **kwargs,
+    ) -> jax.Array:
+        """Run the lora pass for gate_up_proj.
+
+        Args:
+            x: input matrix with shape (s, input_dim), here s is the sum of all sequence lengths
+            gate_up_lora_a: lora_a module for gate_up_proj, with shape (num_lora, 2 * r, input_dim)
+            gate_up_lora_b: lora_b module for qkv.
+                        If passed in as a tensor, its shape should be (num_lora, 2 * output_dim, r)
+                        If passed in as a tuple, it should contain two tensors with shape (num_lora, output_dim, r)
+            output_slices: a fixed tuple which has three item, (output_dim_q, output_dim_kv, output_dim_kv) 
+        Returns:
+            result with shape (s, 2 * output_dim)
+        """
+        pass
+
+    def prepare_lora_batch(
+        self,
+        forward_batch: ForwardBatch,
+        weight_indices: list[int], # (bs,), please pad with -1
+        lora_ranks: list[int], # (max_loras_per_batch,)
+        scalings: list[float], # (max_loras_per_batch,)
+        batch_info: Optional[LoRABatchInfo] = None,
+    ):
+        lora_ranks_bs = []
+        scalings_bs = []
+        for indice in weight_indices:
+            if indice != -1:
+                lora_ranks_bs.append(lora_ranks[indice])
+                scalings_bs.append(scalings[indice])
+            else:
+                lora_ranks_bs.append(0)
+                scalings_bs.append(0.0)
+        
+        assert len(forward_batch.seq_lens) == len(weight_indices)
+        assert len(forward_batch.seq_lens) == len(lora_ranks_bs)
+        assert len(forward_batch.seq_lens) == len(scalings_bs)
+
+        target_len = forward_batch.input_ids.shape[0]
+
+        if forward_batch.forward_mode==ForwardMode.EXTEND:
+            scalings_cpu = np.repeat(np.array(scalings_bs, dtype=np.float32), forward_batch.seq_lens)
+            token_lora_indices_cpu = np.repeat(np.array(weight_indices, dtype=np.int32), forward_batch.seq_lens)
+            lora_ranks_cpu = np.repeat(np.array(lora_ranks_bs,dtype=np.int32), forward_batch.seq_lens)
+
+            num_to_pad = target_len - jnp.sum(forward_batch.seq_lens)
+
+            if num_to_pad>0:
+                padded_scalings_cpu = np.pad(scalings_cpu,[0,num_to_pad], mode="constant", constant_values=0.0)
+                padded_token_lora_indices_cpu = np.pad(token_lora_indices_cpu,[0,num_to_pad], mode="constant", constant_values=-1)
+                padded_lora_ranks_cpu = np.pad(lora_ranks_cpu,[0,num_to_pad], mode="constant", constant_values=0)            
+        elif forward_batch.forward_mode == ForwardMode.DECODE:
+            padded_scalings_cpu = np.array(scalings_bs, dtype=np.float32)
+            padded_token_lora_indices_cpu = np.array(weight_indices, dtype=np.int32)
+            padded_lora_ranks_cpu = np.array(lora_ranks_bs, dtype= np.int32) 
+
+        if batch_info is None:
+            batch_info = LoRABatchInfo()
+
+        batch_info = LoRABatchInfo(
+            bs=forward_batch.batch_size,
+            scalings=jnp.array(padded_scalings_cpu,dtype=jnp.float32),
+            token_lora_indices = jnp.array(padded_token_lora_indices_cpu, dtype=jnp.int32),
+            lora_ranks = jnp.array(padded_lora_ranks_cpu, dtype=jnp.int32),
+        )
+
+        self.batch_info = batch_info
+
+def bgmv_shrink(
+    inputs,
+    lora_weights,
+    lora_indices,
+    scaling: float = 1.0,
+):
+    """
+    Shrink operation: maps input to low-rank space.
+
+    Args:
+        inputs: (s, input_dim)
+        lora_weights: (num_lora, c * r, input_dim), c is a multiplier for stacked modules (e.g., c=3 for qkv_proj, c=2 for gate_up_proj)
+        lora_indices: (num_tokens)
+    Returns:
+        [s, c * r]
+    """
+    # if len(lora_weights.shape) == 4:
+    #     lora_weights = jnp.squeeze(lora_weights, axis=1)
+    return scaling * bgmv_jax(inputs, lora_weights, lora_indices)
+
+
+def bgmv_expand_slice(
+    inputs,  # [num_tokens, lora_rank]
+    lora_weights,  # [num_loras, 1, out_features, lora_rank]
+    base_output,  # [num_tokens, total_out_features]
+    lora_indices,  # [num_tokens]
+    slice_offset: int,
+    slice_size: int,
+    output_shape: tuple,
+):
+    """
+    Expand operation: maps from low-rank space to output space.
+
+    Args:
+        inputs: [num_tokens, lora_rank]
+        lora_weights: [num_loras, 1, out_features, lora_rank]
+        output_tensor: [num_tokens, total_out_features]
+        lora_indices: [num_tokens]
+    """
+    if len(lora_weights.shape) == 4:
+        lora_weights = jnp.squeeze(lora_weights, axis=1)
+
+    outputs = bgmv_jax(inputs, lora_weights, lora_indices)
+
+    # Pad the outputs
+    pad_left = slice_offset
+    pad_right = output_shape[-1] - (slice_offset + slice_size)
+    outputs = jnp.pad(outputs, ((0, 0), (pad_left, pad_right)), mode='constant', constant_values=0)
+
+    if base_output is not None:
+        return base_output + outputs
+    else:
+        return outputs
+
+def bgmv_jax(
+    inputs,  # (s, input_dim)
+    loras,   # (num_lora, c * r, input_dim)
+    idxs,    # (num_tokens)
+):
+    """
+    Batched grouped matrix-vector multiplication.
+    For each token, select the corresponding LoRA and apply matrix multiplication.
+    """
+    return jnp.einsum(
+        "td,tX,Xld->tl",
+        inputs,
+        jax.nn.one_hot(idxs, loras.shape[0], dtype=inputs.dtype),
+        loras,
+    )
diff --git a/python/sgl_jax/srt/lora/utils.py b/python/sgl_jax/srt/lora/utils.py
@@ -1,12 +1,23 @@
 from dataclasses import dataclass
 from enum import Enum
 
+import jax
+
 
 @dataclass
 class LoRABatchInfo:
     # Batch size
     bs: int
 
+    # scaling of each lora adapter, in shape (num_tokens,)
+    scalings: jax.Array
+
+    # (num_tokens,)
+    token_lora_indices: jax.Array
+
+    # (num_tokens,)
+    lora_ranks: jax.Array
+
 
 class LoRAType(Enum):
     LORA_A = 0