Optimize multinomial sampling (InternLM#4056)

grimoire · web-flow · commit adac4b185789 · 2025-10-23T16:22:52.000+08:00
* optimize multinomial sampling kernel

* remove

* add comments

* optimize

* remove sync

* recovery

* remove print

* fix

* optimize output pipeline
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
@@ -831,6 +831,10 @@ def _make_infer_outputs(
         logits = batched_outputs.logits
         logprobs = batched_outputs.logprobs
 
+        if logprobs is not None:
+            logprobs.vals = logprobs.vals.tolist()
+            logprobs.indices = logprobs.indices.tolist()
+
         seq_length = [seq.num_token_ids for seq in running]
         is_run = [seq.status == MessageStatus.LOCKED for seq in running]
         self.seq_strategy.update_running(running=running, batched_outputs=batched_outputs, is_decoding=is_decoding)
@@ -858,7 +862,7 @@ def _make_infer_outputs(
             num_logprobs = msg.sampling_param.num_logprobs
             cur_logprobs = None
             if num_logprobs >= 0:
-                cur_logprobs = (logprobs.vals[idx, :num_logprobs + 1], logprobs.indices[idx, :num_logprobs + 1])
+                cur_logprobs = (logprobs.vals[idx][:num_logprobs + 1], logprobs.indices[idx][:num_logprobs + 1])
 
             req_metrics = RequestMetrics(new_token_timestamp, msg.engine_events)
             out = InferOutput(session_id=session_id,
@@ -953,15 +957,7 @@ def __log_resps(outputs: List[InferOutput]):
         def __send_resp(out: InferOutput):
             """Send response."""
             resp_type = (ResponseType.FINISH if out.finish else ResponseType.SUCCESS)
-            cur_logprobs = out.logprobs
-            logprobs = None
-            if cur_logprobs is not None:
-                # logprobs to dict
-                vals = cur_logprobs[0].tolist()
-                indices = cur_logprobs[1].tolist()
-                cur_logprobs = dict(zip(indices, vals))
-                logprobs = [] if out.resp.data is None else out.resp.data.get('logprobs', [])
-                logprobs = logprobs + [cur_logprobs]
+            logprobs = None if out.resp.data is None else out.resp.data.get('logprobs', None)
             self._response(out.resp,
                            resp_type,
                            data=dict(token_ids=out.token_ids,
@@ -970,10 +966,33 @@ def __send_resp(out: InferOutput):
                                      req_metrics=out.req_metrics,
                                      logprobs=logprobs))
 
+        def __update_logprobs(step_outputs: List[InferOutput]):
+            for out in step_outputs:
+                cur_logprobs = out.logprobs
+                if cur_logprobs is None:
+                    continue
+
+                if out.resp.data is None:
+                    out.resp.data = dict()
+                out.resp.data.setdefault('logprobs', [])
+
+                # logprobs to dict
+                vals = cur_logprobs[0]
+                indices = cur_logprobs[1]
+                cur_logprobs = dict(zip(indices, vals))
+                logprobs = out.resp.data['logprobs']
+                logprobs.append(cur_logprobs)
+
         def __send_resps(step_outputs: List[InferOutput]):
             """Send response callback."""
             __log_resps(step_outputs)
-            for out in step_outputs:
+            __update_logprobs(step_outputs)
+
+            is_done = set()
+            for out in reversed(step_outputs):
+                if out.session_id in is_done:
+                    continue
+                is_done.add(out.session_id)
                 __send_resp(out)
 
         while True:
diff --git a/lmdeploy/pytorch/engine/logits_process.py b/lmdeploy/pytorch/engine/logits_process.py
@@ -236,7 +236,7 @@ def __random_sampling(scores: torch.Tensor, indices: torch.LongTensor):
             if max_topk <= 0:
                 max_topk = scores.size(1)
                 if top_k is not None:
-                    top_k = torch.where(top_k <= 0, top_k.new_tensor(max_topk), top_k)
+                    top_k = torch.masked_fill(top_k, top_k <= 0, max_topk)
 
             if top_k is not None:
                 scores = _filter_topk_sorted_(scores, top_k)
diff --git a/lmdeploy/pytorch/kernels/cuda/multinomial_sampling.py b/lmdeploy/pytorch/kernels/cuda/multinomial_sampling.py
@@ -6,47 +6,60 @@
 
 @triton.jit
 def _multinomial_sampling_kernel(Scores, Seeds, Offsets, Indices, Outputs, stride_sb, stride_st, stride_ib, stride_it,
-                                 num_batchs, num_tokens, BLOCK: tl.constexpr, BLOCK_N: tl.constexpr):
+                                 num_tokens, BLOCK_N: tl.constexpr):
     """Kernel."""
-    batch_block_id = tl.program_id(0)
-
-    off = batch_block_id * BLOCK + tl.arange(0, BLOCK)
+    batch_id = tl.program_id(0)
     n_off = tl.arange(0, BLOCK_N)
 
-    off_mask = off < num_batchs
-    seed = tl.load(Seeds + off, mask=off_mask)
-    offset = tl.load(Offsets + off, mask=off_mask).to(tl.int32)
-
-    samp = tl.rand(seed, offset)[:, None]
-    acc = tl.zeros((BLOCK, ), dtype=tl.float32)
-    output = tl.load(Indices + off * stride_ib, mask=off_mask)
-
-    for b_idx in range(0, num_tokens, BLOCK_N):
-        s_off = b_idx + n_off
-        s_mask = off_mask[:, None] & (s_off[None, :] < num_tokens)
-        scores = tl.load(Scores + off[:, None] * stride_sb + s_off[None, :] * stride_st, mask=s_mask,
-                         other=0.0).to(tl.float32)
-        c_scores = tl.cumsum(scores, 1)
-        cum_scores = acc[:, None] + c_scores
-        acc += tl.max(c_scores, 1)
-
-        pre_cum_scores = cum_scores - scores
-        valid_mask = (samp > pre_cum_scores) & (samp <= cum_scores)
-        found_mask = tl.sum(valid_mask, 1) > 0
-
-        valid_pos = b_idx + tl.argmax(valid_mask.to(tl.int32), 1)
-        indices = tl.load(Indices + off * stride_ib + valid_pos * stride_it, mask=found_mask & off_mask, other=-1)
-        output = tl.where(found_mask, indices, output)
-
-    tl.store(Outputs + off, output, mask=off_mask)
+    # sampling random seed
+    seed = tl.load(Seeds + batch_id)
+    offset = tl.load(Offsets + batch_id).to(tl.int32)
+    samp = tl.rand(seed, offset)
+
+    # initialize
+    acc = 0.0
+    score_ptr = Scores + batch_id * stride_sb + n_off * stride_st
+    indice_ptr = Indices + batch_id * stride_ib
+    output = tl.load(indice_ptr)
+
+    found_mask = False
+    for b_idx in tl.range(0, num_tokens, BLOCK_N):
+        # triton does not have break statement, use mask to skip computation
+        if not found_mask:
+            s_off = b_idx + n_off
+            s_mask = (s_off < num_tokens)
+            scores = tl.load(score_ptr, mask=s_mask, other=0.0).to(tl.float32)
+            c_scores = tl.cumsum(scores, 0)
+            cum_scores = acc + c_scores
+            acc += tl.max(c_scores, 0)
+
+            pre_cum_scores = cum_scores - scores
+            valid_mask = (samp > pre_cum_scores) & (samp <= cum_scores)
+            found_mask = tl.sum(valid_mask, 0) > 0
+
+            if found_mask:
+                valid_pos = tl.argmax(valid_mask.to(tl.int32), 0)
+                indice = tl.load(indice_ptr + valid_pos * stride_it)
+                output = indice
+        score_ptr += stride_st * BLOCK_N
+        indice_ptr += stride_it * BLOCK_N
+
+    tl.store(Outputs + batch_id, output)
 
 
 def multinomial_sampling(scores: torch.Tensor,
                          seeds: torch.LongTensor,
                          offsets: torch.LongTensor,
                          indices: torch.Tensor = None):
-    """Multinomial sampling."""
+    """Multinomial sampling.
+
+    Note that this kernel assumes the input scores are already sorted in descending order.
 
+    scores: [batch_size, num_tokens], sorted softmax scores
+    seeds: [batch_size]
+    offsets: [batch_size]
+    indices: [batch_size, num_tokens], original token indices before sorting
+    """
     assert scores.dim() == 2
     batch_size, num_tokens = scores.size()
     device = scores.device
@@ -63,10 +76,9 @@ def multinomial_sampling(scores: torch.Tensor,
 
     outputs = indices[:, 0].clone()
 
-    BLOCK = 8
     BLOCK_N = 128
 
-    grid = [triton.cdiv(batch_size, BLOCK)]
+    grid = [batch_size]
     _multinomial_sampling_kernel[grid](scores,
                                        seeds,
                                        offsets,
@@ -76,10 +88,8 @@ def multinomial_sampling(scores: torch.Tensor,
                                        stride_st=scores.stride(1),
                                        stride_ib=indices.stride(0),
                                        stride_it=indices.stride(1),
-                                       num_batchs=batch_size,
                                        num_tokens=num_tokens,
-                                       BLOCK=BLOCK,
                                        BLOCK_N=BLOCK_N,
-                                       num_warps=8)
+                                       num_warps=1)
 
     return outputs