From a68171e0042204b51e66ea2fbc7f03ad82f2248b Mon Sep 17 00:00:00 2001
From: CAICAIIs <3360776475@qq.com>
Date: Wed, 10 Jun 2026 20:37:45 +0800
Subject: [PATCH 1/3] fix(dsv2lite): reuse nccl dense exchange scratch

---
 docs/index.md                                 |   2 +-
 .../decode-attribution-gate.md                |  16 +-
 .../device-resident-nccl-combine.md           |  12 +-
 docs/models/deepseek-v2-lite/status.md        |   9 +-
 openinfer-core/src/ops.rs                     |   6 +-
 .../src/nccl_backend.rs                       | 364 +++++++++++++++++-
 openinfer-deepseek-v2-lite/src/runtime/moe.rs |  21 +-
 .../src/runtime/readiness.rs                  |  29 +-
 openinfer-kernels/src/ops.rs                  |   9 +-
 openinfer-kernels/src/ops/elementwise.rs      |  23 +-
 10 files changed, 432 insertions(+), 59 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index 328acac7..39fc5e3d 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -66,7 +66,7 @@ Organized by domain (model line / subsystem / playbook / lesson) instead of by l
 | `models/deepseek-v2-lite/hf-accuracy-gate.md` | DeepSeek-V2-Lite EP2 HF accuracy gate after PR #149/#150: HF incremental greedy, host-staged EP2, and NCCL EP2 are token/text exact for `Hello`, output_len=16. |
 | `models/deepseek-v2-lite/decode-attribution-gate.md` | DeepSeek-V2-Lite EP2 decode attribution gate for `Hello`/16-token batch sizes 1/4/8: structured JSON with accuracy hashes, CPU-side timing, selected CUDA event/NVTX attribution, host-staged/NCCL EP counts, and explicit no-throughput claim boundary. |
 | `models/deepseek-v2-lite/source-layout.md` | DeepSeek-V2-Lite runtime layout refactor: `runtime.rs` split by responsibility, HF/host-staged/NCCL EP2 E2E exact on 2x RTX 5090; NCCL CUDA Graph smoke remains a diagnostic blocker on that host, independent of the passed correctness gate. |
-| `models/deepseek-v2-lite/device-resident-nccl-combine.md` | Issue #275 record: NCCL decode combine now uses reusable device-resident f32 scratch, HF/host-staged/NCCL exact on 2x RTX 5090, and remaining graph blockers are dense exchange plus host-directed routing. |
+| `models/deepseek-v2-lite/device-resident-nccl-combine.md` | Issue #275 record: NCCL decode combine uses reusable device-resident f32 scratch; after issue #276, dense exchange also uses reusable scratch, so remaining graph blockers are host-directed routing/expert accumulation. |
 
 ## models / kimi-k2
 
diff --git a/docs/models/deepseek-v2-lite/decode-attribution-gate.md b/docs/models/deepseek-v2-lite/decode-attribution-gate.md
index 5f3d207a..e4adf037 100644
--- a/docs/models/deepseek-v2-lite/decode-attribution-gate.md
+++ b/docs/models/deepseek-v2-lite/decode-attribution-gate.md
@@ -133,13 +133,13 @@ ncclMaxSharedMem 82240 exceeds device/fn maxSharedMem 79856
 NCCL WARN Cuda failure 1 'invalid argument'
 ```
 
-Use a newer NCCL runtime through the normal library path if the system runtime fails this way. The project code path should not change just to work around local NCCL installation age.
+The NCCL loader now tries explicit overrides first (`OPENINFER_NCCL_LIB`, then `OPENINFER_NCCL_LIB_DIR` / `OPENINFER_NCCL_LIBRARY_PATH`), then Python wheel NCCL directories discoverable from `OPENINFER_NCCL_PYTHON`, `OPENINFER_TRITON_PYTHON`, `VIRTUAL_ENV`, or `CONDA_PREFIX`, and finally the system `libnccl.so.2` / `libnccl.so`. This keeps the code path unchanged while avoiding a stale system NCCL when the validation environment already has a newer CUDA wheel runtime.
 
 The HF oracle needs a Python environment that can load DeepSeek-V2-Lite with `trust_remote_code=True`. The helper script tolerates the model file's optional `flash_attn` import check when FlashAttention is not installed, but the HF environment remains separate from the Rust runtime claim: it is only the truth-source generator for the comparison JSON.
 
 ## Latest Validation
 
-The issue #275 refresh was rerun on 2026-06-08 with DeepSeek-V2-Lite snapshot `604d5664dddd88a0433dbae533b7fe9472482de0`, `prompt="Hello"`, `output_len=16`, and 2x RTX 5090. HF, host-staged, and NCCL were dumped from the same model directory and compared with `--require-all-exact`.
+The issue #276 refresh was rerun on 2026-06-10 with DeepSeek-V2-Lite snapshot `604d5664dddd88a0433dbae533b7fe9472482de0`, `prompt="Hello"`, `output_len=16`, and 2x RTX 5090. HF, host-staged, and NCCL were dumped from the same model directory and compared with `--require-all-exact`. The Rust path loaded NCCL `2.30.7+cuda12.9` from the Python CUDA wheel path because the system NCCL `2.25.1+cuda12.8` failed the init smoke on this Blackwell host before model-level validation.
 
 - HF / host-staged / NCCL comparison: `all_token_text_exact`.
 - Token SHA256: `4fb4c8825fe4d2c4a1d966da25c259abdf675f4de4548daa5d41aea7dfe30225`.
@@ -147,7 +147,15 @@ The issue #275 refresh was rerun on 2026-06-08 with DeepSeek-V2-Lite snapshot `6
 - Generated text: `, I am a 19 year old girl from the UK. I am`.
 - Candidate NCCL attribution: `gpu_timing.sample_count=8384`, `failure_count=0`.
 
-The candidate readiness report still has `full_decode_capture_ready=false`. Compared with the same-host baseline attribution, it removed `nccl_contribution_accumulation_on_host`, `nccl_combine_h2d_contribution_copy`, `nccl_combine_allocates_per_call`, `nccl_combine_syncs_rank_streams`, and `nccl_combine_d2h_result_copy`. The remaining blockers are `nccl_dense_exchange_allocates_per_call`, `nccl_dense_exchange_syncs_rank_streams`, `nccl_route_iteration_on_host`, and `nccl_expert_accumulation_host_directed`.
+The candidate readiness report still has `full_decode_capture_ready=false`. Compared with the issue #275 candidate, it removes the dense-exchange allocation/sync blockers. The remaining blockers are `nccl_route_iteration_on_host` and `nccl_expert_accumulation_host_directed`.
+
+Current NCCL attribution for the issue #276 gate:
+
+| Batch | GPU event samples | GPU failures | NCCL exchange/combine calls | Route counters | Readiness blockers |
+| ---: | ---: | ---: | --- | --- | --- |
+| 1 | 8384 | 0 | `416 / 416` | `local=1284`, `remote=1212`, `combine=2496` | `nccl_route_iteration_on_host`, `nccl_expert_accumulation_host_directed` |
+| 4 | 23996 | 0 | `494 / 494` | `local=5136`, `remote=4848`, `combine=9984` | `nccl_route_iteration_on_host`, `nccl_expert_accumulation_host_directed` |
+| 8 | 44812 | 0 | `598 / 598` | `local=10272`, `remote=9696`, `combine=19968` | `nccl_route_iteration_on_host`, `nccl_expert_accumulation_host_directed` |
 
 The previous A800 strict same-host accuracy gate was rerun on 2026-06-04 with DeepSeek-V2-Lite snapshot `604d5664dddd88a0433dbae533b7fe9472482de0`, `prompt="Hello"`, `output_len=16`, and 2x A800-SXM4-80GB. The token/text oracle is confirmed by a real HF `AutoModelForCausalLM.generate(..., do_sample=false, use_cache=true)` run on the same model directory as the Rust E2E gate.
 
@@ -158,7 +166,7 @@ The Rust E2E accepts the known HF-confirmed RTX 5090 and A800 hash pairs for thi
 - Text SHA256: `4aaafbe4b3a46bc5b9ab5ea8d09d5fad71225006c2e234e87a928e3265b387c6`.
 - Generated text: `, I am a 20 year old female and I have been having a`.
 
-The graph-readiness diagnostic was rerun on 2026-06-04 on the same model snapshot and 2x A800-SXM4-80GB:
+The historical graph-readiness diagnostic before the #275/#276 device-scratch work was rerun on 2026-06-04 on the same model snapshot and 2x A800-SXM4-80GB:
 
 - `full_decode_capture_ready=false`;
 - `status=blocked_full_decode_path`;
diff --git a/docs/models/deepseek-v2-lite/device-resident-nccl-combine.md b/docs/models/deepseek-v2-lite/device-resident-nccl-combine.md
index 7ac35f70..eab6ce9c 100644
--- a/docs/models/deepseek-v2-lite/device-resident-nccl-combine.md
+++ b/docs/models/deepseek-v2-lite/device-resident-nccl-combine.md
@@ -1,6 +1,6 @@
 # DeepSeek-V2-Lite Device-Resident NCCL Combine
 
-> **TL;DR:** Issue #275 moves the NCCL decode combine path to reusable device-resident f32 scratch buffers. The retained `Hello` / 16-token gate stays HF / host-staged / NCCL exact, and the readiness report no longer lists the old combine H2D/D2H/allocation/sync blockers. Full decode capture is still blocked by dense exchange allocation/sync and host-directed routing.
+> **TL;DR:** Issue #275 moves the NCCL decode combine path to reusable device-resident f32 scratch buffers. The retained `Hello` / 16-token gate stays HF / host-staged / NCCL exact, and the readiness report no longer lists the old combine H2D/D2H/allocation/sync blockers. Issue #276 later removes the dense-exchange allocation/sync blockers; full decode capture is still blocked by host-directed routing/expert accumulation.
 
 Last touched: 2026-06
 
@@ -23,11 +23,11 @@ Last touched: 2026-06
   1. Add a shared CUDA helper that accumulates a bf16 single-token expert output into a f32 device contribution buffer at a selected token row.
   2. Re-export that helper through `openinfer-core::ops`.
   3. Add reusable NCCL combine scratch buffers inside `NaiveNcclEp2Backend`, clear the f32 send scratch per MoE call, accumulate local/remote expert outputs on the owning device, all-reduce device buffers, and cast rank0 f32 result to bf16 on device.
-  4. Update graph-readiness blockers and attribution wording so removed combine H2D/D2H/allocation/sync blockers are no longer claimed, while remaining host routing and dense-exchange blockers stay explicit.
+  4. Update graph-readiness blockers and attribution wording so removed combine H2D/D2H/allocation/sync blockers are no longer claimed, while the then-remaining host routing and dense-exchange blockers stay explicit.
   5. Run formatting and local compile gates, then use the provided remote GPU host for the DeepSeek-V2-Lite EP2 exactness and attribution gates.
 - **Risks / open questions**:
   - Device f32 accumulation must preserve the existing expert-id accumulation order before the final bf16 cast.
-  - Dense exchange and host route selection still block full decode CUDA Graph capture; `full_decode_capture_ready` should remain false unless validation proves otherwise.
+  - At the time of issue #275, dense exchange and host route selection still blocked full decode CUDA Graph capture; issue #276 later removed the dense-exchange allocation/sync blockers, but `full_decode_capture_ready` should remain false while host-directed routing/expert accumulation remain.
   - The provided SSH credential should stay local to the validation session and must not be echoed into docs or final output.
 
 ## Execution Log
@@ -97,13 +97,15 @@ cargo clippy --release -p openinfer-deepseek-v2-lite \
 
 Both commands passed on the same remote source copy after syncing the follow-up `host_ops.rs` and `logging.rs` fixes. The clippy command ran without `clippy::manual_midpoint`, `clippy::needless_range_loop`, or `clippy::option_option` allows.
 
-Before/after readiness comparison for the same model snapshot and diagnostic shape:
+Before/after readiness comparison for the same model snapshot and diagnostic shape at issue #275:
 
 | Report | Readiness blockers |
 | --- | --- |
 | Baseline NCCL attribution | `nccl_dense_exchange_allocates_per_call`, `nccl_dense_exchange_syncs_rank_streams`, `nccl_route_iteration_on_host`, `nccl_contribution_accumulation_on_host`, `nccl_combine_h2d_contribution_copy`, `nccl_combine_allocates_per_call`, `nccl_combine_syncs_rank_streams`, `nccl_combine_d2h_result_copy` |
 | Candidate NCCL attribution | `nccl_dense_exchange_allocates_per_call`, `nccl_dense_exchange_syncs_rank_streams`, `nccl_route_iteration_on_host`, `nccl_expert_accumulation_host_directed` |
 
+Current issue #276 readiness removes the dense-exchange allocation/sync blockers as well. The retained NCCL blockers are `nccl_route_iteration_on_host` and `nccl_expert_accumulation_host_directed`.
+
 Removed blockers:
 
 - `nccl_contribution_accumulation_on_host`
@@ -118,4 +120,4 @@ The candidate report replaces the old `nccl_contribution_accumulate` and `nccl_c
 
 The implementation keeps host-staged unchanged as the correctness oracle. The NCCL backend now owns reusable rank0/rank1 f32 send/recv scratch buffers behind `DeviceCombineScratch`; each MoE call clears the f32 send scratch on device, accumulates one-token expert outputs into the owning rank's send scratch with a CUDA helper, runs the f32 NCCL all-reduce, and casts rank0's f32 result back to bf16 on device.
 
-The final bf16 `HiddenStates` returned to the model is still allocated per combine call. That allocation is outside the removed NCCL contribution/result round trip, so it is not claimed as full CUDA Graph readiness. The remaining readiness blockers are still real and should drive the next slice.
+The final bf16 `HiddenStates` returned to the model is still allocated per combine call. That allocation is outside the removed NCCL contribution/result round trip, so issue #275 did not claim full CUDA Graph readiness. Issue #276 then moved dense exchange to reusable bf16 scratch; the remaining host-directed readiness blockers are still real and should drive the next slice.
diff --git a/docs/models/deepseek-v2-lite/status.md b/docs/models/deepseek-v2-lite/status.md
index 2c6b1982..4b915f34 100644
--- a/docs/models/deepseek-v2-lite/status.md
+++ b/docs/models/deepseek-v2-lite/status.md
@@ -1,6 +1,6 @@
 # DeepSeek-V2-Lite Status And Benchmark Ledger
 
-> **TL;DR:** DeepSeek-V2-Lite is a feature-gated EP2 correctness and attribution target. HF, host-staged, and NCCL match for the narrow greedy gate; NCCL decode combine now uses reusable device-resident f32 scratch, while dense exchange and host-directed routing still block full decode graph capture. Current batch and vLLM data remain diagnostic and do not claim production serving parity.
+> **TL;DR:** DeepSeek-V2-Lite is a feature-gated EP2 correctness and attribution target. HF, host-staged, and NCCL match for the narrow greedy gate; NCCL decode combine and dense exchange now use reusable device scratch, while host-directed routing/expert accumulation still block full decode graph capture. Current batch and vLLM data remain diagnostic and do not claim production serving parity.
 
 Last touched: 2026-06
 
@@ -14,7 +14,8 @@ Last touched: 2026-06
 | Decode attribution | Available | PR #162 and PR #169 add CPU/GPU attribution, route counts, NCCL counters, CUDA event timing, and optional NVTX correlation. |
 | Direct same-prompt diagnostic batch | Available | PR #184 and PR #196 cover batch sizes `1`, `4`, and `8` for the fixed same-prompt direct path. |
 | Device-resident NCCL combine | Available | Issue #275 keeps NCCL combine contributions/results on reusable f32 device scratch and preserves the HF / host-staged / NCCL exact gate on 2x RTX 5090. |
-| NCCL CUDA Graph readiness | Diagnostic only | The attribution binary now emits `cuda_graph_readiness`. Current NCCL full decode capture is blocked; the optional preallocated f32 NCCL graph smoke captures, replays, and verifies. |
+| Device-resident NCCL dense exchange | Available | Issue #276 keeps dense-exchange rank recv/zero-send buffers on reusable backend-owned bf16 scratch, clears rank1 zero-send every exchange, removes dense-exchange stream sync from the backend call, and preserves HF / host-staged / NCCL exactness on 2x RTX 5090. |
+| NCCL CUDA Graph readiness | Diagnostic only | The attribution binary emits `cuda_graph_readiness`. Current NCCL full decode capture remains blocked by host route iteration and host-directed expert accumulation; the removed dense-exchange allocation/sync blockers should stay absent. |
 | Production continuous batching | Not available | The direct diagnostic batch path is not mixed-request HTTP serving. |
 | vLLM production parity | Not claimed | The manual vLLM snapshot below is for understanding the gap requested in issue #170. |
 
@@ -100,7 +101,7 @@ Do not claim:
 
 Issue #205 records the model roadmap. Maintainer feedback there calls out NCCL plus CUDA Graph as the likely best decode direction, with host staging possibly deprecated later. Treat that as a future direction, not as current evidence.
 
-The current graph-readiness diagnostic is intentionally fail-closed: `full_decode_capture_ready=false` for NCCL. Issue #275 removed the old NCCL combine H2D/D2H/allocation/sync blockers from the retained 2x RTX 5090 attribution gate, but dense exchange allocation/sync and host-directed routing remain. A basic preallocated f32 NCCL all-reduce smoke captures, replays, and verifies on the retained A800 run, but that proves only the collective smoke shape. It is not full decode CUDA Graph coverage. HF, host-staged, and NCCL remain token/text exact for the narrow greedy gate.
+The current graph-readiness diagnostic is intentionally fail-closed: `full_decode_capture_ready=false` for NCCL. Issue #275 removed the old NCCL combine H2D/D2H/allocation/sync blockers, and issue #276 removed the dense-exchange allocation/sync blockers from the retained 2x RTX 5090 attribution gate. The remaining NCCL blockers are host route iteration and host-directed expert accumulation. The optional f32 NCCL graph smoke is a separate collective-only diagnostic and is not #276 evidence. HF, host-staged, and NCCL remain token/text exact for the narrow greedy gate.
 
 The next implementation should be chosen from measured evidence:
 
@@ -108,7 +109,7 @@ The next implementation should be chosen from measured evidence:
    - keep HF / host-staged / NCCL exact before and after;
    - keep host-staged as the correctness baseline while it exists;
    - preserve attribution before and after the change;
-   - attack dense exchange allocation/sync and host route iteration next;
+   - attack host route iteration and host-directed expert accumulation next;
    - avoid broad generic EP or multi-node work;
    - judge issue #170 by whether it reduces NCCL decode overhead and makes the path more graph-friendly.
 
diff --git a/openinfer-core/src/ops.rs b/openinfer-core/src/ops.rs
index b54379aa..f8e407f0 100644
--- a/openinfer-core/src/ops.rs
+++ b/openinfer-core/src/ops.rs
@@ -16,9 +16,9 @@ pub use attention::{
 pub use openinfer_kernels::ops::{
     LoraDecodeGroupedProjection, accumulate_bf16_token_scaled_to_f32_into, add_batch,
     add_batch_into, bf16_hidden_to_f32_into, embedding_decode_into, extract_vec, extract_vec_into,
-    f32_to_bf16_hidden_into, fused_add_rms_norm_into, gather_hidden_tokens_into, gemm,
-    gemm_into_checked, gemm_per_token, gemv, linear, lora_decode_fused_delta_group3_into,
-    lora_decode_fused_delta_into, pack_lora_b_rows_into,
+    extract_vec_ref, extract_vec_ref_into, f32_to_bf16_hidden_into, fused_add_rms_norm_into,
+    gather_hidden_tokens_into, gemm, gemm_into_checked, gemm_per_token, gemv, linear,
+    lora_decode_fused_delta_group3_into, lora_decode_fused_delta_into, pack_lora_b_rows_into,
     qk_norm_partial_rope_batched_decode_hd256_into, rms_norm, rms_norm_batch_offset_into,
     rms_norm_gated_batch_into, rms_norm_into, rms_norm_offset_into, scale_f32_in_place,
     scaled_add_batch_into, scaled_add_rows_indexed_into, scaled_add_rows_into,
diff --git a/openinfer-deepseek-v2-lite/src/nccl_backend.rs b/openinfer-deepseek-v2-lite/src/nccl_backend.rs
index 7e2b184d..a8be12fd 100644
--- a/openinfer-deepseek-v2-lite/src/nccl_backend.rs
+++ b/openinfer-deepseek-v2-lite/src/nccl_backend.rs
@@ -1,5 +1,9 @@
 use std::{
+    collections::HashSet,
+    env,
     ffi::{CStr, c_char, c_int, c_void},
+    fs,
+    path::{Path, PathBuf},
     ptr,
     sync::{Arc, Mutex, MutexGuard},
 };
@@ -19,7 +23,7 @@ use half::bf16;
 use libloading::Library;
 use openinfer_core::{
     ops,
-    tensor::{DeviceContext, HiddenStates},
+    tensor::{DeviceContext, HiddenStates, HiddenStatesRef},
 };
 use serde::Serialize;
 
@@ -45,6 +49,7 @@ type NcclGetErrorString = unsafe extern "C" fn(ncclResult_t) -> *const c_char;
 pub(crate) struct NaiveNcclEp2Backend {
     lib: Arc<RawNcclLib>,
     comms: Vec<ncclComm_t>,
+    dense_exchange_scratch: Mutex<DeviceDenseExchangeScratch>,
     combine_scratch: Mutex<DeviceCombineScratch>,
 }
 
@@ -95,6 +100,7 @@ impl NcclGraphSmokeReport {
 
 struct RawNcclLib {
     _library: Library,
+    source: String,
     comm_init_all: NcclCommInitAll,
     comm_count: NcclCommCount,
     comm_cu_device: NcclCommCuDevice,
@@ -105,6 +111,108 @@ struct RawNcclLib {
     get_error_string: NcclGetErrorString,
 }
 
+#[derive(Default)]
+struct DeviceDenseExchangeScratch {
+    hidden_dim: usize,
+    seq_len: usize,
+    rank0_recv: Option<CudaSlice<bf16>>,
+    rank1_send_zero: Option<CudaSlice<bf16>>,
+    rank1_recv: Option<CudaSlice<bf16>>,
+}
+
+impl DeviceDenseExchangeScratch {
+    fn ensure(
+        &mut self,
+        rank0: &DeviceContext,
+        rank1: &DeviceContext,
+        hidden_dim: usize,
+        seq_len: usize,
+    ) -> Result<()> {
+        let elems = dense_exchange_elems(hidden_dim, seq_len)?;
+        if self.hidden_dim == hidden_dim
+            && self.seq_len == seq_len
+            && self
+                .rank0_recv
+                .as_ref()
+                .is_some_and(|buf| buf.len() >= elems)
+            && self
+                .rank1_send_zero
+                .as_ref()
+                .is_some_and(|buf| buf.len() >= elems)
+            && self
+                .rank1_recv
+                .as_ref()
+                .is_some_and(|buf| buf.len() >= elems)
+        {
+            return Ok(());
+        }
+
+        activate(rank0)?;
+        drop(self.rank0_recv.take());
+        let rank0_recv = rank0.stream.alloc_zeros::<bf16>(elems)?;
+        activate(rank1)?;
+        drop(self.rank1_send_zero.take());
+        drop(self.rank1_recv.take());
+        let rank1_send_zero = rank1.stream.alloc_zeros::<bf16>(elems)?;
+        let rank1_recv = rank1.stream.alloc_zeros::<bf16>(elems)?;
+
+        self.hidden_dim = hidden_dim;
+        self.seq_len = seq_len;
+        self.rank0_recv = Some(rank0_recv);
+        self.rank1_send_zero = Some(rank1_send_zero);
+        self.rank1_recv = Some(rank1_recv);
+        Ok(())
+    }
+
+    fn ensure_shape(&self, hidden_dim: usize, seq_len: usize) -> Result<usize> {
+        let elems = dense_exchange_elems(hidden_dim, seq_len)?;
+        ensure!(
+            self.hidden_dim == hidden_dim && self.seq_len == seq_len,
+            "DeepSeek-V2-Lite NCCL dense exchange scratch shape mismatch: scratch=[{}, {}], requested=[{}, {}]",
+            self.hidden_dim,
+            self.seq_len,
+            hidden_dim,
+            seq_len
+        );
+        ensure!(
+            self.rank0_recv
+                .as_ref()
+                .is_some_and(|buf| buf.len() >= elems)
+                && self
+                    .rank1_send_zero
+                    .as_ref()
+                    .is_some_and(|buf| buf.len() >= elems)
+                && self
+                    .rank1_recv
+                    .as_ref()
+                    .is_some_and(|buf| buf.len() >= elems),
+            "DeepSeek-V2-Lite NCCL dense exchange scratch is not initialized for {elems} elements"
+        );
+        Ok(elems)
+    }
+
+    fn rank1_hidden_ref(&self) -> Result<HiddenStatesRef<'_>> {
+        Ok(HiddenStatesRef {
+            data: self
+                .rank1_recv
+                .as_ref()
+                .context("DeepSeek-V2-Lite NCCL rank1 dense exchange recv scratch is missing")?,
+            hidden_dim: self.hidden_dim,
+            seq_len: self.seq_len,
+        })
+    }
+}
+
+pub(crate) struct DenseExchangeOutput<'a> {
+    scratch: MutexGuard<'a, DeviceDenseExchangeScratch>,
+}
+
+impl DenseExchangeOutput<'_> {
+    pub(crate) fn rank1_hidden(&self) -> Result<HiddenStatesRef<'_>> {
+        self.scratch.rank1_hidden_ref()
+    }
+}
+
 #[derive(Default)]
 struct DeviceCombineScratch {
     hidden_dim: usize,
@@ -233,6 +341,7 @@ impl NaiveNcclEp2Backend {
         let backend = Self {
             lib,
             comms,
+            dense_exchange_scratch: Mutex::new(DeviceDenseExchangeScratch::default()),
             combine_scratch: Mutex::new(DeviceCombineScratch::default()),
         };
         backend.validate_communicators(&ordinals)?;
@@ -245,45 +354,64 @@ impl NaiveNcclEp2Backend {
         rank0: &DeviceContext,
         rank1: &DeviceContext,
         input: &HiddenStates,
-    ) -> Result<HiddenStates> {
+    ) -> Result<DenseExchangeOutput<'_>> {
         ensure!(
             input.hidden_dim > 0 && input.seq_len > 0,
             "DeepSeek-V2-Lite NCCL dense hidden exchange requires non-empty hidden states"
         );
-        activate(rank0)?;
-        let mut rank0_recv = HiddenStates::zeros(rank0, input.hidden_dim, input.seq_len)?;
+        let mut scratch = self.dense_exchange_scratch()?;
+        let elems = dense_exchange_elems(input.hidden_dim, input.seq_len)?;
+        scratch.ensure(rank0, rank1, input.hidden_dim, input.seq_len)?;
         activate(rank1)?;
-        let rank1_send = HiddenStates::zeros(rank1, input.hidden_dim, input.seq_len)?;
-        let mut rank1_recv = HiddenStates::zeros(rank1, input.hidden_dim, input.seq_len)?;
+        rank1
+            .stream
+            .memset_zeros(scratch.rank1_send_zero.as_mut().context(
+                "DeepSeek-V2-Lite NCCL rank1 dense exchange zero-send scratch is missing",
+            )?)
+            .context("clear DeepSeek-V2-Lite NCCL rank1 dense exchange zero-send scratch")?;
+        scratch.ensure_shape(input.hidden_dim, input.seq_len)?;
+
+        let DeviceDenseExchangeScratch {
+            rank0_recv,
+            rank1_send_zero,
+            rank1_recv,
+            ..
+        } = &mut *scratch;
+        let rank0_recv = rank0_recv
+            .as_mut()
+            .context("DeepSeek-V2-Lite NCCL rank0 dense exchange recv scratch is missing")?;
+        let rank1_send_zero = rank1_send_zero
+            .as_ref()
+            .context("DeepSeek-V2-Lite NCCL rank1 dense exchange zero-send scratch is missing")?;
+        let rank1_recv = rank1_recv
+            .as_mut()
+            .context("DeepSeek-V2-Lite NCCL rank1 dense exchange recv scratch is missing")?;
 
         // Correctness-first dense exchange: rank0 contributes the hidden state
         // and rank1 contributes zeros. This makes rank0 hidden visible on rank1
         // without pretending to be sparse routed dispatch.
-        let count = input.hidden_dim * input.seq_len;
         self.grouped("DeepSeek-V2-Lite NCCL dense hidden all-reduce", || {
             activate(rank0)?;
             self.all_reduce_bf16(
                 0,
                 &input.data,
-                &mut rank0_recv.data,
-                count,
+                rank0_recv,
+                elems,
                 rank0.stream.cu_stream(),
                 "DeepSeek-V2-Lite NCCL dense hidden rank0 all-reduce",
             )?;
             activate(rank1)?;
             self.all_reduce_bf16(
                 1,
-                &rank1_send.data,
-                &mut rank1_recv.data,
-                count,
+                rank1_send_zero,
+                rank1_recv,
+                elems,
                 rank1.stream.cu_stream(),
                 "DeepSeek-V2-Lite NCCL dense hidden rank1 all-reduce",
             )?;
             Ok(())
         })?;
-        rank0.sync()?;
-        rank1.sync()?;
-        Ok(rank1_recv)
+        Ok(DenseExchangeOutput { scratch })
     }
 
     pub(crate) fn clear_device_combine(
@@ -739,6 +867,12 @@ impl NaiveNcclEp2Backend {
             .map_err(|_| anyhow::anyhow!("DeepSeek-V2-Lite NCCL device combine scratch poisoned"))
     }
 
+    fn dense_exchange_scratch(&self) -> Result<MutexGuard<'_, DeviceDenseExchangeScratch>> {
+        self.dense_exchange_scratch
+            .lock()
+            .map_err(|_| anyhow::anyhow!("DeepSeek-V2-Lite NCCL dense exchange scratch poisoned"))
+    }
+
     fn grouped(&self, context: &str, f: impl FnOnce() -> Result<()>) -> Result<()> {
         let start = unsafe {
             // SAFETY: NCCL group state is process-global and entered/exited on
@@ -769,6 +903,18 @@ fn combine_elems(hidden_dim: usize, seq_len: usize) -> Result<usize> {
     })
 }
 
+fn dense_exchange_elems(hidden_dim: usize, seq_len: usize) -> Result<usize> {
+    ensure!(
+        hidden_dim > 0 && seq_len > 0,
+        "DeepSeek-V2-Lite NCCL dense exchange requires non-empty shape, got hidden_dim={hidden_dim}, seq_len={seq_len}"
+    );
+    hidden_dim.checked_mul(seq_len).with_context(|| {
+        format!(
+            "DeepSeek-V2-Lite NCCL dense exchange shape overflow: hidden_dim={hidden_dim}, seq_len={seq_len}"
+        )
+    })
+}
+
 fn cleanup_capture(ctx: &DeviceContext, capture_started: bool) {
     if capture_started {
         let _ = activate(ctx);
@@ -896,19 +1042,19 @@ impl Drop for NaiveNcclEp2Backend {
 impl RawNcclLib {
     fn load() -> Result<Self> {
         let mut tried = Vec::new();
-        for candidate in ["libnccl.so.2", "libnccl.so"] {
-            tried.push(candidate);
+        for candidate in nccl_library_candidates() {
+            tried.push(candidate.clone());
             let Ok(library) = (unsafe {
                 // SAFETY: Loading NCCL is required to create the selected
                 // runtime backend. All symbols are validated immediately below.
-                Library::new(candidate)
+                Library::new(&candidate)
             }) else {
                 continue;
             };
             return unsafe {
                 // SAFETY: The library is kept alive inside `RawNcclLib`; copied
                 // function pointers do not outlive it.
-                Self::from_library(library)
+                Self::from_library(library, candidate.clone())
             }
             .with_context(|| format!("load DeepSeek-V2-Lite NCCL backend from {candidate}"));
         }
@@ -918,7 +1064,7 @@ impl RawNcclLib {
         )
     }
 
-    unsafe fn from_library(library: Library) -> Result<Self> {
+    unsafe fn from_library(library: Library, source: String) -> Result<Self> {
         Ok(Self {
             comm_init_all: unsafe { load_symbol(&library, b"ncclCommInitAll\0")? },
             comm_count: unsafe { load_symbol(&library, b"ncclCommCount\0")? },
@@ -928,6 +1074,7 @@ impl RawNcclLib {
             group_end: unsafe { load_symbol(&library, b"ncclGroupEnd\0")? },
             all_reduce: unsafe { load_symbol(&library, b"ncclAllReduce\0")? },
             get_error_string: unsafe { load_symbol(&library, b"ncclGetErrorString\0")? },
+            source,
             _library: library,
         })
     }
@@ -946,7 +1093,10 @@ impl RawNcclLib {
                 CStr::from_ptr(ptr).to_string_lossy().into_owned()
             }
         };
-        bail!("{context} failed: {message} ({status:?})")
+        bail!(
+            "{context} failed with NCCL library {}: {message} ({status:?})",
+            self.source
+        )
     }
 
     fn query_comm_count(&self, comm: ncclComm_t, context: &str) -> Result<c_int> {
@@ -972,6 +1122,142 @@ impl RawNcclLib {
     }
 }
 
+fn nccl_library_candidates() -> Vec<String> {
+    let mut candidates = Vec::new();
+    let mut seen = HashSet::new();
+
+    add_env_file_candidates(&mut candidates, &mut seen, "OPENINFER_NCCL_LIB");
+    add_env_dir_candidates(&mut candidates, &mut seen, "OPENINFER_NCCL_LIB_DIR");
+    add_env_dir_candidates(&mut candidates, &mut seen, "OPENINFER_NCCL_LIBRARY_PATH");
+    for lib_dir in nccl_python_wheel_lib_dirs() {
+        add_nccl_dir_candidates(&mut candidates, &mut seen, &lib_dir);
+    }
+
+    add_candidate(&mut candidates, &mut seen, "libnccl.so.2".to_string());
+    add_candidate(&mut candidates, &mut seen, "libnccl.so".to_string());
+    candidates
+}
+
+fn add_env_file_candidates(candidates: &mut Vec<String>, seen: &mut HashSet<String>, key: &str) {
+    let Ok(value) = env::var(key) else {
+        return;
+    };
+    for path in env::split_paths(&value) {
+        add_candidate(candidates, seen, path.to_string_lossy().into_owned());
+    }
+}
+
+fn add_env_dir_candidates(candidates: &mut Vec<String>, seen: &mut HashSet<String>, key: &str) {
+    let Ok(value) = env::var(key) else {
+        return;
+    };
+    for dir in env::split_paths(&value) {
+        add_nccl_dir_candidates(candidates, seen, &dir);
+    }
+}
+
+fn add_nccl_dir_candidates(candidates: &mut Vec<String>, seen: &mut HashSet<String>, dir: &Path) {
+    add_candidate(
+        candidates,
+        seen,
+        dir.join("libnccl.so.2").to_string_lossy().into_owned(),
+    );
+    add_candidate(
+        candidates,
+        seen,
+        dir.join("libnccl.so").to_string_lossy().into_owned(),
+    );
+}
+
+fn add_candidate(candidates: &mut Vec<String>, seen: &mut HashSet<String>, candidate: String) {
+    if !candidate.is_empty() && seen.insert(candidate.clone()) {
+        candidates.push(candidate);
+    }
+}
+
+fn nccl_python_wheel_lib_dirs() -> Vec<PathBuf> {
+    python_env_roots()
+        .into_iter()
+        .flat_map(|root| nccl_python_wheel_lib_dirs_from_root(&root))
+        .collect()
+}
+
+fn python_env_roots() -> Vec<PathBuf> {
+    let mut roots = Vec::new();
+    let mut seen = HashSet::new();
+
+    for key in ["OPENINFER_NCCL_PYTHON", "OPENINFER_TRITON_PYTHON"] {
+        if let Ok(value) = env::var(key) {
+            add_python_env_root(&mut roots, &mut seen, Path::new(&value));
+        }
+    }
+    for key in ["VIRTUAL_ENV", "CONDA_PREFIX"] {
+        if let Ok(value) = env::var(key) {
+            add_pathbuf_once(&mut roots, &mut seen, PathBuf::from(value));
+        }
+    }
+    roots
+}
+
+fn add_python_env_root(roots: &mut Vec<PathBuf>, seen: &mut HashSet<PathBuf>, python: &Path) {
+    if python.is_dir() {
+        add_pathbuf_once(roots, seen, python.to_path_buf());
+        return;
+    }
+    if let Some(parent) = python.parent()
+        && parent.file_name().is_some_and(|name| name == "bin")
+        && let Some(root) = parent.parent()
+    {
+        add_pathbuf_once(roots, seen, root.to_path_buf());
+    }
+}
+
+fn add_pathbuf_once(paths: &mut Vec<PathBuf>, seen: &mut HashSet<PathBuf>, path: PathBuf) {
+    if !path.as_os_str().is_empty() && seen.insert(path.clone()) {
+        paths.push(path);
+    }
+}
+
+fn nccl_python_wheel_lib_dirs_from_root(root: &Path) -> Vec<PathBuf> {
+    let mut dirs = Vec::new();
+    let mut seen = HashSet::new();
+    add_python_wheel_lib_dir(
+        &mut dirs,
+        &mut seen,
+        root.join("site-packages/nvidia/nccl/lib"),
+    );
+
+    let lib_root = root.join("lib");
+    if let Ok(entries) = fs::read_dir(&lib_root) {
+        for entry in entries.flatten() {
+            let path = entry.path();
+            let Some(name) = path.file_name().and_then(|name| name.to_str()) else {
+                continue;
+            };
+            if name.starts_with("python") {
+                add_python_wheel_lib_dir(
+                    &mut dirs,
+                    &mut seen,
+                    path.join("site-packages/nvidia/nccl/lib"),
+                );
+            }
+        }
+    }
+
+    add_python_wheel_lib_dir(
+        &mut dirs,
+        &mut seen,
+        root.join("Lib/site-packages/nvidia/nccl/lib"),
+    );
+    dirs
+}
+
+fn add_python_wheel_lib_dir(dirs: &mut Vec<PathBuf>, seen: &mut HashSet<PathBuf>, dir: PathBuf) {
+    if dir.join("libnccl.so.2").exists() && seen.insert(dir.clone()) {
+        dirs.push(dir);
+    }
+}
+
 unsafe fn load_symbol<T: Copy>(library: &Library, symbol: &'static [u8]) -> Result<T> {
     unsafe { library.get::<T>(symbol) }
         .map(|symbol| *symbol)
@@ -982,3 +1268,39 @@ unsafe fn load_symbol<T: Copy>(library: &Library, symbol: &'static [u8]) -> Resu
             )
         })
 }
+
+#[cfg(test)]
+mod nccl_loader_tests {
+    use std::{
+        fs,
+        time::{SystemTime, UNIX_EPOCH},
+    };
+
+    use super::*;
+
+    #[test]
+    fn finds_nccl_python_wheel_lib_dir_from_python_executable() {
+        let unique = SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .expect("system time before epoch")
+            .as_nanos();
+        let root = env::temp_dir().join(format!(
+            "openinfer-nccl-wheel-test-{}-{unique}",
+            std::process::id()
+        ));
+        let python_dir = root.join("bin");
+        let wheel_dir = root.join("lib/python3.11/site-packages/nvidia/nccl/lib");
+        fs::create_dir_all(&python_dir).expect("create python bin dir");
+        fs::create_dir_all(&wheel_dir).expect("create NCCL wheel dir");
+        fs::write(wheel_dir.join("libnccl.so.2"), []).expect("create fake NCCL lib marker");
+
+        let mut roots = Vec::new();
+        let mut seen = HashSet::new();
+        add_python_env_root(&mut roots, &mut seen, &python_dir.join("python"));
+
+        assert_eq!(roots, vec![root.clone()]);
+        assert_eq!(nccl_python_wheel_lib_dirs_from_root(&root), vec![wheel_dir]);
+
+        fs::remove_dir_all(root).expect("remove temp root");
+    }
+}
diff --git a/openinfer-deepseek-v2-lite/src/runtime/moe.rs b/openinfer-deepseek-v2-lite/src/runtime/moe.rs
index 828369a3..aa6dfe18 100644
--- a/openinfer-deepseek-v2-lite/src/runtime/moe.rs
+++ b/openinfer-deepseek-v2-lite/src/runtime/moe.rs
@@ -1,6 +1,9 @@
 use anyhow::{Result, bail};
 use half::bf16;
-use openinfer_core::{ops, tensor::HiddenStates};
+use openinfer_core::{
+    ops,
+    tensor::{HiddenStates, HiddenStatesRef},
+};
 
 use super::{DeepSeekV2LiteEp2Generator, backend::EpBackendRuntime};
 use crate::{
@@ -235,6 +238,7 @@ impl DeepSeekV2LiteEp2Generator {
             token_index,
             || nccl.dense_all_reduce_rank0_hidden_to_rank1(&self.rank0.ctx, &self.rank1.ctx, input),
         )?;
+        let rank1_hidden = rank1_input.rank1_hidden()?;
         attribution.record_gpu_pair_result(
             &self.rank0.ctx,
             &self.rank1.ctx,
@@ -271,7 +275,14 @@ impl DeepSeekV2LiteEp2Generator {
                             || format!("layer.{layer_idx}.nccl.local_expert"),
                             Some(layer_idx),
                             token_index,
-                            || expert_forward_device(&self.rank0.ctx, expert, input, token),
+                            || {
+                                expert_forward_device(
+                                    &self.rank0.ctx,
+                                    expert,
+                                    input.as_ref(),
+                                    token,
+                                )
+                            },
                         )?
                     }
                     1 => {
@@ -284,7 +295,7 @@ impl DeepSeekV2LiteEp2Generator {
                             || format!("layer.{layer_idx}.nccl.remote_expert"),
                             Some(layer_idx),
                             token_index,
-                            || expert_forward_device(&self.rank1.ctx, expert, &rank1_input, token),
+                            || expert_forward_device(&self.rank1.ctx, expert, rank1_hidden, token),
                         )?
                     }
                     other => {
@@ -377,11 +388,11 @@ impl DeepSeekV2LiteEp2Generator {
 fn expert_forward_device(
     ctx: &openinfer_core::tensor::DeviceContext,
     expert: &ExpertMlp,
-    input: &HiddenStates,
+    input: HiddenStatesRef<'_>,
     token_idx: usize,
 ) -> Result<HiddenStates> {
     activate(ctx)?;
-    let token = ops::extract_vec(ctx, input, token_idx)?;
+    let token = ops::extract_vec_ref(ctx, input, token_idx)?;
     let token_hidden = HiddenStates {
         hidden_dim: token.len,
         seq_len: 1,
diff --git a/openinfer-deepseek-v2-lite/src/runtime/readiness.rs b/openinfer-deepseek-v2-lite/src/runtime/readiness.rs
index e9bb8133..4881015d 100644
--- a/openinfer-deepseek-v2-lite/src/runtime/readiness.rs
+++ b/openinfer-deepseek-v2-lite/src/runtime/readiness.rs
@@ -90,16 +90,6 @@ fn decode_graph_blockers(backend: EpBackendKind) -> Vec<DecodeGraphBlocker> {
             },
         ],
         EpBackendKind::Nccl => vec![
-            DecodeGraphBlocker {
-                id: "nccl_dense_exchange_allocates_per_call",
-                source: "nccl_backend.rs::dense_all_reduce_rank0_hidden_to_rank1",
-                reason: "rank0/rank1 receive and zero-send buffers are allocated inside each dense exchange",
-            },
-            DecodeGraphBlocker {
-                id: "nccl_dense_exchange_syncs_rank_streams",
-                source: "nccl_backend.rs::dense_all_reduce_rank0_hidden_to_rank1",
-                reason: "both rank streams are synchronized after every dense exchange",
-            },
             DecodeGraphBlocker {
                 id: "nccl_route_iteration_on_host",
                 source: "runtime/moe.rs::moe_forward_nccl",
@@ -113,3 +103,22 @@ fn decode_graph_blockers(backend: EpBackendKind) -> Vec<DecodeGraphBlocker> {
         ],
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn nccl_readiness_reports_only_remaining_graph_blockers() {
+        let blockers = decode_graph_blockers(EpBackendKind::Nccl);
+        let ids: Vec<_> = blockers.iter().map(|blocker| blocker.id).collect();
+
+        assert_eq!(
+            ids,
+            vec![
+                "nccl_route_iteration_on_host",
+                "nccl_expert_accumulation_host_directed",
+            ]
+        );
+    }
+}
diff --git a/openinfer-kernels/src/ops.rs b/openinfer-kernels/src/ops.rs
index ffbc6f2c..722edeac 100644
--- a/openinfer-kernels/src/ops.rs
+++ b/openinfer-kernels/src/ops.rs
@@ -23,10 +23,11 @@ pub use deepep::{
 };
 pub use elementwise::{
     accumulate_bf16_token_scaled_to_f32_into, add_batch, add_batch_into, bf16_hidden_to_f32_into,
-    extract_vec, extract_vec_into, f32_to_bf16_hidden_into, gather_hidden_tokens_into,
-    repeat_f32_for_reduce_scatter_into, scale_f32_in_place, scaled_add_batch_into,
-    scaled_add_rows_indexed_into, scaled_add_rows_into, scaled_add_rows_token_range_into,
-    silu_mul_batch, silu_mul_batch_into, silu_mul_fused_batch_into, write_vec_into,
+    extract_vec, extract_vec_into, extract_vec_ref, extract_vec_ref_into, f32_to_bf16_hidden_into,
+    gather_hidden_tokens_into, repeat_f32_for_reduce_scatter_into, scale_f32_in_place,
+    scaled_add_batch_into, scaled_add_rows_indexed_into, scaled_add_rows_into,
+    scaled_add_rows_token_range_into, silu_mul_batch, silu_mul_batch_into,
+    silu_mul_fused_batch_into, write_vec_into,
 };
 pub use embedding::{embedding_batch, embedding_batch_vocab_shard, embedding_decode_into};
 #[cfg(feature = "kimi-k2")]
diff --git a/openinfer-kernels/src/ops/elementwise.rs b/openinfer-kernels/src/ops/elementwise.rs
index 177b2df1..e9cbdb2b 100644
--- a/openinfer-kernels/src/ops/elementwise.rs
+++ b/openinfer-kernels/src/ops/elementwise.rs
@@ -2,7 +2,7 @@ use anyhow::{Result, anyhow};
 use cudarc::driver::{CudaSlice, DevicePtr, DevicePtrMut};
 
 use crate::ffi;
-use crate::tensor::{DeviceContext, DeviceVec, HiddenStates};
+use crate::tensor::{DeviceContext, DeviceVec, HiddenStates, HiddenStatesRef};
 
 /// Batched element-wise add: out = a + b (same shape HiddenStates)
 pub fn add_batch(ctx: &DeviceContext, a: &HiddenStates, b: &HiddenStates) -> Result<HiddenStates> {
@@ -477,10 +477,19 @@ pub fn extract_vec(
     ctx: &DeviceContext,
     batch: &HiddenStates,
     token_idx: usize,
+) -> Result<DeviceVec> {
+    extract_vec_ref(ctx, batch.as_ref(), token_idx)
+}
+
+/// Extract a single token's vector from a borrowed HiddenStates batch.
+pub fn extract_vec_ref(
+    ctx: &DeviceContext,
+    batch: HiddenStatesRef<'_>,
+    token_idx: usize,
 ) -> Result<DeviceVec> {
     let len = batch.hidden_dim;
     let mut out = DeviceVec::zeros(ctx, len)?;
-    extract_vec_into(ctx, batch, token_idx, &mut out)?;
+    extract_vec_ref_into(ctx, batch, token_idx, &mut out)?;
     Ok(out)
 }
 
@@ -490,6 +499,16 @@ pub fn extract_vec_into(
     batch: &HiddenStates,
     token_idx: usize,
     out: &mut DeviceVec,
+) -> Result<()> {
+    extract_vec_ref_into(ctx, batch.as_ref(), token_idx, out)
+}
+
+/// Copy one column from a borrowed `batch` into a pre-allocated `out`.
+pub fn extract_vec_ref_into(
+    ctx: &DeviceContext,
+    batch: HiddenStatesRef<'_>,
+    token_idx: usize,
+    out: &mut DeviceVec,
 ) -> Result<()> {
     let offset = token_idx * batch.hidden_dim;
     let len = batch.hidden_dim;

From 867069292b01b638d4074e2882210d8a567dcb0f Mon Sep 17 00:00:00 2001
From: CAICAIIs <3360776475@qq.com>
Date: Wed, 10 Jun 2026 22:39:12 +0800
Subject: [PATCH 2/3] chore(dsv2lite): tighten nccl dense exchange follow-up

---
 docs/index.md                                 |  2 +-
 .../decode-attribution-gate.md                |  2 +-
 .../device-resident-nccl-combine.md           | 10 +--
 docs/models/deepseek-v2-lite/status.md        |  4 +-
 .../src/nccl_backend.rs                       | 74 ++-----------------
 .../src/nccl_backend/tests.rs                 | 33 +++++++++
 .../src/runtime/readiness.rs                  | 22 +-----
 .../src/runtime/readiness/tests.rs            | 15 ++++
 8 files changed, 65 insertions(+), 97 deletions(-)
 create mode 100644 openinfer-deepseek-v2-lite/src/nccl_backend/tests.rs
 create mode 100644 openinfer-deepseek-v2-lite/src/runtime/readiness/tests.rs

diff --git a/docs/index.md b/docs/index.md
index 39fc5e3d..22ce75fc 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -66,7 +66,7 @@ Organized by domain (model line / subsystem / playbook / lesson) instead of by l
 | `models/deepseek-v2-lite/hf-accuracy-gate.md` | DeepSeek-V2-Lite EP2 HF accuracy gate after PR #149/#150: HF incremental greedy, host-staged EP2, and NCCL EP2 are token/text exact for `Hello`, output_len=16. |
 | `models/deepseek-v2-lite/decode-attribution-gate.md` | DeepSeek-V2-Lite EP2 decode attribution gate for `Hello`/16-token batch sizes 1/4/8: structured JSON with accuracy hashes, CPU-side timing, selected CUDA event/NVTX attribution, host-staged/NCCL EP counts, and explicit no-throughput claim boundary. |
 | `models/deepseek-v2-lite/source-layout.md` | DeepSeek-V2-Lite runtime layout refactor: `runtime.rs` split by responsibility, HF/host-staged/NCCL EP2 E2E exact on 2x RTX 5090; NCCL CUDA Graph smoke remains a diagnostic blocker on that host, independent of the passed correctness gate. |
-| `models/deepseek-v2-lite/device-resident-nccl-combine.md` | Issue #275 record: NCCL decode combine uses reusable device-resident f32 scratch; after issue #276, dense exchange also uses reusable scratch, so remaining graph blockers are host-directed routing/expert accumulation. |
+| `models/deepseek-v2-lite/device-resident-nccl-combine.md` | Issue #275 record: NCCL decode combine uses reusable device-resident f32 scratch; current NCCL graph-readiness blockers live in `status.md`. |
 
 ## models / kimi-k2
 
diff --git a/docs/models/deepseek-v2-lite/decode-attribution-gate.md b/docs/models/deepseek-v2-lite/decode-attribution-gate.md
index e4adf037..769e350f 100644
--- a/docs/models/deepseek-v2-lite/decode-attribution-gate.md
+++ b/docs/models/deepseek-v2-lite/decode-attribution-gate.md
@@ -71,7 +71,7 @@ python tools/accuracy/compare_dsv2_lite_ep2_outputs.py \
   --require-all-exact
 ```
 
-Then collect attribution for the same two openinfer backends. Use `--batch-size 1` for the original single-row gate, and `--batch-size 4` / `--batch-size 8` for the true-batch benchmark attribution shape:
+Then collect attribution for the same two openinfer backends. Use `--batch-size 1` for the original single-row gate, and `--batch-size 4` / `--batch-size 8` for the true-batch benchmark attribution shape. If the NCCL runtime should come from a Python CUDA wheel rather than the system install, set `OPENINFER_NCCL_PYTHON` or `OPENINFER_TRITON_PYTHON` to that environment's Python for the attribution command too.
 
 ```bash
 cargo run --release -p openinfer-deepseek-v2-lite \
diff --git a/docs/models/deepseek-v2-lite/device-resident-nccl-combine.md b/docs/models/deepseek-v2-lite/device-resident-nccl-combine.md
index eab6ce9c..c2cd140a 100644
--- a/docs/models/deepseek-v2-lite/device-resident-nccl-combine.md
+++ b/docs/models/deepseek-v2-lite/device-resident-nccl-combine.md
@@ -1,6 +1,6 @@
 # DeepSeek-V2-Lite Device-Resident NCCL Combine
 
-> **TL;DR:** Issue #275 moves the NCCL decode combine path to reusable device-resident f32 scratch buffers. The retained `Hello` / 16-token gate stays HF / host-staged / NCCL exact, and the readiness report no longer lists the old combine H2D/D2H/allocation/sync blockers. Issue #276 later removes the dense-exchange allocation/sync blockers; full decode capture is still blocked by host-directed routing/expert accumulation.
+> **TL;DR:** Issue #275 moves the NCCL decode combine path to reusable device-resident f32 scratch buffers. The retained `Hello` / 16-token gate stays HF / host-staged / NCCL exact, and the readiness report no longer lists the old combine H2D/D2H/allocation/sync blockers. Current NCCL graph-readiness blockers live in `status.md`.
 
 Last touched: 2026-06
 
@@ -23,11 +23,11 @@ Last touched: 2026-06
   1. Add a shared CUDA helper that accumulates a bf16 single-token expert output into a f32 device contribution buffer at a selected token row.
   2. Re-export that helper through `openinfer-core::ops`.
   3. Add reusable NCCL combine scratch buffers inside `NaiveNcclEp2Backend`, clear the f32 send scratch per MoE call, accumulate local/remote expert outputs on the owning device, all-reduce device buffers, and cast rank0 f32 result to bf16 on device.
-  4. Update graph-readiness blockers and attribution wording so removed combine H2D/D2H/allocation/sync blockers are no longer claimed, while the then-remaining host routing and dense-exchange blockers stay explicit.
+  4. Update graph-readiness blockers and attribution wording so removed combine H2D/D2H/allocation/sync blockers are no longer claimed, while the remaining host routing and dense-exchange blockers stay explicit.
   5. Run formatting and local compile gates, then use the provided remote GPU host for the DeepSeek-V2-Lite EP2 exactness and attribution gates.
 - **Risks / open questions**:
   - Device f32 accumulation must preserve the existing expert-id accumulation order before the final bf16 cast.
-  - At the time of issue #275, dense exchange and host route selection still blocked full decode CUDA Graph capture; issue #276 later removed the dense-exchange allocation/sync blockers, but `full_decode_capture_ready` should remain false while host-directed routing/expert accumulation remain.
+  - Dense exchange and host route selection still blocked full decode CUDA Graph capture at the time of issue #275; keep the current blocker list in `status.md`.
   - The provided SSH credential should stay local to the validation session and must not be echoed into docs or final output.
 
 ## Execution Log
@@ -104,8 +104,6 @@ Before/after readiness comparison for the same model snapshot and diagnostic sha
 | Baseline NCCL attribution | `nccl_dense_exchange_allocates_per_call`, `nccl_dense_exchange_syncs_rank_streams`, `nccl_route_iteration_on_host`, `nccl_contribution_accumulation_on_host`, `nccl_combine_h2d_contribution_copy`, `nccl_combine_allocates_per_call`, `nccl_combine_syncs_rank_streams`, `nccl_combine_d2h_result_copy` |
 | Candidate NCCL attribution | `nccl_dense_exchange_allocates_per_call`, `nccl_dense_exchange_syncs_rank_streams`, `nccl_route_iteration_on_host`, `nccl_expert_accumulation_host_directed` |
 
-Current issue #276 readiness removes the dense-exchange allocation/sync blockers as well. The retained NCCL blockers are `nccl_route_iteration_on_host` and `nccl_expert_accumulation_host_directed`.
-
 Removed blockers:
 
 - `nccl_contribution_accumulation_on_host`
@@ -120,4 +118,4 @@ The candidate report replaces the old `nccl_contribution_accumulate` and `nccl_c
 
 The implementation keeps host-staged unchanged as the correctness oracle. The NCCL backend now owns reusable rank0/rank1 f32 send/recv scratch buffers behind `DeviceCombineScratch`; each MoE call clears the f32 send scratch on device, accumulates one-token expert outputs into the owning rank's send scratch with a CUDA helper, runs the f32 NCCL all-reduce, and casts rank0's f32 result back to bf16 on device.
 
-The final bf16 `HiddenStates` returned to the model is still allocated per combine call. That allocation is outside the removed NCCL contribution/result round trip, so issue #275 did not claim full CUDA Graph readiness. Issue #276 then moved dense exchange to reusable bf16 scratch; the remaining host-directed readiness blockers are still real and should drive the next slice.
+The final bf16 `HiddenStates` returned to the model is still allocated per combine call. That allocation is outside the removed NCCL contribution/result round trip, so issue #275 did not claim full CUDA Graph readiness. The current blocker list should stay in `status.md`.
diff --git a/docs/models/deepseek-v2-lite/status.md b/docs/models/deepseek-v2-lite/status.md
index 4b915f34..99850528 100644
--- a/docs/models/deepseek-v2-lite/status.md
+++ b/docs/models/deepseek-v2-lite/status.md
@@ -14,7 +14,7 @@ Last touched: 2026-06
 | Decode attribution | Available | PR #162 and PR #169 add CPU/GPU attribution, route counts, NCCL counters, CUDA event timing, and optional NVTX correlation. |
 | Direct same-prompt diagnostic batch | Available | PR #184 and PR #196 cover batch sizes `1`, `4`, and `8` for the fixed same-prompt direct path. |
 | Device-resident NCCL combine | Available | Issue #275 keeps NCCL combine contributions/results on reusable f32 device scratch and preserves the HF / host-staged / NCCL exact gate on 2x RTX 5090. |
-| Device-resident NCCL dense exchange | Available | Issue #276 keeps dense-exchange rank recv/zero-send buffers on reusable backend-owned bf16 scratch, clears rank1 zero-send every exchange, removes dense-exchange stream sync from the backend call, and preserves HF / host-staged / NCCL exactness on 2x RTX 5090. |
+| Device-resident NCCL dense exchange | Available | Issue #276 reuses backend-owned bf16 dense-exchange scratch, clears rank1 zero-send every exchange, removes dense-exchange stream sync from the backend call, and preserves HF / host-staged / NCCL exactness on 2x RTX 5090. |
 | NCCL CUDA Graph readiness | Diagnostic only | The attribution binary emits `cuda_graph_readiness`. Current NCCL full decode capture remains blocked by host route iteration and host-directed expert accumulation; the removed dense-exchange allocation/sync blockers should stay absent. |
 | Production continuous batching | Not available | The direct diagnostic batch path is not mixed-request HTTP serving. |
 | vLLM production parity | Not claimed | The manual vLLM snapshot below is for understanding the gap requested in issue #170. |
@@ -101,7 +101,7 @@ Do not claim:
 
 Issue #205 records the model roadmap. Maintainer feedback there calls out NCCL plus CUDA Graph as the likely best decode direction, with host staging possibly deprecated later. Treat that as a future direction, not as current evidence.
 
-The current graph-readiness diagnostic is intentionally fail-closed: `full_decode_capture_ready=false` for NCCL. Issue #275 removed the old NCCL combine H2D/D2H/allocation/sync blockers, and issue #276 removed the dense-exchange allocation/sync blockers from the retained 2x RTX 5090 attribution gate. The remaining NCCL blockers are host route iteration and host-directed expert accumulation. The optional f32 NCCL graph smoke is a separate collective-only diagnostic and is not #276 evidence. HF, host-staged, and NCCL remain token/text exact for the narrow greedy gate.
+The current graph-readiness diagnostic is intentionally fail-closed: `full_decode_capture_ready=false` for NCCL. Issue #275 removed the old NCCL combine H2D/D2H/allocation/sync blockers, and issue #276 removed the dense-exchange allocation/sync blockers from the retained 2x RTX 5090 attribution gate. Those removed dense-exchange blockers are absent from the current readiness report. The remaining NCCL blockers are host route iteration and host-directed expert accumulation. The optional f32 NCCL graph smoke is a separate collective-only diagnostic and is not #276 evidence. HF, host-staged, and NCCL remain token/text exact for the narrow greedy gate.
 
 The next implementation should be chosen from measured evidence:
 
diff --git a/openinfer-deepseek-v2-lite/src/nccl_backend.rs b/openinfer-deepseek-v2-lite/src/nccl_backend.rs
index a8be12fd..6042989a 100644
--- a/openinfer-deepseek-v2-lite/src/nccl_backend.rs
+++ b/openinfer-deepseek-v2-lite/src/nccl_backend.rs
@@ -29,6 +29,9 @@ use serde::Serialize;
 
 use crate::device::activate;
 
+#[cfg(test)]
+mod tests;
+
 type NcclCommInitAll = unsafe extern "C" fn(*mut ncclComm_t, c_int, *const c_int) -> ncclResult_t;
 type NcclCommCount = unsafe extern "C" fn(ncclComm_t, *mut c_int) -> ncclResult_t;
 type NcclCommCuDevice = unsafe extern "C" fn(ncclComm_t, *mut c_int) -> ncclResult_t;
@@ -127,7 +130,7 @@ impl DeviceDenseExchangeScratch {
         rank1: &DeviceContext,
         hidden_dim: usize,
         seq_len: usize,
-    ) -> Result<()> {
+    ) -> Result<usize> {
         let elems = dense_exchange_elems(hidden_dim, seq_len)?;
         if self.hidden_dim == hidden_dim
             && self.seq_len == seq_len
@@ -144,7 +147,7 @@ impl DeviceDenseExchangeScratch {
                 .as_ref()
                 .is_some_and(|buf| buf.len() >= elems)
         {
-            return Ok(());
+            return Ok(elems);
         }
 
         activate(rank0)?;
@@ -161,33 +164,6 @@ impl DeviceDenseExchangeScratch {
         self.rank0_recv = Some(rank0_recv);
         self.rank1_send_zero = Some(rank1_send_zero);
         self.rank1_recv = Some(rank1_recv);
-        Ok(())
-    }
-
-    fn ensure_shape(&self, hidden_dim: usize, seq_len: usize) -> Result<usize> {
-        let elems = dense_exchange_elems(hidden_dim, seq_len)?;
-        ensure!(
-            self.hidden_dim == hidden_dim && self.seq_len == seq_len,
-            "DeepSeek-V2-Lite NCCL dense exchange scratch shape mismatch: scratch=[{}, {}], requested=[{}, {}]",
-            self.hidden_dim,
-            self.seq_len,
-            hidden_dim,
-            seq_len
-        );
-        ensure!(
-            self.rank0_recv
-                .as_ref()
-                .is_some_and(|buf| buf.len() >= elems)
-                && self
-                    .rank1_send_zero
-                    .as_ref()
-                    .is_some_and(|buf| buf.len() >= elems)
-                && self
-                    .rank1_recv
-                    .as_ref()
-                    .is_some_and(|buf| buf.len() >= elems),
-            "DeepSeek-V2-Lite NCCL dense exchange scratch is not initialized for {elems} elements"
-        );
         Ok(elems)
     }
 
@@ -360,8 +336,7 @@ impl NaiveNcclEp2Backend {
             "DeepSeek-V2-Lite NCCL dense hidden exchange requires non-empty hidden states"
         );
         let mut scratch = self.dense_exchange_scratch()?;
-        let elems = dense_exchange_elems(input.hidden_dim, input.seq_len)?;
-        scratch.ensure(rank0, rank1, input.hidden_dim, input.seq_len)?;
+        let elems = scratch.ensure(rank0, rank1, input.hidden_dim, input.seq_len)?;
         activate(rank1)?;
         rank1
             .stream
@@ -369,7 +344,6 @@ impl NaiveNcclEp2Backend {
                 "DeepSeek-V2-Lite NCCL rank1 dense exchange zero-send scratch is missing",
             )?)
             .context("clear DeepSeek-V2-Lite NCCL rank1 dense exchange zero-send scratch")?;
-        scratch.ensure_shape(input.hidden_dim, input.seq_len)?;
 
         let DeviceDenseExchangeScratch {
             rank0_recv,
@@ -1268,39 +1242,3 @@ unsafe fn load_symbol<T: Copy>(library: &Library, symbol: &'static [u8]) -> Resu
             )
         })
 }
-
-#[cfg(test)]
-mod nccl_loader_tests {
-    use std::{
-        fs,
-        time::{SystemTime, UNIX_EPOCH},
-    };
-
-    use super::*;
-
-    #[test]
-    fn finds_nccl_python_wheel_lib_dir_from_python_executable() {
-        let unique = SystemTime::now()
-            .duration_since(UNIX_EPOCH)
-            .expect("system time before epoch")
-            .as_nanos();
-        let root = env::temp_dir().join(format!(
-            "openinfer-nccl-wheel-test-{}-{unique}",
-            std::process::id()
-        ));
-        let python_dir = root.join("bin");
-        let wheel_dir = root.join("lib/python3.11/site-packages/nvidia/nccl/lib");
-        fs::create_dir_all(&python_dir).expect("create python bin dir");
-        fs::create_dir_all(&wheel_dir).expect("create NCCL wheel dir");
-        fs::write(wheel_dir.join("libnccl.so.2"), []).expect("create fake NCCL lib marker");
-
-        let mut roots = Vec::new();
-        let mut seen = HashSet::new();
-        add_python_env_root(&mut roots, &mut seen, &python_dir.join("python"));
-
-        assert_eq!(roots, vec![root.clone()]);
-        assert_eq!(nccl_python_wheel_lib_dirs_from_root(&root), vec![wheel_dir]);
-
-        fs::remove_dir_all(root).expect("remove temp root");
-    }
-}
diff --git a/openinfer-deepseek-v2-lite/src/nccl_backend/tests.rs b/openinfer-deepseek-v2-lite/src/nccl_backend/tests.rs
new file mode 100644
index 00000000..505d6e49
--- /dev/null
+++ b/openinfer-deepseek-v2-lite/src/nccl_backend/tests.rs
@@ -0,0 +1,33 @@
+use std::{
+    collections::HashSet,
+    env, fs,
+    time::{SystemTime, UNIX_EPOCH},
+};
+
+use super::{add_python_env_root, nccl_python_wheel_lib_dirs_from_root};
+
+#[test]
+fn finds_nccl_python_wheel_lib_dir_from_python_executable() {
+    let unique = SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .expect("system time before epoch")
+        .as_nanos();
+    let root = env::temp_dir().join(format!(
+        "openinfer-nccl-wheel-test-{}-{unique}",
+        std::process::id()
+    ));
+    let python_dir = root.join("bin");
+    let wheel_dir = root.join("lib/python3.11/site-packages/nvidia/nccl/lib");
+    fs::create_dir_all(&python_dir).expect("create python bin dir");
+    fs::create_dir_all(&wheel_dir).expect("create NCCL wheel dir");
+    fs::write(wheel_dir.join("libnccl.so.2"), []).expect("create fake NCCL lib marker");
+
+    let mut roots = Vec::new();
+    let mut seen = HashSet::new();
+    add_python_env_root(&mut roots, &mut seen, &python_dir.join("python"));
+
+    assert_eq!(roots, vec![root.clone()]);
+    assert_eq!(nccl_python_wheel_lib_dirs_from_root(&root), vec![wheel_dir]);
+
+    fs::remove_dir_all(root).expect("remove temp root");
+}
diff --git a/openinfer-deepseek-v2-lite/src/runtime/readiness.rs b/openinfer-deepseek-v2-lite/src/runtime/readiness.rs
index 4881015d..7311902c 100644
--- a/openinfer-deepseek-v2-lite/src/runtime/readiness.rs
+++ b/openinfer-deepseek-v2-lite/src/runtime/readiness.rs
@@ -9,6 +9,9 @@ use super::{
     },
 };
 
+#[cfg(test)]
+mod tests;
+
 impl DeepSeekV2LiteEp2Generator {
     pub fn decode_graph_readiness_report(
         &self,
@@ -103,22 +106,3 @@ fn decode_graph_blockers(backend: EpBackendKind) -> Vec<DecodeGraphBlocker> {
         ],
     }
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn nccl_readiness_reports_only_remaining_graph_blockers() {
-        let blockers = decode_graph_blockers(EpBackendKind::Nccl);
-        let ids: Vec<_> = blockers.iter().map(|blocker| blocker.id).collect();
-
-        assert_eq!(
-            ids,
-            vec![
-                "nccl_route_iteration_on_host",
-                "nccl_expert_accumulation_host_directed",
-            ]
-        );
-    }
-}
diff --git a/openinfer-deepseek-v2-lite/src/runtime/readiness/tests.rs b/openinfer-deepseek-v2-lite/src/runtime/readiness/tests.rs
new file mode 100644
index 00000000..25dc47de
--- /dev/null
+++ b/openinfer-deepseek-v2-lite/src/runtime/readiness/tests.rs
@@ -0,0 +1,15 @@
+use super::{EpBackendKind, decode_graph_blockers};
+
+#[test]
+fn nccl_readiness_reports_only_remaining_graph_blockers() {
+    let blockers = decode_graph_blockers(EpBackendKind::Nccl);
+    let ids: Vec<_> = blockers.iter().map(|blocker| blocker.id).collect();
+
+    assert_eq!(
+        ids,
+        vec![
+            "nccl_route_iteration_on_host",
+            "nccl_expert_accumulation_host_directed",
+        ]
+    );
+}

From c205c4e8483a7dd03b82f8b6679c276731ddf49d Mon Sep 17 00:00:00 2001
From: CAICAIIs <3360776475@qq.com>
Date: Thu, 11 Jun 2026 01:42:32 +0800
Subject: [PATCH 3/3] fix(dsv2lite): address nccl review follow-ups

---
 .../src/nccl_backend.rs                       |  6 ++++-
 .../src/nccl_backend/tests.rs                 | 19 ++++++++++++++++
 openinfer-kernels/src/ops/elementwise.rs      | 22 ++++++++++++++++++-
 3 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/openinfer-deepseek-v2-lite/src/nccl_backend.rs b/openinfer-deepseek-v2-lite/src/nccl_backend.rs
index 6042989a..28b2d393 100644
--- a/openinfer-deepseek-v2-lite/src/nccl_backend.rs
+++ b/openinfer-deepseek-v2-lite/src/nccl_backend.rs
@@ -1227,11 +1227,15 @@ fn nccl_python_wheel_lib_dirs_from_root(root: &Path) -> Vec<PathBuf> {
 }
 
 fn add_python_wheel_lib_dir(dirs: &mut Vec<PathBuf>, seen: &mut HashSet<PathBuf>, dir: PathBuf) {
-    if dir.join("libnccl.so.2").exists() && seen.insert(dir.clone()) {
+    if nccl_lib_dir_exists(&dir) && seen.insert(dir.clone()) {
         dirs.push(dir);
     }
 }
 
+fn nccl_lib_dir_exists(dir: &Path) -> bool {
+    dir.join("libnccl.so.2").exists() || dir.join("libnccl.so").exists()
+}
+
 unsafe fn load_symbol<T: Copy>(library: &Library, symbol: &'static [u8]) -> Result<T> {
     unsafe { library.get::<T>(symbol) }
         .map(|symbol| *symbol)
diff --git a/openinfer-deepseek-v2-lite/src/nccl_backend/tests.rs b/openinfer-deepseek-v2-lite/src/nccl_backend/tests.rs
index 505d6e49..5f0dc832 100644
--- a/openinfer-deepseek-v2-lite/src/nccl_backend/tests.rs
+++ b/openinfer-deepseek-v2-lite/src/nccl_backend/tests.rs
@@ -31,3 +31,22 @@ fn finds_nccl_python_wheel_lib_dir_from_python_executable() {
 
     fs::remove_dir_all(root).expect("remove temp root");
 }
+
+#[test]
+fn finds_nccl_python_wheel_lib_dir_with_unversioned_soname() {
+    let unique = SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .expect("system time before epoch")
+        .as_nanos();
+    let root = env::temp_dir().join(format!(
+        "openinfer-nccl-wheel-unversioned-test-{}-{unique}",
+        std::process::id()
+    ));
+    let wheel_dir = root.join("lib/python3.11/site-packages/nvidia/nccl/lib");
+    fs::create_dir_all(&wheel_dir).expect("create NCCL wheel dir");
+    fs::write(wheel_dir.join("libnccl.so"), []).expect("create fake NCCL lib marker");
+
+    assert_eq!(nccl_python_wheel_lib_dirs_from_root(&root), vec![wheel_dir]);
+
+    fs::remove_dir_all(root).expect("remove temp root");
+}
diff --git a/openinfer-kernels/src/ops/elementwise.rs b/openinfer-kernels/src/ops/elementwise.rs
index e9cbdb2b..1e651e97 100644
--- a/openinfer-kernels/src/ops/elementwise.rs
+++ b/openinfer-kernels/src/ops/elementwise.rs
@@ -510,9 +510,14 @@ pub fn extract_vec_ref_into(
     token_idx: usize,
     out: &mut DeviceVec,
 ) -> Result<()> {
-    let offset = token_idx * batch.hidden_dim;
     let len = batch.hidden_dim;
     anyhow::ensure!(out.len == len, "extract_vec_into len mismatch");
+    anyhow::ensure!(
+        token_idx < batch.seq_len,
+        "extract_vec_into token index {token_idx} out of bounds for seq_len {}",
+        batch.seq_len
+    );
+    let offset = token_idx * batch.hidden_dim;
     let src_view = batch.data.slice(offset..offset + len);
     ctx.stream
         .memcpy_dtod(&src_view, &mut out.data)
@@ -606,4 +611,19 @@ mod tests {
         }
         Ok(())
     }
+
+    #[test]
+    fn extract_vec_ref_rejects_out_of_bounds_token() -> Result<()> {
+        let ctx = DeviceContext::new()?;
+        let hidden = hidden_from_host(&ctx, &[bf16::from_f32(1.0), bf16::from_f32(2.0)], 2, 1)?;
+        let mut out = DeviceVec::zeros(&ctx, 2)?;
+
+        let err = extract_vec_ref_into(&ctx, hidden.as_ref(), 1, &mut out).unwrap_err();
+
+        assert!(
+            err.to_string().contains("out of bounds"),
+            "unexpected error: {err}"
+        );
+        Ok(())
+    }
 }