From a68171e0042204b51e66ea2fbc7f03ad82f2248b Mon Sep 17 00:00:00 2001 From: CAICAIIs <3360776475@qq.com> Date: Wed, 10 Jun 2026 20:37:45 +0800 Subject: [PATCH 1/3] fix(dsv2lite): reuse nccl dense exchange scratch --- docs/index.md | 2 +- .../decode-attribution-gate.md | 16 +- .../device-resident-nccl-combine.md | 12 +- docs/models/deepseek-v2-lite/status.md | 9 +- openinfer-core/src/ops.rs | 6 +- .../src/nccl_backend.rs | 364 +++++++++++++++++- openinfer-deepseek-v2-lite/src/runtime/moe.rs | 21 +- .../src/runtime/readiness.rs | 29 +- openinfer-kernels/src/ops.rs | 9 +- openinfer-kernels/src/ops/elementwise.rs | 23 +- 10 files changed, 432 insertions(+), 59 deletions(-) diff --git a/docs/index.md b/docs/index.md index 328acac7..39fc5e3d 100644 --- a/docs/index.md +++ b/docs/index.md @@ -66,7 +66,7 @@ Organized by domain (model line / subsystem / playbook / lesson) instead of by l | `models/deepseek-v2-lite/hf-accuracy-gate.md` | DeepSeek-V2-Lite EP2 HF accuracy gate after PR #149/#150: HF incremental greedy, host-staged EP2, and NCCL EP2 are token/text exact for `Hello`, output_len=16. | | `models/deepseek-v2-lite/decode-attribution-gate.md` | DeepSeek-V2-Lite EP2 decode attribution gate for `Hello`/16-token batch sizes 1/4/8: structured JSON with accuracy hashes, CPU-side timing, selected CUDA event/NVTX attribution, host-staged/NCCL EP counts, and explicit no-throughput claim boundary. | | `models/deepseek-v2-lite/source-layout.md` | DeepSeek-V2-Lite runtime layout refactor: `runtime.rs` split by responsibility, HF/host-staged/NCCL EP2 E2E exact on 2x RTX 5090; NCCL CUDA Graph smoke remains a diagnostic blocker on that host, independent of the passed correctness gate. | -| `models/deepseek-v2-lite/device-resident-nccl-combine.md` | Issue #275 record: NCCL decode combine now uses reusable device-resident f32 scratch, HF/host-staged/NCCL exact on 2x RTX 5090, and remaining graph blockers are dense exchange plus host-directed routing. | +| `models/deepseek-v2-lite/device-resident-nccl-combine.md` | Issue #275 record: NCCL decode combine uses reusable device-resident f32 scratch; after issue #276, dense exchange also uses reusable scratch, so remaining graph blockers are host-directed routing/expert accumulation. | ## models / kimi-k2 diff --git a/docs/models/deepseek-v2-lite/decode-attribution-gate.md b/docs/models/deepseek-v2-lite/decode-attribution-gate.md index 5f3d207a..e4adf037 100644 --- a/docs/models/deepseek-v2-lite/decode-attribution-gate.md +++ b/docs/models/deepseek-v2-lite/decode-attribution-gate.md @@ -133,13 +133,13 @@ ncclMaxSharedMem 82240 exceeds device/fn maxSharedMem 79856 NCCL WARN Cuda failure 1 'invalid argument' ``` -Use a newer NCCL runtime through the normal library path if the system runtime fails this way. The project code path should not change just to work around local NCCL installation age. +The NCCL loader now tries explicit overrides first (`OPENINFER_NCCL_LIB`, then `OPENINFER_NCCL_LIB_DIR` / `OPENINFER_NCCL_LIBRARY_PATH`), then Python wheel NCCL directories discoverable from `OPENINFER_NCCL_PYTHON`, `OPENINFER_TRITON_PYTHON`, `VIRTUAL_ENV`, or `CONDA_PREFIX`, and finally the system `libnccl.so.2` / `libnccl.so`. This keeps the code path unchanged while avoiding a stale system NCCL when the validation environment already has a newer CUDA wheel runtime. The HF oracle needs a Python environment that can load DeepSeek-V2-Lite with `trust_remote_code=True`. The helper script tolerates the model file's optional `flash_attn` import check when FlashAttention is not installed, but the HF environment remains separate from the Rust runtime claim: it is only the truth-source generator for the comparison JSON. ## Latest Validation -The issue #275 refresh was rerun on 2026-06-08 with DeepSeek-V2-Lite snapshot `604d5664dddd88a0433dbae533b7fe9472482de0`, `prompt="Hello"`, `output_len=16`, and 2x RTX 5090. HF, host-staged, and NCCL were dumped from the same model directory and compared with `--require-all-exact`. +The issue #276 refresh was rerun on 2026-06-10 with DeepSeek-V2-Lite snapshot `604d5664dddd88a0433dbae533b7fe9472482de0`, `prompt="Hello"`, `output_len=16`, and 2x RTX 5090. HF, host-staged, and NCCL were dumped from the same model directory and compared with `--require-all-exact`. The Rust path loaded NCCL `2.30.7+cuda12.9` from the Python CUDA wheel path because the system NCCL `2.25.1+cuda12.8` failed the init smoke on this Blackwell host before model-level validation. - HF / host-staged / NCCL comparison: `all_token_text_exact`. - Token SHA256: `4fb4c8825fe4d2c4a1d966da25c259abdf675f4de4548daa5d41aea7dfe30225`. @@ -147,7 +147,15 @@ The issue #275 refresh was rerun on 2026-06-08 with DeepSeek-V2-Lite snapshot `6 - Generated text: `, I am a 19 year old girl from the UK. I am`. - Candidate NCCL attribution: `gpu_timing.sample_count=8384`, `failure_count=0`. -The candidate readiness report still has `full_decode_capture_ready=false`. Compared with the same-host baseline attribution, it removed `nccl_contribution_accumulation_on_host`, `nccl_combine_h2d_contribution_copy`, `nccl_combine_allocates_per_call`, `nccl_combine_syncs_rank_streams`, and `nccl_combine_d2h_result_copy`. The remaining blockers are `nccl_dense_exchange_allocates_per_call`, `nccl_dense_exchange_syncs_rank_streams`, `nccl_route_iteration_on_host`, and `nccl_expert_accumulation_host_directed`. +The candidate readiness report still has `full_decode_capture_ready=false`. Compared with the issue #275 candidate, it removes the dense-exchange allocation/sync blockers. The remaining blockers are `nccl_route_iteration_on_host` and `nccl_expert_accumulation_host_directed`. + +Current NCCL attribution for the issue #276 gate: + +| Batch | GPU event samples | GPU failures | NCCL exchange/combine calls | Route counters | Readiness blockers | +| ---: | ---: | ---: | --- | --- | --- | +| 1 | 8384 | 0 | `416 / 416` | `local=1284`, `remote=1212`, `combine=2496` | `nccl_route_iteration_on_host`, `nccl_expert_accumulation_host_directed` | +| 4 | 23996 | 0 | `494 / 494` | `local=5136`, `remote=4848`, `combine=9984` | `nccl_route_iteration_on_host`, `nccl_expert_accumulation_host_directed` | +| 8 | 44812 | 0 | `598 / 598` | `local=10272`, `remote=9696`, `combine=19968` | `nccl_route_iteration_on_host`, `nccl_expert_accumulation_host_directed` | The previous A800 strict same-host accuracy gate was rerun on 2026-06-04 with DeepSeek-V2-Lite snapshot `604d5664dddd88a0433dbae533b7fe9472482de0`, `prompt="Hello"`, `output_len=16`, and 2x A800-SXM4-80GB. The token/text oracle is confirmed by a real HF `AutoModelForCausalLM.generate(..., do_sample=false, use_cache=true)` run on the same model directory as the Rust E2E gate. @@ -158,7 +166,7 @@ The Rust E2E accepts the known HF-confirmed RTX 5090 and A800 hash pairs for thi - Text SHA256: `4aaafbe4b3a46bc5b9ab5ea8d09d5fad71225006c2e234e87a928e3265b387c6`. - Generated text: `, I am a 20 year old female and I have been having a`. -The graph-readiness diagnostic was rerun on 2026-06-04 on the same model snapshot and 2x A800-SXM4-80GB: +The historical graph-readiness diagnostic before the #275/#276 device-scratch work was rerun on 2026-06-04 on the same model snapshot and 2x A800-SXM4-80GB: - `full_decode_capture_ready=false`; - `status=blocked_full_decode_path`; diff --git a/docs/models/deepseek-v2-lite/device-resident-nccl-combine.md b/docs/models/deepseek-v2-lite/device-resident-nccl-combine.md index 7ac35f70..eab6ce9c 100644 --- a/docs/models/deepseek-v2-lite/device-resident-nccl-combine.md +++ b/docs/models/deepseek-v2-lite/device-resident-nccl-combine.md @@ -1,6 +1,6 @@ # DeepSeek-V2-Lite Device-Resident NCCL Combine -> **TL;DR:** Issue #275 moves the NCCL decode combine path to reusable device-resident f32 scratch buffers. The retained `Hello` / 16-token gate stays HF / host-staged / NCCL exact, and the readiness report no longer lists the old combine H2D/D2H/allocation/sync blockers. Full decode capture is still blocked by dense exchange allocation/sync and host-directed routing. +> **TL;DR:** Issue #275 moves the NCCL decode combine path to reusable device-resident f32 scratch buffers. The retained `Hello` / 16-token gate stays HF / host-staged / NCCL exact, and the readiness report no longer lists the old combine H2D/D2H/allocation/sync blockers. Issue #276 later removes the dense-exchange allocation/sync blockers; full decode capture is still blocked by host-directed routing/expert accumulation. Last touched: 2026-06 @@ -23,11 +23,11 @@ Last touched: 2026-06 1. Add a shared CUDA helper that accumulates a bf16 single-token expert output into a f32 device contribution buffer at a selected token row. 2. Re-export that helper through `openinfer-core::ops`. 3. Add reusable NCCL combine scratch buffers inside `NaiveNcclEp2Backend`, clear the f32 send scratch per MoE call, accumulate local/remote expert outputs on the owning device, all-reduce device buffers, and cast rank0 f32 result to bf16 on device. - 4. Update graph-readiness blockers and attribution wording so removed combine H2D/D2H/allocation/sync blockers are no longer claimed, while remaining host routing and dense-exchange blockers stay explicit. + 4. Update graph-readiness blockers and attribution wording so removed combine H2D/D2H/allocation/sync blockers are no longer claimed, while the then-remaining host routing and dense-exchange blockers stay explicit. 5. Run formatting and local compile gates, then use the provided remote GPU host for the DeepSeek-V2-Lite EP2 exactness and attribution gates. - **Risks / open questions**: - Device f32 accumulation must preserve the existing expert-id accumulation order before the final bf16 cast. - - Dense exchange and host route selection still block full decode CUDA Graph capture; `full_decode_capture_ready` should remain false unless validation proves otherwise. + - At the time of issue #275, dense exchange and host route selection still blocked full decode CUDA Graph capture; issue #276 later removed the dense-exchange allocation/sync blockers, but `full_decode_capture_ready` should remain false while host-directed routing/expert accumulation remain. - The provided SSH credential should stay local to the validation session and must not be echoed into docs or final output. ## Execution Log @@ -97,13 +97,15 @@ cargo clippy --release -p openinfer-deepseek-v2-lite \ Both commands passed on the same remote source copy after syncing the follow-up `host_ops.rs` and `logging.rs` fixes. The clippy command ran without `clippy::manual_midpoint`, `clippy::needless_range_loop`, or `clippy::option_option` allows. -Before/after readiness comparison for the same model snapshot and diagnostic shape: +Before/after readiness comparison for the same model snapshot and diagnostic shape at issue #275: | Report | Readiness blockers | | --- | --- | | Baseline NCCL attribution | `nccl_dense_exchange_allocates_per_call`, `nccl_dense_exchange_syncs_rank_streams`, `nccl_route_iteration_on_host`, `nccl_contribution_accumulation_on_host`, `nccl_combine_h2d_contribution_copy`, `nccl_combine_allocates_per_call`, `nccl_combine_syncs_rank_streams`, `nccl_combine_d2h_result_copy` | | Candidate NCCL attribution | `nccl_dense_exchange_allocates_per_call`, `nccl_dense_exchange_syncs_rank_streams`, `nccl_route_iteration_on_host`, `nccl_expert_accumulation_host_directed` | +Current issue #276 readiness removes the dense-exchange allocation/sync blockers as well. The retained NCCL blockers are `nccl_route_iteration_on_host` and `nccl_expert_accumulation_host_directed`. + Removed blockers: - `nccl_contribution_accumulation_on_host` @@ -118,4 +120,4 @@ The candidate report replaces the old `nccl_contribution_accumulate` and `nccl_c The implementation keeps host-staged unchanged as the correctness oracle. The NCCL backend now owns reusable rank0/rank1 f32 send/recv scratch buffers behind `DeviceCombineScratch`; each MoE call clears the f32 send scratch on device, accumulates one-token expert outputs into the owning rank's send scratch with a CUDA helper, runs the f32 NCCL all-reduce, and casts rank0's f32 result back to bf16 on device. -The final bf16 `HiddenStates` returned to the model is still allocated per combine call. That allocation is outside the removed NCCL contribution/result round trip, so it is not claimed as full CUDA Graph readiness. The remaining readiness blockers are still real and should drive the next slice. +The final bf16 `HiddenStates` returned to the model is still allocated per combine call. That allocation is outside the removed NCCL contribution/result round trip, so issue #275 did not claim full CUDA Graph readiness. Issue #276 then moved dense exchange to reusable bf16 scratch; the remaining host-directed readiness blockers are still real and should drive the next slice. diff --git a/docs/models/deepseek-v2-lite/status.md b/docs/models/deepseek-v2-lite/status.md index 2c6b1982..4b915f34 100644 --- a/docs/models/deepseek-v2-lite/status.md +++ b/docs/models/deepseek-v2-lite/status.md @@ -1,6 +1,6 @@ # DeepSeek-V2-Lite Status And Benchmark Ledger -> **TL;DR:** DeepSeek-V2-Lite is a feature-gated EP2 correctness and attribution target. HF, host-staged, and NCCL match for the narrow greedy gate; NCCL decode combine now uses reusable device-resident f32 scratch, while dense exchange and host-directed routing still block full decode graph capture. Current batch and vLLM data remain diagnostic and do not claim production serving parity. +> **TL;DR:** DeepSeek-V2-Lite is a feature-gated EP2 correctness and attribution target. HF, host-staged, and NCCL match for the narrow greedy gate; NCCL decode combine and dense exchange now use reusable device scratch, while host-directed routing/expert accumulation still block full decode graph capture. Current batch and vLLM data remain diagnostic and do not claim production serving parity. Last touched: 2026-06 @@ -14,7 +14,8 @@ Last touched: 2026-06 | Decode attribution | Available | PR #162 and PR #169 add CPU/GPU attribution, route counts, NCCL counters, CUDA event timing, and optional NVTX correlation. | | Direct same-prompt diagnostic batch | Available | PR #184 and PR #196 cover batch sizes `1`, `4`, and `8` for the fixed same-prompt direct path. | | Device-resident NCCL combine | Available | Issue #275 keeps NCCL combine contributions/results on reusable f32 device scratch and preserves the HF / host-staged / NCCL exact gate on 2x RTX 5090. | -| NCCL CUDA Graph readiness | Diagnostic only | The attribution binary now emits `cuda_graph_readiness`. Current NCCL full decode capture is blocked; the optional preallocated f32 NCCL graph smoke captures, replays, and verifies. | +| Device-resident NCCL dense exchange | Available | Issue #276 keeps dense-exchange rank recv/zero-send buffers on reusable backend-owned bf16 scratch, clears rank1 zero-send every exchange, removes dense-exchange stream sync from the backend call, and preserves HF / host-staged / NCCL exactness on 2x RTX 5090. | +| NCCL CUDA Graph readiness | Diagnostic only | The attribution binary emits `cuda_graph_readiness`. Current NCCL full decode capture remains blocked by host route iteration and host-directed expert accumulation; the removed dense-exchange allocation/sync blockers should stay absent. | | Production continuous batching | Not available | The direct diagnostic batch path is not mixed-request HTTP serving. | | vLLM production parity | Not claimed | The manual vLLM snapshot below is for understanding the gap requested in issue #170. | @@ -100,7 +101,7 @@ Do not claim: Issue #205 records the model roadmap. Maintainer feedback there calls out NCCL plus CUDA Graph as the likely best decode direction, with host staging possibly deprecated later. Treat that as a future direction, not as current evidence. -The current graph-readiness diagnostic is intentionally fail-closed: `full_decode_capture_ready=false` for NCCL. Issue #275 removed the old NCCL combine H2D/D2H/allocation/sync blockers from the retained 2x RTX 5090 attribution gate, but dense exchange allocation/sync and host-directed routing remain. A basic preallocated f32 NCCL all-reduce smoke captures, replays, and verifies on the retained A800 run, but that proves only the collective smoke shape. It is not full decode CUDA Graph coverage. HF, host-staged, and NCCL remain token/text exact for the narrow greedy gate. +The current graph-readiness diagnostic is intentionally fail-closed: `full_decode_capture_ready=false` for NCCL. Issue #275 removed the old NCCL combine H2D/D2H/allocation/sync blockers, and issue #276 removed the dense-exchange allocation/sync blockers from the retained 2x RTX 5090 attribution gate. The remaining NCCL blockers are host route iteration and host-directed expert accumulation. The optional f32 NCCL graph smoke is a separate collective-only diagnostic and is not #276 evidence. HF, host-staged, and NCCL remain token/text exact for the narrow greedy gate. The next implementation should be chosen from measured evidence: @@ -108,7 +109,7 @@ The next implementation should be chosen from measured evidence: - keep HF / host-staged / NCCL exact before and after; - keep host-staged as the correctness baseline while it exists; - preserve attribution before and after the change; - - attack dense exchange allocation/sync and host route iteration next; + - attack host route iteration and host-directed expert accumulation next; - avoid broad generic EP or multi-node work; - judge issue #170 by whether it reduces NCCL decode overhead and makes the path more graph-friendly. diff --git a/openinfer-core/src/ops.rs b/openinfer-core/src/ops.rs index b54379aa..f8e407f0 100644 --- a/openinfer-core/src/ops.rs +++ b/openinfer-core/src/ops.rs @@ -16,9 +16,9 @@ pub use attention::{ pub use openinfer_kernels::ops::{ LoraDecodeGroupedProjection, accumulate_bf16_token_scaled_to_f32_into, add_batch, add_batch_into, bf16_hidden_to_f32_into, embedding_decode_into, extract_vec, extract_vec_into, - f32_to_bf16_hidden_into, fused_add_rms_norm_into, gather_hidden_tokens_into, gemm, - gemm_into_checked, gemm_per_token, gemv, linear, lora_decode_fused_delta_group3_into, - lora_decode_fused_delta_into, pack_lora_b_rows_into, + extract_vec_ref, extract_vec_ref_into, f32_to_bf16_hidden_into, fused_add_rms_norm_into, + gather_hidden_tokens_into, gemm, gemm_into_checked, gemm_per_token, gemv, linear, + lora_decode_fused_delta_group3_into, lora_decode_fused_delta_into, pack_lora_b_rows_into, qk_norm_partial_rope_batched_decode_hd256_into, rms_norm, rms_norm_batch_offset_into, rms_norm_gated_batch_into, rms_norm_into, rms_norm_offset_into, scale_f32_in_place, scaled_add_batch_into, scaled_add_rows_indexed_into, scaled_add_rows_into, diff --git a/openinfer-deepseek-v2-lite/src/nccl_backend.rs b/openinfer-deepseek-v2-lite/src/nccl_backend.rs index 7e2b184d..a8be12fd 100644 --- a/openinfer-deepseek-v2-lite/src/nccl_backend.rs +++ b/openinfer-deepseek-v2-lite/src/nccl_backend.rs @@ -1,5 +1,9 @@ use std::{ + collections::HashSet, + env, ffi::{CStr, c_char, c_int, c_void}, + fs, + path::{Path, PathBuf}, ptr, sync::{Arc, Mutex, MutexGuard}, }; @@ -19,7 +23,7 @@ use half::bf16; use libloading::Library; use openinfer_core::{ ops, - tensor::{DeviceContext, HiddenStates}, + tensor::{DeviceContext, HiddenStates, HiddenStatesRef}, }; use serde::Serialize; @@ -45,6 +49,7 @@ type NcclGetErrorString = unsafe extern "C" fn(ncclResult_t) -> *const c_char; pub(crate) struct NaiveNcclEp2Backend { lib: Arc, comms: Vec, + dense_exchange_scratch: Mutex, combine_scratch: Mutex, } @@ -95,6 +100,7 @@ impl NcclGraphSmokeReport { struct RawNcclLib { _library: Library, + source: String, comm_init_all: NcclCommInitAll, comm_count: NcclCommCount, comm_cu_device: NcclCommCuDevice, @@ -105,6 +111,108 @@ struct RawNcclLib { get_error_string: NcclGetErrorString, } +#[derive(Default)] +struct DeviceDenseExchangeScratch { + hidden_dim: usize, + seq_len: usize, + rank0_recv: Option>, + rank1_send_zero: Option>, + rank1_recv: Option>, +} + +impl DeviceDenseExchangeScratch { + fn ensure( + &mut self, + rank0: &DeviceContext, + rank1: &DeviceContext, + hidden_dim: usize, + seq_len: usize, + ) -> Result<()> { + let elems = dense_exchange_elems(hidden_dim, seq_len)?; + if self.hidden_dim == hidden_dim + && self.seq_len == seq_len + && self + .rank0_recv + .as_ref() + .is_some_and(|buf| buf.len() >= elems) + && self + .rank1_send_zero + .as_ref() + .is_some_and(|buf| buf.len() >= elems) + && self + .rank1_recv + .as_ref() + .is_some_and(|buf| buf.len() >= elems) + { + return Ok(()); + } + + activate(rank0)?; + drop(self.rank0_recv.take()); + let rank0_recv = rank0.stream.alloc_zeros::(elems)?; + activate(rank1)?; + drop(self.rank1_send_zero.take()); + drop(self.rank1_recv.take()); + let rank1_send_zero = rank1.stream.alloc_zeros::(elems)?; + let rank1_recv = rank1.stream.alloc_zeros::(elems)?; + + self.hidden_dim = hidden_dim; + self.seq_len = seq_len; + self.rank0_recv = Some(rank0_recv); + self.rank1_send_zero = Some(rank1_send_zero); + self.rank1_recv = Some(rank1_recv); + Ok(()) + } + + fn ensure_shape(&self, hidden_dim: usize, seq_len: usize) -> Result { + let elems = dense_exchange_elems(hidden_dim, seq_len)?; + ensure!( + self.hidden_dim == hidden_dim && self.seq_len == seq_len, + "DeepSeek-V2-Lite NCCL dense exchange scratch shape mismatch: scratch=[{}, {}], requested=[{}, {}]", + self.hidden_dim, + self.seq_len, + hidden_dim, + seq_len + ); + ensure!( + self.rank0_recv + .as_ref() + .is_some_and(|buf| buf.len() >= elems) + && self + .rank1_send_zero + .as_ref() + .is_some_and(|buf| buf.len() >= elems) + && self + .rank1_recv + .as_ref() + .is_some_and(|buf| buf.len() >= elems), + "DeepSeek-V2-Lite NCCL dense exchange scratch is not initialized for {elems} elements" + ); + Ok(elems) + } + + fn rank1_hidden_ref(&self) -> Result> { + Ok(HiddenStatesRef { + data: self + .rank1_recv + .as_ref() + .context("DeepSeek-V2-Lite NCCL rank1 dense exchange recv scratch is missing")?, + hidden_dim: self.hidden_dim, + seq_len: self.seq_len, + }) + } +} + +pub(crate) struct DenseExchangeOutput<'a> { + scratch: MutexGuard<'a, DeviceDenseExchangeScratch>, +} + +impl DenseExchangeOutput<'_> { + pub(crate) fn rank1_hidden(&self) -> Result> { + self.scratch.rank1_hidden_ref() + } +} + #[derive(Default)] struct DeviceCombineScratch { hidden_dim: usize, @@ -233,6 +341,7 @@ impl NaiveNcclEp2Backend { let backend = Self { lib, comms, + dense_exchange_scratch: Mutex::new(DeviceDenseExchangeScratch::default()), combine_scratch: Mutex::new(DeviceCombineScratch::default()), }; backend.validate_communicators(&ordinals)?; @@ -245,45 +354,64 @@ impl NaiveNcclEp2Backend { rank0: &DeviceContext, rank1: &DeviceContext, input: &HiddenStates, - ) -> Result { + ) -> Result> { ensure!( input.hidden_dim > 0 && input.seq_len > 0, "DeepSeek-V2-Lite NCCL dense hidden exchange requires non-empty hidden states" ); - activate(rank0)?; - let mut rank0_recv = HiddenStates::zeros(rank0, input.hidden_dim, input.seq_len)?; + let mut scratch = self.dense_exchange_scratch()?; + let elems = dense_exchange_elems(input.hidden_dim, input.seq_len)?; + scratch.ensure(rank0, rank1, input.hidden_dim, input.seq_len)?; activate(rank1)?; - let rank1_send = HiddenStates::zeros(rank1, input.hidden_dim, input.seq_len)?; - let mut rank1_recv = HiddenStates::zeros(rank1, input.hidden_dim, input.seq_len)?; + rank1 + .stream + .memset_zeros(scratch.rank1_send_zero.as_mut().context( + "DeepSeek-V2-Lite NCCL rank1 dense exchange zero-send scratch is missing", + )?) + .context("clear DeepSeek-V2-Lite NCCL rank1 dense exchange zero-send scratch")?; + scratch.ensure_shape(input.hidden_dim, input.seq_len)?; + + let DeviceDenseExchangeScratch { + rank0_recv, + rank1_send_zero, + rank1_recv, + .. + } = &mut *scratch; + let rank0_recv = rank0_recv + .as_mut() + .context("DeepSeek-V2-Lite NCCL rank0 dense exchange recv scratch is missing")?; + let rank1_send_zero = rank1_send_zero + .as_ref() + .context("DeepSeek-V2-Lite NCCL rank1 dense exchange zero-send scratch is missing")?; + let rank1_recv = rank1_recv + .as_mut() + .context("DeepSeek-V2-Lite NCCL rank1 dense exchange recv scratch is missing")?; // Correctness-first dense exchange: rank0 contributes the hidden state // and rank1 contributes zeros. This makes rank0 hidden visible on rank1 // without pretending to be sparse routed dispatch. - let count = input.hidden_dim * input.seq_len; self.grouped("DeepSeek-V2-Lite NCCL dense hidden all-reduce", || { activate(rank0)?; self.all_reduce_bf16( 0, &input.data, - &mut rank0_recv.data, - count, + rank0_recv, + elems, rank0.stream.cu_stream(), "DeepSeek-V2-Lite NCCL dense hidden rank0 all-reduce", )?; activate(rank1)?; self.all_reduce_bf16( 1, - &rank1_send.data, - &mut rank1_recv.data, - count, + rank1_send_zero, + rank1_recv, + elems, rank1.stream.cu_stream(), "DeepSeek-V2-Lite NCCL dense hidden rank1 all-reduce", )?; Ok(()) })?; - rank0.sync()?; - rank1.sync()?; - Ok(rank1_recv) + Ok(DenseExchangeOutput { scratch }) } pub(crate) fn clear_device_combine( @@ -739,6 +867,12 @@ impl NaiveNcclEp2Backend { .map_err(|_| anyhow::anyhow!("DeepSeek-V2-Lite NCCL device combine scratch poisoned")) } + fn dense_exchange_scratch(&self) -> Result> { + self.dense_exchange_scratch + .lock() + .map_err(|_| anyhow::anyhow!("DeepSeek-V2-Lite NCCL dense exchange scratch poisoned")) + } + fn grouped(&self, context: &str, f: impl FnOnce() -> Result<()>) -> Result<()> { let start = unsafe { // SAFETY: NCCL group state is process-global and entered/exited on @@ -769,6 +903,18 @@ fn combine_elems(hidden_dim: usize, seq_len: usize) -> Result { }) } +fn dense_exchange_elems(hidden_dim: usize, seq_len: usize) -> Result { + ensure!( + hidden_dim > 0 && seq_len > 0, + "DeepSeek-V2-Lite NCCL dense exchange requires non-empty shape, got hidden_dim={hidden_dim}, seq_len={seq_len}" + ); + hidden_dim.checked_mul(seq_len).with_context(|| { + format!( + "DeepSeek-V2-Lite NCCL dense exchange shape overflow: hidden_dim={hidden_dim}, seq_len={seq_len}" + ) + }) +} + fn cleanup_capture(ctx: &DeviceContext, capture_started: bool) { if capture_started { let _ = activate(ctx); @@ -896,19 +1042,19 @@ impl Drop for NaiveNcclEp2Backend { impl RawNcclLib { fn load() -> Result { let mut tried = Vec::new(); - for candidate in ["libnccl.so.2", "libnccl.so"] { - tried.push(candidate); + for candidate in nccl_library_candidates() { + tried.push(candidate.clone()); let Ok(library) = (unsafe { // SAFETY: Loading NCCL is required to create the selected // runtime backend. All symbols are validated immediately below. - Library::new(candidate) + Library::new(&candidate) }) else { continue; }; return unsafe { // SAFETY: The library is kept alive inside `RawNcclLib`; copied // function pointers do not outlive it. - Self::from_library(library) + Self::from_library(library, candidate.clone()) } .with_context(|| format!("load DeepSeek-V2-Lite NCCL backend from {candidate}")); } @@ -918,7 +1064,7 @@ impl RawNcclLib { ) } - unsafe fn from_library(library: Library) -> Result { + unsafe fn from_library(library: Library, source: String) -> Result { Ok(Self { comm_init_all: unsafe { load_symbol(&library, b"ncclCommInitAll\0")? }, comm_count: unsafe { load_symbol(&library, b"ncclCommCount\0")? }, @@ -928,6 +1074,7 @@ impl RawNcclLib { group_end: unsafe { load_symbol(&library, b"ncclGroupEnd\0")? }, all_reduce: unsafe { load_symbol(&library, b"ncclAllReduce\0")? }, get_error_string: unsafe { load_symbol(&library, b"ncclGetErrorString\0")? }, + source, _library: library, }) } @@ -946,7 +1093,10 @@ impl RawNcclLib { CStr::from_ptr(ptr).to_string_lossy().into_owned() } }; - bail!("{context} failed: {message} ({status:?})") + bail!( + "{context} failed with NCCL library {}: {message} ({status:?})", + self.source + ) } fn query_comm_count(&self, comm: ncclComm_t, context: &str) -> Result { @@ -972,6 +1122,142 @@ impl RawNcclLib { } } +fn nccl_library_candidates() -> Vec { + let mut candidates = Vec::new(); + let mut seen = HashSet::new(); + + add_env_file_candidates(&mut candidates, &mut seen, "OPENINFER_NCCL_LIB"); + add_env_dir_candidates(&mut candidates, &mut seen, "OPENINFER_NCCL_LIB_DIR"); + add_env_dir_candidates(&mut candidates, &mut seen, "OPENINFER_NCCL_LIBRARY_PATH"); + for lib_dir in nccl_python_wheel_lib_dirs() { + add_nccl_dir_candidates(&mut candidates, &mut seen, &lib_dir); + } + + add_candidate(&mut candidates, &mut seen, "libnccl.so.2".to_string()); + add_candidate(&mut candidates, &mut seen, "libnccl.so".to_string()); + candidates +} + +fn add_env_file_candidates(candidates: &mut Vec, seen: &mut HashSet, key: &str) { + let Ok(value) = env::var(key) else { + return; + }; + for path in env::split_paths(&value) { + add_candidate(candidates, seen, path.to_string_lossy().into_owned()); + } +} + +fn add_env_dir_candidates(candidates: &mut Vec, seen: &mut HashSet, key: &str) { + let Ok(value) = env::var(key) else { + return; + }; + for dir in env::split_paths(&value) { + add_nccl_dir_candidates(candidates, seen, &dir); + } +} + +fn add_nccl_dir_candidates(candidates: &mut Vec, seen: &mut HashSet, dir: &Path) { + add_candidate( + candidates, + seen, + dir.join("libnccl.so.2").to_string_lossy().into_owned(), + ); + add_candidate( + candidates, + seen, + dir.join("libnccl.so").to_string_lossy().into_owned(), + ); +} + +fn add_candidate(candidates: &mut Vec, seen: &mut HashSet, candidate: String) { + if !candidate.is_empty() && seen.insert(candidate.clone()) { + candidates.push(candidate); + } +} + +fn nccl_python_wheel_lib_dirs() -> Vec { + python_env_roots() + .into_iter() + .flat_map(|root| nccl_python_wheel_lib_dirs_from_root(&root)) + .collect() +} + +fn python_env_roots() -> Vec { + let mut roots = Vec::new(); + let mut seen = HashSet::new(); + + for key in ["OPENINFER_NCCL_PYTHON", "OPENINFER_TRITON_PYTHON"] { + if let Ok(value) = env::var(key) { + add_python_env_root(&mut roots, &mut seen, Path::new(&value)); + } + } + for key in ["VIRTUAL_ENV", "CONDA_PREFIX"] { + if let Ok(value) = env::var(key) { + add_pathbuf_once(&mut roots, &mut seen, PathBuf::from(value)); + } + } + roots +} + +fn add_python_env_root(roots: &mut Vec, seen: &mut HashSet, python: &Path) { + if python.is_dir() { + add_pathbuf_once(roots, seen, python.to_path_buf()); + return; + } + if let Some(parent) = python.parent() + && parent.file_name().is_some_and(|name| name == "bin") + && let Some(root) = parent.parent() + { + add_pathbuf_once(roots, seen, root.to_path_buf()); + } +} + +fn add_pathbuf_once(paths: &mut Vec, seen: &mut HashSet, path: PathBuf) { + if !path.as_os_str().is_empty() && seen.insert(path.clone()) { + paths.push(path); + } +} + +fn nccl_python_wheel_lib_dirs_from_root(root: &Path) -> Vec { + let mut dirs = Vec::new(); + let mut seen = HashSet::new(); + add_python_wheel_lib_dir( + &mut dirs, + &mut seen, + root.join("site-packages/nvidia/nccl/lib"), + ); + + let lib_root = root.join("lib"); + if let Ok(entries) = fs::read_dir(&lib_root) { + for entry in entries.flatten() { + let path = entry.path(); + let Some(name) = path.file_name().and_then(|name| name.to_str()) else { + continue; + }; + if name.starts_with("python") { + add_python_wheel_lib_dir( + &mut dirs, + &mut seen, + path.join("site-packages/nvidia/nccl/lib"), + ); + } + } + } + + add_python_wheel_lib_dir( + &mut dirs, + &mut seen, + root.join("Lib/site-packages/nvidia/nccl/lib"), + ); + dirs +} + +fn add_python_wheel_lib_dir(dirs: &mut Vec, seen: &mut HashSet, dir: PathBuf) { + if dir.join("libnccl.so.2").exists() && seen.insert(dir.clone()) { + dirs.push(dir); + } +} + unsafe fn load_symbol(library: &Library, symbol: &'static [u8]) -> Result { unsafe { library.get::(symbol) } .map(|symbol| *symbol) @@ -982,3 +1268,39 @@ unsafe fn load_symbol(library: &Library, symbol: &'static [u8]) -> Resu ) }) } + +#[cfg(test)] +mod nccl_loader_tests { + use std::{ + fs, + time::{SystemTime, UNIX_EPOCH}, + }; + + use super::*; + + #[test] + fn finds_nccl_python_wheel_lib_dir_from_python_executable() { + let unique = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("system time before epoch") + .as_nanos(); + let root = env::temp_dir().join(format!( + "openinfer-nccl-wheel-test-{}-{unique}", + std::process::id() + )); + let python_dir = root.join("bin"); + let wheel_dir = root.join("lib/python3.11/site-packages/nvidia/nccl/lib"); + fs::create_dir_all(&python_dir).expect("create python bin dir"); + fs::create_dir_all(&wheel_dir).expect("create NCCL wheel dir"); + fs::write(wheel_dir.join("libnccl.so.2"), []).expect("create fake NCCL lib marker"); + + let mut roots = Vec::new(); + let mut seen = HashSet::new(); + add_python_env_root(&mut roots, &mut seen, &python_dir.join("python")); + + assert_eq!(roots, vec![root.clone()]); + assert_eq!(nccl_python_wheel_lib_dirs_from_root(&root), vec![wheel_dir]); + + fs::remove_dir_all(root).expect("remove temp root"); + } +} diff --git a/openinfer-deepseek-v2-lite/src/runtime/moe.rs b/openinfer-deepseek-v2-lite/src/runtime/moe.rs index 828369a3..aa6dfe18 100644 --- a/openinfer-deepseek-v2-lite/src/runtime/moe.rs +++ b/openinfer-deepseek-v2-lite/src/runtime/moe.rs @@ -1,6 +1,9 @@ use anyhow::{Result, bail}; use half::bf16; -use openinfer_core::{ops, tensor::HiddenStates}; +use openinfer_core::{ + ops, + tensor::{HiddenStates, HiddenStatesRef}, +}; use super::{DeepSeekV2LiteEp2Generator, backend::EpBackendRuntime}; use crate::{ @@ -235,6 +238,7 @@ impl DeepSeekV2LiteEp2Generator { token_index, || nccl.dense_all_reduce_rank0_hidden_to_rank1(&self.rank0.ctx, &self.rank1.ctx, input), )?; + let rank1_hidden = rank1_input.rank1_hidden()?; attribution.record_gpu_pair_result( &self.rank0.ctx, &self.rank1.ctx, @@ -271,7 +275,14 @@ impl DeepSeekV2LiteEp2Generator { || format!("layer.{layer_idx}.nccl.local_expert"), Some(layer_idx), token_index, - || expert_forward_device(&self.rank0.ctx, expert, input, token), + || { + expert_forward_device( + &self.rank0.ctx, + expert, + input.as_ref(), + token, + ) + }, )? } 1 => { @@ -284,7 +295,7 @@ impl DeepSeekV2LiteEp2Generator { || format!("layer.{layer_idx}.nccl.remote_expert"), Some(layer_idx), token_index, - || expert_forward_device(&self.rank1.ctx, expert, &rank1_input, token), + || expert_forward_device(&self.rank1.ctx, expert, rank1_hidden, token), )? } other => { @@ -377,11 +388,11 @@ impl DeepSeekV2LiteEp2Generator { fn expert_forward_device( ctx: &openinfer_core::tensor::DeviceContext, expert: &ExpertMlp, - input: &HiddenStates, + input: HiddenStatesRef<'_>, token_idx: usize, ) -> Result { activate(ctx)?; - let token = ops::extract_vec(ctx, input, token_idx)?; + let token = ops::extract_vec_ref(ctx, input, token_idx)?; let token_hidden = HiddenStates { hidden_dim: token.len, seq_len: 1, diff --git a/openinfer-deepseek-v2-lite/src/runtime/readiness.rs b/openinfer-deepseek-v2-lite/src/runtime/readiness.rs index e9bb8133..4881015d 100644 --- a/openinfer-deepseek-v2-lite/src/runtime/readiness.rs +++ b/openinfer-deepseek-v2-lite/src/runtime/readiness.rs @@ -90,16 +90,6 @@ fn decode_graph_blockers(backend: EpBackendKind) -> Vec { }, ], EpBackendKind::Nccl => vec![ - DecodeGraphBlocker { - id: "nccl_dense_exchange_allocates_per_call", - source: "nccl_backend.rs::dense_all_reduce_rank0_hidden_to_rank1", - reason: "rank0/rank1 receive and zero-send buffers are allocated inside each dense exchange", - }, - DecodeGraphBlocker { - id: "nccl_dense_exchange_syncs_rank_streams", - source: "nccl_backend.rs::dense_all_reduce_rank0_hidden_to_rank1", - reason: "both rank streams are synchronized after every dense exchange", - }, DecodeGraphBlocker { id: "nccl_route_iteration_on_host", source: "runtime/moe.rs::moe_forward_nccl", @@ -113,3 +103,22 @@ fn decode_graph_blockers(backend: EpBackendKind) -> Vec { ], } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn nccl_readiness_reports_only_remaining_graph_blockers() { + let blockers = decode_graph_blockers(EpBackendKind::Nccl); + let ids: Vec<_> = blockers.iter().map(|blocker| blocker.id).collect(); + + assert_eq!( + ids, + vec![ + "nccl_route_iteration_on_host", + "nccl_expert_accumulation_host_directed", + ] + ); + } +} diff --git a/openinfer-kernels/src/ops.rs b/openinfer-kernels/src/ops.rs index ffbc6f2c..722edeac 100644 --- a/openinfer-kernels/src/ops.rs +++ b/openinfer-kernels/src/ops.rs @@ -23,10 +23,11 @@ pub use deepep::{ }; pub use elementwise::{ accumulate_bf16_token_scaled_to_f32_into, add_batch, add_batch_into, bf16_hidden_to_f32_into, - extract_vec, extract_vec_into, f32_to_bf16_hidden_into, gather_hidden_tokens_into, - repeat_f32_for_reduce_scatter_into, scale_f32_in_place, scaled_add_batch_into, - scaled_add_rows_indexed_into, scaled_add_rows_into, scaled_add_rows_token_range_into, - silu_mul_batch, silu_mul_batch_into, silu_mul_fused_batch_into, write_vec_into, + extract_vec, extract_vec_into, extract_vec_ref, extract_vec_ref_into, f32_to_bf16_hidden_into, + gather_hidden_tokens_into, repeat_f32_for_reduce_scatter_into, scale_f32_in_place, + scaled_add_batch_into, scaled_add_rows_indexed_into, scaled_add_rows_into, + scaled_add_rows_token_range_into, silu_mul_batch, silu_mul_batch_into, + silu_mul_fused_batch_into, write_vec_into, }; pub use embedding::{embedding_batch, embedding_batch_vocab_shard, embedding_decode_into}; #[cfg(feature = "kimi-k2")] diff --git a/openinfer-kernels/src/ops/elementwise.rs b/openinfer-kernels/src/ops/elementwise.rs index 177b2df1..e9cbdb2b 100644 --- a/openinfer-kernels/src/ops/elementwise.rs +++ b/openinfer-kernels/src/ops/elementwise.rs @@ -2,7 +2,7 @@ use anyhow::{Result, anyhow}; use cudarc::driver::{CudaSlice, DevicePtr, DevicePtrMut}; use crate::ffi; -use crate::tensor::{DeviceContext, DeviceVec, HiddenStates}; +use crate::tensor::{DeviceContext, DeviceVec, HiddenStates, HiddenStatesRef}; /// Batched element-wise add: out = a + b (same shape HiddenStates) pub fn add_batch(ctx: &DeviceContext, a: &HiddenStates, b: &HiddenStates) -> Result { @@ -477,10 +477,19 @@ pub fn extract_vec( ctx: &DeviceContext, batch: &HiddenStates, token_idx: usize, +) -> Result { + extract_vec_ref(ctx, batch.as_ref(), token_idx) +} + +/// Extract a single token's vector from a borrowed HiddenStates batch. +pub fn extract_vec_ref( + ctx: &DeviceContext, + batch: HiddenStatesRef<'_>, + token_idx: usize, ) -> Result { let len = batch.hidden_dim; let mut out = DeviceVec::zeros(ctx, len)?; - extract_vec_into(ctx, batch, token_idx, &mut out)?; + extract_vec_ref_into(ctx, batch, token_idx, &mut out)?; Ok(out) } @@ -490,6 +499,16 @@ pub fn extract_vec_into( batch: &HiddenStates, token_idx: usize, out: &mut DeviceVec, +) -> Result<()> { + extract_vec_ref_into(ctx, batch.as_ref(), token_idx, out) +} + +/// Copy one column from a borrowed `batch` into a pre-allocated `out`. +pub fn extract_vec_ref_into( + ctx: &DeviceContext, + batch: HiddenStatesRef<'_>, + token_idx: usize, + out: &mut DeviceVec, ) -> Result<()> { let offset = token_idx * batch.hidden_dim; let len = batch.hidden_dim; From 867069292b01b638d4074e2882210d8a567dcb0f Mon Sep 17 00:00:00 2001 From: CAICAIIs <3360776475@qq.com> Date: Wed, 10 Jun 2026 22:39:12 +0800 Subject: [PATCH 2/3] chore(dsv2lite): tighten nccl dense exchange follow-up --- docs/index.md | 2 +- .../decode-attribution-gate.md | 2 +- .../device-resident-nccl-combine.md | 10 +-- docs/models/deepseek-v2-lite/status.md | 4 +- .../src/nccl_backend.rs | 74 ++----------------- .../src/nccl_backend/tests.rs | 33 +++++++++ .../src/runtime/readiness.rs | 22 +----- .../src/runtime/readiness/tests.rs | 15 ++++ 8 files changed, 65 insertions(+), 97 deletions(-) create mode 100644 openinfer-deepseek-v2-lite/src/nccl_backend/tests.rs create mode 100644 openinfer-deepseek-v2-lite/src/runtime/readiness/tests.rs diff --git a/docs/index.md b/docs/index.md index 39fc5e3d..22ce75fc 100644 --- a/docs/index.md +++ b/docs/index.md @@ -66,7 +66,7 @@ Organized by domain (model line / subsystem / playbook / lesson) instead of by l | `models/deepseek-v2-lite/hf-accuracy-gate.md` | DeepSeek-V2-Lite EP2 HF accuracy gate after PR #149/#150: HF incremental greedy, host-staged EP2, and NCCL EP2 are token/text exact for `Hello`, output_len=16. | | `models/deepseek-v2-lite/decode-attribution-gate.md` | DeepSeek-V2-Lite EP2 decode attribution gate for `Hello`/16-token batch sizes 1/4/8: structured JSON with accuracy hashes, CPU-side timing, selected CUDA event/NVTX attribution, host-staged/NCCL EP counts, and explicit no-throughput claim boundary. | | `models/deepseek-v2-lite/source-layout.md` | DeepSeek-V2-Lite runtime layout refactor: `runtime.rs` split by responsibility, HF/host-staged/NCCL EP2 E2E exact on 2x RTX 5090; NCCL CUDA Graph smoke remains a diagnostic blocker on that host, independent of the passed correctness gate. | -| `models/deepseek-v2-lite/device-resident-nccl-combine.md` | Issue #275 record: NCCL decode combine uses reusable device-resident f32 scratch; after issue #276, dense exchange also uses reusable scratch, so remaining graph blockers are host-directed routing/expert accumulation. | +| `models/deepseek-v2-lite/device-resident-nccl-combine.md` | Issue #275 record: NCCL decode combine uses reusable device-resident f32 scratch; current NCCL graph-readiness blockers live in `status.md`. | ## models / kimi-k2 diff --git a/docs/models/deepseek-v2-lite/decode-attribution-gate.md b/docs/models/deepseek-v2-lite/decode-attribution-gate.md index e4adf037..769e350f 100644 --- a/docs/models/deepseek-v2-lite/decode-attribution-gate.md +++ b/docs/models/deepseek-v2-lite/decode-attribution-gate.md @@ -71,7 +71,7 @@ python tools/accuracy/compare_dsv2_lite_ep2_outputs.py \ --require-all-exact ``` -Then collect attribution for the same two openinfer backends. Use `--batch-size 1` for the original single-row gate, and `--batch-size 4` / `--batch-size 8` for the true-batch benchmark attribution shape: +Then collect attribution for the same two openinfer backends. Use `--batch-size 1` for the original single-row gate, and `--batch-size 4` / `--batch-size 8` for the true-batch benchmark attribution shape. If the NCCL runtime should come from a Python CUDA wheel rather than the system install, set `OPENINFER_NCCL_PYTHON` or `OPENINFER_TRITON_PYTHON` to that environment's Python for the attribution command too. ```bash cargo run --release -p openinfer-deepseek-v2-lite \ diff --git a/docs/models/deepseek-v2-lite/device-resident-nccl-combine.md b/docs/models/deepseek-v2-lite/device-resident-nccl-combine.md index eab6ce9c..c2cd140a 100644 --- a/docs/models/deepseek-v2-lite/device-resident-nccl-combine.md +++ b/docs/models/deepseek-v2-lite/device-resident-nccl-combine.md @@ -1,6 +1,6 @@ # DeepSeek-V2-Lite Device-Resident NCCL Combine -> **TL;DR:** Issue #275 moves the NCCL decode combine path to reusable device-resident f32 scratch buffers. The retained `Hello` / 16-token gate stays HF / host-staged / NCCL exact, and the readiness report no longer lists the old combine H2D/D2H/allocation/sync blockers. Issue #276 later removes the dense-exchange allocation/sync blockers; full decode capture is still blocked by host-directed routing/expert accumulation. +> **TL;DR:** Issue #275 moves the NCCL decode combine path to reusable device-resident f32 scratch buffers. The retained `Hello` / 16-token gate stays HF / host-staged / NCCL exact, and the readiness report no longer lists the old combine H2D/D2H/allocation/sync blockers. Current NCCL graph-readiness blockers live in `status.md`. Last touched: 2026-06 @@ -23,11 +23,11 @@ Last touched: 2026-06 1. Add a shared CUDA helper that accumulates a bf16 single-token expert output into a f32 device contribution buffer at a selected token row. 2. Re-export that helper through `openinfer-core::ops`. 3. Add reusable NCCL combine scratch buffers inside `NaiveNcclEp2Backend`, clear the f32 send scratch per MoE call, accumulate local/remote expert outputs on the owning device, all-reduce device buffers, and cast rank0 f32 result to bf16 on device. - 4. Update graph-readiness blockers and attribution wording so removed combine H2D/D2H/allocation/sync blockers are no longer claimed, while the then-remaining host routing and dense-exchange blockers stay explicit. + 4. Update graph-readiness blockers and attribution wording so removed combine H2D/D2H/allocation/sync blockers are no longer claimed, while the remaining host routing and dense-exchange blockers stay explicit. 5. Run formatting and local compile gates, then use the provided remote GPU host for the DeepSeek-V2-Lite EP2 exactness and attribution gates. - **Risks / open questions**: - Device f32 accumulation must preserve the existing expert-id accumulation order before the final bf16 cast. - - At the time of issue #275, dense exchange and host route selection still blocked full decode CUDA Graph capture; issue #276 later removed the dense-exchange allocation/sync blockers, but `full_decode_capture_ready` should remain false while host-directed routing/expert accumulation remain. + - Dense exchange and host route selection still blocked full decode CUDA Graph capture at the time of issue #275; keep the current blocker list in `status.md`. - The provided SSH credential should stay local to the validation session and must not be echoed into docs or final output. ## Execution Log @@ -104,8 +104,6 @@ Before/after readiness comparison for the same model snapshot and diagnostic sha | Baseline NCCL attribution | `nccl_dense_exchange_allocates_per_call`, `nccl_dense_exchange_syncs_rank_streams`, `nccl_route_iteration_on_host`, `nccl_contribution_accumulation_on_host`, `nccl_combine_h2d_contribution_copy`, `nccl_combine_allocates_per_call`, `nccl_combine_syncs_rank_streams`, `nccl_combine_d2h_result_copy` | | Candidate NCCL attribution | `nccl_dense_exchange_allocates_per_call`, `nccl_dense_exchange_syncs_rank_streams`, `nccl_route_iteration_on_host`, `nccl_expert_accumulation_host_directed` | -Current issue #276 readiness removes the dense-exchange allocation/sync blockers as well. The retained NCCL blockers are `nccl_route_iteration_on_host` and `nccl_expert_accumulation_host_directed`. - Removed blockers: - `nccl_contribution_accumulation_on_host` @@ -120,4 +118,4 @@ The candidate report replaces the old `nccl_contribution_accumulate` and `nccl_c The implementation keeps host-staged unchanged as the correctness oracle. The NCCL backend now owns reusable rank0/rank1 f32 send/recv scratch buffers behind `DeviceCombineScratch`; each MoE call clears the f32 send scratch on device, accumulates one-token expert outputs into the owning rank's send scratch with a CUDA helper, runs the f32 NCCL all-reduce, and casts rank0's f32 result back to bf16 on device. -The final bf16 `HiddenStates` returned to the model is still allocated per combine call. That allocation is outside the removed NCCL contribution/result round trip, so issue #275 did not claim full CUDA Graph readiness. Issue #276 then moved dense exchange to reusable bf16 scratch; the remaining host-directed readiness blockers are still real and should drive the next slice. +The final bf16 `HiddenStates` returned to the model is still allocated per combine call. That allocation is outside the removed NCCL contribution/result round trip, so issue #275 did not claim full CUDA Graph readiness. The current blocker list should stay in `status.md`. diff --git a/docs/models/deepseek-v2-lite/status.md b/docs/models/deepseek-v2-lite/status.md index 4b915f34..99850528 100644 --- a/docs/models/deepseek-v2-lite/status.md +++ b/docs/models/deepseek-v2-lite/status.md @@ -14,7 +14,7 @@ Last touched: 2026-06 | Decode attribution | Available | PR #162 and PR #169 add CPU/GPU attribution, route counts, NCCL counters, CUDA event timing, and optional NVTX correlation. | | Direct same-prompt diagnostic batch | Available | PR #184 and PR #196 cover batch sizes `1`, `4`, and `8` for the fixed same-prompt direct path. | | Device-resident NCCL combine | Available | Issue #275 keeps NCCL combine contributions/results on reusable f32 device scratch and preserves the HF / host-staged / NCCL exact gate on 2x RTX 5090. | -| Device-resident NCCL dense exchange | Available | Issue #276 keeps dense-exchange rank recv/zero-send buffers on reusable backend-owned bf16 scratch, clears rank1 zero-send every exchange, removes dense-exchange stream sync from the backend call, and preserves HF / host-staged / NCCL exactness on 2x RTX 5090. | +| Device-resident NCCL dense exchange | Available | Issue #276 reuses backend-owned bf16 dense-exchange scratch, clears rank1 zero-send every exchange, removes dense-exchange stream sync from the backend call, and preserves HF / host-staged / NCCL exactness on 2x RTX 5090. | | NCCL CUDA Graph readiness | Diagnostic only | The attribution binary emits `cuda_graph_readiness`. Current NCCL full decode capture remains blocked by host route iteration and host-directed expert accumulation; the removed dense-exchange allocation/sync blockers should stay absent. | | Production continuous batching | Not available | The direct diagnostic batch path is not mixed-request HTTP serving. | | vLLM production parity | Not claimed | The manual vLLM snapshot below is for understanding the gap requested in issue #170. | @@ -101,7 +101,7 @@ Do not claim: Issue #205 records the model roadmap. Maintainer feedback there calls out NCCL plus CUDA Graph as the likely best decode direction, with host staging possibly deprecated later. Treat that as a future direction, not as current evidence. -The current graph-readiness diagnostic is intentionally fail-closed: `full_decode_capture_ready=false` for NCCL. Issue #275 removed the old NCCL combine H2D/D2H/allocation/sync blockers, and issue #276 removed the dense-exchange allocation/sync blockers from the retained 2x RTX 5090 attribution gate. The remaining NCCL blockers are host route iteration and host-directed expert accumulation. The optional f32 NCCL graph smoke is a separate collective-only diagnostic and is not #276 evidence. HF, host-staged, and NCCL remain token/text exact for the narrow greedy gate. +The current graph-readiness diagnostic is intentionally fail-closed: `full_decode_capture_ready=false` for NCCL. Issue #275 removed the old NCCL combine H2D/D2H/allocation/sync blockers, and issue #276 removed the dense-exchange allocation/sync blockers from the retained 2x RTX 5090 attribution gate. Those removed dense-exchange blockers are absent from the current readiness report. The remaining NCCL blockers are host route iteration and host-directed expert accumulation. The optional f32 NCCL graph smoke is a separate collective-only diagnostic and is not #276 evidence. HF, host-staged, and NCCL remain token/text exact for the narrow greedy gate. The next implementation should be chosen from measured evidence: diff --git a/openinfer-deepseek-v2-lite/src/nccl_backend.rs b/openinfer-deepseek-v2-lite/src/nccl_backend.rs index a8be12fd..6042989a 100644 --- a/openinfer-deepseek-v2-lite/src/nccl_backend.rs +++ b/openinfer-deepseek-v2-lite/src/nccl_backend.rs @@ -29,6 +29,9 @@ use serde::Serialize; use crate::device::activate; +#[cfg(test)] +mod tests; + type NcclCommInitAll = unsafe extern "C" fn(*mut ncclComm_t, c_int, *const c_int) -> ncclResult_t; type NcclCommCount = unsafe extern "C" fn(ncclComm_t, *mut c_int) -> ncclResult_t; type NcclCommCuDevice = unsafe extern "C" fn(ncclComm_t, *mut c_int) -> ncclResult_t; @@ -127,7 +130,7 @@ impl DeviceDenseExchangeScratch { rank1: &DeviceContext, hidden_dim: usize, seq_len: usize, - ) -> Result<()> { + ) -> Result { let elems = dense_exchange_elems(hidden_dim, seq_len)?; if self.hidden_dim == hidden_dim && self.seq_len == seq_len @@ -144,7 +147,7 @@ impl DeviceDenseExchangeScratch { .as_ref() .is_some_and(|buf| buf.len() >= elems) { - return Ok(()); + return Ok(elems); } activate(rank0)?; @@ -161,33 +164,6 @@ impl DeviceDenseExchangeScratch { self.rank0_recv = Some(rank0_recv); self.rank1_send_zero = Some(rank1_send_zero); self.rank1_recv = Some(rank1_recv); - Ok(()) - } - - fn ensure_shape(&self, hidden_dim: usize, seq_len: usize) -> Result { - let elems = dense_exchange_elems(hidden_dim, seq_len)?; - ensure!( - self.hidden_dim == hidden_dim && self.seq_len == seq_len, - "DeepSeek-V2-Lite NCCL dense exchange scratch shape mismatch: scratch=[{}, {}], requested=[{}, {}]", - self.hidden_dim, - self.seq_len, - hidden_dim, - seq_len - ); - ensure!( - self.rank0_recv - .as_ref() - .is_some_and(|buf| buf.len() >= elems) - && self - .rank1_send_zero - .as_ref() - .is_some_and(|buf| buf.len() >= elems) - && self - .rank1_recv - .as_ref() - .is_some_and(|buf| buf.len() >= elems), - "DeepSeek-V2-Lite NCCL dense exchange scratch is not initialized for {elems} elements" - ); Ok(elems) } @@ -360,8 +336,7 @@ impl NaiveNcclEp2Backend { "DeepSeek-V2-Lite NCCL dense hidden exchange requires non-empty hidden states" ); let mut scratch = self.dense_exchange_scratch()?; - let elems = dense_exchange_elems(input.hidden_dim, input.seq_len)?; - scratch.ensure(rank0, rank1, input.hidden_dim, input.seq_len)?; + let elems = scratch.ensure(rank0, rank1, input.hidden_dim, input.seq_len)?; activate(rank1)?; rank1 .stream @@ -369,7 +344,6 @@ impl NaiveNcclEp2Backend { "DeepSeek-V2-Lite NCCL rank1 dense exchange zero-send scratch is missing", )?) .context("clear DeepSeek-V2-Lite NCCL rank1 dense exchange zero-send scratch")?; - scratch.ensure_shape(input.hidden_dim, input.seq_len)?; let DeviceDenseExchangeScratch { rank0_recv, @@ -1268,39 +1242,3 @@ unsafe fn load_symbol(library: &Library, symbol: &'static [u8]) -> Resu ) }) } - -#[cfg(test)] -mod nccl_loader_tests { - use std::{ - fs, - time::{SystemTime, UNIX_EPOCH}, - }; - - use super::*; - - #[test] - fn finds_nccl_python_wheel_lib_dir_from_python_executable() { - let unique = SystemTime::now() - .duration_since(UNIX_EPOCH) - .expect("system time before epoch") - .as_nanos(); - let root = env::temp_dir().join(format!( - "openinfer-nccl-wheel-test-{}-{unique}", - std::process::id() - )); - let python_dir = root.join("bin"); - let wheel_dir = root.join("lib/python3.11/site-packages/nvidia/nccl/lib"); - fs::create_dir_all(&python_dir).expect("create python bin dir"); - fs::create_dir_all(&wheel_dir).expect("create NCCL wheel dir"); - fs::write(wheel_dir.join("libnccl.so.2"), []).expect("create fake NCCL lib marker"); - - let mut roots = Vec::new(); - let mut seen = HashSet::new(); - add_python_env_root(&mut roots, &mut seen, &python_dir.join("python")); - - assert_eq!(roots, vec![root.clone()]); - assert_eq!(nccl_python_wheel_lib_dirs_from_root(&root), vec![wheel_dir]); - - fs::remove_dir_all(root).expect("remove temp root"); - } -} diff --git a/openinfer-deepseek-v2-lite/src/nccl_backend/tests.rs b/openinfer-deepseek-v2-lite/src/nccl_backend/tests.rs new file mode 100644 index 00000000..505d6e49 --- /dev/null +++ b/openinfer-deepseek-v2-lite/src/nccl_backend/tests.rs @@ -0,0 +1,33 @@ +use std::{ + collections::HashSet, + env, fs, + time::{SystemTime, UNIX_EPOCH}, +}; + +use super::{add_python_env_root, nccl_python_wheel_lib_dirs_from_root}; + +#[test] +fn finds_nccl_python_wheel_lib_dir_from_python_executable() { + let unique = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("system time before epoch") + .as_nanos(); + let root = env::temp_dir().join(format!( + "openinfer-nccl-wheel-test-{}-{unique}", + std::process::id() + )); + let python_dir = root.join("bin"); + let wheel_dir = root.join("lib/python3.11/site-packages/nvidia/nccl/lib"); + fs::create_dir_all(&python_dir).expect("create python bin dir"); + fs::create_dir_all(&wheel_dir).expect("create NCCL wheel dir"); + fs::write(wheel_dir.join("libnccl.so.2"), []).expect("create fake NCCL lib marker"); + + let mut roots = Vec::new(); + let mut seen = HashSet::new(); + add_python_env_root(&mut roots, &mut seen, &python_dir.join("python")); + + assert_eq!(roots, vec![root.clone()]); + assert_eq!(nccl_python_wheel_lib_dirs_from_root(&root), vec![wheel_dir]); + + fs::remove_dir_all(root).expect("remove temp root"); +} diff --git a/openinfer-deepseek-v2-lite/src/runtime/readiness.rs b/openinfer-deepseek-v2-lite/src/runtime/readiness.rs index 4881015d..7311902c 100644 --- a/openinfer-deepseek-v2-lite/src/runtime/readiness.rs +++ b/openinfer-deepseek-v2-lite/src/runtime/readiness.rs @@ -9,6 +9,9 @@ use super::{ }, }; +#[cfg(test)] +mod tests; + impl DeepSeekV2LiteEp2Generator { pub fn decode_graph_readiness_report( &self, @@ -103,22 +106,3 @@ fn decode_graph_blockers(backend: EpBackendKind) -> Vec { ], } } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn nccl_readiness_reports_only_remaining_graph_blockers() { - let blockers = decode_graph_blockers(EpBackendKind::Nccl); - let ids: Vec<_> = blockers.iter().map(|blocker| blocker.id).collect(); - - assert_eq!( - ids, - vec![ - "nccl_route_iteration_on_host", - "nccl_expert_accumulation_host_directed", - ] - ); - } -} diff --git a/openinfer-deepseek-v2-lite/src/runtime/readiness/tests.rs b/openinfer-deepseek-v2-lite/src/runtime/readiness/tests.rs new file mode 100644 index 00000000..25dc47de --- /dev/null +++ b/openinfer-deepseek-v2-lite/src/runtime/readiness/tests.rs @@ -0,0 +1,15 @@ +use super::{EpBackendKind, decode_graph_blockers}; + +#[test] +fn nccl_readiness_reports_only_remaining_graph_blockers() { + let blockers = decode_graph_blockers(EpBackendKind::Nccl); + let ids: Vec<_> = blockers.iter().map(|blocker| blocker.id).collect(); + + assert_eq!( + ids, + vec![ + "nccl_route_iteration_on_host", + "nccl_expert_accumulation_host_directed", + ] + ); +} From c205c4e8483a7dd03b82f8b6679c276731ddf49d Mon Sep 17 00:00:00 2001 From: CAICAIIs <3360776475@qq.com> Date: Thu, 11 Jun 2026 01:42:32 +0800 Subject: [PATCH 3/3] fix(dsv2lite): address nccl review follow-ups --- .../src/nccl_backend.rs | 6 ++++- .../src/nccl_backend/tests.rs | 19 ++++++++++++++++ openinfer-kernels/src/ops/elementwise.rs | 22 ++++++++++++++++++- 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/openinfer-deepseek-v2-lite/src/nccl_backend.rs b/openinfer-deepseek-v2-lite/src/nccl_backend.rs index 6042989a..28b2d393 100644 --- a/openinfer-deepseek-v2-lite/src/nccl_backend.rs +++ b/openinfer-deepseek-v2-lite/src/nccl_backend.rs @@ -1227,11 +1227,15 @@ fn nccl_python_wheel_lib_dirs_from_root(root: &Path) -> Vec { } fn add_python_wheel_lib_dir(dirs: &mut Vec, seen: &mut HashSet, dir: PathBuf) { - if dir.join("libnccl.so.2").exists() && seen.insert(dir.clone()) { + if nccl_lib_dir_exists(&dir) && seen.insert(dir.clone()) { dirs.push(dir); } } +fn nccl_lib_dir_exists(dir: &Path) -> bool { + dir.join("libnccl.so.2").exists() || dir.join("libnccl.so").exists() +} + unsafe fn load_symbol(library: &Library, symbol: &'static [u8]) -> Result { unsafe { library.get::(symbol) } .map(|symbol| *symbol) diff --git a/openinfer-deepseek-v2-lite/src/nccl_backend/tests.rs b/openinfer-deepseek-v2-lite/src/nccl_backend/tests.rs index 505d6e49..5f0dc832 100644 --- a/openinfer-deepseek-v2-lite/src/nccl_backend/tests.rs +++ b/openinfer-deepseek-v2-lite/src/nccl_backend/tests.rs @@ -31,3 +31,22 @@ fn finds_nccl_python_wheel_lib_dir_from_python_executable() { fs::remove_dir_all(root).expect("remove temp root"); } + +#[test] +fn finds_nccl_python_wheel_lib_dir_with_unversioned_soname() { + let unique = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("system time before epoch") + .as_nanos(); + let root = env::temp_dir().join(format!( + "openinfer-nccl-wheel-unversioned-test-{}-{unique}", + std::process::id() + )); + let wheel_dir = root.join("lib/python3.11/site-packages/nvidia/nccl/lib"); + fs::create_dir_all(&wheel_dir).expect("create NCCL wheel dir"); + fs::write(wheel_dir.join("libnccl.so"), []).expect("create fake NCCL lib marker"); + + assert_eq!(nccl_python_wheel_lib_dirs_from_root(&root), vec![wheel_dir]); + + fs::remove_dir_all(root).expect("remove temp root"); +} diff --git a/openinfer-kernels/src/ops/elementwise.rs b/openinfer-kernels/src/ops/elementwise.rs index e9cbdb2b..1e651e97 100644 --- a/openinfer-kernels/src/ops/elementwise.rs +++ b/openinfer-kernels/src/ops/elementwise.rs @@ -510,9 +510,14 @@ pub fn extract_vec_ref_into( token_idx: usize, out: &mut DeviceVec, ) -> Result<()> { - let offset = token_idx * batch.hidden_dim; let len = batch.hidden_dim; anyhow::ensure!(out.len == len, "extract_vec_into len mismatch"); + anyhow::ensure!( + token_idx < batch.seq_len, + "extract_vec_into token index {token_idx} out of bounds for seq_len {}", + batch.seq_len + ); + let offset = token_idx * batch.hidden_dim; let src_view = batch.data.slice(offset..offset + len); ctx.stream .memcpy_dtod(&src_view, &mut out.data) @@ -606,4 +611,19 @@ mod tests { } Ok(()) } + + #[test] + fn extract_vec_ref_rejects_out_of_bounds_token() -> Result<()> { + let ctx = DeviceContext::new()?; + let hidden = hidden_from_host(&ctx, &[bf16::from_f32(1.0), bf16::from_f32(2.0)], 2, 1)?; + let mut out = DeviceVec::zeros(&ctx, 2)?; + + let err = extract_vec_ref_into(&ctx, hidden.as_ref(), 1, &mut out).unwrap_err(); + + assert!( + err.to_string().contains("out of bounds"), + "unexpected error: {err}" + ); + Ok(()) + } }