diff --git a/openinfer-server/Cargo.toml b/openinfer-server/Cargo.toml index 43b23fc6..6a0e58c7 100644 --- a/openinfer-server/Cargo.toml +++ b/openinfer-server/Cargo.toml @@ -16,7 +16,7 @@ path = "src/main.rs" [[bin]] name = "bench_serving" -path = "src/bin/bench_serving.rs" +path = "src/bin/bench_serving/main.rs" [dependencies] openinfer-core = { workspace = true } diff --git a/openinfer-server/src/bin/bench_serving.rs b/openinfer-server/src/bin/bench_serving.rs deleted file mode 100644 index 310fb66d..00000000 --- a/openinfer-server/src/bin/bench_serving.rs +++ /dev/null @@ -1,2415 +0,0 @@ -//! In-process inference benchmark CLI. -//! -//! Usage: -//! cargo run -r --bin bench_serving -- [GLOBAL_OPTIONS] [OPTIONS] -//! -//! Examples: -//! cargo run -r --bin bench_serving -- request --prompt "Tell me a story" --output-len 128 -//! cargo run -r --bin bench_serving -- request --prompt-len 512 --output-len 64 -//! cargo run -r --bin bench_serving -- matrix --prompt-lens 32,128,512 --output-lens 32,128 -//! cargo run -r --bin bench_serving -- curve --prompt-len 1024 --output-len 256 --window 32 - -use std::fmt::Write as _; -use std::fs; -use std::io::{IsTerminal, stdout}; -use std::path::{Path, PathBuf}; -use std::thread; -use std::time::{Duration, Instant}; - -use anyhow::{Context, Result, ensure}; -use clap::{Args as ClapArgs, Parser, Subcommand, ValueEnum}; -use comfy_table::modifiers::UTF8_ROUND_CORNERS; -use comfy_table::presets::{ASCII_FULL_CONDENSED, UTF8_FULL_CONDENSED}; -use comfy_table::{Cell, CellAlignment, Table}; -use cudarc::driver::Profiler; -use cudarc::runtime::result::device as cuda_device; -use log::{debug, info}; -use openinfer::logging; -use openinfer::sampler::SamplingParams; -use openinfer::scheduler::{SchedulerHandle, SchedulerRequest, TokenEvent}; -use openinfer::server_engine::{ModelType, detect_model_type}; -use openinfer_core::engine::{EngineLoadOptions, EpBackend}; -#[cfg(feature = "kimi-k2")] -use openinfer_core::parallel::ParallelConfig; -use openinfer_vllm_support::load_tokenizer as load_vllm_tokenizer; -use rand::RngExt; -use rand::SeedableRng; -use rand::rngs::StdRng; -use serde::{Deserialize, Serialize}; -use tokio::sync::mpsc; -use vllm_text::tokenizer::DynTokenizer; - -const SNAPSHOT_DIR: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../bench_snapshots"); -const SNAPSHOT_PREFILL_OUTPUT_LEN: usize = 1; -const SNAPSHOT_DECODE_PROMPT_LEN: usize = 1024; -const SNAPSHOT_DECODE_OUTPUT_LEN: usize = 256; - -fn snapshot_prefill_prompt_len(model_type: ModelType) -> usize { - match model_type { - // Kimi serves TP1/DP8, where the PPLX fabric buffers cap prompts at - // 2048 tokens (full-lifetime KV cap is 8192) — probe the largest - // prompt the serving shape admits. - #[cfg(feature = "kimi-k2")] - ModelType::KimiK2 => 2_048, - _ => 10_000, - } -} -const REGRESSION_TPOT_PCT: f64 = 2.0; -const REGRESSION_TTFT_PCT: f64 = 3.0; - -const DEFAULT_MODEL_PATH: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../models/Qwen3-4B"); -const DEFAULT_REQUEST_PROMPT: &str = "Tell me a story"; -const DEFAULT_CURVE_PROMPT_LEN: usize = 512; -const SYNTHETIC_PATTERN: &str = "token_id = 100 + (idx % 1000)"; -const TOP_LEVEL_EXAMPLES: &str = "\ -Examples: - cargo run -r --bin bench_serving -- request - cargo run -r --bin bench_serving -- request --prompt \"Tell me a story about Rust\" --output-len 128 - cargo run -r --bin bench_serving -- request --prompt-len 512 --output-len 64 - cargo run -r --bin bench_serving -- matrix --prompt-lens 32,128,512,2048 --output-lens 32,128,256 - cargo run -r --bin bench_serving -- curve --prompt-len 1024 --output-len 256 --window 32 - cargo run -r --bin bench_serving -- --format json --out bench.json request --prompt-len 512 --output-len 64 - cargo run -r --bin bench_serving -- snapshot - cargo run -r --bin bench_serving -- compare bench_snapshots/rtx-5070-ti/qwen3-4b.json"; -const REQUEST_EXAMPLES: &str = "\ -Examples: - cargo run -r --bin bench_serving -- request - cargo run -r --bin bench_serving -- request --prompt \"Tell me a story about Rust\" --output-len 128 - cargo run -r --bin bench_serving -- request --prompt-file prompts/story.txt --output-len 128 - cargo run -r --bin bench_serving -- request --prompt-len 512 --output-len 64 --warmup 3 --iters 10"; -const MATRIX_EXAMPLES: &str = "\ -Examples: - cargo run -r --bin bench_serving -- matrix - cargo run -r --bin bench_serving -- matrix --prompt-lens 32,128,512,2048 --output-lens 32,128,256 - cargo run -r --bin bench_serving -- --format json --out matrix.json matrix --prompt-lens 128,512 --output-lens 64,256"; -const CURVE_EXAMPLES: &str = "\ -Examples: - cargo run -r --bin bench_serving -- curve - cargo run -r --bin bench_serving -- curve --prompt-len 1024 --output-len 256 --window 32 - cargo run -r --bin bench_serving -- curve --prompt \"Summarize KV cache behavior\" --output-len 128 --window 16"; -const SNAPSHOT_EXAMPLES: &str = "\ -Examples: - cargo run -r --bin bench_serving -- snapshot - cargo run -r --bin bench_serving -- snapshot --warmup 3 --iters 10"; -const COMPARE_EXAMPLES: &str = "\ -Examples: - cargo run -r --bin bench_serving -- compare bench_snapshots/rtx-5070-ti/qwen3-4b.json - cargo run -r --bin bench_serving -- compare bench_snapshots/rtx-5070-ti/qwen3-4b.json --baseline HEAD~3"; - -#[derive(Debug, Clone, Copy, ValueEnum)] -enum OutputFormat { - Text, - Json, -} - -#[derive(Debug, Clone, Copy, ValueEnum)] -enum CliEpBackend { - Nccl, - #[value(name = "deepep")] - DeepEp, -} - -impl From for EpBackend { - fn from(value: CliEpBackend) -> Self { - match value { - CliEpBackend::Nccl => Self::Nccl, - CliEpBackend::DeepEp => Self::DeepEp, - } - } -} - -#[derive(Debug, Subcommand)] -enum Command { - /// Measure one request shape end-to-end. - #[command(after_help = REQUEST_EXAMPLES)] - Request(RequestArgs), - /// Sweep prompt_len x output_len and summarize each cell. - #[command(after_help = MATRIX_EXAMPLES)] - Matrix(MatrixArgs), - /// Measure TPOT as context grows during decode. - #[command(after_help = CURVE_EXAMPLES)] - Curve(CurveArgs), - /// Run standard profiles and write a regression-trackable snapshot. - #[command(after_help = SNAPSHOT_EXAMPLES)] - Snapshot(SnapshotArgs), - /// Compare a snapshot against its git baseline. - #[command(after_help = COMPARE_EXAMPLES)] - Compare(CompareArgs), -} - -#[derive(Parser, Debug)] -#[command( - name = "bench_serving", - about = "openinfer in-process inference benchmark", - after_help = TOP_LEVEL_EXAMPLES -)] -struct Cli { - /// Model directory (contains config.json, tokenizer, safetensors) - #[arg(long, default_value = DEFAULT_MODEL_PATH)] - model_path: String, - - /// Enable CUDA graph on decode path - #[arg(long, default_value_t = true, action = clap::ArgAction::Set)] - cuda_graph: bool, - - /// Render result to terminal as text or structured JSON - #[arg(long, default_value = "text")] - format: OutputFormat, - - /// Optional label to tag this benchmark run - #[arg(long)] - label: Option, - - /// Optional output path for the rendered report - #[arg(long)] - out: Option, - - /// Capture only measured iterations for nsys `-c cudaProfilerApi` - #[arg(long, default_value_t = false)] - cuda_profiler_capture: bool, - - /// Tensor-parallel world size for Kimi-K2 - #[arg(long, default_value_t = 1)] - tp_size: usize, - - /// Data-parallel world size for Kimi-K2 - #[arg(long, default_value_t = 8)] - dp_size: usize, - - /// Expert-parallel backend for Kimi-K2 (TP1/DP8 requires deepep; TP8/DP1 requires nccl) - #[arg(long, default_value = "deepep")] - ep_backend: CliEpBackend, - - #[command(subcommand)] - command: Command, -} - -#[derive(Debug, Clone, ClapArgs)] -struct PromptInputArgs { - /// Inline prompt text - #[arg(long, conflicts_with_all = ["prompt_file", "prompt_len"])] - prompt: Option, - - /// Read prompt text from file - #[arg(long, conflicts_with_all = ["prompt", "prompt_len"])] - prompt_file: Option, - - /// Use a synthetic prompt with exactly this many token ids - #[arg(long, conflicts_with_all = ["prompt", "prompt_file"])] - prompt_len: Option, -} - -#[derive(Debug, Clone, ClapArgs)] -struct RunArgs { - /// Warmup iterations - #[arg(long, default_value_t = 5)] - warmup: usize, - - /// Measured iterations - #[arg(long, default_value_t = 20)] - iters: usize, - - /// RNG seed (matters once sampling becomes non-greedy) - #[arg(long, default_value_t = 42)] - seed: u64, -} - -#[derive(Debug, ClapArgs)] -struct RequestArgs { - #[command(flatten)] - prompt_input: PromptInputArgs, - - /// Max generated tokens - #[arg(long, default_value_t = 64)] - output_len: usize, - - /// Number of concurrent requests per measured iteration - #[arg(long, default_value_t = 1)] - concurrency: usize, - - /// Number of *distinct* synthetic prompts to tile across the concurrent - /// batch (0 = one per request, fully diverse). `1` makes every concurrent - /// request identical, which collapses MoE routing onto a narrow expert set - /// and under-measures decode TPOT — sweep this to quantify the - /// routing-diversity → TPOT curve (see the MoE bench-diversity lesson). - #[arg(long, default_value_t = 0)] - distinct_prompts: usize, - - #[command(flatten)] - run: RunArgs, -} - -#[derive(Debug, ClapArgs)] -struct MatrixArgs { - /// Synthetic prompt lengths to sweep - #[arg(long, value_delimiter = ',', default_value = "32,128,512,2048")] - prompt_lens: Vec, - - /// Output lengths to sweep - #[arg(long, value_delimiter = ',', default_value = "32,128,256")] - output_lens: Vec, - - #[command(flatten)] - run: RunArgs, -} - -#[derive(Debug, ClapArgs)] -struct CurveArgs { - #[command(flatten)] - prompt_input: PromptInputArgs, - - /// Max generated tokens - #[arg(long, default_value_t = 256)] - output_len: usize, - - /// Group decode positions into windows of this size - #[arg(long, default_value_t = 32)] - window: usize, - - #[command(flatten)] - run: RunArgs, -} - -#[derive(Debug, ClapArgs)] -struct SnapshotArgs { - #[command(flatten)] - run: RunArgs, -} - -#[derive(Debug, ClapArgs)] -struct CompareArgs { - /// Path to snapshot JSON file - path: String, - - /// Git ref to compare against - #[arg(long, default_value = "HEAD")] - baseline: String, -} - -#[derive(Debug, Clone, Serialize)] -struct RunInfo { - command: &'static str, - model_path: String, - model_type: String, - cuda_graph: bool, - load_ms: f64, - label: Option, -} - -#[derive(Debug, Clone, Serialize)] -struct PromptDescriptor { - source: String, - prompt_tokens: usize, - prompt_preview: Option, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -struct DurationStats { - avg_ms: f64, - p50_ms: f64, - p95_ms: f64, - p99_ms: f64, - max_ms: f64, - samples: usize, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -struct CountStats { - min: usize, - max: usize, - avg: f64, - samples: usize, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -struct GeneratedTokenTrace { - hash: String, - prefix: Vec, - len: usize, -} - -#[derive(Debug, Clone, Serialize)] -struct RequestWorkload { - prompt: PromptDescriptor, - output_len: usize, - concurrency: usize, - warmup: usize, - iters: usize, - seed: u64, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -struct RequestMetrics { - ttft_ms: DurationStats, - first_decode_step_ms: Option, - steady_tpot_ms: Option, - e2e_ms: DurationStats, - generated_tokens: CountStats, - #[serde(default)] - generated_token_traces: Vec, - request_tok_s: Option, - decode_tok_s: Option, -} - -#[derive(Debug, Clone, Serialize)] -struct RequestIterationTiming { - index: usize, - ttft_ms: f64, - first_decode_step_ms: Option, - steady_tpot_ms: Option, - e2e_ms: f64, - generated_tokens: usize, - generated_token_trace: GeneratedTokenTrace, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -struct SnapshotProfile { - prompt_len: usize, - output_len: usize, - metrics: RequestMetrics, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -struct SnapshotReport { - commit: String, - date: String, - model: String, - gpu: String, - /// Parallel layout the snapshot was measured under (e.g. "tp1-dp8-deepep"). - /// Absent in snapshots that predate multi-GPU model lines. - #[serde(default, skip_serializing_if = "Option::is_none")] - parallel: Option, - prefill_heavy: SnapshotProfile, - decode_heavy: SnapshotProfile, -} - -#[derive(Debug, Clone, Serialize)] -struct RequestReport { - run: RunInfo, - workload: RequestWorkload, - metrics: RequestMetrics, - iterations: Vec, -} - -#[derive(Debug, Clone, Serialize)] -struct MatrixWorkload { - prompt_lens: Vec, - output_lens: Vec, - warmup: usize, - iters: usize, - seed: u64, - synthetic_pattern: &'static str, -} - -#[derive(Debug, Clone, Serialize)] -struct MatrixCell { - prompt_len: usize, - output_len: usize, - ttft_ms: DurationStats, - e2e_ms: DurationStats, - first_decode_step_ms: Option, - steady_tpot_ms: Option, - generated_tokens: CountStats, - request_tok_s: Option, - decode_tok_s: Option, -} - -#[derive(Debug, Clone, Serialize)] -struct MatrixReport { - run: RunInfo, - workload: MatrixWorkload, - cells: Vec, -} - -#[derive(Debug, Clone, Serialize)] -struct CurveWorkload { - prompt: PromptDescriptor, - output_len: usize, - window: usize, - warmup: usize, - iters: usize, - seed: u64, -} - -#[derive(Debug, Clone, Serialize)] -struct CurveWindow { - ctx_start: usize, - ctx_end: usize, - tpot_ms: DurationStats, - decode_tok_s: Option, -} - -#[derive(Debug, Clone, Serialize)] -struct CurveReport { - run: RunInfo, - workload: CurveWorkload, - windows: Vec, -} - -#[derive(Debug, Clone, Serialize)] -#[serde(tag = "kind", rename_all = "snake_case")] -enum BenchReport { - Request(Box), - Matrix(MatrixReport), - Curve(CurveReport), -} - -fn dur_ms(d: Duration) -> f64 { - d.as_secs_f64() * 1000.0 -} - -fn percentiles(sorted: &[Duration]) -> (Duration, Duration, Duration, Duration, Duration) { - assert!(!sorted.is_empty()); - let n = sorted.len(); - let sum: Duration = sorted.iter().sum(); - let avg = sum / n as u32; - let p = |pct: f64| sorted[((pct / 100.0) * (n - 1) as f64).round() as usize]; - (avg, p(50.0), p(95.0), p(99.0), sorted[n - 1]) -} - -fn summarize_durations(samples: &[Duration]) -> DurationStats { - let mut sorted = samples.to_vec(); - sorted.sort(); - let (avg, p50, p95, p99, max) = percentiles(&sorted); - DurationStats { - avg_ms: dur_ms(avg), - p50_ms: dur_ms(p50), - p95_ms: dur_ms(p95), - p99_ms: dur_ms(p99), - max_ms: dur_ms(max), - samples: sorted.len(), - } -} - -fn summarize_counts(samples: &[usize]) -> CountStats { - assert!(!samples.is_empty()); - let min = *samples.iter().min().unwrap(); - let max = *samples.iter().max().unwrap(); - let sum: usize = samples.iter().sum(); - CountStats { - min, - max, - avg: sum as f64 / samples.len() as f64, - samples: samples.len(), - } -} - -fn aggregate_tok_s(tokens: usize, total: Duration) -> Option { - if tokens == 0 || total.is_zero() { - None - } else { - Some(tokens as f64 / total.as_secs_f64()) - } -} - -fn generated_token_hash(tokens: &[u32]) -> String { - let mut hash = 0xcbf2_9ce4_8422_2325_u64; - for token in tokens { - for byte in token.to_le_bytes() { - hash ^= u64::from(byte); - hash = hash.wrapping_mul(0x0100_0000_01b3); - } - } - format!("{hash:016x}") -} - -fn generated_token_trace(tokens: &[u32]) -> GeneratedTokenTrace { - GeneratedTokenTrace { - hash: generated_token_hash(tokens), - prefix: tokens.iter().copied().take(16).collect(), - len: tokens.len(), - } -} - -fn new_table() -> Table { - let mut table = Table::new(); - if stdout().is_terminal() { - table.load_preset(UTF8_FULL_CONDENSED); - table.apply_modifier(UTF8_ROUND_CORNERS); - } else { - table.load_preset(ASCII_FULL_CONDENSED); - } - table -} - -fn key_cell(label: impl Into) -> Cell { - Cell::new(label.into()) -} - -fn value_cell(value: impl Into) -> Cell { - Cell::new(value.into()) -} - -fn numeric_cell(value: impl Into) -> Cell { - Cell::new(value.into()).set_alignment(CellAlignment::Right) -} - -fn format_rate(value: Option) -> String { - value.map_or_else(|| "-".to_string(), |v| format!("{v:.2}")) -} - -fn format_duration_ms(value: f64) -> String { - format!("{value:.2}") -} - -fn format_count_avg(value: f64) -> String { - format!("{value:.2}") -} - -fn push_table(out: &mut String, table: &Table) { - out.push_str(&table.to_string()); - out.push('\n'); -} - -fn render_run_summary(report: &RunInfo) -> Table { - let mut table = new_table(); - table.add_row(vec![ - key_cell("model"), - value_cell(format!("{} ({})", report.model_path, report.model_type)), - ]); - table.add_row(vec![ - key_cell("cuda_graph"), - value_cell(report.cuda_graph.to_string()), - ]); - table.add_row(vec![ - key_cell("load_ms"), - numeric_cell(format_duration_ms(report.load_ms)), - ]); - if let Some(label) = &report.label { - table.add_row(vec![key_cell("label"), value_cell(label.clone())]); - } - table -} - -fn render_request_meta(report: &RequestReport) -> Table { - let mut table = render_run_summary(&report.run); - table.add_row(vec![ - key_cell("prompt_source"), - value_cell(report.workload.prompt.source.clone()), - ]); - table.add_row(vec![ - key_cell("prompt_tokens"), - numeric_cell(report.workload.prompt.prompt_tokens.to_string()), - ]); - if let Some(preview) = &report.workload.prompt.prompt_preview { - table.add_row(vec![ - key_cell("prompt"), - value_cell(format!("\"{preview}\"")), - ]); - } - table.add_row(vec![ - key_cell("output_len"), - numeric_cell(report.workload.output_len.to_string()), - ]); - table.add_row(vec![ - key_cell("warmup / iters"), - value_cell(format!( - "{} / {}", - report.workload.warmup, report.workload.iters - )), - ]); - table.add_row(vec![ - key_cell("seed"), - numeric_cell(report.workload.seed.to_string()), - ]); - table -} - -fn render_duration_table(rows: Vec<(String, DurationStats)>) -> Table { - let mut table = new_table(); - table.set_header(vec![ - Cell::new("metric"), - Cell::new("avg_ms").set_alignment(CellAlignment::Right), - Cell::new("p50_ms").set_alignment(CellAlignment::Right), - Cell::new("p95_ms").set_alignment(CellAlignment::Right), - Cell::new("p99_ms").set_alignment(CellAlignment::Right), - Cell::new("max_ms").set_alignment(CellAlignment::Right), - Cell::new("samples").set_alignment(CellAlignment::Right), - ]); - for (label, stats) in rows { - table.add_row(vec![ - key_cell(label), - numeric_cell(format_duration_ms(stats.avg_ms)), - numeric_cell(format_duration_ms(stats.p50_ms)), - numeric_cell(format_duration_ms(stats.p95_ms)), - numeric_cell(format_duration_ms(stats.p99_ms)), - numeric_cell(format_duration_ms(stats.max_ms)), - numeric_cell(stats.samples.to_string()), - ]); - } - table -} - -fn render_request_summary(report: &RequestReport) -> Table { - let mut table = new_table(); - table.set_header(vec![ - Cell::new("metric"), - Cell::new("value").set_alignment(CellAlignment::Right), - ]); - table.add_row(vec![ - key_cell("generated_tokens_avg"), - numeric_cell(format_count_avg(report.metrics.generated_tokens.avg)), - ]); - table.add_row(vec![ - key_cell("generated_tokens_min"), - numeric_cell(report.metrics.generated_tokens.min.to_string()), - ]); - table.add_row(vec![ - key_cell("generated_tokens_max"), - numeric_cell(report.metrics.generated_tokens.max.to_string()), - ]); - table.add_row(vec![ - key_cell("generated_token_runs"), - numeric_cell(report.metrics.generated_tokens.samples.to_string()), - ]); - table.add_row(vec![ - key_cell("request_tok_s"), - numeric_cell(format_rate(report.metrics.request_tok_s)), - ]); - table.add_row(vec![ - key_cell("decode_tok_s"), - numeric_cell(format_rate(report.metrics.decode_tok_s)), - ]); - table -} - -fn render_matrix_meta(report: &MatrixReport) -> Table { - let mut table = render_run_summary(&report.run); - table.add_row(vec![ - key_cell("prompt_lens"), - value_cell( - report - .workload - .prompt_lens - .iter() - .map(std::string::ToString::to_string) - .collect::>() - .join(","), - ), - ]); - table.add_row(vec![ - key_cell("output_lens"), - value_cell( - report - .workload - .output_lens - .iter() - .map(std::string::ToString::to_string) - .collect::>() - .join(","), - ), - ]); - table.add_row(vec![ - key_cell("synthetic_pattern"), - value_cell(report.workload.synthetic_pattern), - ]); - table.add_row(vec![ - key_cell("warmup / iters"), - value_cell(format!( - "{} / {}", - report.workload.warmup, report.workload.iters - )), - ]); - table.add_row(vec![ - key_cell("seed"), - numeric_cell(report.workload.seed.to_string()), - ]); - table -} - -fn render_matrix_table(report: &MatrixReport) -> Table { - let mut table = new_table(); - table.set_header(vec![ - Cell::new("prompt_tok").set_alignment(CellAlignment::Right), - Cell::new("output_tok").set_alignment(CellAlignment::Right), - Cell::new("ttft_avg").set_alignment(CellAlignment::Right), - Cell::new("ttft_p95").set_alignment(CellAlignment::Right), - Cell::new("e2e_avg").set_alignment(CellAlignment::Right), - Cell::new("req_tok/s").set_alignment(CellAlignment::Right), - Cell::new("decode_tok/s").set_alignment(CellAlignment::Right), - Cell::new("gen_avg").set_alignment(CellAlignment::Right), - ]); - for cell in &report.cells { - table.add_row(vec![ - numeric_cell(cell.prompt_len.to_string()), - numeric_cell(cell.output_len.to_string()), - numeric_cell(format_duration_ms(cell.ttft_ms.avg_ms)), - numeric_cell(format_duration_ms(cell.ttft_ms.p95_ms)), - numeric_cell(format_duration_ms(cell.e2e_ms.avg_ms)), - numeric_cell(format_rate(cell.request_tok_s)), - numeric_cell(format_rate(cell.decode_tok_s)), - numeric_cell(format_count_avg(cell.generated_tokens.avg)), - ]); - } - table -} - -fn render_curve_meta(report: &CurveReport) -> Table { - let mut table = render_run_summary(&report.run); - table.add_row(vec![ - key_cell("prompt_source"), - value_cell(report.workload.prompt.source.clone()), - ]); - table.add_row(vec![ - key_cell("prompt_tokens"), - numeric_cell(report.workload.prompt.prompt_tokens.to_string()), - ]); - if let Some(preview) = &report.workload.prompt.prompt_preview { - table.add_row(vec![ - key_cell("prompt"), - value_cell(format!("\"{preview}\"")), - ]); - } - table.add_row(vec![ - key_cell("output_len"), - numeric_cell(report.workload.output_len.to_string()), - ]); - table.add_row(vec![ - key_cell("window"), - numeric_cell(report.workload.window.to_string()), - ]); - table.add_row(vec![ - key_cell("warmup / iters"), - value_cell(format!( - "{} / {}", - report.workload.warmup, report.workload.iters - )), - ]); - table.add_row(vec![ - key_cell("seed"), - numeric_cell(report.workload.seed.to_string()), - ]); - table -} - -fn render_curve_table(report: &CurveReport) -> Table { - let mut table = new_table(); - table.set_header(vec![ - Cell::new("ctx_range"), - Cell::new("avg_ms").set_alignment(CellAlignment::Right), - Cell::new("p50_ms").set_alignment(CellAlignment::Right), - Cell::new("p95_ms").set_alignment(CellAlignment::Right), - Cell::new("p99_ms").set_alignment(CellAlignment::Right), - Cell::new("tok/s").set_alignment(CellAlignment::Right), - Cell::new("samples").set_alignment(CellAlignment::Right), - ]); - for window in &report.windows { - table.add_row(vec![ - value_cell(format!("{}-{}", window.ctx_start, window.ctx_end)), - numeric_cell(format_duration_ms(window.tpot_ms.avg_ms)), - numeric_cell(format_duration_ms(window.tpot_ms.p50_ms)), - numeric_cell(format_duration_ms(window.tpot_ms.p95_ms)), - numeric_cell(format_duration_ms(window.tpot_ms.p99_ms)), - numeric_cell(format_rate(window.decode_tok_s)), - numeric_cell(window.tpot_ms.samples.to_string()), - ]); - } - table -} - -fn truncate_preview(text: &str, limit: usize) -> String { - let one_line = text.replace('\n', "\\n"); - if one_line.chars().count() <= limit { - return one_line; - } - let mut truncated = String::new(); - for ch in one_line.chars().take(limit) { - truncated.push(ch); - } - truncated.push_str("..."); - truncated -} - -fn synthetic_prompt_tokens(len: usize) -> Vec { - (0..len).map(|i| ((i % 1000) + 100) as u32).collect() -} - -/// Token-id bounds for synthetic concurrent prompts: above the low special -/// tokens and well under the smallest supported vocab (DeepSeek-V2-Lite ≈ -/// 102 400), so every drawn id is an ordinary token on any model line. -const SYNTHETIC_TOKEN_LO: u32 = 100; -const SYNTHETIC_TOKEN_HI: u32 = 100_000; - -/// One synthetic prompt of `len` random tokens, seeded per request so the -/// concurrent decode streams diverge. Identical concurrent prompts route a MoE -/// batch onto a narrow expert set, packing the Marlin expert GEMM into fat -/// tiles and under-measuring decode TPOT by ~7–15% (measured on Kimi-K2 via a -/// `--distinct-prompts` sweep; the bench trap behind the misread #225 "+51% -/// HTTP" gap). Distinct prompts exercise realistic broad expert routing. See -/// docs/lessons/moe-bench-prompt-diversity.md. -fn synthetic_random_prompt(len: usize, seed: u64, request_idx: usize) -> Vec { - let mut rng = - StdRng::seed_from_u64(seed ^ (request_idx as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15)); - (0..len) - .map(|_| rng.random_range(SYNTHETIC_TOKEN_LO..SYNTHETIC_TOKEN_HI)) - .collect() -} - -#[derive(Debug, Clone)] -struct PromptSpec { - descriptor: PromptDescriptor, - tokens: Vec, -} - -fn resolve_prompt_input( - args: &PromptInputArgs, - tokenizer: &DynTokenizer, - default_text: Option<&str>, - default_prompt_len: Option, -) -> Result { - match (&args.prompt, &args.prompt_file, args.prompt_len) { - (Some(prompt), None, None) => Ok(PromptSpec { - descriptor: PromptDescriptor { - source: "text".to_string(), - prompt_tokens: tokenizer.encode(prompt, false)?.len(), - prompt_preview: Some(truncate_preview(prompt, 96)), - }, - tokens: tokenizer.encode(prompt, false)?, - }), - (None, Some(path), None) => { - let prompt = fs::read_to_string(path) - .with_context(|| format!("failed to read prompt file: {path}"))?; - let tokens = tokenizer.encode(&prompt, false)?; - Ok(PromptSpec { - descriptor: PromptDescriptor { - source: format!("file:{path}"), - prompt_tokens: tokens.len(), - prompt_preview: Some(truncate_preview(&prompt, 96)), - }, - tokens, - }) - } - (None, None, Some(prompt_len)) => { - ensure!(prompt_len > 0, "--prompt-len must be > 0"); - Ok(PromptSpec { - descriptor: PromptDescriptor { - source: format!("synthetic:{SYNTHETIC_PATTERN}"), - prompt_tokens: prompt_len, - prompt_preview: None, - }, - tokens: synthetic_prompt_tokens(prompt_len), - }) - } - (None, None, None) => { - if let Some(prompt) = default_text { - let tokens = tokenizer.encode(prompt, false)?; - Ok(PromptSpec { - descriptor: PromptDescriptor { - source: "text".to_string(), - prompt_tokens: tokens.len(), - prompt_preview: Some(truncate_preview(prompt, 96)), - }, - tokens, - }) - } else if let Some(prompt_len) = default_prompt_len { - Ok(PromptSpec { - descriptor: PromptDescriptor { - source: format!("synthetic:{SYNTHETIC_PATTERN}"), - prompt_tokens: prompt_len, - prompt_preview: None, - }, - tokens: synthetic_prompt_tokens(prompt_len), - }) - } else { - unreachable!("default prompt source must be provided"); - } - } - _ => unreachable!("clap enforces prompt input conflicts"), - } -} - -struct GenTimings { - ttft: Duration, - tbt: Vec, - total: Duration, - emitted_tokens: usize, - generated_tokens: Vec, - decode_tokens_for_rate: usize, - decode_time_for_rate: Duration, -} - -trait BenchModel { - fn validate_concurrency(&self, concurrency: usize) -> Result<()> { - ensure!(concurrency > 0, "--concurrency must be > 0"); - Ok(()) - } - - fn timed_generation( - &mut self, - prompt_tokens: &[u32], - max_new_tokens: usize, - sampling: &SamplingParams, - rng: &mut StdRng, - ) -> GenTimings; - - /// Run one request per prompt; the slice length is the concurrency. Each - /// prompt is independent, so MoE models must be handed *distinct* prompts - /// to exercise realistic expert routing (see `synthetic_random_prompt`). - fn timed_generation_batch( - &mut self, - prompts: &[Vec], - max_new_tokens: usize, - sampling: &SamplingParams, - rng: &mut StdRng, - ) -> Vec { - prompts - .iter() - .map(|prompt| self.timed_generation(prompt, max_new_tokens, sampling, rng)) - .collect() - } -} - -fn run_timed(prompt_tokens: &[u32], max_new_tokens: usize, mut generate: F) -> GenTimings -where - F: FnMut(&[u32], usize, &mut dyn FnMut(u32) -> bool) -> Result<()>, -{ - let start = Instant::now(); - let mut first_at: Option = None; - let mut prev_at: Option = None; - let mut emitted_tokens = 0usize; - let mut tbt = Vec::with_capacity(max_new_tokens.saturating_sub(1)); - let mut generated_tokens = Vec::with_capacity(max_new_tokens); - - generate(prompt_tokens, max_new_tokens, &mut |tok| { - let now = Instant::now(); - emitted_tokens += 1; - generated_tokens.push(tok); - if first_at.is_none() { - first_at = Some(now); - } else if let Some(prev) = prev_at { - tbt.push(now - prev); - } - prev_at = Some(now); - true - }) - .expect("generation failed"); - - let total = start.elapsed(); - let ttft = first_at.map_or(total, |t| t - start); - let decode_tokens_for_rate = emitted_tokens.saturating_sub(1); - let decode_time_for_rate = tbt.iter().copied().sum(); - GenTimings { - ttft, - tbt, - total, - emitted_tokens, - generated_tokens, - decode_tokens_for_rate, - decode_time_for_rate, - } -} - -struct SchedulerBenchModel { - handle: SchedulerHandle, -} - -impl BenchModel for SchedulerBenchModel { - fn timed_generation( - &mut self, - prompt_tokens: &[u32], - max_new_tokens: usize, - sampling: &SamplingParams, - _rng: &mut StdRng, - ) -> GenTimings { - run_timed(prompt_tokens, max_new_tokens, |toks, n, cb| { - let (token_tx, mut token_rx) = mpsc::unbounded_channel(); - self.handle - .submit(SchedulerRequest { - request_id: None, - queued_at_unix_s: None, - prompt_tokens: toks.to_vec(), - params: SamplingParams { - temperature: sampling.temperature, - top_k: sampling.top_k, - top_p: sampling.top_p, - ignore_eos: sampling.ignore_eos, - }, - max_tokens: n, - lora_adapter: None, - token_tx, - logprobs: 0, - echo: false, - }) - .map_err(|e| anyhow::anyhow!("scheduler submit failed: {e}"))?; - - loop { - match token_rx.blocking_recv() { - Some(TokenEvent::Token { id, .. }) => { - if !cb(id) { - break; - } - } - Some(TokenEvent::PromptTokens { .. } | TokenEvent::Scheduled { .. }) => {} - Some(TokenEvent::Finished { .. }) => break, - Some(TokenEvent::Error { message, .. }) => { - anyhow::bail!("scheduler request failed: {message}"); - } - Some(TokenEvent::Rejected { message, .. }) => { - anyhow::bail!("scheduler request rejected: {message}"); - } - None => anyhow::bail!("scheduler channel closed"), - } - } - - Ok(()) - }) - } - - fn timed_generation_batch( - &mut self, - prompts: &[Vec], - max_new_tokens: usize, - sampling: &SamplingParams, - _rng: &mut StdRng, - ) -> Vec { - let mut workers = Vec::with_capacity(prompts.len()); - for (idx, prompt) in prompts.iter().enumerate() { - let handle = self.handle.clone(); - let prompt_tokens = prompt.clone(); - let sampling = *sampling; - workers.push(thread::spawn(move || { - run_timed(&prompt_tokens, max_new_tokens, |toks, n, cb| { - let (token_tx, mut token_rx) = mpsc::unbounded_channel(); - handle - .submit(SchedulerRequest { - request_id: Some(format!("bench-serving-{idx}")), - queued_at_unix_s: None, - prompt_tokens: toks.to_vec(), - params: SamplingParams { - temperature: sampling.temperature, - top_k: sampling.top_k, - top_p: sampling.top_p, - ignore_eos: sampling.ignore_eos, - }, - max_tokens: n, - lora_adapter: None, - token_tx, - logprobs: 0, - echo: false, - }) - .map_err(|e| anyhow::anyhow!("scheduler submit failed: {e}"))?; - - loop { - match token_rx.blocking_recv() { - Some(TokenEvent::Token { id, .. }) => { - if !cb(id) { - break; - } - } - Some( - TokenEvent::PromptTokens { .. } | TokenEvent::Scheduled { .. }, - ) => {} - Some(TokenEvent::Finished { .. }) => break, - Some(TokenEvent::Error { message, .. }) => { - anyhow::bail!("scheduler request failed: {message}"); - } - Some(TokenEvent::Rejected { message, .. }) => { - anyhow::bail!("scheduler request rejected: {message}"); - } - None => anyhow::bail!("scheduler channel closed"), - } - } - - Ok(()) - }) - })); - } - - workers - .into_iter() - .map(|worker| worker.join().expect("bench request worker panicked")) - .collect() - } -} - -#[cfg(feature = "deepseek-v2-lite")] -struct DeepSeekV2LiteBenchModel { - generator: openinfer_deepseek_v2_lite::DeepSeekV2LiteEp2Generator, -} - -#[cfg(feature = "deepseek-v2-lite")] -impl BenchModel for DeepSeekV2LiteBenchModel { - fn validate_concurrency(&self, concurrency: usize) -> Result<()> { - ensure!( - concurrency > 0 && concurrency <= 8, - "DeepSeek-V2-Lite direct benchmark supports --concurrency 1..=8; concurrency=1 is the single-row control and >1 uses the narrow same-prompt batched decode path, got {concurrency}" - ); - Ok(()) - } - - fn timed_generation( - &mut self, - prompt_tokens: &[u32], - max_new_tokens: usize, - sampling: &SamplingParams, - _rng: &mut StdRng, - ) -> GenTimings { - assert_dsv2_lite_sampling_contract(sampling); - let (result, attribution) = self - .generator - .generate_greedy_with_attribution(prompt_tokens, max_new_tokens, sampling.ignore_eos) - .expect("DeepSeek-V2-Lite generation failed"); - timings_from_dsv2_lite_attribution( - result.tokens, - max_new_tokens, - attribution.total_generation_us(), - attribution.prefill_next_token_us(), - attribution.per_token_decode_us(), - ) - } - - fn timed_generation_batch( - &mut self, - prompts: &[Vec], - max_new_tokens: usize, - sampling: &SamplingParams, - _rng: &mut StdRng, - ) -> Vec { - assert_dsv2_lite_sampling_contract(sampling); - if prompts.len() == 1 { - return vec![self.timed_generation(&prompts[0], max_new_tokens, sampling, _rng)]; - } - - // This generator drives a narrow same-prompt batched decode kernel: - // every row shares `prompts[0]`. Distinct per-request prompts are a - // scheduler-path concern; this microbench takes one prompt by design. - let result = self - .generator - .generate_greedy_batch_same_prompt_with_timings( - &prompts[0], - prompts.len(), - max_new_tokens, - sampling.ignore_eos, - ) - .expect("DeepSeek-V2-Lite batched generation failed"); - timings_from_dsv2_lite_batched_generation(result, max_new_tokens) - } -} - -#[cfg(feature = "deepseek-v2-lite")] -fn assert_dsv2_lite_sampling_contract(sampling: &SamplingParams) { - assert!( - sampling.ignore_eos, - "DeepSeek-V2-Lite direct attribution benchmark requires ignore_eos=true so output_len maps to an exact generated-token count" - ); - assert!( - (sampling.temperature <= 0.0 || sampling.top_k == 1) && sampling.top_p >= 1.0, - "DeepSeek-V2-Lite direct attribution benchmark supports greedy decoding only; requested temperature={}, top_k={}, top_p={}", - sampling.temperature, - sampling.top_k, - sampling.top_p - ); -} - -#[cfg(feature = "deepseek-v2-lite")] -fn timings_from_dsv2_lite_attribution( - generated_token_ids: Vec, - expected_generated_tokens: usize, - total_generation_us: u64, - prefill_next_token_us: Option, - per_token_decode_us: &[u64], -) -> GenTimings { - // This bench helper intentionally panics on corrupted attribution data rather - // than synthesizing a result. The surrounding trait does not carry errors, - // and emitting bogus TPOT would be worse than aborting the benchmark. - let emitted_tokens = generated_token_ids.len(); - assert_eq!( - emitted_tokens, expected_generated_tokens, - "DeepSeek-V2-Lite generated token count mismatch: got {} tokens for requested output_len={}", - emitted_tokens, expected_generated_tokens - ); - let expected_decode_steps = expected_generated_tokens.saturating_sub(1); - assert_eq!( - per_token_decode_us.len(), - expected_decode_steps, - "DeepSeek-V2-Lite timing count mismatch: got {} decode samples for {} generated tokens", - per_token_decode_us.len(), - emitted_tokens - ); - assert!( - total_generation_us > 0, - "DeepSeek-V2-Lite total generation timing is zero; refusing to report TPOT" - ); - if emitted_tokens > 0 { - assert!( - prefill_next_token_us.is_some_and(|us| us > 0), - "DeepSeek-V2-Lite TTFT timing is missing or zero; refusing to report TPOT" - ); - } - if expected_decode_steps > 0 { - assert!( - per_token_decode_us.iter().all(|us| *us > 0), - "DeepSeek-V2-Lite decode timing contains a zero-duration sample; refusing to report TPOT" - ); - } - let tbt: Vec<_> = per_token_decode_us - .iter() - .map(|us| Duration::from_micros(*us)) - .collect(); - let decode_time_for_rate = tbt.iter().copied().sum(); - GenTimings { - ttft: Duration::from_micros(prefill_next_token_us.unwrap_or(total_generation_us)), - tbt, - total: Duration::from_micros(total_generation_us), - emitted_tokens, - generated_tokens: generated_token_ids, - decode_tokens_for_rate: emitted_tokens.saturating_sub(1), - decode_time_for_rate, - } -} - -#[cfg(feature = "deepseek-v2-lite")] -fn timings_from_dsv2_lite_batched_generation( - result: openinfer_deepseek_v2_lite::BatchedGenerationResult, - expected_generated_tokens: usize, -) -> Vec { - let batch_size = result.tokens.len(); - assert!( - batch_size > 0, - "DeepSeek-V2-Lite batch result must contain at least one row" - ); - assert_eq!( - result.prefill_next_token_us.len(), - batch_size, - "DeepSeek-V2-Lite batch result TTFT count mismatch" - ); - assert!( - result.total_generation_us > 0, - "DeepSeek-V2-Lite batch total generation timing is zero; refusing to report TPOT" - ); - assert!( - result.prefill_next_token_us.iter().all(|us| *us > 0), - "DeepSeek-V2-Lite batch TTFT timing contains a zero-duration sample; refusing to report TPOT" - ); - let expected_decode_steps = expected_generated_tokens.saturating_sub(1); - assert_eq!( - result.per_token_decode_us.len(), - expected_decode_steps, - "DeepSeek-V2-Lite batch timing count mismatch: got {} decode samples for {} generated tokens", - result.per_token_decode_us.len(), - expected_generated_tokens - ); - if expected_decode_steps > 0 { - assert!( - result.per_token_decode_us.iter().all(|us| *us > 0), - "DeepSeek-V2-Lite batch decode timing contains a zero-duration sample; refusing to report TPOT" - ); - } - - let tbt: Vec<_> = result - .per_token_decode_us - .iter() - .map(|us| Duration::from_micros(*us)) - .collect(); - let decode_time_for_rate: Duration = tbt.iter().copied().sum(); - let decode_tokens_for_rate = batch_size * expected_decode_steps; - - result - .tokens - .into_iter() - .zip(result.prefill_next_token_us) - .enumerate() - .map(|(idx, (generated_token_ids, prefill_us))| { - let emitted_tokens = generated_token_ids.len(); - assert_eq!( - emitted_tokens, expected_generated_tokens, - "DeepSeek-V2-Lite batch row {idx} generated token count mismatch: got {} tokens for requested output_len={}", - emitted_tokens, expected_generated_tokens - ); - GenTimings { - ttft: Duration::from_micros(prefill_us), - tbt: tbt.clone(), - total: Duration::from_micros(result.total_generation_us), - emitted_tokens, - generated_tokens: generated_token_ids, - decode_tokens_for_rate: if idx == 0 { decode_tokens_for_rate } else { 0 }, - decode_time_for_rate: if idx == 0 { - decode_time_for_rate - } else { - Duration::ZERO - }, - } - }) - .collect() -} - -fn command_seed(cli: &Cli) -> u64 { - match &cli.command { - Command::Request(args) => args.run.seed, - Command::Matrix(args) => args.run.seed, - Command::Curve(args) => args.run.seed, - Command::Snapshot(args) => args.run.seed, - Command::Compare(_) => 42, - } -} - -#[cfg(feature = "kimi-k2")] -fn kimi_parallel_config(tp_size: usize, dp_size: usize) -> Result { - ensure!(tp_size > 0, "--tp-size must be positive"); - ensure!(dp_size > 0, "--dp-size must be positive"); - Ok(ParallelConfig::new(tp_size, dp_size)) -} - -fn normalize_sizes(values: &[usize], flag: &str) -> Result> { - ensure!(!values.is_empty(), "{flag} must not be empty"); - ensure!(values.iter().all(|v| *v > 0), "{flag} values must be > 0"); - let mut normalized = values.to_vec(); - normalized.sort_unstable(); - normalized.dedup(); - Ok(normalized) -} - -fn validate_run_args(args: &RunArgs) -> Result<()> { - ensure!(args.iters > 0, "--iters must be > 0"); - Ok(()) -} - -fn measure_timings( - model: &mut dyn BenchModel, - prompts: &[Vec], - output_len: usize, - run: &RunArgs, - cuda_profiler_capture: bool, -) -> Result> { - ensure!(output_len > 0, "--output-len must be > 0"); - ensure!(!prompts.is_empty(), "concurrency must be > 0"); - model.validate_concurrency(prompts.len())?; - validate_run_args(run)?; - - let sampling = SamplingParams { - ignore_eos: true, - ..SamplingParams::default() - }; - let mut rng = StdRng::seed_from_u64(run.seed); - - for _ in 0..run.warmup { - let _ = model.timed_generation_batch(prompts, output_len, &sampling, &mut rng); - } - - let profiler = if cuda_profiler_capture { - info!( - "Starting CUDA profiler capture around {} measured iterations", - run.iters - ); - cuda_device::set(0).context("failed to set CUDA device before profiler capture")?; - Some(Profiler::new().context("failed to start CUDA profiler capture")?) - } else { - None - }; - - let mut timings = Vec::with_capacity(run.iters * prompts.len()); - for _ in 0..run.iters { - timings.extend(model.timed_generation_batch(prompts, output_len, &sampling, &mut rng)); - } - drop(profiler); - Ok(timings) -} - -fn build_request_metrics(timings: &[GenTimings]) -> RequestMetrics { - let ttfts: Vec = timings.iter().map(|t| t.ttft).collect(); - let e2e: Vec = timings.iter().map(|t| t.total).collect(); - let first_steps: Vec = timings - .iter() - .filter_map(|t| t.tbt.first().copied()) - .collect(); - let steady: Vec = timings - .iter() - .flat_map(|t| t.tbt.iter().skip(1).copied()) - .collect(); - let generated: Vec = timings.iter().map(|t| t.emitted_tokens).collect(); - let generated_token_traces: Vec = timings - .iter() - .map(|timing| generated_token_trace(&timing.generated_tokens)) - .collect(); - - let total_emitted: usize = timings.iter().map(|t| t.emitted_tokens).sum(); - let total_request_time: Duration = timings.iter().map(|t| t.total).sum(); - let total_decode_steps: usize = timings.iter().map(|t| t.decode_tokens_for_rate).sum(); - let total_decode_time: Duration = timings.iter().map(|t| t.decode_time_for_rate).sum(); - - RequestMetrics { - ttft_ms: summarize_durations(&ttfts), - first_decode_step_ms: (!first_steps.is_empty()).then(|| summarize_durations(&first_steps)), - steady_tpot_ms: (!steady.is_empty()).then(|| summarize_durations(&steady)), - e2e_ms: summarize_durations(&e2e), - generated_tokens: summarize_counts(&generated), - generated_token_traces, - request_tok_s: aggregate_tok_s(total_emitted, total_request_time), - decode_tok_s: aggregate_tok_s(total_decode_steps, total_decode_time), - } -} - -fn build_request_iterations(timings: &[GenTimings]) -> Vec { - timings - .iter() - .enumerate() - .map(|(index, timing)| { - let steady: Vec = timing.tbt.iter().skip(1).copied().collect(); - RequestIterationTiming { - index, - ttft_ms: dur_ms(timing.ttft), - first_decode_step_ms: timing.tbt.first().copied().map(dur_ms), - steady_tpot_ms: (!steady.is_empty()).then(|| summarize_durations(&steady)), - e2e_ms: dur_ms(timing.total), - generated_tokens: timing.emitted_tokens, - generated_token_trace: generated_token_trace(&timing.generated_tokens), - } - }) - .collect() -} - -fn run_info( - cli: &Cli, - command: &'static str, - model_type: ModelType, - load_ms: f64, - cuda_graph: bool, -) -> RunInfo { - RunInfo { - command, - model_path: cli.model_path.clone(), - model_type: format!("{model_type:?}"), - cuda_graph, - load_ms, - label: cli.label.clone(), - } -} - -fn bench_request( - model: &mut dyn BenchModel, - tokenizer: &DynTokenizer, - cli: &Cli, - model_type: ModelType, - load_ms: f64, - cuda_graph: bool, - args: &RequestArgs, -) -> Result { - let mut prompt = resolve_prompt_input( - &args.prompt_input, - tokenizer, - Some(DEFAULT_REQUEST_PROMPT), - None, - )?; - // A `--prompt-len` workload is synthetic: give every concurrent request a - // distinct seeded-random prompt so the decode streams diverge and MoE - // routing is realistic. An explicit `--prompt`/`--prompt-file` (or the - // default text) is the caller's chosen prompt and is replicated as-is. - let synthetic = args.prompt_input.prompt_len.is_some(); - let prompts: Vec> = if synthetic { - // 0 = one distinct prompt per request (fully diverse). Otherwise tile - // `distinct` unique prompts across the batch: idx → idx % distinct. - let distinct = if args.distinct_prompts == 0 { - args.concurrency - } else { - args.distinct_prompts.min(args.concurrency) - }; - prompt.descriptor.source = format!( - "synthetic-random[{SYNTHETIC_TOKEN_LO}..{SYNTHETIC_TOKEN_HI}) seed={} distinct={distinct}/{}", - args.run.seed, args.concurrency - ); - (0..args.concurrency) - .map(|idx| synthetic_random_prompt(prompt.tokens.len(), args.run.seed, idx % distinct)) - .collect() - } else { - vec![prompt.tokens.clone(); args.concurrency] - }; - info!( - "Starting request benchmark: prompt_tokens={} output_len={} concurrency={} warmup={} iters={} seed={} source={}", - prompt.descriptor.prompt_tokens, - args.output_len, - args.concurrency, - args.run.warmup, - args.run.iters, - args.run.seed, - prompt.descriptor.source, - ); - let timings = measure_timings( - model, - &prompts, - args.output_len, - &args.run, - cli.cuda_profiler_capture, - )?; - Ok(BenchReport::Request(Box::new(RequestReport { - run: run_info(cli, "request", model_type, load_ms, cuda_graph), - workload: RequestWorkload { - prompt: prompt.descriptor, - output_len: args.output_len, - concurrency: args.concurrency, - warmup: args.run.warmup, - iters: args.run.iters, - seed: args.run.seed, - }, - metrics: build_request_metrics(&timings), - iterations: build_request_iterations(&timings), - }))) -} - -fn bench_matrix( - model: &mut dyn BenchModel, - cli: &Cli, - model_type: ModelType, - load_ms: f64, - cuda_graph: bool, - args: &MatrixArgs, -) -> Result { - validate_run_args(&args.run)?; - let prompt_lens = normalize_sizes(&args.prompt_lens, "--prompt-lens")?; - let output_lens = normalize_sizes(&args.output_lens, "--output-lens")?; - info!( - "Starting matrix benchmark: prompt_lens={:?} output_lens={:?} warmup={} iters={} seed={}", - prompt_lens, output_lens, args.run.warmup, args.run.iters, args.run.seed - ); - - let mut cells = Vec::with_capacity(prompt_lens.len() * output_lens.len()); - for &prompt_len in &prompt_lens { - let prompt_tokens = synthetic_prompt_tokens(prompt_len); - for &output_len in &output_lens { - debug!( - "Running matrix cell: prompt_len={} output_len={}", - prompt_len, output_len - ); - let timings = measure_timings( - model, - std::slice::from_ref(&prompt_tokens), - output_len, - &args.run, - cli.cuda_profiler_capture, - )?; - let metrics = build_request_metrics(&timings); - cells.push(MatrixCell { - prompt_len, - output_len, - ttft_ms: metrics.ttft_ms, - e2e_ms: metrics.e2e_ms, - first_decode_step_ms: metrics.first_decode_step_ms, - steady_tpot_ms: metrics.steady_tpot_ms, - generated_tokens: metrics.generated_tokens, - request_tok_s: metrics.request_tok_s, - decode_tok_s: metrics.decode_tok_s, - }); - } - } - - Ok(BenchReport::Matrix(MatrixReport { - run: run_info(cli, "matrix", model_type, load_ms, cuda_graph), - workload: MatrixWorkload { - prompt_lens, - output_lens, - warmup: args.run.warmup, - iters: args.run.iters, - seed: args.run.seed, - synthetic_pattern: SYNTHETIC_PATTERN, - }, - cells, - })) -} - -fn bench_curve( - model: &mut dyn BenchModel, - tokenizer: &DynTokenizer, - cli: &Cli, - model_type: ModelType, - load_ms: f64, - cuda_graph: bool, - args: &CurveArgs, -) -> Result { - ensure!(args.window > 0, "--window must be > 0"); - ensure!(args.output_len >= 2, "--output-len must be >= 2 for curve"); - - let prompt = resolve_prompt_input( - &args.prompt_input, - tokenizer, - None, - Some(DEFAULT_CURVE_PROMPT_LEN), - )?; - info!( - "Starting curve benchmark: prompt_tokens={} output_len={} window={} warmup={} iters={} seed={}", - prompt.descriptor.prompt_tokens, - args.output_len, - args.window, - args.run.warmup, - args.run.iters, - args.run.seed - ); - let timings = measure_timings( - model, - std::slice::from_ref(&prompt.tokens), - args.output_len, - &args.run, - cli.cuda_profiler_capture, - )?; - - let mut tbt_by_pos: Vec> = Vec::new(); - for timing in &timings { - for (idx, &duration) in timing.tbt.iter().enumerate() { - if idx >= tbt_by_pos.len() { - tbt_by_pos.push(Vec::with_capacity(args.run.iters)); - } - tbt_by_pos[idx].push(duration); - } - } - - let mut windows = Vec::new(); - let mut pos = 0usize; - while pos < tbt_by_pos.len() { - let end = (pos + args.window).min(tbt_by_pos.len()); - let mut samples = Vec::new(); - for bucket in &tbt_by_pos[pos..end] { - samples.extend_from_slice(bucket); - } - if !samples.is_empty() { - let stats = summarize_durations(&samples); - windows.push(CurveWindow { - ctx_start: prompt.descriptor.prompt_tokens + pos + 1, - ctx_end: prompt.descriptor.prompt_tokens + end, - decode_tok_s: (stats.avg_ms > 0.0).then(|| 1000.0 / stats.avg_ms), - tpot_ms: stats, - }); - } - pos = end; - } - - Ok(BenchReport::Curve(CurveReport { - run: run_info(cli, "curve", model_type, load_ms, cuda_graph), - workload: CurveWorkload { - prompt: prompt.descriptor, - output_len: args.output_len, - window: args.window, - warmup: args.run.warmup, - iters: args.run.iters, - seed: args.run.seed, - }, - windows, - })) -} - -fn render_text(report: &BenchReport) -> String { - let mut out = String::new(); - match report { - BenchReport::Request(report) => { - let _ = writeln!(out, "bench_serving request\n"); - push_table(&mut out, &render_request_meta(report)); - out.push('\n'); - push_table( - &mut out, - &render_duration_table( - std::iter::once(("ttft_ms".to_string(), report.metrics.ttft_ms.clone())) - .chain( - report - .metrics - .first_decode_step_ms - .clone() - .into_iter() - .map(|stats| ("first_decode_step_ms".to_string(), stats)), - ) - .chain( - report - .metrics - .steady_tpot_ms - .clone() - .into_iter() - .map(|stats| ("steady_tpot_ms".to_string(), stats)), - ) - .chain(std::iter::once(( - "e2e_ms".to_string(), - report.metrics.e2e_ms.clone(), - ))) - .collect(), - ), - ); - out.push('\n'); - push_table(&mut out, &render_request_summary(report)); - } - BenchReport::Matrix(report) => { - let _ = writeln!(out, "bench_serving matrix\n"); - push_table(&mut out, &render_matrix_meta(report)); - out.push('\n'); - push_table(&mut out, &render_matrix_table(report)); - } - BenchReport::Curve(report) => { - let _ = writeln!(out, "bench_serving curve\n"); - push_table(&mut out, &render_curve_meta(report)); - out.push('\n'); - push_table(&mut out, &render_curve_table(report)); - } - } - out -} - -fn emit_report(cli: &Cli, report: &BenchReport) -> Result<()> { - let rendered = match cli.format { - OutputFormat::Text => render_text(report), - OutputFormat::Json => serde_json::to_string_pretty(report)?, - }; - - if let Some(path) = &cli.out { - fs::write(path, &rendered).with_context(|| format!("failed to write report to {path}"))?; - info!("Wrote benchmark report to {}", path); - } - - println!("{rendered}"); - Ok(()) -} - -fn run_command( - cli: &Cli, - model_type: ModelType, - load_ms: f64, - cuda_graph: bool, - model: &mut dyn BenchModel, - tokenizer: &DynTokenizer, -) -> Result { - match &cli.command { - Command::Request(args) => { - bench_request(model, tokenizer, cli, model_type, load_ms, cuda_graph, args) - } - Command::Matrix(args) => bench_matrix(model, cli, model_type, load_ms, cuda_graph, args), - Command::Curve(args) => { - bench_curve(model, tokenizer, cli, model_type, load_ms, cuda_graph, args) - } - Command::Snapshot(_) | Command::Compare(_) => unreachable!(), - } -} - -// --------------------------------------------------------------------------- -// Snapshot / Compare -// --------------------------------------------------------------------------- - -fn shell_output(program: &str, args: &[&str]) -> Option { - std::process::Command::new(program) - .args(args) - .output() - .ok() - .filter(|o| o.status.success()) - .and_then(|o| String::from_utf8(o.stdout).ok()) - .map(|s| s.trim().to_string()) -} - -fn git_short_commit() -> String { - shell_output("git", &["rev-parse", "--short", "HEAD"]).unwrap_or_else(|| "unknown".into()) -} - -fn gpu_name() -> String { - shell_output( - "nvidia-smi", - &["--query-gpu=name", "--format=csv,noheader", "--id=0"], - ) - .unwrap_or_else(|| "unknown".into()) -} - -/// Produce a filesystem-safe slug from a GPU name string. -/// -/// `"NVIDIA GeForce RTX 5070 Ti"` → `"rtx-5070-ti"` -fn gpu_slug_from(name: &str) -> String { - let stripped = name - .strip_prefix("NVIDIA GeForce ") - .or_else(|| name.strip_prefix("NVIDIA ")) - .unwrap_or(name); - stripped - .to_lowercase() - .chars() - .map(|c| { - if c.is_alphanumeric() || c == '-' { - c - } else { - '-' - } - }) - .collect::() - .split('-') - .filter(|s| !s.is_empty()) - .collect::>() - .join("-") -} - -fn today_date() -> String { - shell_output("date", &["+%Y-%m-%d"]).unwrap_or_else(|| "unknown".into()) -} - -fn model_display_name(model_path: &str) -> String { - Path::new(model_path) - .file_name() - .and_then(|n| n.to_str()) - .unwrap_or("unknown") - .to_string() -} - -fn delta_pct(current: f64, baseline: f64) -> f64 { - if baseline == 0.0 { - return 0.0; - } - (current - baseline) / baseline * 100.0 -} - -fn format_delta(pct: f64) -> String { - if pct >= 0.0 { - format!("+{pct:.1}%") - } else { - format!("{pct:.1}%") - } -} - -fn run_snapshot( - model: &mut dyn BenchModel, - cli: &Cli, - model_type: ModelType, - args: &SnapshotArgs, -) -> Result<()> { - let prefill_prompt_len = snapshot_prefill_prompt_len(model_type); - - info!("Running prefill-heavy ({prefill_prompt_len},{SNAPSHOT_PREFILL_OUTPUT_LEN})"); - let prefill_tokens = synthetic_prompt_tokens(prefill_prompt_len); - let prefill_timings = measure_timings( - model, - std::slice::from_ref(&prefill_tokens), - SNAPSHOT_PREFILL_OUTPUT_LEN, - &args.run, - cli.cuda_profiler_capture, - )?; - let prefill_metrics = build_request_metrics(&prefill_timings); - - info!("Running decode-heavy ({SNAPSHOT_DECODE_PROMPT_LEN},{SNAPSHOT_DECODE_OUTPUT_LEN})"); - let decode_tokens = synthetic_prompt_tokens(SNAPSHOT_DECODE_PROMPT_LEN); - let decode_timings = measure_timings( - model, - std::slice::from_ref(&decode_tokens), - SNAPSHOT_DECODE_OUTPUT_LEN, - &args.run, - cli.cuda_profiler_capture, - )?; - let decode_metrics = build_request_metrics(&decode_timings); - - let model_name = model_display_name(&cli.model_path); - let gpu = gpu_name(); - let parallel = match model_type { - #[cfg(feature = "kimi-k2")] - ModelType::KimiK2 => Some(format!( - "tp{}-dp{}-{}", - cli.tp_size, - cli.dp_size, - format!("{:?}", cli.ep_backend).to_lowercase() - )), - _ => None, - }; - let report = SnapshotReport { - commit: git_short_commit(), - date: today_date(), - model: model_name.clone(), - gpu: gpu.clone(), - parallel, - prefill_heavy: SnapshotProfile { - prompt_len: prefill_prompt_len, - output_len: SNAPSHOT_PREFILL_OUTPUT_LEN, - metrics: prefill_metrics, - }, - decode_heavy: SnapshotProfile { - prompt_len: SNAPSHOT_DECODE_PROMPT_LEN, - output_len: SNAPSHOT_DECODE_OUTPUT_LEN, - metrics: decode_metrics, - }, - }; - - let dir = Path::new(SNAPSHOT_DIR).join(gpu_slug_from(&gpu)); - fs::create_dir_all(&dir)?; - let filename = model_name.to_lowercase(); - let path = dir.join(format!("{filename}.json")); - let snapshot_json = serde_json::to_string_pretty(&report)?; - fs::write(&path, format!("{snapshot_json}\n"))?; - - println!("{}", render_snapshot_text(&report, &path)); - Ok(()) -} - -fn render_snapshot_text(report: &SnapshotReport, path: &Path) -> String { - let mut out = String::new(); - let _ = writeln!(out, "bench_serving snapshot\n"); - let _ = writeln!(out, "model: {}", report.model); - let _ = writeln!(out, "gpu: {}", report.gpu); - if let Some(parallel) = &report.parallel { - let _ = writeln!(out, "shape: {parallel}"); - } - let _ = writeln!(out, "commit: {}\n", report.commit); - let _ = writeln!( - out, - "prefill_heavy ({},{}):", - report.prefill_heavy.prompt_len, report.prefill_heavy.output_len - ); - let _ = writeln!( - out, - " TTFT p50={:.2}ms p99={:.2}ms", - report.prefill_heavy.metrics.ttft_ms.p50_ms, report.prefill_heavy.metrics.ttft_ms.p99_ms - ); - let _ = writeln!( - out, - "\ndecode_heavy ({},{}):", - report.decode_heavy.prompt_len, report.decode_heavy.output_len - ); - if let Some(tpot) = &report.decode_heavy.metrics.steady_tpot_ms { - let _ = writeln!( - out, - " TPOT p50={:.2}ms p99={:.2}ms", - tpot.p50_ms, tpot.p99_ms - ); - } - let _ = writeln!(out, "\nwritten to {}", path.display()); - out -} - -fn run_compare(args: &CompareArgs) -> Result<()> { - let current_content = fs::read_to_string(&args.path).with_context(|| { - format!( - "snapshot not found: {}\nrun `bench_serving snapshot` first", - args.path - ) - })?; - let current: SnapshotReport = - serde_json::from_str(¤t_content).context("failed to parse current snapshot")?; - - // Resolve repo-root-relative path for git show - let abs_path = fs::canonicalize(&args.path)?; - let toplevel = - shell_output("git", &["rev-parse", "--show-toplevel"]).context("not a git repository")?; - let root = PathBuf::from(&toplevel); - let rel_path = abs_path - .strip_prefix(&root) - .context("snapshot file is outside the git repository")?; - - let git_output = std::process::Command::new("git") - .args(["show", &format!("{}:{}", args.baseline, rel_path.display())]) - .output() - .context("failed to run git show")?; - - if !git_output.status.success() { - anyhow::bail!( - "no baseline at {}:{}\ncommit the current snapshot to establish a baseline", - args.baseline, - rel_path.display() - ); - } - - let baseline: SnapshotReport = - serde_json::from_slice(&git_output.stdout).context("failed to parse baseline snapshot")?; - - // Guard against comparing snapshots with different profile shapes - ensure!( - current.prefill_heavy.prompt_len == baseline.prefill_heavy.prompt_len - && current.prefill_heavy.output_len == baseline.prefill_heavy.output_len - && current.decode_heavy.prompt_len == baseline.decode_heavy.prompt_len - && current.decode_heavy.output_len == baseline.decode_heavy.output_len, - "profile shape mismatch: current ({},{}) + ({},{}) vs baseline ({},{}) + ({},{})\n\ - the snapshot profiles were changed — re-baseline by committing a fresh snapshot", - current.prefill_heavy.prompt_len, - current.prefill_heavy.output_len, - current.decode_heavy.prompt_len, - current.decode_heavy.output_len, - baseline.prefill_heavy.prompt_len, - baseline.prefill_heavy.output_len, - baseline.decode_heavy.prompt_len, - baseline.decode_heavy.output_len, - ); - println!("{}", render_comparison(¤t, &baseline, &args.baseline)); - Ok(()) -} - -fn render_comparison( - current: &SnapshotReport, - baseline: &SnapshotReport, - ref_name: &str, -) -> String { - let mut out = String::new(); - let _ = writeln!(out, "bench_serving compare\n"); - let _ = writeln!( - out, - "comparing {} (working tree) vs {} ({ref_name})\n", - current.commit, baseline.commit - ); - - let mut table = new_table(); - table.set_header(vec![ - Cell::new("metric"), - Cell::new("current").set_alignment(CellAlignment::Right), - Cell::new("baseline").set_alignment(CellAlignment::Right), - Cell::new("delta").set_alignment(CellAlignment::Right), - ]); - - let pf = ¤t.prefill_heavy; - let pf_b = &baseline.prefill_heavy; - let pf_label = format!("({},{})", pf.prompt_len, pf.output_len); - - for (stat, cur, base) in [ - ( - "p50", - pf.metrics.ttft_ms.p50_ms, - pf_b.metrics.ttft_ms.p50_ms, - ), - ( - "p99", - pf.metrics.ttft_ms.p99_ms, - pf_b.metrics.ttft_ms.p99_ms, - ), - ] { - table.add_row(vec![ - key_cell(format!("TTFT {stat} {pf_label}")), - numeric_cell(format!("{cur:.2}ms")), - numeric_cell(format!("{base:.2}ms")), - numeric_cell(format_delta(delta_pct(cur, base))), - ]); - } - - let dc_label = format!( - "({},{})", - current.decode_heavy.prompt_len, current.decode_heavy.output_len - ); - if let (Some(cur_tpot), Some(base_tpot)) = ( - ¤t.decode_heavy.metrics.steady_tpot_ms, - &baseline.decode_heavy.metrics.steady_tpot_ms, - ) { - for (stat, cur, base) in [ - ("p50", cur_tpot.p50_ms, base_tpot.p50_ms), - ("p99", cur_tpot.p99_ms, base_tpot.p99_ms), - ] { - table.add_row(vec![ - key_cell(format!("TPOT {stat} {dc_label}")), - numeric_cell(format!("{cur:.2}ms")), - numeric_cell(format!("{base:.2}ms")), - numeric_cell(format_delta(delta_pct(cur, base))), - ]); - } - } - - push_table(&mut out, &table); - - // Regression check - let mut regressions = Vec::new(); - let ttft_d = delta_pct( - current.prefill_heavy.metrics.ttft_ms.p50_ms, - baseline.prefill_heavy.metrics.ttft_ms.p50_ms, - ); - if ttft_d > REGRESSION_TTFT_PCT { - regressions.push(format!( - "TTFT p50 {ttft_d:+.1}% > {REGRESSION_TTFT_PCT}% threshold" - )); - } - if let (Some(cur), Some(base)) = ( - ¤t.decode_heavy.metrics.steady_tpot_ms, - &baseline.decode_heavy.metrics.steady_tpot_ms, - ) { - let tpot_d = delta_pct(cur.p50_ms, base.p50_ms); - if tpot_d > REGRESSION_TPOT_PCT { - regressions.push(format!( - "TPOT p50 {tpot_d:+.1}% > {REGRESSION_TPOT_PCT}% threshold" - )); - } - } - - out.push('\n'); - if regressions.is_empty() { - let _ = writeln!( - out, - "no regression detected (threshold: TPOT >{REGRESSION_TPOT_PCT}%, TTFT >{REGRESSION_TTFT_PCT}%)" - ); - } else { - let _ = writeln!(out, "REGRESSION DETECTED:"); - for r in ®ressions { - let _ = writeln!(out, " {r}"); - } - } - - out -} - -fn dispatch( - cli: &Cli, - model_type: ModelType, - load_ms: f64, - cuda_graph: bool, - model: &mut dyn BenchModel, - tokenizer: &DynTokenizer, -) -> Result<()> { - if let Command::Snapshot(args) = &cli.command { - run_snapshot(model, cli, model_type, args) - } else { - let report = run_command(cli, model_type, load_ms, cuda_graph, model, tokenizer)?; - emit_report(cli, &report) - } -} - -fn main() -> Result<()> { - logging::init_default(); - - let cli = Cli::parse(); - - // Compare needs no model loading - if let Command::Compare(ref args) = cli.command { - return run_compare(args); - } - - debug!( - "bench_serving starting: command={} model_path={} cuda_graph={} format={:?}", - match &cli.command { - Command::Request(_) => "request", - Command::Matrix(_) => "matrix", - Command::Curve(_) => "curve", - Command::Snapshot(_) => "snapshot", - Command::Compare(_) => "compare", - }, - cli.model_path, - cli.cuda_graph, - cli.format - ); - let model_type = detect_model_type(&cli.model_path) - .with_context(|| format!("failed to detect model type from {}", cli.model_path))?; - debug!("Detected model type: {:?}", model_type); - let load_start = Instant::now(); - - match model_type { - #[cfg(feature = "deepseek-v2-lite")] - ModelType::DeepSeekV2Lite => { - let generator = openinfer_deepseek_v2_lite::DeepSeekV2LiteEp2Generator::load( - Path::new(&cli.model_path), - EngineLoadOptions { - enable_cuda_graph: false, - enable_prefill_profile: false, - device_ordinals: vec![0, 1], - parallel_config: None, - ep_backend: EpBackend::Nccl, - seed: command_seed(&cli), - }, - )?; - let tokenizer = load_vllm_tokenizer(&cli.model_path)?; - let load_ms = dur_ms(load_start.elapsed()); - let mut bench = DeepSeekV2LiteBenchModel { generator }; - dispatch(&cli, model_type, load_ms, false, &mut bench, &tokenizer) - } - #[cfg(feature = "deepseek-v4")] - ModelType::DeepSeekV4 => { - let handle = openinfer_deepseek_v4::start_engine( - Path::new(&cli.model_path), - EngineLoadOptions { - enable_cuda_graph: false, - enable_prefill_profile: false, - device_ordinals: (0..8).collect(), - parallel_config: None, - ep_backend: EpBackend::Nccl, - seed: command_seed(&cli), - }, - )?; - let tokenizer = load_vllm_tokenizer(&cli.model_path)?; - let load_ms = dur_ms(load_start.elapsed()); - let mut bench = SchedulerBenchModel { handle }; - dispatch(&cli, model_type, load_ms, false, &mut bench, &tokenizer) - } - #[cfg(feature = "kimi-k2")] - ModelType::KimiK2 => { - let parallel = kimi_parallel_config(cli.tp_size, cli.dp_size)?; - let handle = openinfer_kimi_k2::start_engine( - Path::new(&cli.model_path), - EngineLoadOptions { - enable_cuda_graph: cli.cuda_graph, - enable_prefill_profile: false, - device_ordinals: (0..parallel.ep_world()).collect(), - parallel_config: Some(parallel), - ep_backend: cli.ep_backend.into(), - seed: command_seed(&cli), - }, - )?; - let tokenizer = load_vllm_tokenizer(&cli.model_path)?; - let load_ms = dur_ms(load_start.elapsed()); - let mut bench = SchedulerBenchModel { handle }; - dispatch( - &cli, - model_type, - load_ms, - cli.cuda_graph, - &mut bench, - &tokenizer, - ) - } - #[cfg(feature = "qwen3-4b")] - ModelType::Qwen3 => { - let handle = openinfer_qwen3_4b::start_engine( - Path::new(&cli.model_path), - EngineLoadOptions { - enable_cuda_graph: cli.cuda_graph, - enable_prefill_profile: false, - device_ordinals: vec![0], - parallel_config: None, - ep_backend: EpBackend::Nccl, - seed: command_seed(&cli), - }, - )?; - let tokenizer = load_vllm_tokenizer(&cli.model_path)?; - let load_ms = dur_ms(load_start.elapsed()); - let mut bench = SchedulerBenchModel { handle }; - dispatch( - &cli, - model_type, - load_ms, - cli.cuda_graph, - &mut bench, - &tokenizer, - ) - } - #[cfg(feature = "qwen35-4b")] - ModelType::Qwen35 => { - let handle = openinfer_qwen35_4b::start_engine_with_capacity( - Path::new(&cli.model_path), - EngineLoadOptions { - enable_cuda_graph: cli.cuda_graph, - enable_prefill_profile: false, - device_ordinals: vec![0], - parallel_config: None, - ep_backend: EpBackend::Nccl, - seed: command_seed(&cli), - }, - 4, - )?; - let tokenizer = load_vllm_tokenizer(&cli.model_path)?; - let load_ms = dur_ms(load_start.elapsed()); - let mut bench = SchedulerBenchModel { handle }; - dispatch( - &cli, - model_type, - load_ms, - cli.cuda_graph, - &mut bench, - &tokenizer, - ) - } - } -} - -#[cfg(all(test, feature = "deepseek-v2-lite"))] -mod tests { - use super::*; - - #[test] - fn dsv2_lite_sampling_contract_accepts_bench_params() { - let sampling = SamplingParams { - ignore_eos: true, - ..SamplingParams::default() - }; - - assert_dsv2_lite_sampling_contract(&sampling); - } - - #[test] - #[should_panic(expected = "supports greedy decoding only")] - fn dsv2_lite_sampling_contract_rejects_non_greedy_params() { - let sampling = SamplingParams { - temperature: 0.8, - top_k: -1, - top_p: 0.95, - ignore_eos: true, - }; - - assert_dsv2_lite_sampling_contract(&sampling); - } - - #[test] - #[should_panic(expected = "requires ignore_eos=true")] - fn dsv2_lite_sampling_contract_rejects_eos_enabled_params() { - let sampling = SamplingParams { - ignore_eos: false, - ..SamplingParams::default() - }; - - assert_dsv2_lite_sampling_contract(&sampling); - } - - #[test] - fn dsv2_lite_attribution_timings_preserve_decode_steps() { - let timings = timings_from_dsv2_lite_attribution( - vec![11, 304, 608], - 3, - 60_000, - Some(20_000), - &[19_000, 18_000], - ); - - assert_eq!(timings.ttft, Duration::from_micros(20_000)); - assert_eq!( - timings.tbt, - vec![Duration::from_micros(19_000), Duration::from_micros(18_000)] - ); - assert_eq!(timings.total, Duration::from_micros(60_000)); - assert_eq!(timings.emitted_tokens, 3); - assert_eq!(timings.generated_tokens, vec![11, 304, 608]); - assert_eq!(timings.decode_tokens_for_rate, 2); - assert_eq!(timings.decode_time_for_rate, Duration::from_micros(37_000)); - } - - #[test] - fn dsv2_lite_batched_timings_use_shared_decode_time_for_rate() { - let timings = timings_from_dsv2_lite_batched_generation( - openinfer_deepseek_v2_lite::BatchedGenerationResult { - tokens: vec![vec![11, 304, 608], vec![11, 304, 608]], - prefill_next_token_us: vec![20_000, 21_000], - per_token_decode_us: vec![19_000, 18_000], - total_generation_us: 80_000, - stats: openinfer_deepseek_v2_lite::GenerationStats::default(), - }, - 3, - ); - - assert_eq!(timings.len(), 2); - assert_eq!(timings[0].decode_tokens_for_rate, 4); - assert_eq!( - timings[0].decode_time_for_rate, - Duration::from_micros(37_000) - ); - assert_eq!(timings[1].decode_tokens_for_rate, 0); - assert_eq!(timings[1].decode_time_for_rate, Duration::ZERO); - - let metrics = build_request_metrics(&timings); - assert_eq!(metrics.steady_tpot_ms.unwrap().p50_ms, 18.0); - assert!( - metrics.decode_tok_s.unwrap() > 100.0, - "batched decode tok/s should use one shared step duration instead of duplicating it per row" - ); - } - - #[test] - #[should_panic(expected = "timing count mismatch")] - fn dsv2_lite_attribution_timings_fail_on_missing_decode_samples() { - let _ = timings_from_dsv2_lite_attribution( - vec![11, 304, 608], - 3, - 60_000, - Some(20_000), - &[19_000], - ); - } - - #[test] - #[should_panic(expected = "generated token count mismatch")] - fn dsv2_lite_attribution_timings_fail_on_short_generation() { - let _ = - timings_from_dsv2_lite_attribution(vec![11, 304], 3, 60_000, Some(20_000), &[19_000]); - } - - #[test] - #[should_panic(expected = "zero-duration")] - fn dsv2_lite_attribution_timings_fail_on_zero_decode_samples() { - let _ = timings_from_dsv2_lite_attribution(vec![11, 304], 2, 60_000, Some(20_000), &[0]); - } - - #[test] - #[should_panic(expected = "total generation timing is zero")] - fn dsv2_lite_attribution_timings_fail_on_zero_total_generation() { - let _ = timings_from_dsv2_lite_attribution(vec![11, 304], 2, 0, Some(20_000), &[19_000]); - } - - #[test] - #[should_panic(expected = "TTFT timing is missing or zero")] - fn dsv2_lite_attribution_timings_fail_on_missing_ttft() { - let _ = timings_from_dsv2_lite_attribution(vec![11, 304], 2, 60_000, None, &[19_000]); - } - - #[test] - #[should_panic(expected = "TTFT timing is missing or zero")] - fn dsv2_lite_attribution_timings_fail_on_zero_ttft() { - let _ = timings_from_dsv2_lite_attribution(vec![11, 304], 2, 60_000, Some(0), &[19_000]); - } -} diff --git a/openinfer-server/src/bin/bench_serving/cli.rs b/openinfer-server/src/bin/bench_serving/cli.rs new file mode 100644 index 00000000..da9d1202 --- /dev/null +++ b/openinfer-server/src/bin/bench_serving/cli.rs @@ -0,0 +1,231 @@ +//! CLI surface: global options, subcommands, and per-command argument structs. + +use clap::{Args as ClapArgs, Parser, Subcommand, ValueEnum}; +use openinfer_core::engine::EpBackend; + +pub(crate) const DEFAULT_MODEL_PATH: &str = + concat!(env!("CARGO_MANIFEST_DIR"), "/../models/Qwen3-4B"); +pub(crate) const TOP_LEVEL_EXAMPLES: &str = "\ +Examples: + cargo run -r --bin bench_serving -- request + cargo run -r --bin bench_serving -- request --prompt \"Tell me a story about Rust\" --output-len 128 + cargo run -r --bin bench_serving -- request --prompt-len 512 --output-len 64 + cargo run -r --bin bench_serving -- matrix --prompt-lens 32,128,512,2048 --output-lens 32,128,256 + cargo run -r --bin bench_serving -- curve --prompt-len 1024 --output-len 256 --window 32 + cargo run -r --bin bench_serving -- --format json --out bench.json request --prompt-len 512 --output-len 64 + cargo run -r --bin bench_serving -- snapshot + cargo run -r --bin bench_serving -- compare bench_snapshots/rtx-5070-ti/qwen3-4b.json"; +pub(crate) const REQUEST_EXAMPLES: &str = "\ +Examples: + cargo run -r --bin bench_serving -- request + cargo run -r --bin bench_serving -- request --prompt \"Tell me a story about Rust\" --output-len 128 + cargo run -r --bin bench_serving -- request --prompt-file prompts/story.txt --output-len 128 + cargo run -r --bin bench_serving -- request --prompt-len 512 --output-len 64 --warmup 3 --iters 10"; +pub(crate) const MATRIX_EXAMPLES: &str = "\ +Examples: + cargo run -r --bin bench_serving -- matrix + cargo run -r --bin bench_serving -- matrix --prompt-lens 32,128,512,2048 --output-lens 32,128,256 + cargo run -r --bin bench_serving -- --format json --out matrix.json matrix --prompt-lens 128,512 --output-lens 64,256"; +pub(crate) const CURVE_EXAMPLES: &str = "\ +Examples: + cargo run -r --bin bench_serving -- curve + cargo run -r --bin bench_serving -- curve --prompt-len 1024 --output-len 256 --window 32 + cargo run -r --bin bench_serving -- curve --prompt \"Summarize KV cache behavior\" --output-len 128 --window 16"; +pub(crate) const SNAPSHOT_EXAMPLES: &str = "\ +Examples: + cargo run -r --bin bench_serving -- snapshot + cargo run -r --bin bench_serving -- snapshot --warmup 3 --iters 10"; +pub(crate) const COMPARE_EXAMPLES: &str = "\ +Examples: + cargo run -r --bin bench_serving -- compare bench_snapshots/rtx-5070-ti/qwen3-4b.json + cargo run -r --bin bench_serving -- compare bench_snapshots/rtx-5070-ti/qwen3-4b.json --baseline HEAD~3"; + +#[derive(Debug, Clone, Copy, ValueEnum)] +pub(crate) enum OutputFormat { + Text, + Json, +} + +#[derive(Debug, Clone, Copy, ValueEnum)] +pub(crate) enum CliEpBackend { + Nccl, + #[value(name = "deepep")] + DeepEp, +} + +impl From for EpBackend { + fn from(value: CliEpBackend) -> Self { + match value { + CliEpBackend::Nccl => Self::Nccl, + CliEpBackend::DeepEp => Self::DeepEp, + } + } +} + +#[derive(Debug, Subcommand)] +pub(crate) enum Command { + /// Measure one request shape end-to-end. + #[command(after_help = REQUEST_EXAMPLES)] + Request(RequestArgs), + /// Sweep prompt_len x output_len and summarize each cell. + #[command(after_help = MATRIX_EXAMPLES)] + Matrix(MatrixArgs), + /// Measure TPOT as context grows during decode. + #[command(after_help = CURVE_EXAMPLES)] + Curve(CurveArgs), + /// Run standard profiles and write a regression-trackable snapshot. + #[command(after_help = SNAPSHOT_EXAMPLES)] + Snapshot(SnapshotArgs), + /// Compare a snapshot against its git baseline. + #[command(after_help = COMPARE_EXAMPLES)] + Compare(CompareArgs), +} + +#[derive(Parser, Debug)] +#[command( + name = "bench_serving", + about = "openinfer in-process inference benchmark", + after_help = TOP_LEVEL_EXAMPLES +)] +pub(crate) struct Cli { + /// Model directory (contains config.json, tokenizer, safetensors) + #[arg(long, default_value = DEFAULT_MODEL_PATH)] + pub(crate) model_path: String, + + /// Enable CUDA graph on decode path + #[arg(long, default_value_t = true, action = clap::ArgAction::Set)] + pub(crate) cuda_graph: bool, + + /// Render result to terminal as text or structured JSON + #[arg(long, default_value = "text")] + pub(crate) format: OutputFormat, + + /// Optional label to tag this benchmark run + #[arg(long)] + pub(crate) label: Option, + + /// Optional output path for the rendered report + #[arg(long)] + pub(crate) out: Option, + + /// Capture only measured iterations for nsys `-c cudaProfilerApi` + #[arg(long, default_value_t = false)] + pub(crate) cuda_profiler_capture: bool, + + /// Tensor-parallel world size for Kimi-K2 + #[arg(long, default_value_t = 1)] + pub(crate) tp_size: usize, + + /// Data-parallel world size for Kimi-K2 + #[arg(long, default_value_t = 8)] + pub(crate) dp_size: usize, + + /// Expert-parallel backend for Kimi-K2 (TP1/DP8 requires deepep; TP8/DP1 requires nccl) + #[arg(long, default_value = "deepep")] + pub(crate) ep_backend: CliEpBackend, + + #[command(subcommand)] + pub(crate) command: Command, +} + +#[derive(Debug, Clone, ClapArgs)] +pub(crate) struct PromptInputArgs { + /// Inline prompt text + #[arg(long, conflicts_with_all = ["prompt_file", "prompt_len"])] + pub(crate) prompt: Option, + + /// Read prompt text from file + #[arg(long, conflicts_with_all = ["prompt", "prompt_len"])] + pub(crate) prompt_file: Option, + + /// Use a synthetic prompt with exactly this many token ids + #[arg(long, conflicts_with_all = ["prompt", "prompt_file"])] + pub(crate) prompt_len: Option, +} + +#[derive(Debug, Clone, ClapArgs)] +pub(crate) struct RunArgs { + /// Warmup iterations + #[arg(long, default_value_t = 5)] + pub(crate) warmup: usize, + + /// Measured iterations + #[arg(long, default_value_t = 20)] + pub(crate) iters: usize, + + /// RNG seed (matters once sampling becomes non-greedy) + #[arg(long, default_value_t = 42)] + pub(crate) seed: u64, +} + +#[derive(Debug, ClapArgs)] +pub(crate) struct RequestArgs { + #[command(flatten)] + pub(crate) prompt_input: PromptInputArgs, + + /// Max generated tokens + #[arg(long, default_value_t = 64)] + pub(crate) output_len: usize, + + /// Number of concurrent requests per measured iteration + #[arg(long, default_value_t = 1)] + pub(crate) concurrency: usize, + + /// Number of *distinct* synthetic prompts to tile across the concurrent + /// batch (0 = one per request, fully diverse). `1` makes every concurrent + /// request identical, which collapses MoE routing onto a narrow expert set + /// and under-measures decode TPOT — sweep this to quantify the + /// routing-diversity → TPOT curve (see the MoE bench-diversity lesson). + #[arg(long, default_value_t = 0)] + pub(crate) distinct_prompts: usize, + + #[command(flatten)] + pub(crate) run: RunArgs, +} + +#[derive(Debug, ClapArgs)] +pub(crate) struct MatrixArgs { + /// Synthetic prompt lengths to sweep + #[arg(long, value_delimiter = ',', default_value = "32,128,512,2048")] + pub(crate) prompt_lens: Vec, + + /// Output lengths to sweep + #[arg(long, value_delimiter = ',', default_value = "32,128,256")] + pub(crate) output_lens: Vec, + + #[command(flatten)] + pub(crate) run: RunArgs, +} + +#[derive(Debug, ClapArgs)] +pub(crate) struct CurveArgs { + #[command(flatten)] + pub(crate) prompt_input: PromptInputArgs, + + /// Max generated tokens + #[arg(long, default_value_t = 256)] + pub(crate) output_len: usize, + + /// Group decode positions into windows of this size + #[arg(long, default_value_t = 32)] + pub(crate) window: usize, + + #[command(flatten)] + pub(crate) run: RunArgs, +} + +#[derive(Debug, ClapArgs)] +pub(crate) struct SnapshotArgs { + #[command(flatten)] + pub(crate) run: RunArgs, +} + +#[derive(Debug, ClapArgs)] +pub(crate) struct CompareArgs { + /// Path to snapshot JSON file + pub(crate) path: String, + + /// Git ref to compare against + #[arg(long, default_value = "HEAD")] + pub(crate) baseline: String, +} diff --git a/openinfer-server/src/bin/bench_serving/exec.rs b/openinfer-server/src/bin/bench_serving/exec.rs new file mode 100644 index 00000000..7453e4b9 --- /dev/null +++ b/openinfer-server/src/bin/bench_serving/exec.rs @@ -0,0 +1,408 @@ +//! Generation timing harness: the BenchModel trait, the timed-run loop, +//! the scheduler stream helper, and the per-model bench adapters. + +use std::thread; +use std::time::{Duration, Instant}; + +use anyhow::{Result, ensure}; +use rand::rngs::StdRng; +use tokio::sync::mpsc; + +use openinfer::sampler::SamplingParams; +use openinfer::scheduler::{SchedulerHandle, SchedulerRequest, TokenEvent}; + +pub(crate) struct GenTimings { + pub(crate) ttft: Duration, + pub(crate) tbt: Vec, + pub(crate) total: Duration, + pub(crate) emitted_tokens: usize, + pub(crate) generated_tokens: Vec, + pub(crate) decode_tokens_for_rate: usize, + pub(crate) decode_time_for_rate: Duration, +} + +pub(crate) trait BenchModel { + fn validate_concurrency(&self, concurrency: usize) -> Result<()> { + ensure!(concurrency > 0, "--concurrency must be > 0"); + Ok(()) + } + + fn timed_generation( + &mut self, + prompt_tokens: &[u32], + max_new_tokens: usize, + sampling: &SamplingParams, + rng: &mut StdRng, + ) -> GenTimings; + + /// Run one request per prompt; the slice length is the concurrency. Each + /// prompt is independent, so MoE models must be handed *distinct* prompts + /// to exercise realistic expert routing (see `synthetic_random_prompt`). + fn timed_generation_batch( + &mut self, + prompts: &[Vec], + max_new_tokens: usize, + sampling: &SamplingParams, + rng: &mut StdRng, + ) -> Vec { + prompts + .iter() + .map(|prompt| self.timed_generation(prompt, max_new_tokens, sampling, rng)) + .collect() + } +} + +pub(crate) fn run_timed( + prompt_tokens: &[u32], + max_new_tokens: usize, + mut generate: F, +) -> GenTimings +where + F: FnMut(&[u32], usize, &mut dyn FnMut(u32) -> bool) -> Result<()>, +{ + let start = Instant::now(); + let mut first_at: Option = None; + let mut prev_at: Option = None; + let mut emitted_tokens = 0usize; + let mut tbt = Vec::with_capacity(max_new_tokens.saturating_sub(1)); + let mut generated_tokens = Vec::with_capacity(max_new_tokens); + + generate(prompt_tokens, max_new_tokens, &mut |tok| { + let now = Instant::now(); + emitted_tokens += 1; + generated_tokens.push(tok); + if first_at.is_none() { + first_at = Some(now); + } else if let Some(prev) = prev_at { + tbt.push(now - prev); + } + prev_at = Some(now); + true + }) + .expect("generation failed"); + + let total = start.elapsed(); + let ttft = first_at.map_or(total, |t| t - start); + let decode_tokens_for_rate = emitted_tokens.saturating_sub(1); + let decode_time_for_rate = tbt.iter().copied().sum(); + GenTimings { + ttft, + tbt, + total, + emitted_tokens, + generated_tokens, + decode_tokens_for_rate, + decode_time_for_rate, + } +} + +/// Submit a single request to the scheduler and drain its token stream, +/// invoking `on_token` for each generated token id. Returns when the request +/// finishes, `on_token` returns false (early stop), or an error/closed event +/// arrives. Owns its args and borrows the handle so it composes inside a +/// `thread::spawn(move)` worker with a cloned `SchedulerHandle`. +pub(crate) fn run_scheduler_stream( + handle: &SchedulerHandle, + request_id: Option, + prompt_tokens: Vec, + params: SamplingParams, + max_tokens: usize, + mut on_token: impl FnMut(u32) -> bool, +) -> Result<()> { + let (token_tx, mut token_rx) = mpsc::unbounded_channel(); + handle + .submit(SchedulerRequest { + request_id, + queued_at_unix_s: None, + prompt_tokens, + params, + max_tokens, + lora_adapter: None, + token_tx, + logprobs: 0, + echo: false, + }) + .map_err(|e| anyhow::anyhow!("scheduler submit failed: {e}"))?; + + loop { + match token_rx.blocking_recv() { + Some(TokenEvent::Token { id, .. }) => { + if !on_token(id) { + return Ok(()); + } + } + Some(TokenEvent::PromptTokens { .. } | TokenEvent::Scheduled { .. }) => {} + Some(TokenEvent::Finished { .. }) => return Ok(()), + Some(TokenEvent::Error { message, .. }) => { + anyhow::bail!("scheduler request failed: {message}"); + } + Some(TokenEvent::Rejected { message, .. }) => { + anyhow::bail!("scheduler request rejected: {message}"); + } + None => anyhow::bail!("scheduler channel closed"), + } + } +} + +pub(crate) struct SchedulerBenchModel { + pub(crate) handle: SchedulerHandle, +} + +impl BenchModel for SchedulerBenchModel { + fn timed_generation( + &mut self, + prompt_tokens: &[u32], + max_new_tokens: usize, + sampling: &SamplingParams, + _rng: &mut StdRng, + ) -> GenTimings { + run_timed(prompt_tokens, max_new_tokens, |toks, n, cb| { + run_scheduler_stream(&self.handle, None, toks.to_vec(), *sampling, n, |id| cb(id))?; + Ok(()) + }) + } + + fn timed_generation_batch( + &mut self, + prompts: &[Vec], + max_new_tokens: usize, + sampling: &SamplingParams, + _rng: &mut StdRng, + ) -> Vec { + let mut workers = Vec::with_capacity(prompts.len()); + for (idx, prompt) in prompts.iter().enumerate() { + let handle = self.handle.clone(); + let prompt_tokens = prompt.clone(); + let sampling = *sampling; + workers.push(thread::spawn(move || { + run_timed(&prompt_tokens, max_new_tokens, |toks, n, cb| { + run_scheduler_stream( + &handle, + Some(format!("bench-serving-{idx}")), + toks.to_vec(), + sampling, + n, + |id| cb(id), + )?; + Ok(()) + }) + })); + } + + workers + .into_iter() + .map(|worker| worker.join().expect("bench request worker panicked")) + .collect() + } +} + +#[cfg(feature = "deepseek-v2-lite")] +pub(crate) struct DeepSeekV2LiteBenchModel { + pub(crate) generator: openinfer_deepseek_v2_lite::DeepSeekV2LiteEp2Generator, +} + +#[cfg(feature = "deepseek-v2-lite")] +impl BenchModel for DeepSeekV2LiteBenchModel { + fn validate_concurrency(&self, concurrency: usize) -> Result<()> { + ensure!( + concurrency > 0 && concurrency <= 8, + "DeepSeek-V2-Lite direct benchmark supports --concurrency 1..=8; concurrency=1 is the single-row control and >1 uses the narrow same-prompt batched decode path, got {concurrency}" + ); + Ok(()) + } + + fn timed_generation( + &mut self, + prompt_tokens: &[u32], + max_new_tokens: usize, + sampling: &SamplingParams, + _rng: &mut StdRng, + ) -> GenTimings { + assert_dsv2_lite_sampling_contract(sampling); + let (result, attribution) = self + .generator + .generate_greedy_with_attribution(prompt_tokens, max_new_tokens, sampling.ignore_eos) + .expect("DeepSeek-V2-Lite generation failed"); + timings_from_dsv2_lite_attribution( + result.tokens, + max_new_tokens, + attribution.total_generation_us(), + attribution.prefill_next_token_us(), + attribution.per_token_decode_us(), + ) + } + + fn timed_generation_batch( + &mut self, + prompts: &[Vec], + max_new_tokens: usize, + sampling: &SamplingParams, + _rng: &mut StdRng, + ) -> Vec { + assert_dsv2_lite_sampling_contract(sampling); + if prompts.len() == 1 { + return vec![self.timed_generation(&prompts[0], max_new_tokens, sampling, _rng)]; + } + + // This generator drives a narrow same-prompt batched decode kernel: + // every row shares `prompts[0]`. Distinct per-request prompts are a + // scheduler-path concern; this microbench takes one prompt by design. + let result = self + .generator + .generate_greedy_batch_same_prompt_with_timings( + &prompts[0], + prompts.len(), + max_new_tokens, + sampling.ignore_eos, + ) + .expect("DeepSeek-V2-Lite batched generation failed"); + timings_from_dsv2_lite_batched_generation(result, max_new_tokens) + } +} + +#[cfg(feature = "deepseek-v2-lite")] +pub(crate) fn assert_dsv2_lite_sampling_contract(sampling: &SamplingParams) { + assert!( + sampling.ignore_eos, + "DeepSeek-V2-Lite direct attribution benchmark requires ignore_eos=true so output_len maps to an exact generated-token count" + ); + assert!( + (sampling.temperature <= 0.0 || sampling.top_k == 1) && sampling.top_p >= 1.0, + "DeepSeek-V2-Lite direct attribution benchmark supports greedy decoding only; requested temperature={}, top_k={}, top_p={}", + sampling.temperature, + sampling.top_k, + sampling.top_p + ); +} + +#[cfg(feature = "deepseek-v2-lite")] +pub(crate) fn timings_from_dsv2_lite_attribution( + generated_token_ids: Vec, + expected_generated_tokens: usize, + total_generation_us: u64, + prefill_next_token_us: Option, + per_token_decode_us: &[u64], +) -> GenTimings { + // This bench helper intentionally panics on corrupted attribution data rather + // than synthesizing a result. The surrounding trait does not carry errors, + // and emitting bogus TPOT would be worse than aborting the benchmark. + let emitted_tokens = generated_token_ids.len(); + assert_eq!( + emitted_tokens, expected_generated_tokens, + "DeepSeek-V2-Lite generated token count mismatch: got {} tokens for requested output_len={}", + emitted_tokens, expected_generated_tokens + ); + let expected_decode_steps = expected_generated_tokens.saturating_sub(1); + assert_eq!( + per_token_decode_us.len(), + expected_decode_steps, + "DeepSeek-V2-Lite timing count mismatch: got {} decode samples for {} generated tokens", + per_token_decode_us.len(), + emitted_tokens + ); + assert!( + total_generation_us > 0, + "DeepSeek-V2-Lite total generation timing is zero; refusing to report TPOT" + ); + if emitted_tokens > 0 { + assert!( + prefill_next_token_us.is_some_and(|us| us > 0), + "DeepSeek-V2-Lite TTFT timing is missing or zero; refusing to report TPOT" + ); + } + if expected_decode_steps > 0 { + assert!( + per_token_decode_us.iter().all(|us| *us > 0), + "DeepSeek-V2-Lite decode timing contains a zero-duration sample; refusing to report TPOT" + ); + } + let tbt: Vec<_> = per_token_decode_us + .iter() + .map(|us| Duration::from_micros(*us)) + .collect(); + let decode_time_for_rate = tbt.iter().copied().sum(); + GenTimings { + ttft: Duration::from_micros(prefill_next_token_us.unwrap_or(total_generation_us)), + tbt, + total: Duration::from_micros(total_generation_us), + emitted_tokens, + generated_tokens: generated_token_ids, + decode_tokens_for_rate: emitted_tokens.saturating_sub(1), + decode_time_for_rate, + } +} + +#[cfg(feature = "deepseek-v2-lite")] +pub(crate) fn timings_from_dsv2_lite_batched_generation( + result: openinfer_deepseek_v2_lite::BatchedGenerationResult, + expected_generated_tokens: usize, +) -> Vec { + let batch_size = result.tokens.len(); + assert!( + batch_size > 0, + "DeepSeek-V2-Lite batch result must contain at least one row" + ); + assert_eq!( + result.prefill_next_token_us.len(), + batch_size, + "DeepSeek-V2-Lite batch result TTFT count mismatch" + ); + assert!( + result.total_generation_us > 0, + "DeepSeek-V2-Lite batch total generation timing is zero; refusing to report TPOT" + ); + assert!( + result.prefill_next_token_us.iter().all(|us| *us > 0), + "DeepSeek-V2-Lite batch TTFT timing contains a zero-duration sample; refusing to report TPOT" + ); + let expected_decode_steps = expected_generated_tokens.saturating_sub(1); + assert_eq!( + result.per_token_decode_us.len(), + expected_decode_steps, + "DeepSeek-V2-Lite batch timing count mismatch: got {} decode samples for {} generated tokens", + result.per_token_decode_us.len(), + expected_generated_tokens + ); + if expected_decode_steps > 0 { + assert!( + result.per_token_decode_us.iter().all(|us| *us > 0), + "DeepSeek-V2-Lite batch decode timing contains a zero-duration sample; refusing to report TPOT" + ); + } + + let tbt: Vec<_> = result + .per_token_decode_us + .iter() + .map(|us| Duration::from_micros(*us)) + .collect(); + let decode_time_for_rate: Duration = tbt.iter().copied().sum(); + let decode_tokens_for_rate = batch_size * expected_decode_steps; + + result + .tokens + .into_iter() + .zip(result.prefill_next_token_us) + .enumerate() + .map(|(idx, (generated_token_ids, prefill_us))| { + let emitted_tokens = generated_token_ids.len(); + assert_eq!( + emitted_tokens, expected_generated_tokens, + "DeepSeek-V2-Lite batch row {idx} generated token count mismatch: got {} tokens for requested output_len={}", + emitted_tokens, expected_generated_tokens + ); + GenTimings { + ttft: Duration::from_micros(prefill_us), + tbt: tbt.clone(), + total: Duration::from_micros(result.total_generation_us), + emitted_tokens, + generated_tokens: generated_token_ids, + decode_tokens_for_rate: if idx == 0 { decode_tokens_for_rate } else { 0 }, + decode_time_for_rate: if idx == 0 { + decode_time_for_rate + } else { + Duration::ZERO + }, + } + }) + .collect() +} diff --git a/openinfer-server/src/bin/bench_serving/main.rs b/openinfer-server/src/bin/bench_serving/main.rs new file mode 100644 index 00000000..9b81fda3 --- /dev/null +++ b/openinfer-server/src/bin/bench_serving/main.rs @@ -0,0 +1,335 @@ +//! In-process inference benchmark CLI. +//! +//! Usage: +//! cargo run -r --bin bench_serving -- [GLOBAL_OPTIONS] [OPTIONS] +//! +//! Examples: +//! cargo run -r --bin bench_serving -- request --prompt "Tell me a story" --output-len 128 +//! cargo run -r --bin bench_serving -- request --prompt-len 512 --output-len 64 +//! cargo run -r --bin bench_serving -- matrix --prompt-lens 32,128,512 --output-lens 32,128 +//! cargo run -r --bin bench_serving -- curve --prompt-len 1024 --output-len 256 --window 32 + +use std::path::Path; +use std::time::Instant; + +use anyhow::{Context, Result}; +use clap::Parser; +use log::debug; +use openinfer::logging; +use openinfer::scheduler::SchedulerHandle; +use openinfer::server_engine::{ModelType, detect_model_type}; +use openinfer_core::engine::{EngineLoadOptions, EpBackend}; +#[cfg(feature = "kimi-k2")] +use openinfer_core::parallel::ParallelConfig; +use openinfer_vllm_support::load_tokenizer as load_vllm_tokenizer; +use vllm_text::tokenizer::DynTokenizer; + +mod cli; +mod exec; +mod metrics; +mod prompt; +mod render; +mod report; +mod runners; +mod snapshot; +use cli::*; +use exec::*; +use metrics::*; +use runners::*; +use snapshot::*; + +fn command_seed(cli: &Cli) -> u64 { + match &cli.command { + Command::Request(args) => args.run.seed, + Command::Matrix(args) => args.run.seed, + Command::Curve(args) => args.run.seed, + Command::Snapshot(args) => args.run.seed, + Command::Compare(_) => 42, + } +} + +#[cfg(feature = "kimi-k2")] +fn kimi_parallel_config(tp_size: usize, dp_size: usize) -> Result { + anyhow::ensure!(tp_size > 0, "--tp-size must be positive"); + anyhow::ensure!(dp_size > 0, "--dp-size must be positive"); + Ok(ParallelConfig::new(tp_size, dp_size)) +} + +fn dispatch( + cli: &Cli, + model_type: ModelType, + load_ms: f64, + cuda_graph: bool, + model: &mut dyn BenchModel, + tokenizer: &DynTokenizer, +) -> Result<()> { + if let Command::Snapshot(args) = &cli.command { + run_snapshot(model, cli, model_type, args) + } else { + let report = run_command(cli, model_type, load_ms, cuda_graph, model, tokenizer)?; + emit_report(cli, &report) + } +} + +fn main() -> Result<()> { + logging::init_default(); + + let cli = Cli::parse(); + + // Compare needs no model loading + if let Command::Compare(ref args) = cli.command { + return run_compare(args); + } + + debug!( + "bench_serving starting: command={} model_path={} cuda_graph={} format={:?}", + match &cli.command { + Command::Request(_) => "request", + Command::Matrix(_) => "matrix", + Command::Curve(_) => "curve", + Command::Snapshot(_) => "snapshot", + Command::Compare(_) => "compare", + }, + cli.model_path, + cli.cuda_graph, + cli.format + ); + let model_type = detect_model_type(&cli.model_path) + .with_context(|| format!("failed to detect model type from {}", cli.model_path))?; + debug!("Detected model type: {:?}", model_type); + let load_start = Instant::now(); + + // Shared tail for every scheduler-backed model: load the tokenizer, stamp + // the elapsed load time, wrap the handle, and dispatch. The per-model arms + // below differ only in how they construct the engine handle. + let finish = |handle: SchedulerHandle, cuda_graph: bool| -> Result<()> { + let tokenizer = load_vllm_tokenizer(&cli.model_path)?; + let load_ms = dur_ms(load_start.elapsed()); + let mut bench = SchedulerBenchModel { handle }; + dispatch( + &cli, model_type, load_ms, cuda_graph, &mut bench, &tokenizer, + ) + }; + + match model_type { + #[cfg(feature = "deepseek-v2-lite")] + ModelType::DeepSeekV2Lite => { + // Distinct bench type (not scheduler-backed), so it keeps its own tail. + let generator = openinfer_deepseek_v2_lite::DeepSeekV2LiteEp2Generator::load( + Path::new(&cli.model_path), + EngineLoadOptions { + enable_cuda_graph: false, + enable_prefill_profile: false, + device_ordinals: vec![0, 1], + parallel_config: None, + ep_backend: EpBackend::Nccl, + seed: command_seed(&cli), + }, + )?; + let tokenizer = load_vllm_tokenizer(&cli.model_path)?; + let load_ms = dur_ms(load_start.elapsed()); + let mut bench = DeepSeekV2LiteBenchModel { generator }; + dispatch(&cli, model_type, load_ms, false, &mut bench, &tokenizer) + } + #[cfg(feature = "deepseek-v4")] + ModelType::DeepSeekV4 => { + let handle = openinfer_deepseek_v4::start_engine( + Path::new(&cli.model_path), + EngineLoadOptions { + enable_cuda_graph: false, + enable_prefill_profile: false, + device_ordinals: (0..8).collect(), + parallel_config: None, + ep_backend: EpBackend::Nccl, + seed: command_seed(&cli), + }, + )?; + finish(handle, false) + } + #[cfg(feature = "kimi-k2")] + ModelType::KimiK2 => { + let parallel = kimi_parallel_config(cli.tp_size, cli.dp_size)?; + let handle = openinfer_kimi_k2::start_engine( + Path::new(&cli.model_path), + EngineLoadOptions { + enable_cuda_graph: cli.cuda_graph, + enable_prefill_profile: false, + device_ordinals: (0..parallel.ep_world()).collect(), + parallel_config: Some(parallel), + ep_backend: cli.ep_backend.into(), + seed: command_seed(&cli), + }, + )?; + finish(handle, cli.cuda_graph) + } + #[cfg(feature = "qwen3-4b")] + ModelType::Qwen3 => { + let handle = openinfer_qwen3_4b::start_engine( + Path::new(&cli.model_path), + EngineLoadOptions { + enable_cuda_graph: cli.cuda_graph, + enable_prefill_profile: false, + device_ordinals: vec![0], + parallel_config: None, + ep_backend: EpBackend::Nccl, + seed: command_seed(&cli), + }, + )?; + finish(handle, cli.cuda_graph) + } + #[cfg(feature = "qwen35-4b")] + ModelType::Qwen35 => { + let handle = openinfer_qwen35_4b::start_engine_with_capacity( + Path::new(&cli.model_path), + EngineLoadOptions { + enable_cuda_graph: cli.cuda_graph, + enable_prefill_profile: false, + device_ordinals: vec![0], + parallel_config: None, + ep_backend: EpBackend::Nccl, + seed: command_seed(&cli), + }, + 4, + )?; + finish(handle, cli.cuda_graph) + } + } +} + +#[cfg(all(test, feature = "deepseek-v2-lite"))] +mod tests { + use std::time::Duration; + + use openinfer::sampler::SamplingParams; + + use super::*; + + #[test] + fn dsv2_lite_sampling_contract_accepts_bench_params() { + let sampling = SamplingParams { + ignore_eos: true, + ..SamplingParams::default() + }; + + assert_dsv2_lite_sampling_contract(&sampling); + } + + #[test] + #[should_panic(expected = "supports greedy decoding only")] + fn dsv2_lite_sampling_contract_rejects_non_greedy_params() { + let sampling = SamplingParams { + temperature: 0.8, + top_k: -1, + top_p: 0.95, + ignore_eos: true, + }; + + assert_dsv2_lite_sampling_contract(&sampling); + } + + #[test] + #[should_panic(expected = "requires ignore_eos=true")] + fn dsv2_lite_sampling_contract_rejects_eos_enabled_params() { + let sampling = SamplingParams { + ignore_eos: false, + ..SamplingParams::default() + }; + + assert_dsv2_lite_sampling_contract(&sampling); + } + + #[test] + fn dsv2_lite_attribution_timings_preserve_decode_steps() { + let timings = timings_from_dsv2_lite_attribution( + vec![11, 304, 608], + 3, + 60_000, + Some(20_000), + &[19_000, 18_000], + ); + + assert_eq!(timings.ttft, Duration::from_micros(20_000)); + assert_eq!( + timings.tbt, + vec![Duration::from_micros(19_000), Duration::from_micros(18_000)] + ); + assert_eq!(timings.total, Duration::from_micros(60_000)); + assert_eq!(timings.emitted_tokens, 3); + assert_eq!(timings.generated_tokens, vec![11, 304, 608]); + assert_eq!(timings.decode_tokens_for_rate, 2); + assert_eq!(timings.decode_time_for_rate, Duration::from_micros(37_000)); + } + + #[test] + fn dsv2_lite_batched_timings_use_shared_decode_time_for_rate() { + let timings = timings_from_dsv2_lite_batched_generation( + openinfer_deepseek_v2_lite::BatchedGenerationResult { + tokens: vec![vec![11, 304, 608], vec![11, 304, 608]], + prefill_next_token_us: vec![20_000, 21_000], + per_token_decode_us: vec![19_000, 18_000], + total_generation_us: 80_000, + stats: openinfer_deepseek_v2_lite::GenerationStats::default(), + }, + 3, + ); + + assert_eq!(timings.len(), 2); + assert_eq!(timings[0].decode_tokens_for_rate, 4); + assert_eq!( + timings[0].decode_time_for_rate, + Duration::from_micros(37_000) + ); + assert_eq!(timings[1].decode_tokens_for_rate, 0); + assert_eq!(timings[1].decode_time_for_rate, Duration::ZERO); + + let metrics = build_request_metrics(&timings); + assert_eq!(metrics.steady_tpot_ms.unwrap().p50_ms, 18.0); + assert!( + metrics.decode_tok_s.unwrap() > 100.0, + "batched decode tok/s should use one shared step duration instead of duplicating it per row" + ); + } + + #[test] + #[should_panic(expected = "timing count mismatch")] + fn dsv2_lite_attribution_timings_fail_on_missing_decode_samples() { + let _ = timings_from_dsv2_lite_attribution( + vec![11, 304, 608], + 3, + 60_000, + Some(20_000), + &[19_000], + ); + } + + #[test] + #[should_panic(expected = "generated token count mismatch")] + fn dsv2_lite_attribution_timings_fail_on_short_generation() { + let _ = + timings_from_dsv2_lite_attribution(vec![11, 304], 3, 60_000, Some(20_000), &[19_000]); + } + + #[test] + #[should_panic(expected = "zero-duration")] + fn dsv2_lite_attribution_timings_fail_on_zero_decode_samples() { + let _ = timings_from_dsv2_lite_attribution(vec![11, 304], 2, 60_000, Some(20_000), &[0]); + } + + #[test] + #[should_panic(expected = "total generation timing is zero")] + fn dsv2_lite_attribution_timings_fail_on_zero_total_generation() { + let _ = timings_from_dsv2_lite_attribution(vec![11, 304], 2, 0, Some(20_000), &[19_000]); + } + + #[test] + #[should_panic(expected = "TTFT timing is missing or zero")] + fn dsv2_lite_attribution_timings_fail_on_missing_ttft() { + let _ = timings_from_dsv2_lite_attribution(vec![11, 304], 2, 60_000, None, &[19_000]); + } + + #[test] + #[should_panic(expected = "TTFT timing is missing or zero")] + fn dsv2_lite_attribution_timings_fail_on_zero_ttft() { + let _ = timings_from_dsv2_lite_attribution(vec![11, 304], 2, 60_000, Some(0), &[19_000]); + } +} diff --git a/openinfer-server/src/bin/bench_serving/metrics.rs b/openinfer-server/src/bin/bench_serving/metrics.rs new file mode 100644 index 00000000..98a1ab02 --- /dev/null +++ b/openinfer-server/src/bin/bench_serving/metrics.rs @@ -0,0 +1,74 @@ +//! Numeric summarization: percentiles, duration/count stats, token traces. + +use std::time::Duration; + +use crate::report::{CountStats, DurationStats, GeneratedTokenTrace}; + +pub(crate) fn dur_ms(d: Duration) -> f64 { + d.as_secs_f64() * 1000.0 +} + +pub(crate) fn percentiles( + sorted: &[Duration], +) -> (Duration, Duration, Duration, Duration, Duration) { + assert!(!sorted.is_empty()); + let n = sorted.len(); + let sum: Duration = sorted.iter().sum(); + let avg = sum / n as u32; + let p = |pct: f64| sorted[((pct / 100.0) * (n - 1) as f64).round() as usize]; + (avg, p(50.0), p(95.0), p(99.0), sorted[n - 1]) +} + +pub(crate) fn summarize_durations(samples: &[Duration]) -> DurationStats { + let mut sorted = samples.to_vec(); + sorted.sort(); + let (avg, p50, p95, p99, max) = percentiles(&sorted); + DurationStats { + avg_ms: dur_ms(avg), + p50_ms: dur_ms(p50), + p95_ms: dur_ms(p95), + p99_ms: dur_ms(p99), + max_ms: dur_ms(max), + samples: sorted.len(), + } +} + +pub(crate) fn summarize_counts(samples: &[usize]) -> CountStats { + assert!(!samples.is_empty()); + let min = *samples.iter().min().unwrap(); + let max = *samples.iter().max().unwrap(); + let sum: usize = samples.iter().sum(); + CountStats { + min, + max, + avg: sum as f64 / samples.len() as f64, + samples: samples.len(), + } +} + +pub(crate) fn aggregate_tok_s(tokens: usize, total: Duration) -> Option { + if tokens == 0 || total.is_zero() { + None + } else { + Some(tokens as f64 / total.as_secs_f64()) + } +} + +pub(crate) fn generated_token_hash(tokens: &[u32]) -> String { + let mut hash = 0xcbf2_9ce4_8422_2325_u64; + for token in tokens { + for byte in token.to_le_bytes() { + hash ^= u64::from(byte); + hash = hash.wrapping_mul(0x0100_0000_01b3); + } + } + format!("{hash:016x}") +} + +pub(crate) fn generated_token_trace(tokens: &[u32]) -> GeneratedTokenTrace { + GeneratedTokenTrace { + hash: generated_token_hash(tokens), + prefix: tokens.iter().copied().take(16).collect(), + len: tokens.len(), + } +} diff --git a/openinfer-server/src/bin/bench_serving/prompt.rs b/openinfer-server/src/bin/bench_serving/prompt.rs new file mode 100644 index 00000000..a8ae5fb6 --- /dev/null +++ b/openinfer-server/src/bin/bench_serving/prompt.rs @@ -0,0 +1,124 @@ +//! Prompt resolution: inline/file/synthetic inputs and per-request salting. + +use std::fs; + +use anyhow::{Context, Result, ensure}; +use rand::rngs::StdRng; +use rand::{RngExt, SeedableRng}; +use vllm_text::tokenizer::DynTokenizer; + +use crate::cli::PromptInputArgs; +use crate::report::PromptDescriptor; + +pub(crate) const SYNTHETIC_PATTERN: &str = "token_id = 100 + (idx % 1000)"; + +pub(crate) fn truncate_preview(text: &str, limit: usize) -> String { + let one_line = text.replace('\n', "\\n"); + if one_line.chars().count() <= limit { + return one_line; + } + let mut truncated = String::new(); + for ch in one_line.chars().take(limit) { + truncated.push(ch); + } + truncated.push_str("..."); + truncated +} + +pub(crate) fn synthetic_prompt_tokens(len: usize) -> Vec { + (0..len).map(|i| ((i % 1000) + 100) as u32).collect() +} + +/// Token-id bounds for synthetic concurrent prompts: above the low special +/// tokens and well under the smallest supported vocab (DeepSeek-V2-Lite ≈ +/// 102 400), so every drawn id is an ordinary token on any model line. +pub(crate) const SYNTHETIC_TOKEN_LO: u32 = 100; +pub(crate) const SYNTHETIC_TOKEN_HI: u32 = 100_000; + +/// One synthetic prompt of `len` random tokens, seeded per request so the +/// concurrent decode streams diverge. Identical concurrent prompts route a MoE +/// batch onto a narrow expert set, packing the Marlin expert GEMM into fat +/// tiles and under-measuring decode TPOT by ~7–15% (measured on Kimi-K2 via a +/// `--distinct-prompts` sweep; the bench trap behind the misread #225 "+51% +/// HTTP" gap). Distinct prompts exercise realistic broad expert routing. See +/// docs/lessons/moe-bench-prompt-diversity.md. +pub(crate) fn synthetic_random_prompt(len: usize, seed: u64, request_idx: usize) -> Vec { + let mut rng = + StdRng::seed_from_u64(seed ^ (request_idx as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15)); + (0..len) + .map(|_| rng.random_range(SYNTHETIC_TOKEN_LO..SYNTHETIC_TOKEN_HI)) + .collect() +} + +#[derive(Debug, Clone)] +pub(crate) struct PromptSpec { + pub(crate) descriptor: PromptDescriptor, + pub(crate) tokens: Vec, +} + +pub(crate) fn resolve_prompt_input( + args: &PromptInputArgs, + tokenizer: &DynTokenizer, + default_text: Option<&str>, + default_prompt_len: Option, +) -> Result { + match (&args.prompt, &args.prompt_file, args.prompt_len) { + (Some(prompt), None, None) => Ok(PromptSpec { + descriptor: PromptDescriptor { + source: "text".to_string(), + prompt_tokens: tokenizer.encode(prompt, false)?.len(), + prompt_preview: Some(truncate_preview(prompt, 96)), + }, + tokens: tokenizer.encode(prompt, false)?, + }), + (None, Some(path), None) => { + let prompt = fs::read_to_string(path) + .with_context(|| format!("failed to read prompt file: {path}"))?; + let tokens = tokenizer.encode(&prompt, false)?; + Ok(PromptSpec { + descriptor: PromptDescriptor { + source: format!("file:{path}"), + prompt_tokens: tokens.len(), + prompt_preview: Some(truncate_preview(&prompt, 96)), + }, + tokens, + }) + } + (None, None, Some(prompt_len)) => { + ensure!(prompt_len > 0, "--prompt-len must be > 0"); + Ok(PromptSpec { + descriptor: PromptDescriptor { + source: format!("synthetic:{SYNTHETIC_PATTERN}"), + prompt_tokens: prompt_len, + prompt_preview: None, + }, + tokens: synthetic_prompt_tokens(prompt_len), + }) + } + (None, None, None) => { + if let Some(prompt) = default_text { + let tokens = tokenizer.encode(prompt, false)?; + Ok(PromptSpec { + descriptor: PromptDescriptor { + source: "text".to_string(), + prompt_tokens: tokens.len(), + prompt_preview: Some(truncate_preview(prompt, 96)), + }, + tokens, + }) + } else if let Some(prompt_len) = default_prompt_len { + Ok(PromptSpec { + descriptor: PromptDescriptor { + source: format!("synthetic:{SYNTHETIC_PATTERN}"), + prompt_tokens: prompt_len, + prompt_preview: None, + }, + tokens: synthetic_prompt_tokens(prompt_len), + }) + } else { + unreachable!("default prompt source must be provided"); + } + } + _ => unreachable!("clap enforces prompt input conflicts"), + } +} diff --git a/openinfer-server/src/bin/bench_serving/render.rs b/openinfer-server/src/bin/bench_serving/render.rs new file mode 100644 index 00000000..cad733e9 --- /dev/null +++ b/openinfer-server/src/bin/bench_serving/render.rs @@ -0,0 +1,295 @@ +//! comfy_table renderers for the text report format. + +use std::io::{IsTerminal, stdout}; + +use comfy_table::modifiers::UTF8_ROUND_CORNERS; +use comfy_table::presets::{ASCII_FULL_CONDENSED, UTF8_FULL_CONDENSED}; +use comfy_table::{Cell, CellAlignment, Table}; + +use crate::report::{CurveReport, DurationStats, MatrixReport, RequestReport, RunInfo}; + +pub(crate) fn new_table() -> Table { + let mut table = Table::new(); + if stdout().is_terminal() { + table.load_preset(UTF8_FULL_CONDENSED); + table.apply_modifier(UTF8_ROUND_CORNERS); + } else { + table.load_preset(ASCII_FULL_CONDENSED); + } + table +} + +pub(crate) fn key_cell(label: impl Into) -> Cell { + Cell::new(label.into()) +} + +pub(crate) fn value_cell(value: impl Into) -> Cell { + Cell::new(value.into()) +} + +pub(crate) fn numeric_cell(value: impl Into) -> Cell { + Cell::new(value.into()).set_alignment(CellAlignment::Right) +} + +pub(crate) fn format_rate(value: Option) -> String { + value.map_or_else(|| "-".to_string(), |v| format!("{v:.2}")) +} + +pub(crate) fn format_duration_ms(value: f64) -> String { + format!("{value:.2}") +} + +pub(crate) fn format_count_avg(value: f64) -> String { + format!("{value:.2}") +} + +pub(crate) fn push_table(out: &mut String, table: &Table) { + out.push_str(&table.to_string()); + out.push('\n'); +} + +pub(crate) fn render_run_summary(report: &RunInfo) -> Table { + let mut table = new_table(); + table.add_row(vec![ + key_cell("model"), + value_cell(format!("{} ({})", report.model_path, report.model_type)), + ]); + table.add_row(vec![ + key_cell("cuda_graph"), + value_cell(report.cuda_graph.to_string()), + ]); + table.add_row(vec![ + key_cell("load_ms"), + numeric_cell(format_duration_ms(report.load_ms)), + ]); + if let Some(label) = &report.label { + table.add_row(vec![key_cell("label"), value_cell(label.clone())]); + } + table +} + +pub(crate) fn render_request_meta(report: &RequestReport) -> Table { + let mut table = render_run_summary(&report.run); + table.add_row(vec![ + key_cell("prompt_source"), + value_cell(report.workload.prompt.source.clone()), + ]); + table.add_row(vec![ + key_cell("prompt_tokens"), + numeric_cell(report.workload.prompt.prompt_tokens.to_string()), + ]); + if let Some(preview) = &report.workload.prompt.prompt_preview { + table.add_row(vec![ + key_cell("prompt"), + value_cell(format!("\"{preview}\"")), + ]); + } + table.add_row(vec![ + key_cell("output_len"), + numeric_cell(report.workload.output_len.to_string()), + ]); + table.add_row(vec![ + key_cell("warmup / iters"), + value_cell(format!( + "{} / {}", + report.workload.warmup, report.workload.iters + )), + ]); + table.add_row(vec![ + key_cell("seed"), + numeric_cell(report.workload.seed.to_string()), + ]); + table +} + +pub(crate) fn render_duration_table(rows: Vec<(String, DurationStats)>) -> Table { + let mut table = new_table(); + table.set_header(vec![ + Cell::new("metric"), + Cell::new("avg_ms").set_alignment(CellAlignment::Right), + Cell::new("p50_ms").set_alignment(CellAlignment::Right), + Cell::new("p95_ms").set_alignment(CellAlignment::Right), + Cell::new("p99_ms").set_alignment(CellAlignment::Right), + Cell::new("max_ms").set_alignment(CellAlignment::Right), + Cell::new("samples").set_alignment(CellAlignment::Right), + ]); + for (label, stats) in rows { + table.add_row(vec![ + key_cell(label), + numeric_cell(format_duration_ms(stats.avg_ms)), + numeric_cell(format_duration_ms(stats.p50_ms)), + numeric_cell(format_duration_ms(stats.p95_ms)), + numeric_cell(format_duration_ms(stats.p99_ms)), + numeric_cell(format_duration_ms(stats.max_ms)), + numeric_cell(stats.samples.to_string()), + ]); + } + table +} + +pub(crate) fn render_request_summary(report: &RequestReport) -> Table { + let mut table = new_table(); + table.set_header(vec![ + Cell::new("metric"), + Cell::new("value").set_alignment(CellAlignment::Right), + ]); + table.add_row(vec![ + key_cell("generated_tokens_avg"), + numeric_cell(format_count_avg(report.metrics.generated_tokens.avg)), + ]); + table.add_row(vec![ + key_cell("generated_tokens_min"), + numeric_cell(report.metrics.generated_tokens.min.to_string()), + ]); + table.add_row(vec![ + key_cell("generated_tokens_max"), + numeric_cell(report.metrics.generated_tokens.max.to_string()), + ]); + table.add_row(vec![ + key_cell("generated_token_runs"), + numeric_cell(report.metrics.generated_tokens.samples.to_string()), + ]); + table.add_row(vec![ + key_cell("request_tok_s"), + numeric_cell(format_rate(report.metrics.request_tok_s)), + ]); + table.add_row(vec![ + key_cell("decode_tok_s"), + numeric_cell(format_rate(report.metrics.decode_tok_s)), + ]); + table +} + +pub(crate) fn render_matrix_meta(report: &MatrixReport) -> Table { + let mut table = render_run_summary(&report.run); + table.add_row(vec![ + key_cell("prompt_lens"), + value_cell( + report + .workload + .prompt_lens + .iter() + .map(std::string::ToString::to_string) + .collect::>() + .join(","), + ), + ]); + table.add_row(vec![ + key_cell("output_lens"), + value_cell( + report + .workload + .output_lens + .iter() + .map(std::string::ToString::to_string) + .collect::>() + .join(","), + ), + ]); + table.add_row(vec![ + key_cell("synthetic_pattern"), + value_cell(report.workload.synthetic_pattern), + ]); + table.add_row(vec![ + key_cell("warmup / iters"), + value_cell(format!( + "{} / {}", + report.workload.warmup, report.workload.iters + )), + ]); + table.add_row(vec![ + key_cell("seed"), + numeric_cell(report.workload.seed.to_string()), + ]); + table +} + +pub(crate) fn render_matrix_table(report: &MatrixReport) -> Table { + let mut table = new_table(); + table.set_header(vec![ + Cell::new("prompt_tok").set_alignment(CellAlignment::Right), + Cell::new("output_tok").set_alignment(CellAlignment::Right), + Cell::new("ttft_avg").set_alignment(CellAlignment::Right), + Cell::new("ttft_p95").set_alignment(CellAlignment::Right), + Cell::new("e2e_avg").set_alignment(CellAlignment::Right), + Cell::new("req_tok/s").set_alignment(CellAlignment::Right), + Cell::new("decode_tok/s").set_alignment(CellAlignment::Right), + Cell::new("gen_avg").set_alignment(CellAlignment::Right), + ]); + for cell in &report.cells { + table.add_row(vec![ + numeric_cell(cell.prompt_len.to_string()), + numeric_cell(cell.output_len.to_string()), + numeric_cell(format_duration_ms(cell.ttft_ms.avg_ms)), + numeric_cell(format_duration_ms(cell.ttft_ms.p95_ms)), + numeric_cell(format_duration_ms(cell.e2e_ms.avg_ms)), + numeric_cell(format_rate(cell.request_tok_s)), + numeric_cell(format_rate(cell.decode_tok_s)), + numeric_cell(format_count_avg(cell.generated_tokens.avg)), + ]); + } + table +} + +pub(crate) fn render_curve_meta(report: &CurveReport) -> Table { + let mut table = render_run_summary(&report.run); + table.add_row(vec![ + key_cell("prompt_source"), + value_cell(report.workload.prompt.source.clone()), + ]); + table.add_row(vec![ + key_cell("prompt_tokens"), + numeric_cell(report.workload.prompt.prompt_tokens.to_string()), + ]); + if let Some(preview) = &report.workload.prompt.prompt_preview { + table.add_row(vec![ + key_cell("prompt"), + value_cell(format!("\"{preview}\"")), + ]); + } + table.add_row(vec![ + key_cell("output_len"), + numeric_cell(report.workload.output_len.to_string()), + ]); + table.add_row(vec![ + key_cell("window"), + numeric_cell(report.workload.window.to_string()), + ]); + table.add_row(vec![ + key_cell("warmup / iters"), + value_cell(format!( + "{} / {}", + report.workload.warmup, report.workload.iters + )), + ]); + table.add_row(vec![ + key_cell("seed"), + numeric_cell(report.workload.seed.to_string()), + ]); + table +} + +pub(crate) fn render_curve_table(report: &CurveReport) -> Table { + let mut table = new_table(); + table.set_header(vec![ + Cell::new("ctx_range"), + Cell::new("avg_ms").set_alignment(CellAlignment::Right), + Cell::new("p50_ms").set_alignment(CellAlignment::Right), + Cell::new("p95_ms").set_alignment(CellAlignment::Right), + Cell::new("p99_ms").set_alignment(CellAlignment::Right), + Cell::new("tok/s").set_alignment(CellAlignment::Right), + Cell::new("samples").set_alignment(CellAlignment::Right), + ]); + for window in &report.windows { + table.add_row(vec![ + value_cell(format!("{}-{}", window.ctx_start, window.ctx_end)), + numeric_cell(format_duration_ms(window.tpot_ms.avg_ms)), + numeric_cell(format_duration_ms(window.tpot_ms.p50_ms)), + numeric_cell(format_duration_ms(window.tpot_ms.p95_ms)), + numeric_cell(format_duration_ms(window.tpot_ms.p99_ms)), + numeric_cell(format_rate(window.decode_tok_s)), + numeric_cell(window.tpot_ms.samples.to_string()), + ]); + } + table +} diff --git a/openinfer-server/src/bin/bench_serving/report.rs b/openinfer-server/src/bin/bench_serving/report.rs new file mode 100644 index 00000000..d6598a67 --- /dev/null +++ b/openinfer-server/src/bin/bench_serving/report.rs @@ -0,0 +1,171 @@ +//! Serializable report and metric types emitted by the benchmark runners. + +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Serialize)] +pub(crate) struct RunInfo { + pub(crate) command: &'static str, + pub(crate) model_path: String, + pub(crate) model_type: String, + pub(crate) cuda_graph: bool, + pub(crate) load_ms: f64, + pub(crate) label: Option, +} + +#[derive(Debug, Clone, Serialize)] +pub(crate) struct PromptDescriptor { + pub(crate) source: String, + pub(crate) prompt_tokens: usize, + pub(crate) prompt_preview: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub(crate) struct DurationStats { + pub(crate) avg_ms: f64, + pub(crate) p50_ms: f64, + pub(crate) p95_ms: f64, + pub(crate) p99_ms: f64, + pub(crate) max_ms: f64, + pub(crate) samples: usize, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub(crate) struct CountStats { + pub(crate) min: usize, + pub(crate) max: usize, + pub(crate) avg: f64, + pub(crate) samples: usize, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub(crate) struct GeneratedTokenTrace { + pub(crate) hash: String, + pub(crate) prefix: Vec, + pub(crate) len: usize, +} + +#[derive(Debug, Clone, Serialize)] +pub(crate) struct RequestWorkload { + pub(crate) prompt: PromptDescriptor, + pub(crate) output_len: usize, + pub(crate) concurrency: usize, + pub(crate) warmup: usize, + pub(crate) iters: usize, + pub(crate) seed: u64, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub(crate) struct RequestMetrics { + pub(crate) ttft_ms: DurationStats, + pub(crate) first_decode_step_ms: Option, + pub(crate) steady_tpot_ms: Option, + pub(crate) e2e_ms: DurationStats, + pub(crate) generated_tokens: CountStats, + #[serde(default)] + pub(crate) generated_token_traces: Vec, + pub(crate) request_tok_s: Option, + pub(crate) decode_tok_s: Option, +} + +#[derive(Debug, Clone, Serialize)] +pub(crate) struct RequestIterationTiming { + pub(crate) index: usize, + pub(crate) ttft_ms: f64, + pub(crate) first_decode_step_ms: Option, + pub(crate) steady_tpot_ms: Option, + pub(crate) e2e_ms: f64, + pub(crate) generated_tokens: usize, + pub(crate) generated_token_trace: GeneratedTokenTrace, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub(crate) struct SnapshotProfile { + pub(crate) prompt_len: usize, + pub(crate) output_len: usize, + pub(crate) metrics: RequestMetrics, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub(crate) struct SnapshotReport { + pub(crate) commit: String, + pub(crate) date: String, + pub(crate) model: String, + pub(crate) gpu: String, + /// Parallel layout the snapshot was measured under (e.g. "tp1-dp8-deepep"). + /// Absent in snapshots that predate multi-GPU model lines. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub(crate) parallel: Option, + pub(crate) prefill_heavy: SnapshotProfile, + pub(crate) decode_heavy: SnapshotProfile, +} + +#[derive(Debug, Clone, Serialize)] +pub(crate) struct RequestReport { + pub(crate) run: RunInfo, + pub(crate) workload: RequestWorkload, + pub(crate) metrics: RequestMetrics, + pub(crate) iterations: Vec, +} + +#[derive(Debug, Clone, Serialize)] +pub(crate) struct MatrixWorkload { + pub(crate) prompt_lens: Vec, + pub(crate) output_lens: Vec, + pub(crate) warmup: usize, + pub(crate) iters: usize, + pub(crate) seed: u64, + pub(crate) synthetic_pattern: &'static str, +} + +#[derive(Debug, Clone, Serialize)] +pub(crate) struct MatrixCell { + pub(crate) prompt_len: usize, + pub(crate) output_len: usize, + pub(crate) ttft_ms: DurationStats, + pub(crate) e2e_ms: DurationStats, + pub(crate) first_decode_step_ms: Option, + pub(crate) steady_tpot_ms: Option, + pub(crate) generated_tokens: CountStats, + pub(crate) request_tok_s: Option, + pub(crate) decode_tok_s: Option, +} + +#[derive(Debug, Clone, Serialize)] +pub(crate) struct MatrixReport { + pub(crate) run: RunInfo, + pub(crate) workload: MatrixWorkload, + pub(crate) cells: Vec, +} + +#[derive(Debug, Clone, Serialize)] +pub(crate) struct CurveWorkload { + pub(crate) prompt: PromptDescriptor, + pub(crate) output_len: usize, + pub(crate) window: usize, + pub(crate) warmup: usize, + pub(crate) iters: usize, + pub(crate) seed: u64, +} + +#[derive(Debug, Clone, Serialize)] +pub(crate) struct CurveWindow { + pub(crate) ctx_start: usize, + pub(crate) ctx_end: usize, + pub(crate) tpot_ms: DurationStats, + pub(crate) decode_tok_s: Option, +} + +#[derive(Debug, Clone, Serialize)] +pub(crate) struct CurveReport { + pub(crate) run: RunInfo, + pub(crate) workload: CurveWorkload, + pub(crate) windows: Vec, +} + +#[derive(Debug, Clone, Serialize)] +#[serde(tag = "kind", rename_all = "snake_case")] +pub(crate) enum BenchReport { + Request(Box), + Matrix(MatrixReport), + Curve(CurveReport), +} diff --git a/openinfer-server/src/bin/bench_serving/runners.rs b/openinfer-server/src/bin/bench_serving/runners.rs new file mode 100644 index 00000000..b3458cc4 --- /dev/null +++ b/openinfer-server/src/bin/bench_serving/runners.rs @@ -0,0 +1,452 @@ +//! Benchmark drivers for the request / matrix / curve commands: +//! timing collection, metric assembly, and report emission. + +use std::fmt::Write as _; +use std::fs; +use std::time::Duration; + +use anyhow::{Context, Result, ensure}; +use cudarc::driver::Profiler; +use cudarc::runtime::result::device as cuda_device; +use log::{debug, info}; +use openinfer::sampler::SamplingParams; +use openinfer::server_engine::ModelType; +use rand::SeedableRng; +use rand::rngs::StdRng; +use vllm_text::tokenizer::DynTokenizer; + +use crate::cli::*; +use crate::exec::*; +use crate::metrics::*; +use crate::prompt::*; +use crate::render::*; +use crate::report::*; + +pub(crate) const DEFAULT_REQUEST_PROMPT: &str = "Tell me a story"; +pub(crate) const DEFAULT_CURVE_PROMPT_LEN: usize = 512; + +pub(crate) fn normalize_sizes(values: &[usize], flag: &str) -> Result> { + ensure!(!values.is_empty(), "{flag} must not be empty"); + ensure!(values.iter().all(|v| *v > 0), "{flag} values must be > 0"); + let mut normalized = values.to_vec(); + normalized.sort_unstable(); + normalized.dedup(); + Ok(normalized) +} + +pub(crate) fn validate_run_args(args: &RunArgs) -> Result<()> { + ensure!(args.iters > 0, "--iters must be > 0"); + Ok(()) +} + +pub(crate) fn measure_timings( + model: &mut dyn BenchModel, + prompts: &[Vec], + output_len: usize, + run: &RunArgs, + cuda_profiler_capture: bool, +) -> Result> { + ensure!(output_len > 0, "--output-len must be > 0"); + ensure!(!prompts.is_empty(), "concurrency must be > 0"); + model.validate_concurrency(prompts.len())?; + validate_run_args(run)?; + + let sampling = SamplingParams { + ignore_eos: true, + ..SamplingParams::default() + }; + let mut rng = StdRng::seed_from_u64(run.seed); + + for _ in 0..run.warmup { + let _ = model.timed_generation_batch(prompts, output_len, &sampling, &mut rng); + } + + let profiler = if cuda_profiler_capture { + info!( + "Starting CUDA profiler capture around {} measured iterations", + run.iters + ); + cuda_device::set(0).context("failed to set CUDA device before profiler capture")?; + Some(Profiler::new().context("failed to start CUDA profiler capture")?) + } else { + None + }; + + let mut timings = Vec::with_capacity(run.iters * prompts.len()); + for _ in 0..run.iters { + timings.extend(model.timed_generation_batch(prompts, output_len, &sampling, &mut rng)); + } + drop(profiler); + Ok(timings) +} + +pub(crate) fn build_request_metrics(timings: &[GenTimings]) -> RequestMetrics { + let ttfts: Vec = timings.iter().map(|t| t.ttft).collect(); + let e2e: Vec = timings.iter().map(|t| t.total).collect(); + let first_steps: Vec = timings + .iter() + .filter_map(|t| t.tbt.first().copied()) + .collect(); + let steady: Vec = timings + .iter() + .flat_map(|t| t.tbt.iter().skip(1).copied()) + .collect(); + let generated: Vec = timings.iter().map(|t| t.emitted_tokens).collect(); + let generated_token_traces: Vec = timings + .iter() + .map(|timing| generated_token_trace(&timing.generated_tokens)) + .collect(); + + let total_emitted: usize = timings.iter().map(|t| t.emitted_tokens).sum(); + let total_request_time: Duration = timings.iter().map(|t| t.total).sum(); + let total_decode_steps: usize = timings.iter().map(|t| t.decode_tokens_for_rate).sum(); + let total_decode_time: Duration = timings.iter().map(|t| t.decode_time_for_rate).sum(); + + RequestMetrics { + ttft_ms: summarize_durations(&ttfts), + first_decode_step_ms: (!first_steps.is_empty()).then(|| summarize_durations(&first_steps)), + steady_tpot_ms: (!steady.is_empty()).then(|| summarize_durations(&steady)), + e2e_ms: summarize_durations(&e2e), + generated_tokens: summarize_counts(&generated), + generated_token_traces, + request_tok_s: aggregate_tok_s(total_emitted, total_request_time), + decode_tok_s: aggregate_tok_s(total_decode_steps, total_decode_time), + } +} + +pub(crate) fn build_request_iterations(timings: &[GenTimings]) -> Vec { + timings + .iter() + .enumerate() + .map(|(index, timing)| { + let steady: Vec = timing.tbt.iter().skip(1).copied().collect(); + RequestIterationTiming { + index, + ttft_ms: dur_ms(timing.ttft), + first_decode_step_ms: timing.tbt.first().copied().map(dur_ms), + steady_tpot_ms: (!steady.is_empty()).then(|| summarize_durations(&steady)), + e2e_ms: dur_ms(timing.total), + generated_tokens: timing.emitted_tokens, + generated_token_trace: generated_token_trace(&timing.generated_tokens), + } + }) + .collect() +} + +pub(crate) fn run_info( + cli: &Cli, + command: &'static str, + model_type: ModelType, + load_ms: f64, + cuda_graph: bool, +) -> RunInfo { + RunInfo { + command, + model_path: cli.model_path.clone(), + model_type: format!("{model_type:?}"), + cuda_graph, + load_ms, + label: cli.label.clone(), + } +} + +pub(crate) fn bench_request( + model: &mut dyn BenchModel, + tokenizer: &DynTokenizer, + cli: &Cli, + model_type: ModelType, + load_ms: f64, + cuda_graph: bool, + args: &RequestArgs, +) -> Result { + let mut prompt = resolve_prompt_input( + &args.prompt_input, + tokenizer, + Some(DEFAULT_REQUEST_PROMPT), + None, + )?; + // A `--prompt-len` workload is synthetic: give every concurrent request a + // distinct seeded-random prompt so the decode streams diverge and MoE + // routing is realistic. An explicit `--prompt`/`--prompt-file` (or the + // default text) is the caller's chosen prompt and is replicated as-is. + let synthetic = args.prompt_input.prompt_len.is_some(); + let prompts: Vec> = if synthetic { + // 0 = one distinct prompt per request (fully diverse). Otherwise tile + // `distinct` unique prompts across the batch: idx → idx % distinct. + let distinct = if args.distinct_prompts == 0 { + args.concurrency + } else { + args.distinct_prompts.min(args.concurrency) + }; + prompt.descriptor.source = format!( + "synthetic-random[{SYNTHETIC_TOKEN_LO}..{SYNTHETIC_TOKEN_HI}) seed={} distinct={distinct}/{}", + args.run.seed, args.concurrency + ); + (0..args.concurrency) + .map(|idx| synthetic_random_prompt(prompt.tokens.len(), args.run.seed, idx % distinct)) + .collect() + } else { + vec![prompt.tokens.clone(); args.concurrency] + }; + info!( + "Starting request benchmark: prompt_tokens={} output_len={} concurrency={} warmup={} iters={} seed={} source={}", + prompt.descriptor.prompt_tokens, + args.output_len, + args.concurrency, + args.run.warmup, + args.run.iters, + args.run.seed, + prompt.descriptor.source, + ); + let timings = measure_timings( + model, + &prompts, + args.output_len, + &args.run, + cli.cuda_profiler_capture, + )?; + Ok(BenchReport::Request(Box::new(RequestReport { + run: run_info(cli, "request", model_type, load_ms, cuda_graph), + workload: RequestWorkload { + prompt: prompt.descriptor, + output_len: args.output_len, + concurrency: args.concurrency, + warmup: args.run.warmup, + iters: args.run.iters, + seed: args.run.seed, + }, + metrics: build_request_metrics(&timings), + iterations: build_request_iterations(&timings), + }))) +} + +pub(crate) fn bench_matrix( + model: &mut dyn BenchModel, + cli: &Cli, + model_type: ModelType, + load_ms: f64, + cuda_graph: bool, + args: &MatrixArgs, +) -> Result { + validate_run_args(&args.run)?; + let prompt_lens = normalize_sizes(&args.prompt_lens, "--prompt-lens")?; + let output_lens = normalize_sizes(&args.output_lens, "--output-lens")?; + info!( + "Starting matrix benchmark: prompt_lens={:?} output_lens={:?} warmup={} iters={} seed={}", + prompt_lens, output_lens, args.run.warmup, args.run.iters, args.run.seed + ); + + let mut cells = Vec::with_capacity(prompt_lens.len() * output_lens.len()); + for &prompt_len in &prompt_lens { + let prompt_tokens = synthetic_prompt_tokens(prompt_len); + for &output_len in &output_lens { + debug!( + "Running matrix cell: prompt_len={} output_len={}", + prompt_len, output_len + ); + let timings = measure_timings( + model, + std::slice::from_ref(&prompt_tokens), + output_len, + &args.run, + cli.cuda_profiler_capture, + )?; + let metrics = build_request_metrics(&timings); + cells.push(MatrixCell { + prompt_len, + output_len, + ttft_ms: metrics.ttft_ms, + e2e_ms: metrics.e2e_ms, + first_decode_step_ms: metrics.first_decode_step_ms, + steady_tpot_ms: metrics.steady_tpot_ms, + generated_tokens: metrics.generated_tokens, + request_tok_s: metrics.request_tok_s, + decode_tok_s: metrics.decode_tok_s, + }); + } + } + + Ok(BenchReport::Matrix(MatrixReport { + run: run_info(cli, "matrix", model_type, load_ms, cuda_graph), + workload: MatrixWorkload { + prompt_lens, + output_lens, + warmup: args.run.warmup, + iters: args.run.iters, + seed: args.run.seed, + synthetic_pattern: SYNTHETIC_PATTERN, + }, + cells, + })) +} + +pub(crate) fn bench_curve( + model: &mut dyn BenchModel, + tokenizer: &DynTokenizer, + cli: &Cli, + model_type: ModelType, + load_ms: f64, + cuda_graph: bool, + args: &CurveArgs, +) -> Result { + ensure!(args.window > 0, "--window must be > 0"); + ensure!(args.output_len >= 2, "--output-len must be >= 2 for curve"); + + let prompt = resolve_prompt_input( + &args.prompt_input, + tokenizer, + None, + Some(DEFAULT_CURVE_PROMPT_LEN), + )?; + info!( + "Starting curve benchmark: prompt_tokens={} output_len={} window={} warmup={} iters={} seed={}", + prompt.descriptor.prompt_tokens, + args.output_len, + args.window, + args.run.warmup, + args.run.iters, + args.run.seed + ); + let timings = measure_timings( + model, + std::slice::from_ref(&prompt.tokens), + args.output_len, + &args.run, + cli.cuda_profiler_capture, + )?; + + let mut tbt_by_pos: Vec> = Vec::new(); + for timing in &timings { + for (idx, &duration) in timing.tbt.iter().enumerate() { + if idx >= tbt_by_pos.len() { + tbt_by_pos.push(Vec::with_capacity(args.run.iters)); + } + tbt_by_pos[idx].push(duration); + } + } + + let mut windows = Vec::new(); + let mut pos = 0usize; + while pos < tbt_by_pos.len() { + let end = (pos + args.window).min(tbt_by_pos.len()); + let mut samples = Vec::new(); + for bucket in &tbt_by_pos[pos..end] { + samples.extend_from_slice(bucket); + } + if !samples.is_empty() { + let stats = summarize_durations(&samples); + windows.push(CurveWindow { + ctx_start: prompt.descriptor.prompt_tokens + pos + 1, + ctx_end: prompt.descriptor.prompt_tokens + end, + decode_tok_s: (stats.avg_ms > 0.0).then(|| 1000.0 / stats.avg_ms), + tpot_ms: stats, + }); + } + pos = end; + } + + Ok(BenchReport::Curve(CurveReport { + run: run_info(cli, "curve", model_type, load_ms, cuda_graph), + workload: CurveWorkload { + prompt: prompt.descriptor, + output_len: args.output_len, + window: args.window, + warmup: args.run.warmup, + iters: args.run.iters, + seed: args.run.seed, + }, + windows, + })) +} + +pub(crate) fn render_text(report: &BenchReport) -> String { + let mut out = String::new(); + match report { + BenchReport::Request(report) => { + let _ = writeln!(out, "bench_serving request\n"); + push_table(&mut out, &render_request_meta(report)); + out.push('\n'); + push_table( + &mut out, + &render_duration_table( + std::iter::once(("ttft_ms".to_string(), report.metrics.ttft_ms.clone())) + .chain( + report + .metrics + .first_decode_step_ms + .clone() + .into_iter() + .map(|stats| ("first_decode_step_ms".to_string(), stats)), + ) + .chain( + report + .metrics + .steady_tpot_ms + .clone() + .into_iter() + .map(|stats| ("steady_tpot_ms".to_string(), stats)), + ) + .chain(std::iter::once(( + "e2e_ms".to_string(), + report.metrics.e2e_ms.clone(), + ))) + .collect(), + ), + ); + out.push('\n'); + push_table(&mut out, &render_request_summary(report)); + } + BenchReport::Matrix(report) => { + let _ = writeln!(out, "bench_serving matrix\n"); + push_table(&mut out, &render_matrix_meta(report)); + out.push('\n'); + push_table(&mut out, &render_matrix_table(report)); + } + BenchReport::Curve(report) => { + let _ = writeln!(out, "bench_serving curve\n"); + push_table(&mut out, &render_curve_meta(report)); + out.push('\n'); + push_table(&mut out, &render_curve_table(report)); + } + } + out +} + +pub(crate) fn emit_report(cli: &Cli, report: &BenchReport) -> Result<()> { + let rendered = match cli.format { + OutputFormat::Text => render_text(report), + OutputFormat::Json => serde_json::to_string_pretty(report)?, + }; + + if let Some(path) = &cli.out { + fs::write(path, &rendered).with_context(|| format!("failed to write report to {path}"))?; + info!("Wrote benchmark report to {}", path); + } + + println!("{rendered}"); + Ok(()) +} + +pub(crate) fn run_command( + cli: &Cli, + model_type: ModelType, + load_ms: f64, + cuda_graph: bool, + model: &mut dyn BenchModel, + tokenizer: &DynTokenizer, +) -> Result { + match &cli.command { + Command::Request(args) => { + bench_request(model, tokenizer, cli, model_type, load_ms, cuda_graph, args) + } + Command::Matrix(args) => bench_matrix(model, cli, model_type, load_ms, cuda_graph, args), + Command::Curve(args) => { + bench_curve(model, tokenizer, cli, model_type, load_ms, cuda_graph, args) + } + Command::Snapshot(_) | Command::Compare(_) => unreachable!(), + } +} + +// --------------------------------------------------------------------------- +// Snapshot / Compare +// --------------------------------------------------------------------------- diff --git a/openinfer-server/src/bin/bench_serving/snapshot.rs b/openinfer-server/src/bin/bench_serving/snapshot.rs new file mode 100644 index 00000000..01f75fff --- /dev/null +++ b/openinfer-server/src/bin/bench_serving/snapshot.rs @@ -0,0 +1,379 @@ +//! Snapshot generation and git-baseline comparison: the regression- +//! trackable profiles plus their git/gpu/date provenance helpers. + +use std::fmt::Write as _; +use std::fs; +use std::path::{Path, PathBuf}; + +use anyhow::{Context, Result, ensure}; +use comfy_table::{Cell, CellAlignment}; +use log::info; +use openinfer::server_engine::ModelType; + +use crate::cli::*; +use crate::exec::*; +use crate::prompt::*; +use crate::render::*; +use crate::report::*; +use crate::runners::*; + +pub(crate) const SNAPSHOT_DIR: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../bench_snapshots"); +pub(crate) const SNAPSHOT_PREFILL_OUTPUT_LEN: usize = 1; +pub(crate) const SNAPSHOT_DECODE_PROMPT_LEN: usize = 1024; +pub(crate) const SNAPSHOT_DECODE_OUTPUT_LEN: usize = 256; + +pub(crate) fn snapshot_prefill_prompt_len(model_type: ModelType) -> usize { + match model_type { + // Kimi serves TP1/DP8, where the PPLX fabric buffers cap prompts at + // 2048 tokens (full-lifetime KV cap is 8192) — probe the largest + // prompt the serving shape admits. + #[cfg(feature = "kimi-k2")] + ModelType::KimiK2 => 2_048, + _ => 10_000, + } +} +pub(crate) const REGRESSION_TPOT_PCT: f64 = 2.0; +pub(crate) const REGRESSION_TTFT_PCT: f64 = 3.0; + +pub(crate) fn shell_output(program: &str, args: &[&str]) -> Option { + std::process::Command::new(program) + .args(args) + .output() + .ok() + .filter(|o| o.status.success()) + .and_then(|o| String::from_utf8(o.stdout).ok()) + .map(|s| s.trim().to_string()) +} + +pub(crate) fn git_short_commit() -> String { + shell_output("git", &["rev-parse", "--short", "HEAD"]).unwrap_or_else(|| "unknown".into()) +} + +pub(crate) fn gpu_name() -> String { + shell_output( + "nvidia-smi", + &["--query-gpu=name", "--format=csv,noheader", "--id=0"], + ) + .unwrap_or_else(|| "unknown".into()) +} + +/// Produce a filesystem-safe slug from a GPU name string. +/// +/// `"NVIDIA GeForce RTX 5070 Ti"` → `"rtx-5070-ti"` +pub(crate) fn gpu_slug_from(name: &str) -> String { + let stripped = name + .strip_prefix("NVIDIA GeForce ") + .or_else(|| name.strip_prefix("NVIDIA ")) + .unwrap_or(name); + stripped + .to_lowercase() + .chars() + .map(|c| { + if c.is_alphanumeric() || c == '-' { + c + } else { + '-' + } + }) + .collect::() + .split('-') + .filter(|s| !s.is_empty()) + .collect::>() + .join("-") +} + +pub(crate) fn today_date() -> String { + shell_output("date", &["+%Y-%m-%d"]).unwrap_or_else(|| "unknown".into()) +} + +pub(crate) fn model_display_name(model_path: &str) -> String { + Path::new(model_path) + .file_name() + .and_then(|n| n.to_str()) + .unwrap_or("unknown") + .to_string() +} + +pub(crate) fn delta_pct(current: f64, baseline: f64) -> f64 { + if baseline == 0.0 { + return 0.0; + } + (current - baseline) / baseline * 100.0 +} + +pub(crate) fn format_delta(pct: f64) -> String { + if pct >= 0.0 { + format!("+{pct:.1}%") + } else { + format!("{pct:.1}%") + } +} + +pub(crate) fn run_snapshot( + model: &mut dyn BenchModel, + cli: &Cli, + model_type: ModelType, + args: &SnapshotArgs, +) -> Result<()> { + let prefill_prompt_len = snapshot_prefill_prompt_len(model_type); + + info!("Running prefill-heavy ({prefill_prompt_len},{SNAPSHOT_PREFILL_OUTPUT_LEN})"); + let prefill_tokens = synthetic_prompt_tokens(prefill_prompt_len); + let prefill_timings = measure_timings( + model, + std::slice::from_ref(&prefill_tokens), + SNAPSHOT_PREFILL_OUTPUT_LEN, + &args.run, + cli.cuda_profiler_capture, + )?; + let prefill_metrics = build_request_metrics(&prefill_timings); + + info!("Running decode-heavy ({SNAPSHOT_DECODE_PROMPT_LEN},{SNAPSHOT_DECODE_OUTPUT_LEN})"); + let decode_tokens = synthetic_prompt_tokens(SNAPSHOT_DECODE_PROMPT_LEN); + let decode_timings = measure_timings( + model, + std::slice::from_ref(&decode_tokens), + SNAPSHOT_DECODE_OUTPUT_LEN, + &args.run, + cli.cuda_profiler_capture, + )?; + let decode_metrics = build_request_metrics(&decode_timings); + + let model_name = model_display_name(&cli.model_path); + let gpu = gpu_name(); + let parallel = match model_type { + #[cfg(feature = "kimi-k2")] + ModelType::KimiK2 => Some(format!( + "tp{}-dp{}-{}", + cli.tp_size, + cli.dp_size, + format!("{:?}", cli.ep_backend).to_lowercase() + )), + _ => None, + }; + let report = SnapshotReport { + commit: git_short_commit(), + date: today_date(), + model: model_name.clone(), + gpu: gpu.clone(), + parallel, + prefill_heavy: SnapshotProfile { + prompt_len: prefill_prompt_len, + output_len: SNAPSHOT_PREFILL_OUTPUT_LEN, + metrics: prefill_metrics, + }, + decode_heavy: SnapshotProfile { + prompt_len: SNAPSHOT_DECODE_PROMPT_LEN, + output_len: SNAPSHOT_DECODE_OUTPUT_LEN, + metrics: decode_metrics, + }, + }; + + let dir = Path::new(SNAPSHOT_DIR).join(gpu_slug_from(&gpu)); + fs::create_dir_all(&dir)?; + let filename = model_name.to_lowercase(); + let path = dir.join(format!("{filename}.json")); + let snapshot_json = serde_json::to_string_pretty(&report)?; + fs::write(&path, format!("{snapshot_json}\n"))?; + + println!("{}", render_snapshot_text(&report, &path)); + Ok(()) +} + +pub(crate) fn render_snapshot_text(report: &SnapshotReport, path: &Path) -> String { + let mut out = String::new(); + let _ = writeln!(out, "bench_serving snapshot\n"); + let _ = writeln!(out, "model: {}", report.model); + let _ = writeln!(out, "gpu: {}", report.gpu); + if let Some(parallel) = &report.parallel { + let _ = writeln!(out, "shape: {parallel}"); + } + let _ = writeln!(out, "commit: {}\n", report.commit); + let _ = writeln!( + out, + "prefill_heavy ({},{}):", + report.prefill_heavy.prompt_len, report.prefill_heavy.output_len + ); + let _ = writeln!( + out, + " TTFT p50={:.2}ms p99={:.2}ms", + report.prefill_heavy.metrics.ttft_ms.p50_ms, report.prefill_heavy.metrics.ttft_ms.p99_ms + ); + let _ = writeln!( + out, + "\ndecode_heavy ({},{}):", + report.decode_heavy.prompt_len, report.decode_heavy.output_len + ); + if let Some(tpot) = &report.decode_heavy.metrics.steady_tpot_ms { + let _ = writeln!( + out, + " TPOT p50={:.2}ms p99={:.2}ms", + tpot.p50_ms, tpot.p99_ms + ); + } + let _ = writeln!(out, "\nwritten to {}", path.display()); + out +} + +pub(crate) fn run_compare(args: &CompareArgs) -> Result<()> { + let current_content = fs::read_to_string(&args.path).with_context(|| { + format!( + "snapshot not found: {}\nrun `bench_serving snapshot` first", + args.path + ) + })?; + let current: SnapshotReport = + serde_json::from_str(¤t_content).context("failed to parse current snapshot")?; + + // Resolve repo-root-relative path for git show + let abs_path = fs::canonicalize(&args.path)?; + let toplevel = + shell_output("git", &["rev-parse", "--show-toplevel"]).context("not a git repository")?; + let root = PathBuf::from(&toplevel); + let rel_path = abs_path + .strip_prefix(&root) + .context("snapshot file is outside the git repository")?; + + let git_output = std::process::Command::new("git") + .args(["show", &format!("{}:{}", args.baseline, rel_path.display())]) + .output() + .context("failed to run git show")?; + + if !git_output.status.success() { + anyhow::bail!( + "no baseline at {}:{}\ncommit the current snapshot to establish a baseline", + args.baseline, + rel_path.display() + ); + } + + let baseline: SnapshotReport = + serde_json::from_slice(&git_output.stdout).context("failed to parse baseline snapshot")?; + + // Guard against comparing snapshots with different profile shapes + ensure!( + current.prefill_heavy.prompt_len == baseline.prefill_heavy.prompt_len + && current.prefill_heavy.output_len == baseline.prefill_heavy.output_len + && current.decode_heavy.prompt_len == baseline.decode_heavy.prompt_len + && current.decode_heavy.output_len == baseline.decode_heavy.output_len, + "profile shape mismatch: current ({},{}) + ({},{}) vs baseline ({},{}) + ({},{})\n\ + the snapshot profiles were changed — re-baseline by committing a fresh snapshot", + current.prefill_heavy.prompt_len, + current.prefill_heavy.output_len, + current.decode_heavy.prompt_len, + current.decode_heavy.output_len, + baseline.prefill_heavy.prompt_len, + baseline.prefill_heavy.output_len, + baseline.decode_heavy.prompt_len, + baseline.decode_heavy.output_len, + ); + println!("{}", render_comparison(¤t, &baseline, &args.baseline)); + Ok(()) +} + +pub(crate) fn render_comparison( + current: &SnapshotReport, + baseline: &SnapshotReport, + ref_name: &str, +) -> String { + let mut out = String::new(); + let _ = writeln!(out, "bench_serving compare\n"); + let _ = writeln!( + out, + "comparing {} (working tree) vs {} ({ref_name})\n", + current.commit, baseline.commit + ); + + let mut table = new_table(); + table.set_header(vec![ + Cell::new("metric"), + Cell::new("current").set_alignment(CellAlignment::Right), + Cell::new("baseline").set_alignment(CellAlignment::Right), + Cell::new("delta").set_alignment(CellAlignment::Right), + ]); + + let pf = ¤t.prefill_heavy; + let pf_b = &baseline.prefill_heavy; + let pf_label = format!("({},{})", pf.prompt_len, pf.output_len); + + for (stat, cur, base) in [ + ( + "p50", + pf.metrics.ttft_ms.p50_ms, + pf_b.metrics.ttft_ms.p50_ms, + ), + ( + "p99", + pf.metrics.ttft_ms.p99_ms, + pf_b.metrics.ttft_ms.p99_ms, + ), + ] { + table.add_row(vec![ + key_cell(format!("TTFT {stat} {pf_label}")), + numeric_cell(format!("{cur:.2}ms")), + numeric_cell(format!("{base:.2}ms")), + numeric_cell(format_delta(delta_pct(cur, base))), + ]); + } + + let dc_label = format!( + "({},{})", + current.decode_heavy.prompt_len, current.decode_heavy.output_len + ); + if let (Some(cur_tpot), Some(base_tpot)) = ( + ¤t.decode_heavy.metrics.steady_tpot_ms, + &baseline.decode_heavy.metrics.steady_tpot_ms, + ) { + for (stat, cur, base) in [ + ("p50", cur_tpot.p50_ms, base_tpot.p50_ms), + ("p99", cur_tpot.p99_ms, base_tpot.p99_ms), + ] { + table.add_row(vec![ + key_cell(format!("TPOT {stat} {dc_label}")), + numeric_cell(format!("{cur:.2}ms")), + numeric_cell(format!("{base:.2}ms")), + numeric_cell(format_delta(delta_pct(cur, base))), + ]); + } + } + + push_table(&mut out, &table); + + // Regression check + let mut regressions = Vec::new(); + let ttft_d = delta_pct( + current.prefill_heavy.metrics.ttft_ms.p50_ms, + baseline.prefill_heavy.metrics.ttft_ms.p50_ms, + ); + if ttft_d > REGRESSION_TTFT_PCT { + regressions.push(format!( + "TTFT p50 {ttft_d:+.1}% > {REGRESSION_TTFT_PCT}% threshold" + )); + } + if let (Some(cur), Some(base)) = ( + ¤t.decode_heavy.metrics.steady_tpot_ms, + &baseline.decode_heavy.metrics.steady_tpot_ms, + ) { + let tpot_d = delta_pct(cur.p50_ms, base.p50_ms); + if tpot_d > REGRESSION_TPOT_PCT { + regressions.push(format!( + "TPOT p50 {tpot_d:+.1}% > {REGRESSION_TPOT_PCT}% threshold" + )); + } + } + + out.push('\n'); + if regressions.is_empty() { + let _ = writeln!( + out, + "no regression detected (threshold: TPOT >{REGRESSION_TPOT_PCT}%, TTFT >{REGRESSION_TTFT_PCT}%)" + ); + } else { + let _ = writeln!(out, "REGRESSION DETECTED:"); + for r in ®ressions { + let _ = writeln!(out, " {r}"); + } + } + + out +}