Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion mllm/models/llama/modeling_llama.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -377,8 +377,10 @@ class LlamaForCausalLM : public nn::Module, public ARGeneration {
bool tie_word_embeddings_;
bool mask_by_tensor_;

inline nn::AbstractStaticCache& kvCache() { return *kv_cache_; }

private:
std::unique_ptr<nn::AbstractStaticCache> kv_cache_;
};

} // namespace mllm::models::llama
} // namespace mllm::models::llama
48 changes: 41 additions & 7 deletions tools/mllm-llm-benchmark/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ This is a benchmark tool for measuring MLLM model performance, including:
## Build

Build from the mllm_v2 project root directory:

```bash
mkdir -p build && cd build
cmake ..
Expand All @@ -20,7 +19,6 @@ make mllm-llm-benchmark
## Usage

### Basic Usage

```bash
./mllm-llm-benchmark \
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The README is for developer, not a description of your PR. You should add your part on the top of the original README.md

-n qwen3-w4a32-kai \
Expand All @@ -32,6 +30,47 @@ make mllm-llm-benchmark
-cl 2048
```

### Context Sweep (New Feature)

For automated benchmarking across different context lengths, use the sweep script:
```bash
cd tools/mllm-llm-benchmark
chmod +x scripts/sweep_context_v2.sh

# Configure paths
export BIN=../../build/bin/mllm-llm-benchmark
export MODEL=/path/to/your-model.mllm
export CFG=/path/to/config.json

# Run sweep
./scripts/sweep_context_v2.sh
```

Output goes to `bench_context/context_sweep_v2.csv`.

**Configuration options:**
- `BIN`: Path to benchmark binary (required)
- `MODEL`: Path to model file (required)
- `CFG`: Path to config json (default: `./examples/llama/config_tiny_llama.json`)
- `THREADS`: Number of threads (default: 8)
- `RUNS`: How many runs to average (default: 1)
- `COOLDOWN`: Seconds to wait between runs (default: 0)
- `CTX_LENS`: Context lengths to test (default: "256 512 1024 2048 4096")
- `TG_DH`: Generate length for decode_heavy mode (default: 256)
- `TG_TTFT`: Generate length for prefill_ttft mode (default: 2)
- `OUTDIR`: Output directory (default: bench_context)

**Test modes:**
- `prefill_ttft`: Measures time to first token (prompt length = CTX_LEN-2, generates 2 tokens)
- `decode_heavy`: Measures decode throughput (prompt length = CTX_LEN-256, generates 256 tokens)

### Plot Results

Visualize benchmark results:
```bash
python3 scripts/plot_sweep.py bench_context/context_sweep_v2.csv output_dir/
```

### Parameters

| Parameter | Long Format | Description | Example |
Expand All @@ -47,7 +86,6 @@ make mllm-llm-benchmark
### Examples

#### Testing Qwen3-0.6B Model

```bash
./mllm-llm-benchmark \
-n qwen3-w4a32-kai \
Expand All @@ -60,7 +98,6 @@ make mllm-llm-benchmark
```

#### Quick Test (Single Configuration)

```bash
./mllm-llm-benchmark \
-n qwen3-w4a32-kai \
Expand All @@ -73,7 +110,6 @@ make mllm-llm-benchmark
```

## Output Example

```
MLLM Build Version : abc123def456
ARCH : ARM64
Expand Down Expand Up @@ -144,7 +180,6 @@ Each test configuration executes the following steps:
### 1. Create New Benchmark Class

Create `YourModel_Benchmark.hpp` in the `models/` directory:

```cpp
#include "BenchmarkTemplate.hpp"
#include <mllm/models/yourmodel/modeling_yourmodel.hpp>
Expand Down Expand Up @@ -178,7 +213,6 @@ class YourModel_Benchmark final : public BenchmarkTemplate {
```

### 2. Register in All.hpp

```cpp
#include "YourModel_Benchmark.hpp"

Expand Down
92 changes: 75 additions & 17 deletions tools/mllm-llm-benchmark/main.cpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
// Copyright (c) MLLM Team.
// Licensed under the MIT License.

#include <string>
#include <fstream>
#include <vector>
#include <sstream>
#include <thread>
#include <chrono>
#include <algorithm>

#include <mllm/mllm.hpp>
#include <mllm/utils/Argparse.hpp>
Expand All @@ -16,6 +19,9 @@

#include "models/All.hpp"

#define STR_HELPER(x) #x
#define STR(x) STR_HELPER(x)

MLLM_MAIN({
auto& help = mllm::Argparse::add<bool>("-h|--help").help("Show help message");
auto& model_name = mllm::Argparse::add<std::string>("-n|--model_name").help("Model name");
Expand All @@ -25,12 +31,21 @@ MLLM_MAIN({
auto& pp = mllm::Argparse::add<std::string>("-pp|--prompt_length").help("Prompt length");
auto& tg = mllm::Argparse::add<std::string>("-tg|--test_generation_length").help("Test Generation length");
auto& cache_length = mllm::Argparse::add<int32_t>("-cl|--cache_length").help("Cache length");

auto& runs = mllm::Argparse::add<int32_t>("-r|--runs").help("Number of benchmark runs").def(3);
auto& cooldown_s = mllm::Argparse::add<int32_t>("-cs|--cooldown_s").help("Cooldown time between runs in seconds").def(5);
auto& output_csv = mllm::Argparse::add<std::string>("-oc|--output_csv").help("Output results to a CSV file").def("");
auto& schema_version = mllm::Argparse::add<int32_t>("-sv|--schema_version").help("Schema version for output format").def(1);
auto& kv_dtype_bytes =
mllm::Argparse::add<int32_t>("-kv|--kv_dtype_bytes").help("KV cache data type bytes (1: int8, 2: fp16, 4: fp32)").def(4);

mllm::Argparse::parse(argc, argv);

// Print Build Version
mllm::Context::instance().setCpuOpThreads(num_threads.get());
mllm::setMaximumNumThreads((uint32_t)num_threads.get());

mllm::print("MLLM Build Version :", STRINGIFY(MLLM_GIT_COMMIT_HASH));

// Print Device Info
mllm::print("ARCH :", mllm::cpu::CURRENT_ARCH_STRING);
mllm::print("FP16 :", mllm::cpu::hasFP16());
mllm::print("BF16 :", mllm::cpu::hasBF16());
Expand All @@ -53,15 +68,31 @@ MLLM_MAIN({
mllm::print("AVX512VL :", mllm::cpu::hasAVX512VL());
mllm::print("FMA :", mllm::cpu::hasFMA());

// Create benchmark
mllm::print("Create Benchmark: ", model_name.get());
auto benchmark = createBenchmark(model_name.get());
MLLM_RT_ASSERT(benchmark != nullptr);

// Print Model Info
int R = runs.get();
if (R <= 0) {
mllm::print("[ERROR] --runs must be > 0, got:", R);
return 1;
}

std::ofstream csv_file;
if (!output_csv.get().empty()) {
csv_file.open(output_csv.get());
if (!csv_file.is_open()) {
mllm::print("[ERROR] Failed to open --output_csv:", output_csv.get());
return 1;
}
csv_file << "schema_version,git_commit,arch,model_name,cache_length,pp,tg,ttft_ms,prefill_speed,decode_speed,prefill_ms,decode_ms_per_"
"tok,kv_est_bytes_pp,kv_est_bytes_final\n";
}

mllm::print("Model Info");
benchmark->init(config_path.get(), model_path.get(), cache_length.get());
benchmark->printModelInfo();
mllm::print("Cache Length :", cache_length.get());

// Warmup run
mllm::print("Warmup Run");
Expand Down Expand Up @@ -92,7 +123,7 @@ MLLM_MAIN({
for (size_t i = 0; i < pp_values.size(); ++i) { pp_tg_pairs.emplace_back(pp_values[i], tg_values[i]); }
}

// Actual run for 3 turns and gives avg results. Each turn will sleep for 5 seconds to let the SoC or GPU/NPU cool down.
// Actual run for configurable number of turns
mllm::print("\n========================================");
mllm::print("Starting Benchmark Tests");
mllm::print("========================================\n");
Expand All @@ -104,32 +135,33 @@ MLLM_MAIN({
mllm::print(" Generation Length (TG):", tg);
mllm::print("----------------------------------------");

// Storage for results
std::vector<BenchmarkTemplateResult> results;
results.reserve(3);
results.reserve(static_cast<size_t>(R));

for (int i = 0; i < 3; ++i) {
mllm::print(" Run", i + 1, "of 3...");
for (int i = 0; i < R; ++i) {
mllm::print(" Run", i + 1, "of", R, "...");

// Clear cache before each run
benchmark->clear();

// Run benchmark
auto result = benchmark->run(pp, tg);
results.push_back(result);

mllm::print(" TTFT :", result.ttft, "ms");
mllm::print(" Prefill Speed:", result.prefill_speed, "tokens/s");
mllm::print(" Decode Speed :", result.decode_speed, "tokens/s");

// Sleep for 5 seconds between runs to cool down
if (i < 2) {
mllm::print(" Cooling down for 5 seconds...");
std::this_thread::sleep_for(std::chrono::seconds(5));
float prefill_ms = (result.prefill_speed > 0.0f) ? (pp / result.prefill_speed) * 1000.0f : 0.0f;
float decode_ms_per_tok = (result.decode_speed > 0.0f) ? (1.0f / result.decode_speed) * 1000.0f : 0.0f;
mllm::print(" Prefill Latency :", prefill_ms, "ms");
mllm::print(" Decode Latency :", decode_ms_per_tok, "ms");

int cool = cooldown_s.get();
if (i + 1 < R && cool > 0) {
mllm::print(" Cooling down for", cool, "seconds...");
std::this_thread::sleep_for(std::chrono::seconds(cool));
}
}

// Calculate average results
float denom = (R > 0) ? static_cast<float>(R) : 1.0f;
float avg_ttft = 0.0f;
float avg_prefill_speed = 0.0f;
float avg_decode_speed = 0.0f;
Expand All @@ -151,9 +183,35 @@ MLLM_MAIN({
mllm::print("Average Prefill Speed:", avg_prefill_speed, "tokens/s");
mllm::print("Average Decode Speed :", avg_decode_speed, "tokens/s");
mllm::print("=====================================\n");

avg_ttft /= denom;
avg_prefill_speed /= denom;
avg_decode_speed /= denom;

float avg_prefill_ms = (avg_prefill_speed > 0.0f) ? (pp / avg_prefill_speed) * 1000.0f : 0.0f;
float avg_decode_ms_per_tok = (avg_decode_speed > 0.0f) ? (1.0f / avg_decode_speed) * 1000.0f : 0.0f;

// KV cache estimate
double kv_est_bytes_pp = 0.0;
double kv_est_bytes_final = 0.0;
if (auto info = benchmark->kvEstimateInfo(); info.has_value()) {
const int32_t bytes_per = kv_dtype_bytes.get(); // 1/2/4
// LLaMA-like KV: 2 * n_layers * n_kv_heads * head_dim * seq_len * bytes
kv_est_bytes_pp = 2.0 * info->num_layers * info->num_kv_heads * info->head_dim * (double)pp * bytes_per;
kv_est_bytes_final = 2.0 * info->num_layers * info->num_kv_heads * info->head_dim * (double)(pp + tg) * bytes_per;
}

std::stringstream ss;
ss << schema_version.get() << "," << STRINGIFY(MLLM_GIT_COMMIT_HASH) << "," << mllm::cpu::CURRENT_ARCH_STRING << ","
<< model_name.get() << "," << cache_length.get() << "," << pp << "," << tg << "," << avg_ttft << "," << avg_prefill_speed << "," << avg_decode_speed
<< "," << avg_prefill_ms << "," << avg_decode_ms_per_tok << "," << kv_est_bytes_pp << "," << kv_est_bytes_final;

if (csv_file.is_open()) { csv_file << ss.str() << std::endl; }
}

mllm::print("\n========================================");
mllm::print("Benchmark Tests Completed");
mllm::print("========================================");

if (csv_file.is_open()) { csv_file.close(); }
})
19 changes: 16 additions & 3 deletions tools/mllm-llm-benchmark/models/All.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,33 @@

#include <memory>
#include <algorithm>
#include <string>
#include <cctype>

#include "Qwen3_W4A32_KAI.hpp"
#include "BenchmarkTemplate.hpp"
#include "Qwen3_W4A32_KAI.hpp"
#include "Llama.hpp"

std::shared_ptr<BenchmarkTemplate> createBenchmark(const std::string& model_name) {
inline std::shared_ptr<BenchmarkTemplate> createBenchmark(const std::string& model_name) {
auto tolower = [](const std::string& str) {
std::string result = str;
std::transform(result.begin(), result.end(), result.begin(), ::tolower);
// unsigned char cast to avoid UB
std::transform(result.begin(), result.end(), result.begin(),
[](unsigned char c) { return static_cast<char>(std::tolower(c)); });
return result;
};

auto normalized_model_name = tolower(model_name);

if (normalized_model_name.find("qwen3") != std::string::npos && normalized_model_name.find("w4a32") != std::string::npos
&& normalized_model_name.find("kai") != std::string::npos) {
return std::make_shared<Qwen3_W4A32_KAI_Benchmark>();
}

if (normalized_model_name.find("llama") != std::string::npos || normalized_model_name.find("tinyllama") != std::string::npos
|| normalized_model_name.find("tiny_llama") != std::string::npos) {
return std::make_shared<Llama_Benchmark>();
}

return nullptr;
}
25 changes: 18 additions & 7 deletions tools/mllm-llm-benchmark/models/BenchmarkTemplate.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,27 @@
#pragma once

#include <string>
#include <optional>
#include <cstdint>

/**
* @brief Benchmark result structure
*/
struct BenchmarkTemplateResult {
float ttft; ///< Time To First Token in milliseconds
float prefill_speed; ///< Prefill phase speed in tokens/s
float decode_speed; ///< Decode phase speed in tokens/s
float ttft; ///< Time To First Token in milliseconds
float prefill_speed; ///< Prefill phase speed in tokens/s
float decode_speed; ///< Decode phase speed in tokens/s
};

struct KVCacheEstimateInfo {
int32_t num_layers = 0;
int32_t num_kv_heads = 0;
int32_t head_dim = 0; // hidden_size / num_attention_heads
};

/**
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do not delete all comments here!

* @brief Base class for benchmark templates
*
*
* All model benchmark implementations should inherit from this class and implement all virtual functions.
*/
class BenchmarkTemplate {
Expand All @@ -32,21 +40,21 @@ class BenchmarkTemplate {

/**
* @brief Print model information
*
*
* Should output model key parameters such as number of layers, hidden size, attention heads, etc.
*/
virtual void printModelInfo() = 0;

/**
* @brief Warmup run
*
*
* Run the model once with small-scale input to ensure the model enters a stable state.
*/
virtual void warmup() = 0;

/**
* @brief Clear cache
*
*
* Clear KV cache and performance counters to prepare for the next test.
*/
virtual void clear() = 0;
Expand All @@ -58,4 +66,7 @@ class BenchmarkTemplate {
* @return Test results
*/
virtual BenchmarkTemplateResult run(int32_t pp, int32_t tg) = 0;

// KV cache size estimation; return nullopt if unsupported
virtual std::optional<KVCacheEstimateInfo> kvEstimateInfo() const { return std::nullopt; }
};
Loading