diff --git a/README.md b/README.md
index 7356742c..4e7a0ad8 100644
--- a/README.md
+++ b/README.md
@@ -17,6 +17,7 @@ mllm
## Latest News
+- [2026 May 02] 🔥🔥🔥 MLLM now supports the Ascend NPU backend, with ATB graph execution and Qwen3 W8A8 inference on Ascend devices.
- [2026 Mar 18] 🔥🔥🔥 `pymllm` now supports CUDA on Jetson Orin and Jetson Thor devices (experimental; still under active development).
- [2026 Feb 03] 🔥🔥🔥 MLLM Qnn AOT Support for Full Graph Execution on NPU! [Quick Start](https://ubiquitouslearning.github.io/mllm/qnn_backend/aot_execute.html), [Technical Report](https://chenghuawang.github.io/News/2026-01-29-mllm-qnn-aot-support-en/)
- [2025 Nov 27] Android Demo Update: Enabled stable Qwen3 and DeepSeek-OCR streaming on Android via a novel In-App Go Server Architecture.
@@ -75,7 +76,7 @@ The mllm framework integrates seamlessly with popular community frameworks' chec
| Model(v2) | CPU | Hexagon NPU
INT8 | Ascend NPU |
|-----------------------------------------------------------------------------|------|-----------------------|------------|
-| [Qwen3-0.6B](https://github.com/QwenLM/Qwen3) | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen3-0.6B-w4a32kai) | | ✔️ W8A8 |
+| [Qwen3-0.6B](https://github.com/QwenLM/Qwen3) | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen3-0.6B-w4a32kai) | | [✔️ W8A8](https://www.modelscope.cn/models/mllmTeam/Qwen3-0.6B-W8A8-Ascend) |
| [Qwen3-1.7B](https://github.com/QwenLM/Qwen3) | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen3-1.7B-w4a8-i8mm-kai) | [W4A16-SM8650](https://modelscope.cn/models/mllmTeam/Qwen3-1.7B-Qnn-AOT-SM8650/) | |
| [Qwen3-4B](https://github.com/QwenLM/Qwen3) | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen3-4B-w4a8-i8mm-kai) | | |
| [DeepSeek-OCR](https://github.com/deepseek-ai/DeepSeek-OCR) | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/DeepSeek-OCR-w4a8-i8mm-kai) | | |
diff --git a/examples/qwen_ascend/main.cpp b/examples/qwen_ascend/main.cpp
index 9099d18b..d5abd622 100644
--- a/examples/qwen_ascend/main.cpp
+++ b/examples/qwen_ascend/main.cpp
@@ -3,13 +3,39 @@
#include
#include
+#include
+#include
+#include
#include
+#include
#include
#include
#include
using mllm::Argparse;
+namespace {
+
+std::string takeValidUtf8Prefix(std::string& pending_text) {
+ auto invalid = utf8::find_invalid(pending_text.begin(), pending_text.end());
+ if (invalid == pending_text.begin()) {
+ return {};
+ }
+
+ if (invalid == pending_text.end()) {
+ std::string ready_text;
+ ready_text.swap(pending_text);
+ return ready_text;
+ }
+
+ auto ready_bytes = static_cast(std::distance(pending_text.begin(), invalid));
+ auto ready_text = pending_text.substr(0, ready_bytes);
+ pending_text.erase(0, ready_bytes);
+ return ready_text;
+}
+
+} // namespace
+
MLLM_MAIN({
auto& help = Argparse::add("-h|--help").help("Show help message");
auto& model_path = Argparse::add("-m|--model_path").help("Model path").required(true);
@@ -194,17 +220,27 @@ MLLM_MAIN({
msg.prompt = prompt_text;
auto inputs = tokenizer.convertMessage(msg);
- // Clear KV cache before generation
+ // Run a prefill warmup outside ARGeneration timing so first-use Ascend
+ // graph/runtime setup is not counted as the measured prefill time.
model.clearCache();
+ fmt::print("\nWarming up prefill path...\n");
+ (void)model.forward(inputs, {});
+ // Keep RoPE cache warmed, but reset KV state for the measured generation.
+ model.kvCache().clearCache();
fmt::print("\nAnswer:\n");
auto chat_start = std::chrono::high_resolution_clock::now();
std::vector generated_ids;
- // Use streaming generation with the ARGeneration chat interface
+ std::string pending_text;
for (auto& step : model.chat(inputs)) {
generated_ids.push_back(step.cur_token_id);
- std::wcout << tokenizer.detokenize(step.cur_token_id) << std::flush;
+ pending_text += tokenizer.decode({step.cur_token_id});
+ auto ready_text = takeValidUtf8Prefix(pending_text);
+ if (!ready_text.empty()) {
+ fmt::print("{}", ready_text);
+ std::fflush(stdout);
+ }
// Stop if we've reached max_new_tokens
if (static_cast(generated_ids.size()) >= gen_max_new_tokens) {
if (step.current_step > 0) {
@@ -213,7 +249,10 @@ MLLM_MAIN({
break;
}
}
- std::wcout << std::endl;
+ if (!pending_text.empty()) {
+ fmt::print("{}", pending_text);
+ }
+ fmt::print("\n");
auto chat_end = std::chrono::high_resolution_clock::now();
auto chat_ms = std::chrono::duration_cast(chat_end - chat_start).count();
diff --git a/mllm/models/qwen_ascend/tokenization_qwen_ascend.hpp b/mllm/models/qwen_ascend/tokenization_qwen_ascend.hpp
index e9306002..eb7bc2f3 100644
--- a/mllm/models/qwen_ascend/tokenization_qwen_ascend.hpp
+++ b/mllm/models/qwen_ascend/tokenization_qwen_ascend.hpp
@@ -22,7 +22,7 @@ struct QwenAscendMessage {
class QwenAscendTokenizer final : public mllm::preprocessor::AutoTokenizer {
public:
explicit QwenAscendTokenizer(const std::string& file_path) {
- preprocessor::initLocal();
+ preprocessor::initLocal("C.UTF-8");
preprocessor::makeBytes2UnicodeMap(bytes_2_unicode_dict_);
for (auto& kv : bytes_2_unicode_dict_) { bytes_2_unicode_dict_inverse_.insert({kv.second, kv.first}); }
bpe_.initFromSentencePieceJson(file_path);