UbiquitousLearning · lywbarca · May 6, 2026 · coderabbitai · May 6, 2026
diff --git a/README.md b/README.md
@@ -17,6 +17,7 @@ mllm
 
 ## Latest News
 
+- [2026 May 02] 🔥🔥🔥 MLLM now supports the Ascend NPU backend, with ATB graph execution and Qwen3 W8A8 inference on Ascend devices.
 - [2026 Mar 18] 🔥🔥🔥 `pymllm` now supports CUDA on Jetson Orin and Jetson Thor devices (experimental; still under active development).
 - [2026 Feb 03] 🔥🔥🔥 MLLM Qnn AOT Support for Full Graph Execution on NPU! [Quick Start](https://ubiquitouslearning.github.io/mllm/qnn_backend/aot_execute.html), [Technical Report](https://chenghuawang.github.io/News/2026-01-29-mllm-qnn-aot-support-en/)
 - [2025 Nov 27] Android Demo Update: Enabled stable Qwen3 and DeepSeek-OCR streaming on Android via a novel In-App Go Server Architecture.
@@ -75,7 +76,7 @@ The mllm framework integrates seamlessly with popular community frameworks' chec
 
 | Model(v2)                                                                   | CPU  | Hexagon NPU <br> INT8 | Ascend NPU |
 |-----------------------------------------------------------------------------|------|-----------------------|------------|
-| [Qwen3-0.6B](https://github.com/QwenLM/Qwen3)                     | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen3-0.6B-w4a32kai)  |  | ✔️ W8A8 |
+| [Qwen3-0.6B](https://github.com/QwenLM/Qwen3)                     | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen3-0.6B-w4a32kai)  |  | [✔️ W8A8](https://www.modelscope.cn/models/mllmTeam/Qwen3-0.6B-W8A8-Ascend) |
 | [Qwen3-1.7B](https://github.com/QwenLM/Qwen3)                     | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen3-1.7B-w4a8-i8mm-kai)  | [W4A16-SM8650](https://modelscope.cn/models/mllmTeam/Qwen3-1.7B-Qnn-AOT-SM8650/) | |
 | [Qwen3-4B](https://github.com/QwenLM/Qwen3)                      | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen3-4B-w4a8-i8mm-kai)  |  | |
 | [DeepSeek-OCR](https://github.com/deepseek-ai/DeepSeek-OCR)       | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/DeepSeek-OCR-w4a8-i8mm-kai)  |  | |

diff --git a/examples/qwen_ascend/main.cpp b/examples/qwen_ascend/main.cpp
@@ -3,13 +3,39 @@
 
 #include <iostream>
 #include <clocale>
+#include <cstdio>
+#include <iterator>
+#include <string>
 #include <fmt/core.h>
+#include <utfcpp/utf8.h>
 #include <mllm/mllm.hpp>
 #include <mllm/models/qwen_ascend/modeling_qwen_ascend.hpp>
 #include <mllm/models/qwen_ascend/tokenization_qwen_ascend.hpp>
 
 using mllm::Argparse;
 
+namespace {
+
+std::string takeValidUtf8Prefix(std::string& pending_text) {
+  auto invalid = utf8::find_invalid(pending_text.begin(), pending_text.end());
+  if (invalid == pending_text.begin()) {
+    return {};
+  }
+
+  if (invalid == pending_text.end()) {
+    std::string ready_text;
+    ready_text.swap(pending_text);
+    return ready_text;
+  }
+
+  auto ready_bytes = static_cast<size_t>(std::distance(pending_text.begin(), invalid));
+  auto ready_text = pending_text.substr(0, ready_bytes);
+  pending_text.erase(0, ready_bytes);
+  return ready_text;
+}
+
+}  // namespace
+
 MLLM_MAIN({
   auto& help = Argparse::add<bool>("-h|--help").help("Show help message");
   auto& model_path = Argparse::add<std::string>("-m|--model_path").help("Model path").required(true);
@@ -194,17 +220,27 @@ MLLM_MAIN({
       msg.prompt = prompt_text;
       auto inputs = tokenizer.convertMessage(msg);
 
-      // Clear KV cache before generation
+      // Run a prefill warmup outside ARGeneration timing so first-use Ascend
+      // graph/runtime setup is not counted as the measured prefill time.
       model.clearCache();
+      fmt::print("\nWarming up prefill path...\n");
+      (void)model.forward(inputs, {});
+      // Keep RoPE cache warmed, but reset KV state for the measured generation.
+      model.kvCache().clearCache();
 
       fmt::print("\nAnswer:\n");
       auto chat_start = std::chrono::high_resolution_clock::now();
 
       std::vector<int64_t> generated_ids;
-      // Use streaming generation with the ARGeneration chat interface
+      std::string pending_text;
       for (auto& step : model.chat(inputs)) {
         generated_ids.push_back(step.cur_token_id);
-        std::wcout << tokenizer.detokenize(step.cur_token_id) << std::flush;
+        pending_text += tokenizer.decode({step.cur_token_id});
+        auto ready_text = takeValidUtf8Prefix(pending_text);
+        if (!ready_text.empty()) {
+          fmt::print("{}", ready_text);
+          std::fflush(stdout);
+        }
         // Stop if we've reached max_new_tokens
         if (static_cast<int>(generated_ids.size()) >= gen_max_new_tokens) {
           if (step.current_step > 0) {
@@ -213,7 +249,10 @@ MLLM_MAIN({
           break;
         }
       }
-      std::wcout << std::endl;
+      if (!pending_text.empty()) {
+        fmt::print("{}", pending_text);
+      }
+      fmt::print("\n");
 
       auto chat_end = std::chrono::high_resolution_clock::now();
       auto chat_ms = std::chrono::duration_cast<std::chrono::milliseconds>(chat_end - chat_start).count();

diff --git a/mllm/models/qwen_ascend/tokenization_qwen_ascend.hpp b/mllm/models/qwen_ascend/tokenization_qwen_ascend.hpp
@@ -22,7 +22,7 @@ struct QwenAscendMessage {
 class QwenAscendTokenizer final : public mllm::preprocessor::AutoTokenizer {
  public:
   explicit QwenAscendTokenizer(const std::string& file_path) {
-    preprocessor::initLocal();
+    preprocessor::initLocal("C.UTF-8");
     preprocessor::makeBytes2UnicodeMap(bytes_2_unicode_dict_);
     for (auto& kv : bytes_2_unicode_dict_) { bytes_2_unicode_dict_inverse_.insert({kv.second, kv.first}); }
     bpe_.initFromSentencePieceJson(file_path);