diff --git a/README.md b/README.md index 7356742c..4e7a0ad8 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ mllm ## Latest News +- [2026 May 02] 🔥🔥🔥 MLLM now supports the Ascend NPU backend, with ATB graph execution and Qwen3 W8A8 inference on Ascend devices. - [2026 Mar 18] 🔥🔥🔥 `pymllm` now supports CUDA on Jetson Orin and Jetson Thor devices (experimental; still under active development). - [2026 Feb 03] 🔥🔥🔥 MLLM Qnn AOT Support for Full Graph Execution on NPU! [Quick Start](https://ubiquitouslearning.github.io/mllm/qnn_backend/aot_execute.html), [Technical Report](https://chenghuawang.github.io/News/2026-01-29-mllm-qnn-aot-support-en/) - [2025 Nov 27] Android Demo Update: Enabled stable Qwen3 and DeepSeek-OCR streaming on Android via a novel In-App Go Server Architecture. @@ -75,7 +76,7 @@ The mllm framework integrates seamlessly with popular community frameworks' chec | Model(v2) | CPU | Hexagon NPU
INT8 | Ascend NPU | |-----------------------------------------------------------------------------|------|-----------------------|------------| -| [Qwen3-0.6B](https://github.com/QwenLM/Qwen3) | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen3-0.6B-w4a32kai) | | ✔️ W8A8 | +| [Qwen3-0.6B](https://github.com/QwenLM/Qwen3) | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen3-0.6B-w4a32kai) | | [✔️ W8A8](https://www.modelscope.cn/models/mllmTeam/Qwen3-0.6B-W8A8-Ascend) | | [Qwen3-1.7B](https://github.com/QwenLM/Qwen3) | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen3-1.7B-w4a8-i8mm-kai) | [W4A16-SM8650](https://modelscope.cn/models/mllmTeam/Qwen3-1.7B-Qnn-AOT-SM8650/) | | | [Qwen3-4B](https://github.com/QwenLM/Qwen3) | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen3-4B-w4a8-i8mm-kai) | | | | [DeepSeek-OCR](https://github.com/deepseek-ai/DeepSeek-OCR) | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/DeepSeek-OCR-w4a8-i8mm-kai) | | | diff --git a/examples/qwen_ascend/main.cpp b/examples/qwen_ascend/main.cpp index 9099d18b..d5abd622 100644 --- a/examples/qwen_ascend/main.cpp +++ b/examples/qwen_ascend/main.cpp @@ -3,13 +3,39 @@ #include #include +#include +#include +#include #include +#include #include #include #include using mllm::Argparse; +namespace { + +std::string takeValidUtf8Prefix(std::string& pending_text) { + auto invalid = utf8::find_invalid(pending_text.begin(), pending_text.end()); + if (invalid == pending_text.begin()) { + return {}; + } + + if (invalid == pending_text.end()) { + std::string ready_text; + ready_text.swap(pending_text); + return ready_text; + } + + auto ready_bytes = static_cast(std::distance(pending_text.begin(), invalid)); + auto ready_text = pending_text.substr(0, ready_bytes); + pending_text.erase(0, ready_bytes); + return ready_text; +} + +} // namespace + MLLM_MAIN({ auto& help = Argparse::add("-h|--help").help("Show help message"); auto& model_path = Argparse::add("-m|--model_path").help("Model path").required(true); @@ -194,17 +220,27 @@ MLLM_MAIN({ msg.prompt = prompt_text; auto inputs = tokenizer.convertMessage(msg); - // Clear KV cache before generation + // Run a prefill warmup outside ARGeneration timing so first-use Ascend + // graph/runtime setup is not counted as the measured prefill time. model.clearCache(); + fmt::print("\nWarming up prefill path...\n"); + (void)model.forward(inputs, {}); + // Keep RoPE cache warmed, but reset KV state for the measured generation. + model.kvCache().clearCache(); fmt::print("\nAnswer:\n"); auto chat_start = std::chrono::high_resolution_clock::now(); std::vector generated_ids; - // Use streaming generation with the ARGeneration chat interface + std::string pending_text; for (auto& step : model.chat(inputs)) { generated_ids.push_back(step.cur_token_id); - std::wcout << tokenizer.detokenize(step.cur_token_id) << std::flush; + pending_text += tokenizer.decode({step.cur_token_id}); + auto ready_text = takeValidUtf8Prefix(pending_text); + if (!ready_text.empty()) { + fmt::print("{}", ready_text); + std::fflush(stdout); + } // Stop if we've reached max_new_tokens if (static_cast(generated_ids.size()) >= gen_max_new_tokens) { if (step.current_step > 0) { @@ -213,7 +249,10 @@ MLLM_MAIN({ break; } } - std::wcout << std::endl; + if (!pending_text.empty()) { + fmt::print("{}", pending_text); + } + fmt::print("\n"); auto chat_end = std::chrono::high_resolution_clock::now(); auto chat_ms = std::chrono::duration_cast(chat_end - chat_start).count(); diff --git a/mllm/models/qwen_ascend/tokenization_qwen_ascend.hpp b/mllm/models/qwen_ascend/tokenization_qwen_ascend.hpp index e9306002..eb7bc2f3 100644 --- a/mllm/models/qwen_ascend/tokenization_qwen_ascend.hpp +++ b/mllm/models/qwen_ascend/tokenization_qwen_ascend.hpp @@ -22,7 +22,7 @@ struct QwenAscendMessage { class QwenAscendTokenizer final : public mllm::preprocessor::AutoTokenizer { public: explicit QwenAscendTokenizer(const std::string& file_path) { - preprocessor::initLocal(); + preprocessor::initLocal("C.UTF-8"); preprocessor::makeBytes2UnicodeMap(bytes_2_unicode_dict_); for (auto& kv : bytes_2_unicode_dict_) { bytes_2_unicode_dict_inverse_.insert({kv.second, kv.first}); } bpe_.initFromSentencePieceJson(file_path);