-
Notifications
You must be signed in to change notification settings - Fork 194
Added Qwen3.5 0.8B #657
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Added Qwen3.5 0.8B #657
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| add_executable(mllm-qwen3-5-runner main.cpp) | ||
| target_link_libraries(mllm-qwen3-5-runner PRIVATE MllmRT MllmCPUBackend) | ||
| target_include_directories(mllm-qwen3-5-runner PRIVATE ${MLLM_INCLUDE_DIR}) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,76 @@ | ||
| #include <iostream> | ||
| #include <fmt/core.h> | ||
| #include <mllm/mllm.hpp> | ||
| #include <mllm/models/qwen3_5/modeling_qwen3_5.hpp> | ||
| #include <mllm/models/qwen3_5/tokenization_qwen3_5.hpp> | ||
| #include <mllm/utils/AnyValue.hpp> | ||
|
|
||
| using mllm::Argparse; | ||
|
|
||
| MLLM_MAIN({ | ||
| auto& help = Argparse::add<bool>("-h|--help").help("Show help message"); | ||
| auto& model_path = Argparse::add<std::string>("-m|--model_path").help("Model path").required(true); | ||
| auto& model_version = Argparse::add<std::string>("-mv|--model_version").help("Model version").required(true); | ||
| auto& tokenizer_path = Argparse::add<std::string>("-t|--tokenizer_path").help("Tokenizer directory").required(true); | ||
| auto& config_path = Argparse::add<std::string>("-c|--config_path").help("Config path").required(true); | ||
|
|
||
| Argparse::parse(argc, argv); | ||
|
|
||
| #ifdef MLLM_PERFETTO_ENABLE | ||
| mllm::perf::start(); | ||
| #endif | ||
|
|
||
| mllm::ModelFileVersion file_version = mllm::ModelFileVersion::kV1; | ||
| if (model_version.get() == "v1") { | ||
| file_version = mllm::ModelFileVersion::kV1; | ||
| } else if (model_version.get() == "v2") { | ||
| file_version = mllm::ModelFileVersion::kV2; | ||
| } | ||
|
|
||
| if (help.isSet()) { | ||
| Argparse::printHelp(); | ||
| mllm::shutdownContext(); | ||
| return 0; | ||
| } | ||
|
|
||
| { | ||
| auto cfg = mllm::models::qwen3_5::Qwen3_5Config(config_path.get()); | ||
| auto tokenizer = mllm::models::qwen3_5::Qwen3_5Tokenizer(tokenizer_path.get()); | ||
| auto model = mllm::models::qwen3_5::Qwen3_5ForCausalLM(cfg); | ||
|
|
||
| fmt::print("Qwen3.5 0.8B: {} layers ({} full attention + {} GDN)\n", | ||
| cfg.num_hidden_layers, cfg.numFullAttentionLayers(), cfg.numGDNLayers()); | ||
|
|
||
| auto param = mllm::load(model_path.get(), file_version); | ||
| model.load(param); | ||
|
|
||
| fmt::print("\n{:*^60}\n", " Qwen3.5 Interactive CLI "); | ||
| fmt::print("Enter 'exit' or 'quit' to end the session\n\n"); | ||
|
|
||
| std::string prompt_text; | ||
|
|
||
| fmt::print("Prompt text (or 'exit/quit'): "); | ||
| std::getline(std::cin, prompt_text); | ||
|
|
||
| try { | ||
| fmt::print("Processing...\n"); | ||
| auto inputs = tokenizer.convertMessage({.prompt = prompt_text}); | ||
|
|
||
| fmt::print("\nResponse: "); | ||
|
|
||
| for (auto& step : model.chat(inputs)) { std::wcout << tokenizer.detokenize(step.cur_token_id) << std::flush; } | ||
|
|
||
| fmt::print("\n{}\n", std::string(60, '-')); | ||
| } catch (const std::exception& e) { fmt::print("\nError: {}\n{}\n", e.what(), std::string(60, '-')); } | ||
|
|
||
| model.perfSummary(); | ||
| } | ||
|
|
||
| #ifdef MLLM_PERFETTO_ENABLE | ||
| mllm::perf::stop(); | ||
| mllm::perf::saveReport("qwen3_5.perf"); | ||
| #endif | ||
|
|
||
| mllm::print("\n"); | ||
| mllm::memoryReport(); | ||
| }) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,73 @@ | ||
| // Copyright (c) MLLM Team. | ||
| // Licensed under the MIT License. | ||
| // | ||
| // Lowers RoPE (Rotary Position Embedding) to the custom HTP op from LLaMAPackage. | ||
| // The custom op signature: RoPE(input, sin, cos, h_cnt; pose_type) → output | ||
| // It supports partial rotation natively via the HVX kernel. | ||
|
|
||
| #include "mllm/utils/Common.hpp" | ||
| #include "mllm/core/aops/RoPEOp.hpp" | ||
| #include "mllm/compile/ir/linalg/Op.hpp" | ||
| #include "mllm/compile/ir/builtin/Attribute.hpp" | ||
| #include "mllm/compile/ir/tensor/Value.hpp" | ||
| #include "mllm/backends/qnn/aot/QnnWrappersAPI.hpp" | ||
| #include "mllm/backends/qnn/aot/visitor/RoPE.hpp" | ||
| #include "mllm/backends/qnn/aot/passes/AOTCompileContext.hpp" | ||
|
|
||
| namespace mllm::qnn::aot { | ||
|
|
||
| bool QnnAOTRoPEPattern::isMatch(const mllm::ir::op_ptr_t& op) { | ||
| return op->isa_<mllm::ir::linalg::RoPEOp>() && (op->getAttr("using_qnn") != nullptr); | ||
| } | ||
|
|
||
| bool QnnAOTRoPEPattern::rewrite(ir::IRWriter& writer, const ir::op_ptr_t& op) { | ||
| auto env = AOTCompileContext::getInstance().getEnv(); | ||
|
|
||
| auto rope_op = op->cast_<mllm::ir::linalg::RoPEOp>(); | ||
| if (!rope_op) { | ||
| MLLM_ERROR("Failed to cast to linalg::RoPEOp"); | ||
| return false; | ||
| } | ||
|
|
||
| MLLM_RETURN_FALSE_IF_NOT(op->getAttr("qnn_graph_name")); | ||
| auto qnn_graph_name = op->getAttr("qnn_graph_name")->cast_<ir::StrAttr>()->data(); | ||
| MLLM_RETURN_FALSE_IF_NOT(op->getAttr("qnn_context_name")); | ||
| auto qnn_context_name = op->getAttr("qnn_context_name")->cast_<ir::StrAttr>()->data(); | ||
|
|
||
| auto a = rope_op->getAOp(); | ||
| auto rope_aop = dynamic_cast<mllm::aops::RoPEOp*>(a); | ||
| if (!rope_aop) { | ||
| MLLM_ERROR("Failed to cast to aops::RoPEOp"); | ||
| return false; | ||
| } | ||
|
|
||
| // RoPE inputs: x, sin, cos | ||
| auto inputs_it = op->inputs().begin(); | ||
| auto i_0 = (*inputs_it)->cast_<ir::tensor::TensorValue>(); // input tensor | ||
| auto i_sin = (*std::next(inputs_it))->cast_<ir::tensor::TensorValue>(); // sin embeddings | ||
| auto i_cos = (*std::next(inputs_it, 2))->cast_<ir::tensor::TensorValue>(); // cos embeddings | ||
|
|
||
| // RoPE output | ||
| auto o_0 = op->outputs().front()->cast_<ir::tensor::TensorValue>(); | ||
|
|
||
|
Comment on lines
+45
to
+52
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Validate RoPE input/output arity before dereferencing iterators. Line 47, Line 48, and Line 51 assume 3 inputs and 1 output exist. Missing checks here can dereference invalid iterators on malformed IR. Proposed fix // RoPE inputs: x, sin, cos
+ if (op->inputs().size() < 3 || op->outputs().empty()) {
+ MLLM_ERROR("RoPE lowering expects at least 3 inputs and 1 output");
+ return false;
+ }
auto inputs_it = op->inputs().begin();
auto i_0 = (*inputs_it)->cast_<ir::tensor::TensorValue>(); // input tensor
auto i_sin = (*std::next(inputs_it))->cast_<ir::tensor::TensorValue>(); // sin embeddings
auto i_cos = (*std::next(inputs_it, 2))->cast_<ir::tensor::TensorValue>(); // cos embeddingsAs per coding guidelines, "Validate inputs for public APIs and critical internal functions." 🤖 Prompt for AI Agents |
||
| // Create the custom HTP RoPE op from LLaMAPackage | ||
| auto qnn_op_node = QnnAOTNodeOperation::create("RoPE"); | ||
| qnn_op_node->setPackageName("LLaMAPackage"); | ||
|
|
||
| // pose_type parameter: 0 for standard RoPE | ||
| // The custom HTP op uses this to select between different RoPE variants | ||
| qnn_op_node->emplaceParamScalar(mllm::qnn::QNNParamScalarWrapper::create("pose_type", static_cast<uint32_t>(0))); | ||
|
|
||
|
Comment on lines
+57
to
+60
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
set -euo pipefail
echo "1) Verify RoPE kernel pose_type handling in custom package:"
rg -n -C3 'pose_type|only support pose_type' mllm/backends/qnn/custom-op-package/LLaMAPackage/src/ops/RoPE.cpp
echo
echo "2) Verify current lowering hardcodes pose_type:"
rg -n -C3 'emplaceParamScalar|pose_type' mllm/backends/qnn/aot/visitor/RoPE.cpp
echo
echo "3) Find where RoPE variant is represented in IR/AOp to thread through lowering:"
rg -n -C3 'class .*RoPEOp|pose_type' mllm/core mllm/compile mllm/modelsRepository: UbiquitousLearning/mllm Length of output: 6375 🏁 Script executed: #!/bin/bash
set -euo pipefail
echo "=== Check RoPEOp struct/class definition ==="
cat -n mllm/core/aops/RoPEOp.hpp
echo
echo "=== Check visitor code to see how RoPEOp variant is extracted ==="
cat -n mllm/backends/qnn/aot/visitor/RoPE.cpp | head -80Repository: UbiquitousLearning/mllm Length of output: 5148 Add The hardcoded 🤖 Prompt for AI Agents |
||
| qnn_op_node->emplaceInput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, i_0)) | ||
| ->emplaceInput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, i_sin)) | ||
| ->emplaceInput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, i_cos)) | ||
| ->emplaceOutput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, o_0)) | ||
| ->setName(rope_op->getAOp()->getName()); | ||
|
|
||
| // Register this op node into one graph. | ||
| env->captureAOTNodeOp(qnn_context_name, qnn_graph_name, qnn_op_node); | ||
|
|
||
| return true; | ||
| } | ||
|
|
||
| } // namespace mllm::qnn::aot | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,25 @@ | ||
| // Copyright (c) MLLM Team. | ||
| // Licensed under the MIT License. | ||
|
|
||
| #pragma once | ||
|
|
||
| #include "mllm/core/OpTypes.hpp" | ||
| #include "mllm/compile/ir/Node.hpp" | ||
| #include "mllm/backends/qnn/aot/visitor/Base.hpp" | ||
|
|
||
| namespace mllm::qnn::aot { | ||
|
|
||
| // Lowers RoPE to the custom HTP op from LLaMAPackage. | ||
| // The custom op handles partial rotation natively (partial_dimension parameter). | ||
| class QnnAOTRoPEPattern : public QnnAOTBasePattern { | ||
| public: | ||
| bool isMatch(const mllm::ir::op_ptr_t& op) override; | ||
|
|
||
| bool rewrite(ir::IRWriter& writer, const ir::op_ptr_t& op) override; | ||
|
|
||
| static inline std::pair<OpTypes, std::shared_ptr<QnnAOTRoPEPattern>> create() { | ||
| return {OpTypes::kRoPE, std::make_shared<QnnAOTRoPEPattern>()}; | ||
| } | ||
| }; | ||
|
|
||
| } // namespace mllm::qnn::aot |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,67 @@ | ||
| // Copyright (c) MLLM Team. | ||
| // Licensed under the MIT License. | ||
| // | ||
| // SiLU(x) = x * sigmoid(x) | ||
| // Decomposed into standard QNN ops: Sigmoid + ElementWiseMultiply | ||
|
|
||
| #include "mllm/utils/Common.hpp" | ||
| #include "mllm/compile/ir/linalg/Op.hpp" | ||
| #include "mllm/compile/ir/builtin/Attribute.hpp" | ||
| #include "mllm/compile/ir/tensor/Value.hpp" | ||
| #include "mllm/backends/qnn/aot/QnnWrappersAPI.hpp" | ||
| #include "mllm/backends/qnn/aot/visitor/SiLU.hpp" | ||
| #include "mllm/backends/qnn/aot/passes/AOTCompileContext.hpp" | ||
|
|
||
| namespace mllm::qnn::aot { | ||
|
|
||
| bool QnnAOTSiLUPattern::isMatch(const mllm::ir::op_ptr_t& op) { | ||
| return op->isa_<mllm::ir::linalg::SiLUOp>() && (op->getAttr("using_qnn") != nullptr); | ||
| } | ||
|
|
||
| bool QnnAOTSiLUPattern::rewrite(ir::IRWriter& writer, const ir::op_ptr_t& op) { | ||
| auto env = AOTCompileContext::getInstance().getEnv(); | ||
|
|
||
| auto silu_op = op->cast_<mllm::ir::linalg::SiLUOp>(); | ||
| if (!silu_op) { | ||
| MLLM_ERROR("Failed to cast to linalg::SiLUOp"); | ||
| return false; | ||
| } | ||
|
|
||
| MLLM_RETURN_FALSE_IF_NOT(op->getAttr("qnn_graph_name")); | ||
| auto qnn_graph_name = op->getAttr("qnn_graph_name")->cast_<ir::StrAttr>()->data(); | ||
| MLLM_RETURN_FALSE_IF_NOT(op->getAttr("qnn_context_name")); | ||
| auto qnn_context_name = op->getAttr("qnn_context_name")->cast_<ir::StrAttr>()->data(); | ||
|
|
||
| // Input and output tensors | ||
| auto i_0 = op->inputs().front()->cast_<ir::tensor::TensorValue>(); | ||
| auto o_0 = op->outputs().front()->cast_<ir::tensor::TensorValue>(); | ||
|
|
||
| // Create intermediate tensor for sigmoid output (same shape/dtype as output) | ||
| auto sigmoid_out_tensor = Tensor::empty(o_0->tensor_.shape(), o_0->tensor_.dtype()); | ||
| sigmoid_out_tensor.setName(silu_op->getAOp()->getName() + "_sigmoid_out"); | ||
| auto sigmoid_out = writer.getContext()->create<ir::tensor::TensorValue>(sigmoid_out_tensor); | ||
|
|
||
| // Copy quantization recipe from output to intermediate if available | ||
| if (op->getAttr("quant_recipe")) { | ||
| sigmoid_out->setAttr("quant_recipe", op->getAttr("quant_recipe")); | ||
| } | ||
|
Comment on lines
+44
to
+47
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
# Check how other QNN AOT patterns handle quant_recipe
rg -n "quant_recipe" mllm/backends/qnn/aot/visitor/ -A2 -B2Repository: UbiquitousLearning/mllm Length of output: 3254 🏁 Script executed: # Get the full SiLU.cpp file to understand the complete context
cat -n mllm/backends/qnn/aot/visitor/SiLU.cppRepository: UbiquitousLearning/mllm Length of output: 3503 🏁 Script executed: # Check if there are other activation/unary patterns with optional quant_recipe handling
rg "if.*getAttr.*quant_recipe" mllm/backends/qnn/aot/visitor/ -B5 -A5Repository: UbiquitousLearning/mllm Length of output: 1043 🏁 Script executed: # Count pattern frequency - how many patterns use MLLM_RETURN_FALSE_IF_NOT vs optional check
echo "=== Patterns requiring quant_recipe (MLLM_RETURN_FALSE_IF_NOT) ===" && \
rg "MLLM_RETURN_FALSE_IF_NOT.*quant_recipe" mllm/backends/qnn/aot/visitor/ | wc -l && \
echo "=== Patterns with optional quant_recipe (if check) ===" && \
rg "if.*getAttr.*quant_recipe" mllm/backends/qnn/aot/visitor/ | wc -lRepository: UbiquitousLearning/mllm Length of output: 192 Ensure quantization metadata is handled consistently with other QNN patterns. SiLU treats Either make 🤖 Prompt for AI Agents |
||
|
|
||
| // Step 1: Sigmoid(input) → sigmoid_out | ||
| auto sigmoid_node = QnnAOTNodeOperation::create("Sigmoid"); | ||
| sigmoid_node->emplaceInput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, i_0)) | ||
| ->emplaceOutput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, sigmoid_out)) | ||
| ->setName(silu_op->getAOp()->getName() + "_sigmoid"); | ||
| env->captureAOTNodeOp(qnn_context_name, qnn_graph_name, sigmoid_node); | ||
|
|
||
| // Step 2: ElementWiseMultiply(input, sigmoid_out) → output | ||
| auto mul_node = QnnAOTNodeOperation::create("ElementWiseMultiply"); | ||
| mul_node->emplaceInput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, i_0)) | ||
| ->emplaceInput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, sigmoid_out)) | ||
| ->emplaceOutput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, o_0)) | ||
| ->setName(silu_op->getAOp()->getName()); | ||
| env->captureAOTNodeOp(qnn_context_name, qnn_graph_name, mul_node); | ||
|
|
||
| return true; | ||
| } | ||
|
|
||
| } // namespace mllm::qnn::aot | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,25 @@ | ||
| // Copyright (c) MLLM Team. | ||
| // Licensed under the MIT License. | ||
|
|
||
| #pragma once | ||
|
|
||
| #include "mllm/core/OpTypes.hpp" | ||
| #include "mllm/compile/ir/Node.hpp" | ||
| #include "mllm/backends/qnn/aot/visitor/Base.hpp" | ||
|
|
||
| namespace mllm::qnn::aot { | ||
|
|
||
| // SiLU(x) = x * sigmoid(x) | ||
| // Decomposed into two standard QNN ops: Sigmoid + ElementWiseMultiply | ||
| class QnnAOTSiLUPattern : public QnnAOTBasePattern { | ||
| public: | ||
| bool isMatch(const mllm::ir::op_ptr_t& op) override; | ||
|
|
||
| bool rewrite(ir::IRWriter& writer, const ir::op_ptr_t& op) override; | ||
|
|
||
| static inline std::pair<OpTypes, std::shared_ptr<QnnAOTSiLUPattern>> create() { | ||
| return {OpTypes::kSiLU, std::make_shared<QnnAOTSiLUPattern>()}; | ||
| } | ||
| }; | ||
|
|
||
| } // namespace mllm::qnn::aot |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Help flag check may fail due to required argument validation.
The help flag is checked after
Argparse::parse(argc, argv)(line 17), but arguments are marked asrequired(true). If the user runs with just-h, the parser may fail before reaching the help check.🛠️ Proposed fix
Consider checking for help before validating required arguments, or ensuring
Argparse::parsedoesn't error on missing required args when-his present. A common pattern is:🤖 Prompt for AI Agents