UbiquitousLearning · FarmersWrap · Mar 22, 2026 · Mar 22, 2026 · Mar 22, 2026 · coderabbitai
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -6,6 +6,7 @@ add_subdirectory(llama)
 add_subdirectory(minicpm_o)
 add_subdirectory(minicpm4)
 add_subdirectory(qwen3)
+add_subdirectory(qwen3_5)
 add_subdirectory(qwen3_service)
 add_subdirectory(qwen3_moe)
 add_subdirectory(deepseek_ocr)

diff --git a/examples/qwen3_5/CMakeLists.txt b/examples/qwen3_5/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_executable(mllm-qwen3-5-runner main.cpp)
+target_link_libraries(mllm-qwen3-5-runner PRIVATE MllmRT MllmCPUBackend)
+target_include_directories(mllm-qwen3-5-runner PRIVATE ${MLLM_INCLUDE_DIR})
diff --git a/examples/qwen3_5/main.cpp b/examples/qwen3_5/main.cpp
@@ -0,0 +1,76 @@
+#include <iostream>
+#include <fmt/core.h>
+#include <mllm/mllm.hpp>
+#include <mllm/models/qwen3_5/modeling_qwen3_5.hpp>
+#include <mllm/models/qwen3_5/tokenization_qwen3_5.hpp>
+#include <mllm/utils/AnyValue.hpp>
+
+using mllm::Argparse;
+
+MLLM_MAIN({
+  auto& help = Argparse::add<bool>("-h|--help").help("Show help message");
+  auto& model_path = Argparse::add<std::string>("-m|--model_path").help("Model path").required(true);
+  auto& model_version = Argparse::add<std::string>("-mv|--model_version").help("Model version").required(true);
+  auto& tokenizer_path = Argparse::add<std::string>("-t|--tokenizer_path").help("Tokenizer directory").required(true);
+  auto& config_path = Argparse::add<std::string>("-c|--config_path").help("Config path").required(true);
+
+  Argparse::parse(argc, argv);
+
+#ifdef MLLM_PERFETTO_ENABLE
+  mllm::perf::start();
+#endif
+
+  mllm::ModelFileVersion file_version = mllm::ModelFileVersion::kV1;
+  if (model_version.get() == "v1") {
+    file_version = mllm::ModelFileVersion::kV1;
+  } else if (model_version.get() == "v2") {
+    file_version = mllm::ModelFileVersion::kV2;
+  }
+
+  if (help.isSet()) {
+    Argparse::printHelp();
+    mllm::shutdownContext();
+    return 0;
+  }
+
+  {
+    auto cfg = mllm::models::qwen3_5::Qwen3_5Config(config_path.get());
+    auto tokenizer = mllm::models::qwen3_5::Qwen3_5Tokenizer(tokenizer_path.get());
+    auto model = mllm::models::qwen3_5::Qwen3_5ForCausalLM(cfg);
+
+    fmt::print("Qwen3.5 0.8B: {} layers ({} full attention + {} GDN)\n",
+               cfg.num_hidden_layers, cfg.numFullAttentionLayers(), cfg.numGDNLayers());
+
+    auto param = mllm::load(model_path.get(), file_version);
+    model.load(param);
+
+    fmt::print("\n{:*^60}\n", " Qwen3.5 Interactive CLI ");
+    fmt::print("Enter 'exit' or 'quit' to end the session\n\n");
+
+    std::string prompt_text;
+
+    fmt::print("Prompt text (or 'exit/quit'): ");
+    std::getline(std::cin, prompt_text);
+
+    try {
+      fmt::print("Processing...\n");
+      auto inputs = tokenizer.convertMessage({.prompt = prompt_text});
+
+      fmt::print("\nResponse: ");
+
+      for (auto& step : model.chat(inputs)) { std::wcout << tokenizer.detokenize(step.cur_token_id) << std::flush; }
+
+      fmt::print("\n{}\n", std::string(60, '-'));
+    } catch (const std::exception& e) { fmt::print("\nError: {}\n{}\n", e.what(), std::string(60, '-')); }
+
+    model.perfSummary();
+  }
+
+#ifdef MLLM_PERFETTO_ENABLE
+  mllm::perf::stop();
+  mllm::perf::saveReport("qwen3_5.perf");
+#endif
+
+  mllm::print("\n");
+  mllm::memoryReport();
+})
@@ -27,6 +27,8 @@
 #include "mllm/backends/qnn/aot/visitor/Reduce.hpp"
 #include "mllm/backends/qnn/aot/visitor/Equal.hpp"
 #include "mllm/backends/qnn/aot/visitor/Sigmoid.hpp"
+#include "mllm/backends/qnn/aot/visitor/SiLU.hpp"
+#include "mllm/backends/qnn/aot/visitor/RoPE.hpp"
 #include "mllm/backends/qnn/aot/visitor/Matmul.hpp"
 #include "mllm/backends/qnn/aot/visitor/Repeat.hpp"
 #include "mllm/backends/qnn/aot/visitor/Softmax.hpp"
@@ -39,7 +41,8 @@ LLM2QnnLoweringPass::LLM2QnnLoweringPass() {
                    QnnAOTViewPattern, QnnAOTIndexPattern, QnnAOTGatherPattern, QnnAOTRMSNormPattern, QnnAOTLinearPattern,
                    QnnAOTTransposePattern, QnnAOTSlicePattern, QnnAOTConcatPattern, QnnAOTRepeatPattern, QnnAOTMatMulPattern,
                    QnnAOTReduceMaxPattern, QnnAOTReduceMinPattern, QnnAOTReduceMeanPattern, QnnAOTReduceSumPattern,
-                   QnnAOTEqualPattern, QnnAOTWherePattern, QnnAOTSoftmaxPattern, QnnAOTSigmoidPattern, QnnAOTConv2DPattern>();
+                   QnnAOTEqualPattern, QnnAOTWherePattern, QnnAOTSoftmaxPattern, QnnAOTSigmoidPattern, QnnAOTSiLUPattern,
+                   QnnAOTRoPEPattern, QnnAOTConv2DPattern>();
 }
 
 uint8_t LLM2QnnLoweringPass::run(const ir::node_ptr_t& op) {

@@ -0,0 +1,73 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+//
+// Lowers RoPE (Rotary Position Embedding) to the custom HTP op from LLaMAPackage.
+// The custom op signature: RoPE(input, sin, cos, h_cnt; pose_type) → output
+// It supports partial rotation natively via the HVX kernel.
+
+#include "mllm/utils/Common.hpp"
+#include "mllm/core/aops/RoPEOp.hpp"
+#include "mllm/compile/ir/linalg/Op.hpp"
+#include "mllm/compile/ir/builtin/Attribute.hpp"
+#include "mllm/compile/ir/tensor/Value.hpp"
+#include "mllm/backends/qnn/aot/QnnWrappersAPI.hpp"
+#include "mllm/backends/qnn/aot/visitor/RoPE.hpp"
+#include "mllm/backends/qnn/aot/passes/AOTCompileContext.hpp"
+
+namespace mllm::qnn::aot {
+
+bool QnnAOTRoPEPattern::isMatch(const mllm::ir::op_ptr_t& op) {
+  return op->isa_<mllm::ir::linalg::RoPEOp>() && (op->getAttr("using_qnn") != nullptr);
+}
+
+bool QnnAOTRoPEPattern::rewrite(ir::IRWriter& writer, const ir::op_ptr_t& op) {
+  auto env = AOTCompileContext::getInstance().getEnv();
+
+  auto rope_op = op->cast_<mllm::ir::linalg::RoPEOp>();
+  if (!rope_op) {
+    MLLM_ERROR("Failed to cast to linalg::RoPEOp");
+    return false;
+  }
+
+  MLLM_RETURN_FALSE_IF_NOT(op->getAttr("qnn_graph_name"));
+  auto qnn_graph_name = op->getAttr("qnn_graph_name")->cast_<ir::StrAttr>()->data();
+  MLLM_RETURN_FALSE_IF_NOT(op->getAttr("qnn_context_name"));
+  auto qnn_context_name = op->getAttr("qnn_context_name")->cast_<ir::StrAttr>()->data();
+
+  auto a = rope_op->getAOp();
+  auto rope_aop = dynamic_cast<mllm::aops::RoPEOp*>(a);
+  if (!rope_aop) {
+    MLLM_ERROR("Failed to cast to aops::RoPEOp");
+    return false;
+  }
+
+  // RoPE inputs: x, sin, cos
+  auto inputs_it = op->inputs().begin();
+  auto i_0 = (*inputs_it)->cast_<ir::tensor::TensorValue>();          // input tensor
+  auto i_sin = (*std::next(inputs_it))->cast_<ir::tensor::TensorValue>();  // sin embeddings
+  auto i_cos = (*std::next(inputs_it, 2))->cast_<ir::tensor::TensorValue>();  // cos embeddings
+
+  // RoPE output
+  auto o_0 = op->outputs().front()->cast_<ir::tensor::TensorValue>();
+
+  // Create the custom HTP RoPE op from LLaMAPackage
+  auto qnn_op_node = QnnAOTNodeOperation::create("RoPE");
+  qnn_op_node->setPackageName("LLaMAPackage");
+
+  // pose_type parameter: 0 for standard RoPE
+  // The custom HTP op uses this to select between different RoPE variants
+  qnn_op_node->emplaceParamScalar(mllm::qnn::QNNParamScalarWrapper::create("pose_type", static_cast<uint32_t>(0)));
+
+  qnn_op_node->emplaceInput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, i_0))
+      ->emplaceInput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, i_sin))
+      ->emplaceInput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, i_cos))
+      ->emplaceOutput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, o_0))
+      ->setName(rope_op->getAOp()->getName());
+
+  // Register this op node into one graph.
+  env->captureAOTNodeOp(qnn_context_name, qnn_graph_name, qnn_op_node);
+
+  return true;
+}
+
+}  // namespace mllm::qnn::aot
@@ -0,0 +1,25 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/core/OpTypes.hpp"
+#include "mllm/compile/ir/Node.hpp"
+#include "mllm/backends/qnn/aot/visitor/Base.hpp"
+
+namespace mllm::qnn::aot {
+
+// Lowers RoPE to the custom HTP op from LLaMAPackage.
+// The custom op handles partial rotation natively (partial_dimension parameter).
+class QnnAOTRoPEPattern : public QnnAOTBasePattern {
+ public:
+  bool isMatch(const mllm::ir::op_ptr_t& op) override;
+
+  bool rewrite(ir::IRWriter& writer, const ir::op_ptr_t& op) override;
+
+  static inline std::pair<OpTypes, std::shared_ptr<QnnAOTRoPEPattern>> create() {
+    return {OpTypes::kRoPE, std::make_shared<QnnAOTRoPEPattern>()};
+  }
+};
+
+}  // namespace mllm::qnn::aot
@@ -0,0 +1,67 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+//
+// SiLU(x) = x * sigmoid(x)
+// Decomposed into standard QNN ops: Sigmoid + ElementWiseMultiply
+
+#include "mllm/utils/Common.hpp"
+#include "mllm/compile/ir/linalg/Op.hpp"
+#include "mllm/compile/ir/builtin/Attribute.hpp"
+#include "mllm/compile/ir/tensor/Value.hpp"
+#include "mllm/backends/qnn/aot/QnnWrappersAPI.hpp"
+#include "mllm/backends/qnn/aot/visitor/SiLU.hpp"
+#include "mllm/backends/qnn/aot/passes/AOTCompileContext.hpp"
+
+namespace mllm::qnn::aot {
+
+bool QnnAOTSiLUPattern::isMatch(const mllm::ir::op_ptr_t& op) {
+  return op->isa_<mllm::ir::linalg::SiLUOp>() && (op->getAttr("using_qnn") != nullptr);
+}
+
+bool QnnAOTSiLUPattern::rewrite(ir::IRWriter& writer, const ir::op_ptr_t& op) {
+  auto env = AOTCompileContext::getInstance().getEnv();
+
+  auto silu_op = op->cast_<mllm::ir::linalg::SiLUOp>();
+  if (!silu_op) {
+    MLLM_ERROR("Failed to cast to linalg::SiLUOp");
+    return false;
+  }
+
+  MLLM_RETURN_FALSE_IF_NOT(op->getAttr("qnn_graph_name"));
+  auto qnn_graph_name = op->getAttr("qnn_graph_name")->cast_<ir::StrAttr>()->data();
+  MLLM_RETURN_FALSE_IF_NOT(op->getAttr("qnn_context_name"));
+  auto qnn_context_name = op->getAttr("qnn_context_name")->cast_<ir::StrAttr>()->data();
+
+  // Input and output tensors
+  auto i_0 = op->inputs().front()->cast_<ir::tensor::TensorValue>();
+  auto o_0 = op->outputs().front()->cast_<ir::tensor::TensorValue>();
+
+  // Create intermediate tensor for sigmoid output (same shape/dtype as output)
+  auto sigmoid_out_tensor = Tensor::empty(o_0->tensor_.shape(), o_0->tensor_.dtype());
+  sigmoid_out_tensor.setName(silu_op->getAOp()->getName() + "_sigmoid_out");
+  auto sigmoid_out = writer.getContext()->create<ir::tensor::TensorValue>(sigmoid_out_tensor);
+
+  // Copy quantization recipe from output to intermediate if available
+  if (op->getAttr("quant_recipe")) {
+    sigmoid_out->setAttr("quant_recipe", op->getAttr("quant_recipe"));
+  }
+
+  // Step 1: Sigmoid(input) → sigmoid_out
+  auto sigmoid_node = QnnAOTNodeOperation::create("Sigmoid");
+  sigmoid_node->emplaceInput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, i_0))
+      ->emplaceOutput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, sigmoid_out))
+      ->setName(silu_op->getAOp()->getName() + "_sigmoid");
+  env->captureAOTNodeOp(qnn_context_name, qnn_graph_name, sigmoid_node);
+
+  // Step 2: ElementWiseMultiply(input, sigmoid_out) → output
+  auto mul_node = QnnAOTNodeOperation::create("ElementWiseMultiply");
+  mul_node->emplaceInput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, i_0))
+      ->emplaceInput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, sigmoid_out))
+      ->emplaceOutput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, o_0))
+      ->setName(silu_op->getAOp()->getName());
+  env->captureAOTNodeOp(qnn_context_name, qnn_graph_name, mul_node);
+
+  return true;
+}
+
+}  // namespace mllm::qnn::aot
@@ -0,0 +1,25 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/core/OpTypes.hpp"
+#include "mllm/compile/ir/Node.hpp"
+#include "mllm/backends/qnn/aot/visitor/Base.hpp"
+
+namespace mllm::qnn::aot {
+
+// SiLU(x) = x * sigmoid(x)
+// Decomposed into two standard QNN ops: Sigmoid + ElementWiseMultiply
+class QnnAOTSiLUPattern : public QnnAOTBasePattern {
+ public:
+  bool isMatch(const mllm::ir::op_ptr_t& op) override;
+
+  bool rewrite(ir::IRWriter& writer, const ir::op_ptr_t& op) override;
+
+  static inline std::pair<OpTypes, std::shared_ptr<QnnAOTSiLUPattern>> create() {
+    return {OpTypes::kSiLU, std::make_shared<QnnAOTSiLUPattern>()};
+  }
+};
+
+}  // namespace mllm::qnn::aot