kvcache-ai · LZWuuu · Aug 28, 2025 · Aug 29, 2025 · Sep 17, 2025
diff --git a/doc/en/Hunyuan.md b/doc/en/Hunyuan.md
@@ -0,0 +1,77 @@
+# HunYuan Support for KTransformers
+
+## Introduction
+
+### Overview
+We are excited to announce that **KTransformers now supports HunYuan models with AMX optimization**.
+
+- **HunYuan-Standard (AMX bf16)**: ~12 TPS **on a dual-socket CPU with one consumer-grade GPU**, requiring ~441 GB DRAM. Enhanced performance with Intel AMX acceleration for MoE expert computations.
+
+### Model & Resource Links
+- *[Hunyuan-A13B-Instruct](https://huggingface.co/tencent/Hunyuan-A13B-Instruct)*
+
+---
+
+## Installation Guide
+
+### 1. Resource Requirements
+
+| Model                     | Precision  | Experts | DRAM Needed | GPU Memory Needed\* | TPS (approx.)                   |
+| ------------------------- | ---------- | ------- | ----------- | ------------------- | --------------------------------------- |
+| HunYuan-Standard          | bf16       | 64      | \~441 GB    | 14 GB               | \~12 TPS                    |
+
+\* Exact GPU memory depends on sequence length, batch size, and kernels used.  
+
+### 2. Prepare Models
+
+```bash
+# Example: download original safetensors (adjust to your paths/repos)
+# (Fill in actual repos/filenames yourself)
+
+# HunYuan-Standard
+huggingface-cli download --resume-download https://huggingface.co/tencent/Hunyuan-A13B-Instruct \
+  --local-dir ./Hunyuan-A13B-Instruct
+```
+
+### 3. Install KTransformers
+
+Follow the official [Installation Guide](https://kvcache-ai.github.io/ktransformers/en/install.html).
+
+```bash
+pip install ktransformers  # or from source if you need bleeding-edge features
+```
+
+### 4. Run HunYuan Inference Server
+
+```bash
+python ktransformers/server/main.py \
+  --port 10002 \
+  --model_path /abs/path/to/Hunyuan-A13B-Instruct \
+  --model_name Hunyuan-A13B-Instruct \
+  --gguf_path /abs/path/to/Hunyuan model files (.gguf or .safetensor) \
+  --optimize_config_path ktransformers/optimize/optimize_rules/Hunyuan-serve-amx.yaml \
+  --max_new_tokens 1024 \
+  --cache_lens 32768 \
+  --chunk_size 256 \
+  --max_batch_size 4 \
+  --backend_type balance_serve
+```
+
+### 5. Access Server
+
+```bash
+curl http://127.0.0.1:10002/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+        "model": "Hunyuan-A13B-Instruct",
+        "messages": [
+          {"role": "user", "content": "介绍一下西伯利亚森林猫"}
+        ],
+        "temperature": 0.7,
+        "max_tokens": 200,
+        "stream": false
+      }'
+```
+
+
+---
diff --git a/ktransformers/local_chat.py b/ktransformers/local_chat.py
@@ -28,6 +28,7 @@
 from ktransformers.models.modeling_deepseek_v3 import DeepseekV3ForCausalLM
 from ktransformers.models.modeling_llama import LlamaForCausalLM
 from ktransformers.models.modeling_mixtral import MixtralForCausalLM
+from ktransformers.models.modeling_hunyuan import HunYuanMoEV1ForCausalLM
 from ktransformers.util.utils import prefill_and_generate, get_compute_capability, xpu_fp16_model
 from ktransformers.server.config.config import Config
 from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled
@@ -39,6 +40,7 @@
     "Qwen2MoeForCausalLM": Qwen2MoeForCausalLM,
     "LlamaForCausalLM": LlamaForCausalLM,
     "MixtralForCausalLM": MixtralForCausalLM,
+    "HunYuanMoEV1ForCausalLM": HunYuanMoEV1ForCausalLM,
 }
 
 ktransformer_rules_dir = (
@@ -50,6 +52,7 @@
     "Qwen2MoeForCausalLM": ktransformer_rules_dir + "Qwen2-57B-A14B-Instruct.yaml",
     "LlamaForCausalLM": ktransformer_rules_dir + "Internlm2_5-7b-Chat-1m.yaml",
     "MixtralForCausalLM": ktransformer_rules_dir + "Mixtral.yaml",
+    "HunYuanMoEV1ForCausalLM": ktransformer_rules_dir + "Hunyuan-serve.yaml",
 }
 
 
@@ -96,6 +99,8 @@ def local_chat(
                 config._attn_implementation = "eager"
             if "Mixtral" in config.architectures[0]:
                 config._attn_implementation = "flash_attention_2"
+            if "HunYuan" in config.architectures[0]:
+                config._attn_implementation = "flash_attention_2"
             if torch.xpu.is_available():
                 config._attn_implementation = "eager"
             model = custom_models[config.architectures[0]](config)