From a71d166cb4d8fa73c5d969e3057cc220222265ab Mon Sep 17 00:00:00 2001 From: lxc Date: Wed, 3 Dec 2025 17:18:22 +0800 Subject: [PATCH 1/2] 1 --- .../S1/MultiModal/545yyds/README.md | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 2025-Ascend-Innovation-Contest/S1/MultiModal/545yyds/README.md diff --git a/2025-Ascend-Innovation-Contest/S1/MultiModal/545yyds/README.md b/2025-Ascend-Innovation-Contest/S1/MultiModal/545yyds/README.md new file mode 100644 index 00000000..21329e92 --- /dev/null +++ b/2025-Ascend-Innovation-Contest/S1/MultiModal/545yyds/README.md @@ -0,0 +1,88 @@ +# MindNLP 模型优化详细说明 (janus_model & Qwen) + +本文档详细记录了针对 janus_model 和 Qwen 模型的关键性能优化点,并附带了相应的核心代码实现。 + +## 1. janus_model 模型优化 + +### 1.1 Attention 计算算子优化 优化痛点: + +优化痛点:原有的 Attention 实现中使用 `ops.transpose` 结合 `view` 对 Query、Key、Value 状态进行维度变换。在 MindSpore 的 Ascend 后端执行时,`ops.transpose` 可能会产生额外的算子开销或内存重排效率不如专用算子库。 + +改进方案:将通用的 `ops.transpose` 替换为 MindSpore Mint 模块下的 `mindspore.mint.swapaxes`。Mint 系列算子通常针对 PyTorch 语义对齐及底层硬件(如 NPU)进行了更深度的适配和融合优化,能够提升维度交换操作的效率。 + +**源码实现** (`mindnlp/transformers/models/llama/modeling_llama.py`): + +**Python** + +```python +# 修改前: +query_states = ops.transpose(query_states.view(bsz, q_len, self.num_heads, self.head_dim), 1, 2) +key_states = ops.transpose(key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim), 1, 2) +value_states = ops.transpose(value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim), 1, 2) + +# 修改后: +query_states = mindspore.mint.swapaxes(query_states.view(bsz, q_len, self.num_heads, self.head_dim), 1, 2) +key_states = mindspore.mint.swapaxes(key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim), 1, 2) +value_states = mindspore.mint.swapaxes(value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim), 1, 2) +``` + + + +## 2. Qwen2-VL 模型优化 + +### 2.1 RoPE 与 Attention 逻辑优化 (针对 Qwen2-VL) + +优化痛点: + +1. **RoPE 切片操作**: 在旋转位置编码(RoPE)的 `rotate_half` 函数中,使用切片索引 `x[..., :half]` 和 `x[..., half:]` 可能会导致非连续内存访问,增加数据搬运开销。 +2. **Attention 掩码拷贝**: 在准备 causal mask 时,`causal_mask.copy()` 执行了深拷贝,导致不必要的内存占用和拷贝耗时。 +3. **算子选择**: 同样存在使用 Tensor 自身的 `.swapaxes` 方法而非高性能 `mint` 算子的问题。 + +改进方案: + +1. **RoPE**: 使用 `ops.split` 替代手动切片,在图编译层面更易优化,减少 strided slice 带来的开销。 +2. **Mask**: 去除 `.copy()` 操作,直接在原 Tensor 上操作或广播,减少内存拷贝。 +3. **Attention**: 全面将 `.swapaxes` 替换为 `mindspore.mint.swapaxes`。 + +**源码实现** (`mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py`): + +**Python** + +```python +# 1. RoPE 优化 (rotate_half) +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + # 修改前: x1 = x[..., : x.shape[-1] // 2]; x2 = x[..., x.shape[-1] // 2 :] + # 修改后: 使用 ops.split + x1, x2 = ops.split(x, x.shape[-1] // 2, dim=-1) + return ops.cat((-x2, x1), dim=-1) + +# 2. Attention 算子与 Mask 优化 (Qwen2VLAttention) +# 修改前: +causal_mask = causal_mask.copy() +# 修改后: 注释掉 copy +causal_mask = causal_mask.copy() # copy to contiguous memory for in-place edit + +# 3. Attention 维度交换优化 +# 修改前: +query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).swapaxes(1, 2) +attn_weights = ops.matmul(query_states, key_states.swapaxes(2, 3)) / math.sqrt(self.head_dim) + +# 修改后: +query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) +query_states = mindspore.mint.swapaxes(query_states, 1, 2) +# ... (key/value 同理) +attn_weights = ops.matmul(query_states, mindspore.mint.swapaxes(key_states, 2, 3)) / math.sqrt(self.head_dim) +``` + + + + +## 评测结果 + +| 评测指标 | 平均得分 | +|---------|---------| +| 峰值显存得分 | 100 | +| Prefill时延得分 | 106.8524 | +| Decode时延得分 | 111.755 | +| **总分** | **106.2025** | \ No newline at end of file From a16e4bcd63ec2d81eba0a3c0029d6cf7e7d51a79 Mon Sep 17 00:00:00 2001 From: lxc666-buaa <783273683@qq.com> Date: Wed, 3 Dec 2025 17:24:28 +0800 Subject: [PATCH 2/2] Add files via upload --- .../S1/MultiModal/545yyds/patches.zip | Bin 0 -> 2134 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 2025-Ascend-Innovation-Contest/S1/MultiModal/545yyds/patches.zip diff --git a/2025-Ascend-Innovation-Contest/S1/MultiModal/545yyds/patches.zip b/2025-Ascend-Innovation-Contest/S1/MultiModal/545yyds/patches.zip new file mode 100644 index 0000000000000000000000000000000000000000..f3ca8066ce9abb4f4c0bbe3bf055a048e014152c GIT binary patch literal 2134 zcmZ|QdpOg5AHeb747YRKVS1>X4|5tNB8S^))X1^0BBz{bw&jqvMnl%EP!3^M%&E*_ zJu9RLVWH9%ZbRrPDTjq*F}Lcup69;0uD;*v_x*hT_+78-``_E`2v|@I__l)XMIgUl z{(Fc3O2C;wOmKK8+R)oe6absuk&Zxaf7nEE03@?DUbN=*~| z4FKf+1pxB@5MVHvkv=9q$>2NrKku9Qlh@`OgC4st!ygQ2ocU_&t?uHH9~u>TGGHZ@kuB9;h6zV^k?}0|;GD-dsF+;oABwh;Ad&QM9Z3AwcS;w=y$Q>w(}nS%1|g z?fCl6K6V$@FvZ&bz>R=x$41kHws0$Q*V3i1rCsPH&DSlSo1jUo$nh2lvvr*{?w>1S^;?_*20rz+4Yrt4T*8xVH2x@(JQ-UnaVPpcC4AyyBY2Xilck^?TZir@&|wa6WSU4kz|Qa zy*p^{$1NJ*!^dBlNZclf1TrsPxRUjsFdZY$?0xk9^{#T`?haP?ULDSir?NUR=Ypbi z;giAhS0Nk4&4^nf^yJv*a(?u?UF4DEM}>iL#65@g{1}M_N^R^C@%>|YgXSG@v;pqk zYZa%QAf#e1)4QQ!Onok4)iN{px|mE>@gKo8^eS5E>A6g~Zm$^b{&SWI7=4CTKpZ-; zy%?I<>raq6pO>8&a&U~2xbuxb8a2*l;yEX2uUWMVj3%CBc?#SEczy$+-o-!trP2JaCaC%I=XY+wKj z+E%E7g+YCLfAI;w<1nwz3J@KyLz}eXGc|O{{FYTbgoR68J9HG5>Z38qz*aRacvX*r zU!>j+7tPoEQ^cu&_s;pgfjfuZArVV;dq7m(R;E7AC*2iGuVtx~hlfVJRh--^{&S8h$=F4^H?P%&5L+KtB!h_gm) z_`{=dDh2W*84jyP?*(WMaj0RRp&P7r)D-XH%g#vUQ~|eIuu+(hFvMND$)jQa?4Gct z!d?v*;t>gkFFP8+HqP3~_gW`=KewHIWaAqb2Hbnj<~|6+a^~w8CKGoj7rZ_*D7=Kj zWkvf8%}BPIAvUPM!6JzQf)ftDo5SE+C}70W7T$WfTt)c~3q9*VVRpIpE^DEIYn?39 zt0j&e&Du#ir--`#hrD6Mqe)C#!OOpocI71FlF_R%^OR?G@hNiQb^4~h)w+56N-Nja z&-*nOTj_VxOG))#%;6Z0?V7g+Sgt%6U%^u|N#wh_aBbtiE_|NV_bVTkop0lyDLTm{ z)lCYOEIv`Gy@<46iL^aQsGpz_ub*Sp3Nc#hzGB^~>S&9R zCET#_CkVpoGOy*!?Zqvhch%fPL}_;v&6;Ww5)wIncEJjwFXsOiZgbO}h3a(o)?$E0cXg?Ol0r}D1#>j7J?4qt zDc+s%60td|7)ys{8`NW!TpZ8&z1)e5@I0C#ytUw!&g{Yuwp?zlI{%>dBWQZ<_;vQ? zxzTCH!$>vzamNM7tBn4%@fMGJyoaX8q^R8FD{$+jY0GD~DxeEn>AU0+7Wc_p)OJIm zBgU7lYO){B&yS5EvvU0eTUQ)hBy*i+@^Y~Zdr#A)$1l>RDX&i#w;XET)tp8%=vy=@ z|Eu9lt+~BVS-G`EFo%4h&^@f=mCu>ip)k}kfhzD@ z!onRa_>M+`s*U+gXje^1b{>)589Q=7;o;0trC}If-0cWRU=R3L zT~^qx?YAWj%>MCh{=d>fKPvrGQGaN^YV;2cwCzv`w*Sz6F7~f%A>nUt5ZK