diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/README.md" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/README.md" new file mode 100644 index 00000000..3215394e --- /dev/null +++ "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/README.md" @@ -0,0 +1,44 @@ + +# 2025年昇腾AI创新大赛-昇思模型开发挑战赛(S1赛季)--MultiModal赛题--好树花生队提交说明 + +## 运行时长如下: + + + +## 优化点分为以下方面: + +1.qwenvl模型中有大量api替换,分别有mindspore.ops.rms_norm替换原来的实现,然后大量直接从tensor调用的方法,比如tensor.broadcast替换成mint中的方法,还有transpose、unsqueeze等方法 + +2.qwenvl模型中的conv3d操作,原本调用的方法速度慢,改成mindspore.mint.nn.functional.conv3d后能够明显提升 + + + +3.两个模型中softmax的操作,默认使用了fp32数据类型,改成默认的bf16,能够有轻微提升 + + + +4.qwenvl模型的预处理阶段,在processing_vlm.py文件中,其使用tokenizer获得tag的4个方法操作很耗时,每次都重复执行,且每次操作都是一样的,所以改成在类初始化时就获取,后面直接使用现成的值,这样能够大大降低预处理的时间 + + + + +5.janus模型中的modeling_vlm.py文件里有大量打印,发现删除那些print后,速度偶尔有轻微提升 + +6.janus的siglip_vit.py文件中,对forward_features方法中的blocks运算加jit,速度能明显提升,forward_head方法也加了jit,但速度提升不明显,有时感觉有轻微提升,有时却一点都没有提升 + + + +7.qwenvl模型中vision模块的attention_mask重复计算了,将其提取到layer的for循环之外,保证就执行一次 + + + + +以上就是主要的几个修改点 + +# 最终优化结果: +| 评测指标 | 平均得分 | +|---------|---------| +| 峰值显存得分 | 116.6667 | +| Prefill时延得分 | 382.3937 | +| Decode时延得分 | 154.2476 | +| **总分** | **217.7693** | \ No newline at end of file diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/2.png" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/2.png" new file mode 100644 index 00000000..f62690a8 Binary files /dev/null and "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/2.png" differ diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/3.png" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/3.png" new file mode 100644 index 00000000..a66767ea Binary files /dev/null and "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/3.png" differ diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/4_1.png" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/4_1.png" new file mode 100644 index 00000000..fe355fbb Binary files /dev/null and "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/4_1.png" differ diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/4_2.png" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/4_2.png" new file mode 100644 index 00000000..0647a182 Binary files /dev/null and "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/4_2.png" differ diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/6.png" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/6.png" new file mode 100644 index 00000000..a46e889f Binary files /dev/null and "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/6.png" differ diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/7_1.png" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/7_1.png" new file mode 100644 index 00000000..423e7a3d Binary files /dev/null and "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/7_1.png" differ diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/7_2.png" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/7_2.png" new file mode 100644 index 00000000..98f4ea1a Binary files /dev/null and "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/7_2.png" differ diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/results.png" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/results.png" new file mode 100644 index 00000000..0551ce7f Binary files /dev/null and "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/results.png" differ diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0001-.patch" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0001-.patch" new file mode 100644 index 00000000..6513ba0e --- /dev/null +++ "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0001-.patch" @@ -0,0 +1,141 @@ +From 4fffb31ea6073e689089caa02e0a17cf4b6f73af Mon Sep 17 00:00:00 2001 +From: Alice <1215876379@qq.com> +Date: Tue, 4 Nov 2025 22:41:54 +0800 +Subject: [PATCH 1/3] =?UTF-8?q?=E5=A4=9A=E6=A8=A1=E6=80=81=E8=B0=83?= + =?UTF-8?q?=E4=BC=98=E6=8F=90=E4=BA=A4?= +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +--- + .../janus_pro/janus/models/processing_vlm.py | 28 ++++++++++++++++--- + .../models/qwen2_vl/modeling_qwen2_vl.py | 14 ++++++---- + 2 files changed, 33 insertions(+), 9 deletions(-) + +diff --git a/llm/inference/janus_pro/janus/models/processing_vlm.py b/llm/inference/janus_pro/janus/models/processing_vlm.py +index 7f881e39..c93bfd92 100644 +--- a/llm/inference/janus_pro/janus/models/processing_vlm.py ++++ b/llm/inference/janus_pro/janus/models/processing_vlm.py +@@ -110,9 +110,13 @@ class VLChatProcessor(ProcessorMixin): + print(f"Add image tag = {image_tag} to the tokenizer") + + self.image_tag = image_tag ++ self.image_tag_id = None + self.image_start_tag = image_start_tag ++ self.image_start_tag_id = None + self.image_end_tag = image_end_tag ++ self.image_end_tag_id = None + self.pad_tag = pad_tag ++ self.pad_tag_id = None + + self.num_image_tokens = num_image_tokens + self.add_special_token = add_special_token +@@ -185,17 +189,29 @@ class VLChatProcessor(ProcessorMixin): + + @property + def image_id(self): +- image_id = self.tokenizer.vocab.get(self.image_tag) ++ if self.image_tag_id is None: ++ image_id = self.tokenizer.vocab.get(self.image_tag) ++ self.image_tag_id = image_id ++ else: ++ image_id = self.image_tag_id + return image_id + + @property + def image_start_id(self): +- image_start_id = self.tokenizer.vocab.get(self.image_start_tag) ++ if self.image_start_tag_id is None: ++ image_start_id = self.tokenizer.vocab.get(self.image_start_tag) ++ self.image_start_tag_id = image_start_id ++ else: ++ image_start_id = self.image_start_tag_id + return image_start_id + + @property + def image_end_id(self): +- image_end_id = self.tokenizer.vocab.get(self.image_end_tag) ++ if self.image_end_tag_id is None: ++ image_end_id = self.tokenizer.vocab.get(self.image_end_tag) ++ self.image_end_tag_id = image_end_id ++ else: ++ image_end_id = self.image_end_tag_id + return image_end_id + + @property +@@ -208,7 +224,11 @@ class VLChatProcessor(ProcessorMixin): + + @property + def pad_id(self): +- pad_id = self.tokenizer.vocab.get(self.pad_tag) ++ if self.pad_tag_id is None: ++ pad_id = self.tokenizer.vocab.get(self.pad_tag) ++ self.pad_tag_id = pad_id ++ else: ++ pad_id = self.pad_tag_id + # pad_id = self.tokenizer.pad_token_id + # if pad_id is None: + # pad_id = self.tokenizer.eos_token_id +diff --git a/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py +index d059dcbe..ffb100cf 100644 +--- a/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py ++++ b/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py +@@ -265,6 +265,7 @@ class PatchEmbed(nn.Module): + self.embed_dim = embed_dim + + kernel_size = (temporal_patch_size, patch_size, patch_size) ++ self.kernel_size = kernel_size + self.proj = nn.Conv3d(in_channels, embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=False) + + def forward(self, hidden_states: mindspore.Tensor) -> mindspore.Tensor: +@@ -272,7 +273,10 @@ class PatchEmbed(nn.Module): + hidden_states = hidden_states.view( + -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size + ) +- hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim) ++ ++ hidden_states = mindspore.mint.nn.functional.conv3d(hidden_states.to(dtype=target_dtype), self.proj.weight, ++ stride=self.kernel_size).view(-1, self.embed_dim) ++ + return hidden_states + + +@@ -330,7 +334,7 @@ class VisionAttention(nn.Module): + v = v.swapaxes(0, 1) + attn_weights = ops.matmul(q, k.swapaxes(1, 2)) / math.sqrt(self.head_dim) + attn_weights = attn_weights + attention_mask +- attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=mindspore.float32).to(q.dtype) ++ attn_weights = nn.functional.softmax(attn_weights, dim=-1) + attn_output = ops.matmul(attn_weights, v) + attn_output = attn_output.swapaxes(0, 1) + attn_output = attn_output.reshape(seq_length, -1) +@@ -559,7 +563,7 @@ class Qwen2VLAttention(nn.Module): + attn_weights = attn_weights + causal_mask + + # upcast attention to fp32 +- attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=mindspore.float32).to(query_states.dtype) ++ attn_weights = nn.functional.softmax(attn_weights, dim=-1) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = ops.matmul(attn_weights, value_states) + +@@ -637,7 +641,7 @@ class Qwen2VLDecoderLayer(nn.Module): + + residual = hidden_states + +- hidden_states = self.input_layernorm(hidden_states) ++ hidden_states, _ = mindspore.ops.rms_norm(hidden_states, self.input_layernorm.weight, self.input_layernorm.variance_epsilon) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( +@@ -654,7 +658,7 @@ class Qwen2VLDecoderLayer(nn.Module): + + # Fully Connected + residual = hidden_states +- hidden_states = self.post_attention_layernorm(hidden_states) ++ hidden_states, _ = mindspore.ops.rms_norm(hidden_states, self.post_attention_layernorm.weight, self.post_attention_layernorm.variance_epsilon) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + +-- +2.47.1.windows.2 + diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0002-rmsnorm-jit.patch" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0002-rmsnorm-jit.patch" new file mode 100644 index 00000000..8fc3923e --- /dev/null +++ "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0002-rmsnorm-jit.patch" @@ -0,0 +1,183 @@ +From a9b4c9b85f237fcae52ea396cf7be606cdf16410 Mon Sep 17 00:00:00 2001 +From: Alice <1215876379@qq.com> +Date: Thu, 6 Nov 2025 16:39:55 +0800 +Subject: [PATCH 2/3] =?UTF-8?q?=E7=BB=A7=E7=BB=AD=E4=BC=98=E5=8C=96?= + =?UTF-8?q?=EF=BC=8C=E5=8A=A0=E5=85=A5rmsnorm,jit?= +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +--- + .../janus_pro/janus/models/clip_encoder.py | 6 +++-- + .../janus_pro/janus/models/modeling_vlm.py | 27 ++++++++----------- + .../janus_pro/janus/models/siglip_vit.py | 11 +++++--- + .../janus_pro/janus/models/timm_layers.py | 2 +- + .../models/llama/modeling_llama.py | 6 ++--- + 5 files changed, 27 insertions(+), 25 deletions(-) + +diff --git a/llm/inference/janus_pro/janus/models/clip_encoder.py b/llm/inference/janus_pro/janus/models/clip_encoder.py +index a0620cfe..acb6ea3f 100644 +--- a/llm/inference/janus_pro/janus/models/clip_encoder.py ++++ b/llm/inference/janus_pro/janus/models/clip_encoder.py +@@ -56,6 +56,7 @@ class CLIPVisionTower(nn.Module): + self.vision_tower, self.forward_kwargs = self.build_vision_tower( + vision_tower_params + ) ++ self.vision_tower.jit() + + if pixel_mean is not None and pixel_std is not None: + image_norm = Normalize( +@@ -112,10 +113,11 @@ class CLIPVisionTower(nn.Module): + Returns: + image_features (torch.Tensor): [b, n_patch, d] + """ +- ++ + if self.image_norm is not None: + images = self.image_norm(images) +- ++ + image_forward_outs = self.vision_tower(images, **self.forward_kwargs) ++ + image_features = self.feature_select(image_forward_outs) + return image_features +diff --git a/llm/inference/janus_pro/janus/models/modeling_vlm.py b/llm/inference/janus_pro/janus/models/modeling_vlm.py +index 7178c398..3d2d2d74 100644 +--- a/llm/inference/janus_pro/janus/models/modeling_vlm.py ++++ b/llm/inference/janus_pro/janus/models/modeling_vlm.py +@@ -241,12 +241,16 @@ class MultiModalityCausalLM(MultiModalityPreTrainedModel): + Returns: + input_embeds (torch.Tensor): [b, T, D] + """ +- ++ ++ + bs, n = pixel_values.shape[0:2] + # "b n c h w -> (b n) c h w" + images = ops.reshape( + pixel_values, (bs * n, pixel_values.shape[2], pixel_values.shape[3], pixel_values.shape[4])) +- images_embeds = self.aligner(self.vision_model(images)) ++ ++ vr = self.vision_model(images) ++ ++ images_embeds = self.aligner(vr) + + # "(b n) t d -> b (n t) d" + images_embeds = ops.reshape( +@@ -259,33 +263,24 @@ class MultiModalityCausalLM(MultiModalityPreTrainedModel): + condition = input_ids < 0 + input_ids = (1-condition) * input_ids + condition * \ + 0 # ignore the image embeddings ++ + inputs_embeds = self.language_model.get_input_embeddings()(input_ids) ++ + + # replace with the image embeddings + # 627 576 + # inputs_embeds[images_seq_mask] = images_embeds[images_emb_mask] +- print("inputs_embeds:", inputs_embeds.shape) +- print("images_embeds[images_emb_mask].dtype", images_embeds[images_emb_mask].dtype) +- print("inputs_embeds.dtype", inputs_embeds.dtype) ++ ++ + padding_size = images_seq_mask.shape[1] - images_emb_mask.shape[1] + padding = Tensor(np.full((images_seq_mask.shape[0], padding_size), False), dtype=images_emb_mask.dtype) + padded_images_emb_mask = ops.concat((images_emb_mask, padding), dim=1) +- print("padded_images_emb_mask.shape:",padded_images_emb_mask.shape) +- print("images_embeds.shape:",images_embeds.shape) +- print("images_seq_mask.shape:",images_seq_mask.shape) + first_true = images_seq_mask.nonzero().squeeze()[0][1] # 42 + last_true = images_seq_mask.nonzero().squeeze()[-1][1] # 42 +- print("first_true:",first_true) +- print("last_true:",last_true) + left = inputs_embeds[:,:first_true] +- print(left.shape) + right = inputs_embeds[:, last_true+1:] +- print(right.shape) + inputs_embeds = ops.cat((left, images_embeds, right),1) +- print("inputs_embeds.shape:",inputs_embeds.shape) +- print("inputs_embeds.dtype:",inputs_embeds.dtype) +- +- ++ + + # inputs_embeds = images_embeds[padded_images_emb_mask] * images_seq_mask + inputs_embeds * (1 - images_seq_mask) + return inputs_embeds +diff --git a/llm/inference/janus_pro/janus/models/siglip_vit.py b/llm/inference/janus_pro/janus/models/siglip_vit.py +index 56a6f299..d896eeb2 100644 +--- a/llm/inference/janus_pro/janus/models/siglip_vit.py ++++ b/llm/inference/janus_pro/janus/models/siglip_vit.py +@@ -580,7 +580,11 @@ class VisionTransformer(nn.Module): + if return_prefix_tokens: + return tuple(zip(outputs, prefix_tokens)) + return tuple(outputs) +- ++ ++ @mindspore.jit(backend='GE') ++ def run_blocks_jit(self, x: mindspore.Tensor) -> mindspore.Tensor: ++ return self.blocks(x) ++ + def forward_features(self, x: mindspore.Tensor) -> mindspore.Tensor: + x = self.patch_embed(x) + x = self._pos_embed(x) +@@ -590,10 +594,11 @@ class VisionTransformer(nn.Module): + # x = checkpoint_seq(self.blocks, x) + # else: + # x = self.blocks(x) +- x = self.blocks(x) ++ x = self.run_blocks_jit(x) + x = self.norm(x) + return x +- ++ ++ @mindspore.jit(backend='GE') + def forward_head(self, x: mindspore.Tensor, pre_logits: bool = False) -> mindspore.Tensor: + if self.attn_pool is not None: + x = self.attn_pool(x) +diff --git a/llm/inference/janus_pro/janus/models/timm_layers.py b/llm/inference/janus_pro/janus/models/timm_layers.py +index 8960d256..61e68c54 100644 +--- a/llm/inference/janus_pro/janus/models/timm_layers.py ++++ b/llm/inference/janus_pro/janus/models/timm_layers.py +@@ -46,7 +46,7 @@ class Mlp(nn.Module): + + def forward(self, x): + x = self.fc1(x) +- x = self.act(x) ++ x = mindspore.ops.gelu(x) + x = self.drop1(x) + x = self.norm(x) + x = self.fc2(x) +diff --git a/mindnlp/transformers/models/llama/modeling_llama.py b/mindnlp/transformers/models/llama/modeling_llama.py +index 9c5cb555..c8c55492 100644 +--- a/mindnlp/transformers/models/llama/modeling_llama.py ++++ b/mindnlp/transformers/models/llama/modeling_llama.py +@@ -429,7 +429,7 @@ class LlamaAttention(nn.Module): + attn_weights = attn_weights + causal_mask + + # upcast attention to fp32 +- attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=mindspore.float32).to(query_states.dtype) ++ attn_weights = nn.functional.softmax(attn_weights, dim=-1) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = ops.matmul(attn_weights, value_states) + +@@ -508,7 +508,7 @@ class LlamaDecoderLayer(nn.Module): + """ + residual = hidden_states + +- hidden_states = self.input_layernorm(hidden_states) ++ hidden_states, _ = mindspore.ops.rms_norm(hidden_states, self.input_layernorm.weight, self.input_layernorm.variance_epsilon) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( +@@ -526,7 +526,7 @@ class LlamaDecoderLayer(nn.Module): + + # Fully Connected + residual = hidden_states +- hidden_states = self.post_attention_layernorm(hidden_states) ++ hidden_states, _ = mindspore.ops.rms_norm(hidden_states, self.post_attention_layernorm.weight, self.post_attention_layernorm.variance_epsilon) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + +-- +2.47.1.windows.2 + diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0003-janus-llama-qwen2_vl.patch" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0003-janus-llama-qwen2_vl.patch" new file mode 100644 index 00000000..c60c9b59 --- /dev/null +++ "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0003-janus-llama-qwen2_vl.patch" @@ -0,0 +1,286 @@ +From dba3c2bd641feb6526a82881e1ef395890711a9d Mon Sep 17 00:00:00 2001 +From: Alice <1215876379@qq.com> +Date: Fri, 7 Nov 2025 00:13:55 +0800 +Subject: [PATCH 3/3] =?UTF-8?q?=E5=8F=82=E8=80=83janus=E4=B8=AD=E7=9A=84ll?= + =?UTF-8?q?ama=EF=BC=8C=E4=BC=98=E5=8C=96qwen2=5Fvl=E7=9A=84=E6=8E=A5?= + =?UTF-8?q?=E5=8F=A3=E8=B0=83=E7=94=A8?= +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +--- + .../models/qwen2_vl/modeling_qwen2_vl.py | 104 +++++++++--------- + 1 file changed, 53 insertions(+), 51 deletions(-) + +diff --git a/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py +index ffb100cf..4ed07c37 100644 +--- a/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py ++++ b/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py +@@ -151,7 +151,7 @@ class Qwen2VLRotaryEmbedding(nn.Module): + self.max_seq_len_cached = self.original_max_seq_len + + @no_grad() +- def forward(self, x, position_ids): ++ def forward(self, x, position_ids, mrope_section, unsqueeze_dim=1): + if "dynamic" in self.rope_type: + self._dynamic_frequency_update(position_ids) + +@@ -161,7 +161,7 @@ class Qwen2VLRotaryEmbedding(nn.Module): + position_ids_expanded = position_ids[:, :, None, :].float() # shape (3, bs, 1, positions) + # Force float32 (see https://github.com/huggingface/transformers/pull/29285) + # with ops.autocast(device_type=device_type, enabled=False): +- freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).swapaxes(2, 3) ++ freqs = ops.transpose((inv_freq_expanded.float() @ position_ids_expanded.float()), 2, 3) + emb = ops.cat((freqs, freqs), dim=-1) + cos = emb.cos() + sin = emb.sin() +@@ -169,6 +169,14 @@ class Qwen2VLRotaryEmbedding(nn.Module): + # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention + cos = cos * self.attention_scaling + sin = sin * self.attention_scaling ++ ++ mrope_section = mrope_section * 2 ++ cos = ops.unsqueeze(ops.cat([m[i % 3] for i, m in enumerate(ops.split(cos, mrope_section, dim=-1))], dim=-1), ++ unsqueeze_dim ++ ) ++ sin = ops.unsqueeze(ops.cat([m[i % 3] for i, m in enumerate(ops.split(sin, mrope_section, dim=-1))], dim=-1), ++ unsqueeze_dim ++ ) + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + +@@ -176,12 +184,11 @@ class Qwen2VLRotaryEmbedding(nn.Module): + # Copied from transformers.models.llama.modeling_llama.rotate_half + def rotate_half(x): + """Rotates half the hidden dims of the input.""" +- x1 = x[..., : x.shape[-1] // 2] +- x2 = x[..., x.shape[-1] // 2 :] ++ x1, x2 = ops.split(x, x.shape[-1] // 2, dim=-1) + return ops.cat((-x2, x1), dim=-1) + + +-def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1): ++def apply_multimodal_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1): + """Applies Rotary Position Embedding with Multimodal Sections to the query and key tensors (https://qwenlm.github.io/blog/qwen2-vl/). + + Explanation: +@@ -213,14 +220,6 @@ def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim + Returns: + `tuple(mindspore.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ +- mrope_section = mrope_section * 2 +- cos = ops.cat([m[i % 3] for i, m in enumerate(ops.split(cos, mrope_section, dim=-1))], dim=-1).unsqueeze( +- unsqueeze_dim +- ) +- sin = ops.cat([m[i % 3] for i, m in enumerate(ops.split(sin, mrope_section, dim=-1))], dim=-1).unsqueeze( +- unsqueeze_dim +- ) +- + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed +@@ -229,10 +228,10 @@ def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim + def apply_rotary_pos_emb_vision(tensor: mindspore.Tensor, freqs: mindspore.Tensor) -> mindspore.Tensor: + orig_dtype = tensor.dtype + tensor = tensor.float() +- cos = freqs.cos() +- sin = freqs.sin() +- cos = cos.unsqueeze(1).tile((1, 1, 2)).unsqueeze(0).float() +- sin = sin.unsqueeze(1).tile((1, 1, 2)).unsqueeze(0).float() ++ cos = ops.cos(freqs) ++ sin = ops.sin(freqs) ++ cos = ops.unsqueeze(ops.unsqueeze(cos,1).tile((1, 1, 2)), 0).float() ++ sin = ops.unsqueeze(ops.unsqueeze(sin, 1).tile((1, 1, 2)), 0).float() + output = (tensor * cos) + (rotate_half(tensor) * sin) + output = output.to(orig_dtype) + return output +@@ -316,28 +315,22 @@ class VisionAttention(nn.Module): + self.proj = nn.Linear(dim, dim) + + def forward( +- self, hidden_states: mindspore.Tensor, cu_seqlens: mindspore.Tensor, rotary_pos_emb: mindspore.Tensor = None ++ self, hidden_states: mindspore.Tensor, attention_mask: mindspore.Tensor, rotary_pos_emb: mindspore.Tensor = None + ) -> mindspore.Tensor: + seq_length = hidden_states.shape[0] +- q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0) ++ q, k, v = mindspore.mint.reshape(self.qkv(hidden_states), (seq_length, 3, self.num_heads, -1)).permute(1, 0, 2, 3).unbind(0) + q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0) + k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0) + +- attention_mask = ops.full( +- [1, seq_length, seq_length], float(ops.finfo(q.dtype).min), dtype=q.dtype +- ) +- for i in range(1, len(cu_seqlens)): +- attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0 +- +- q = q.swapaxes(0, 1) +- k = k.swapaxes(0, 1) +- v = v.swapaxes(0, 1) +- attn_weights = ops.matmul(q, k.swapaxes(1, 2)) / math.sqrt(self.head_dim) ++ q = ops.transpose(q, 0, 1) ++ k = ops.transpose(k, 0, 1) ++ v = ops.transpose(v, 0, 1) ++ attn_weights = ops.matmul(q, ops.transpose(k, 1, 2)) / math.sqrt(self.head_dim) + attn_weights = attn_weights + attention_mask + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + attn_output = ops.matmul(attn_weights, v) +- attn_output = attn_output.swapaxes(0, 1) +- attn_output = attn_output.reshape(seq_length, -1) ++ attn_output = ops.transpose(attn_output, 0, 1) ++ attn_output = mindspore.mint.reshape(attn_output, (seq_length, -1)) + attn_output = self.proj(attn_output) + return attn_output + +@@ -358,9 +351,9 @@ class Qwen2VLVisionBlock(nn.Module): + ) + self.mlp = VisionMlp(dim=config.embed_dim, hidden_dim=mlp_hidden_dim, hidden_act=config.hidden_act) + +- def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> mindspore.Tensor: ++ def forward(self, hidden_states, attention_mask, rotary_pos_emb) -> mindspore.Tensor: + hidden_states = hidden_states + self.attn( +- self.norm1(hidden_states), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb ++ self.norm1(hidden_states), attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb + ) + hidden_states = hidden_states + self.mlp(self.norm2(hidden_states)) + return hidden_states +@@ -403,7 +396,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( + causal_mask = ops.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype) + if sequence_length != 1: + causal_mask = ops.triu(causal_mask, diagonal=1) +- causal_mask *= ops.arange(target_length) > cache_position.reshape(-1, 1) ++ causal_mask *= ops.arange(target_length) > mindspore.mint.reshape(cache_position, (-1, 1)) + causal_mask = causal_mask[None, None, :, :].broadcast_to((batch_size, 1, -1, -1)) + if attention_mask is not None: + causal_mask = causal_mask.copy() # copy to contiguous memory for in-place edit +@@ -462,8 +455,8 @@ def repeat_kv(hidden_states: mindspore.Tensor, n_rep: int) -> mindspore.Tensor: + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states +- hidden_states = hidden_states[:, :, None, :, :].broadcast_to((batch, num_key_value_heads, n_rep, slen, head_dim)) +- return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) ++ hidden_states = ops.broadcast_to(ops.unsqueeze(hidden_states, 2), (batch, num_key_value_heads, n_rep, slen, head_dim)) ++ return mindspore.mint.reshape(hidden_states, (batch, num_key_value_heads * n_rep, slen, head_dim)) + + + class Qwen2VLAttention(nn.Module): +@@ -527,9 +520,9 @@ class Qwen2VLAttention(nn.Module): + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + +- query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).swapaxes(1, 2) +- key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).swapaxes(1, 2) +- value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).swapaxes(1, 2) ++ query_states = ops.transpose(query_states.view(bsz, q_len, self.num_heads, self.head_dim), 1, 2) ++ key_states = ops.transpose(key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim), 1, 2) ++ value_states = ops.transpose(value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim), 1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: +@@ -545,7 +538,7 @@ class Qwen2VLAttention(nn.Module): + else: + cos, sin = position_embeddings + query_states, key_states = apply_multimodal_rotary_pos_emb( +- query_states, key_states, cos, sin, self.rope_scaling["mrope_section"] ++ query_states, key_states, cos, sin + ) + + if past_key_value is not None: +@@ -556,10 +549,10 @@ class Qwen2VLAttention(nn.Module): + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + +- attn_weights = ops.matmul(query_states, key_states.swapaxes(2, 3)) / math.sqrt(self.head_dim) ++ attn_weights = ops.matmul(query_states, ops.transpose(key_states, 2, 3)) / math.sqrt(self.head_dim) + + if attention_mask is not None: # no matter the length, we just slice it +- causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] ++ causal_mask = ops.narrow(attention_mask, 3, 0, key_states.shape[-2]) + attn_weights = attn_weights + causal_mask + + # upcast attention to fp32 +@@ -573,8 +566,8 @@ class Qwen2VLAttention(nn.Module): + f" {attn_output.shape}" + ) + +- attn_output = attn_output.swapaxes(1, 2) +- attn_output = attn_output.reshape(bsz, q_len, -1) ++ attn_output = ops.transpose(attn_output, 1, 2) ++ attn_output = mindspore.mint.reshape(attn_output, (bsz, q_len, -1)) + + attn_output = self.o_proj(attn_output) + +@@ -727,22 +720,22 @@ class Qwen2VisionTransformerPretrainedModel(Qwen2VLPreTrainedModel): + for t, h, w in grid_thw: + t, h, w = t.item(), h.item(), w.item() + hpos_ids = ops.arange(h).unsqueeze(1).broadcast_to((-1, w)) +- hpos_ids = hpos_ids.reshape( ++ hpos_ids = mindspore.mint.reshape(hpos_ids, ( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, +- ) ++ )) + hpos_ids = hpos_ids.permute(0, 2, 1, 3) + hpos_ids = hpos_ids.flatten() + + wpos_ids = ops.arange(w).unsqueeze(0).broadcast_to((h, -1)) +- wpos_ids = wpos_ids.reshape( ++ wpos_ids = mindspore.mint.reshape(wpos_ids, ( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, +- ) ++ )) + wpos_ids = wpos_ids.permute(0, 2, 1, 3) + wpos_ids = wpos_ids.flatten() + pos_ids.append(ops.stack([hpos_ids, wpos_ids], dim=-1).tile((t, 1))) +@@ -761,9 +754,16 @@ class Qwen2VisionTransformerPretrainedModel(Qwen2VLPreTrainedModel): + dim=0, dtype=mindspore.int32 + ) + cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0) ++ ++ seq_length = hidden_states.shape[0] ++ attention_mask = ops.full( ++ [1, seq_length, seq_length], float(ops.finfo(hidden_states.dtype).min), dtype=hidden_states.dtype ++ ) ++ for i in range(1, len(cu_seqlens)): ++ attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0 + + for blk in self.blocks: +- hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb) ++ hidden_states = blk(hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb) + + return self.merger(hidden_states) + +@@ -785,6 +785,8 @@ class Qwen2VLModel(Qwen2VLPreTrainedModel): + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() ++ ++ self.rope_scaling = config.rope_scaling + + def get_input_embeddings(self): + return self.embed_tokens +@@ -847,7 +849,7 @@ class Qwen2VLModel(Qwen2VLPreTrainedModel): + hidden_states = inputs_embeds + + # create position embeddings to be shared across the decoder layers +- position_embeddings = self.rotary_emb(hidden_states, position_ids) ++ position_embeddings = self.rotary_emb(hidden_states, position_ids, self.rope_scaling["mrope_section"]) + + # decoder layers + all_hidden_states = () if output_hidden_states else None +@@ -1120,7 +1122,7 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel): + text_len = len(input_tokens) - st + llm_pos_ids_list.append(ops.arange(text_len).view(1, -1).broadcast_to((3, -1)) + st_idx) + +- llm_positions = ops.cat(llm_pos_ids_list, dim=1).reshape(3, -1) ++ llm_positions = mindspore.mint.reshape(ops.cat(llm_pos_ids_list, dim=1), (3, -1)) + position_ids[..., i, attention_mask[i] == 1] = llm_positions + mrope_position_deltas.append(llm_positions.max().item() + 1 - len(total_input_ids[i])) + mrope_position_deltas = mindspore.tensor(mrope_position_deltas).unsqueeze(1) +-- +2.47.1.windows.2 +