diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/README.md" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/README.md"
new file mode 100644
index 00000000..3215394e
--- /dev/null
+++ "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/README.md"
@@ -0,0 +1,44 @@
+
+# 2025年昇腾AI创新大赛-昇思模型开发挑战赛(S1赛季)--MultiModal赛题--好树花生队提交说明
+
+## 运行时长如下:
+
+
+
+## 优化点分为以下方面:
+
+1.qwenvl模型中有大量api替换,分别有mindspore.ops.rms_norm替换原来的实现,然后大量直接从tensor调用的方法,比如tensor.broadcast替换成mint中的方法,还有transpose、unsqueeze等方法
+
+2.qwenvl模型中的conv3d操作,原本调用的方法速度慢,改成mindspore.mint.nn.functional.conv3d后能够明显提升
+
+
+
+3.两个模型中softmax的操作,默认使用了fp32数据类型,改成默认的bf16,能够有轻微提升
+
+
+
+4.qwenvl模型的预处理阶段,在processing_vlm.py文件中,其使用tokenizer获得tag的4个方法操作很耗时,每次都重复执行,且每次操作都是一样的,所以改成在类初始化时就获取,后面直接使用现成的值,这样能够大大降低预处理的时间
+
+
+
+
+5.janus模型中的modeling_vlm.py文件里有大量打印,发现删除那些print后,速度偶尔有轻微提升
+
+6.janus的siglip_vit.py文件中,对forward_features方法中的blocks运算加jit,速度能明显提升,forward_head方法也加了jit,但速度提升不明显,有时感觉有轻微提升,有时却一点都没有提升
+
+
+
+7.qwenvl模型中vision模块的attention_mask重复计算了,将其提取到layer的for循环之外,保证就执行一次
+
+
+
+
+以上就是主要的几个修改点
+
+# 最终优化结果:
+| 评测指标 | 平均得分 |
+|---------|---------|
+| 峰值显存得分 | 116.6667 |
+| Prefill时延得分 | 382.3937 |
+| Decode时延得分 | 154.2476 |
+| **总分** | **217.7693** |
\ No newline at end of file
diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/2.png" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/2.png"
new file mode 100644
index 00000000..f62690a8
Binary files /dev/null and "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/2.png" differ
diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/3.png" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/3.png"
new file mode 100644
index 00000000..a66767ea
Binary files /dev/null and "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/3.png" differ
diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/4_1.png" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/4_1.png"
new file mode 100644
index 00000000..fe355fbb
Binary files /dev/null and "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/4_1.png" differ
diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/4_2.png" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/4_2.png"
new file mode 100644
index 00000000..0647a182
Binary files /dev/null and "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/4_2.png" differ
diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/6.png" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/6.png"
new file mode 100644
index 00000000..a46e889f
Binary files /dev/null and "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/6.png" differ
diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/7_1.png" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/7_1.png"
new file mode 100644
index 00000000..423e7a3d
Binary files /dev/null and "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/7_1.png" differ
diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/7_2.png" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/7_2.png"
new file mode 100644
index 00000000..98f4ea1a
Binary files /dev/null and "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/7_2.png" differ
diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/results.png" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/results.png"
new file mode 100644
index 00000000..0551ce7f
Binary files /dev/null and "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/results.png" differ
diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0001-.patch" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0001-.patch"
new file mode 100644
index 00000000..6513ba0e
--- /dev/null
+++ "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0001-.patch"
@@ -0,0 +1,141 @@
+From 4fffb31ea6073e689089caa02e0a17cf4b6f73af Mon Sep 17 00:00:00 2001
+From: Alice <1215876379@qq.com>
+Date: Tue, 4 Nov 2025 22:41:54 +0800
+Subject: [PATCH 1/3] =?UTF-8?q?=E5=A4=9A=E6=A8=A1=E6=80=81=E8=B0=83?=
+ =?UTF-8?q?=E4=BC=98=E6=8F=90=E4=BA=A4?=
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+---
+ .../janus_pro/janus/models/processing_vlm.py | 28 ++++++++++++++++---
+ .../models/qwen2_vl/modeling_qwen2_vl.py | 14 ++++++----
+ 2 files changed, 33 insertions(+), 9 deletions(-)
+
+diff --git a/llm/inference/janus_pro/janus/models/processing_vlm.py b/llm/inference/janus_pro/janus/models/processing_vlm.py
+index 7f881e39..c93bfd92 100644
+--- a/llm/inference/janus_pro/janus/models/processing_vlm.py
++++ b/llm/inference/janus_pro/janus/models/processing_vlm.py
+@@ -110,9 +110,13 @@ class VLChatProcessor(ProcessorMixin):
+ print(f"Add image tag = {image_tag} to the tokenizer")
+
+ self.image_tag = image_tag
++ self.image_tag_id = None
+ self.image_start_tag = image_start_tag
++ self.image_start_tag_id = None
+ self.image_end_tag = image_end_tag
++ self.image_end_tag_id = None
+ self.pad_tag = pad_tag
++ self.pad_tag_id = None
+
+ self.num_image_tokens = num_image_tokens
+ self.add_special_token = add_special_token
+@@ -185,17 +189,29 @@ class VLChatProcessor(ProcessorMixin):
+
+ @property
+ def image_id(self):
+- image_id = self.tokenizer.vocab.get(self.image_tag)
++ if self.image_tag_id is None:
++ image_id = self.tokenizer.vocab.get(self.image_tag)
++ self.image_tag_id = image_id
++ else:
++ image_id = self.image_tag_id
+ return image_id
+
+ @property
+ def image_start_id(self):
+- image_start_id = self.tokenizer.vocab.get(self.image_start_tag)
++ if self.image_start_tag_id is None:
++ image_start_id = self.tokenizer.vocab.get(self.image_start_tag)
++ self.image_start_tag_id = image_start_id
++ else:
++ image_start_id = self.image_start_tag_id
+ return image_start_id
+
+ @property
+ def image_end_id(self):
+- image_end_id = self.tokenizer.vocab.get(self.image_end_tag)
++ if self.image_end_tag_id is None:
++ image_end_id = self.tokenizer.vocab.get(self.image_end_tag)
++ self.image_end_tag_id = image_end_id
++ else:
++ image_end_id = self.image_end_tag_id
+ return image_end_id
+
+ @property
+@@ -208,7 +224,11 @@ class VLChatProcessor(ProcessorMixin):
+
+ @property
+ def pad_id(self):
+- pad_id = self.tokenizer.vocab.get(self.pad_tag)
++ if self.pad_tag_id is None:
++ pad_id = self.tokenizer.vocab.get(self.pad_tag)
++ self.pad_tag_id = pad_id
++ else:
++ pad_id = self.pad_tag_id
+ # pad_id = self.tokenizer.pad_token_id
+ # if pad_id is None:
+ # pad_id = self.tokenizer.eos_token_id
+diff --git a/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+index d059dcbe..ffb100cf 100644
+--- a/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py
++++ b/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+@@ -265,6 +265,7 @@ class PatchEmbed(nn.Module):
+ self.embed_dim = embed_dim
+
+ kernel_size = (temporal_patch_size, patch_size, patch_size)
++ self.kernel_size = kernel_size
+ self.proj = nn.Conv3d(in_channels, embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=False)
+
+ def forward(self, hidden_states: mindspore.Tensor) -> mindspore.Tensor:
+@@ -272,7 +273,10 @@ class PatchEmbed(nn.Module):
+ hidden_states = hidden_states.view(
+ -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
+ )
+- hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
++
++ hidden_states = mindspore.mint.nn.functional.conv3d(hidden_states.to(dtype=target_dtype), self.proj.weight,
++ stride=self.kernel_size).view(-1, self.embed_dim)
++
+ return hidden_states
+
+
+@@ -330,7 +334,7 @@ class VisionAttention(nn.Module):
+ v = v.swapaxes(0, 1)
+ attn_weights = ops.matmul(q, k.swapaxes(1, 2)) / math.sqrt(self.head_dim)
+ attn_weights = attn_weights + attention_mask
+- attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=mindspore.float32).to(q.dtype)
++ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+ attn_output = ops.matmul(attn_weights, v)
+ attn_output = attn_output.swapaxes(0, 1)
+ attn_output = attn_output.reshape(seq_length, -1)
+@@ -559,7 +563,7 @@ class Qwen2VLAttention(nn.Module):
+ attn_weights = attn_weights + causal_mask
+
+ # upcast attention to fp32
+- attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=mindspore.float32).to(query_states.dtype)
++ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+ attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+ attn_output = ops.matmul(attn_weights, value_states)
+
+@@ -637,7 +641,7 @@ class Qwen2VLDecoderLayer(nn.Module):
+
+ residual = hidden_states
+
+- hidden_states = self.input_layernorm(hidden_states)
++ hidden_states, _ = mindspore.ops.rms_norm(hidden_states, self.input_layernorm.weight, self.input_layernorm.variance_epsilon)
+
+ # Self Attention
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
+@@ -654,7 +658,7 @@ class Qwen2VLDecoderLayer(nn.Module):
+
+ # Fully Connected
+ residual = hidden_states
+- hidden_states = self.post_attention_layernorm(hidden_states)
++ hidden_states, _ = mindspore.ops.rms_norm(hidden_states, self.post_attention_layernorm.weight, self.post_attention_layernorm.variance_epsilon)
+ hidden_states = self.mlp(hidden_states)
+ hidden_states = residual + hidden_states
+
+--
+2.47.1.windows.2
+
diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0002-rmsnorm-jit.patch" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0002-rmsnorm-jit.patch"
new file mode 100644
index 00000000..8fc3923e
--- /dev/null
+++ "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0002-rmsnorm-jit.patch"
@@ -0,0 +1,183 @@
+From a9b4c9b85f237fcae52ea396cf7be606cdf16410 Mon Sep 17 00:00:00 2001
+From: Alice <1215876379@qq.com>
+Date: Thu, 6 Nov 2025 16:39:55 +0800
+Subject: [PATCH 2/3] =?UTF-8?q?=E7=BB=A7=E7=BB=AD=E4=BC=98=E5=8C=96?=
+ =?UTF-8?q?=EF=BC=8C=E5=8A=A0=E5=85=A5rmsnorm,jit?=
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+---
+ .../janus_pro/janus/models/clip_encoder.py | 6 +++--
+ .../janus_pro/janus/models/modeling_vlm.py | 27 ++++++++-----------
+ .../janus_pro/janus/models/siglip_vit.py | 11 +++++---
+ .../janus_pro/janus/models/timm_layers.py | 2 +-
+ .../models/llama/modeling_llama.py | 6 ++---
+ 5 files changed, 27 insertions(+), 25 deletions(-)
+
+diff --git a/llm/inference/janus_pro/janus/models/clip_encoder.py b/llm/inference/janus_pro/janus/models/clip_encoder.py
+index a0620cfe..acb6ea3f 100644
+--- a/llm/inference/janus_pro/janus/models/clip_encoder.py
++++ b/llm/inference/janus_pro/janus/models/clip_encoder.py
+@@ -56,6 +56,7 @@ class CLIPVisionTower(nn.Module):
+ self.vision_tower, self.forward_kwargs = self.build_vision_tower(
+ vision_tower_params
+ )
++ self.vision_tower.jit()
+
+ if pixel_mean is not None and pixel_std is not None:
+ image_norm = Normalize(
+@@ -112,10 +113,11 @@ class CLIPVisionTower(nn.Module):
+ Returns:
+ image_features (torch.Tensor): [b, n_patch, d]
+ """
+-
++
+ if self.image_norm is not None:
+ images = self.image_norm(images)
+-
++
+ image_forward_outs = self.vision_tower(images, **self.forward_kwargs)
++
+ image_features = self.feature_select(image_forward_outs)
+ return image_features
+diff --git a/llm/inference/janus_pro/janus/models/modeling_vlm.py b/llm/inference/janus_pro/janus/models/modeling_vlm.py
+index 7178c398..3d2d2d74 100644
+--- a/llm/inference/janus_pro/janus/models/modeling_vlm.py
++++ b/llm/inference/janus_pro/janus/models/modeling_vlm.py
+@@ -241,12 +241,16 @@ class MultiModalityCausalLM(MultiModalityPreTrainedModel):
+ Returns:
+ input_embeds (torch.Tensor): [b, T, D]
+ """
+-
++
++
+ bs, n = pixel_values.shape[0:2]
+ # "b n c h w -> (b n) c h w"
+ images = ops.reshape(
+ pixel_values, (bs * n, pixel_values.shape[2], pixel_values.shape[3], pixel_values.shape[4]))
+- images_embeds = self.aligner(self.vision_model(images))
++
++ vr = self.vision_model(images)
++
++ images_embeds = self.aligner(vr)
+
+ # "(b n) t d -> b (n t) d"
+ images_embeds = ops.reshape(
+@@ -259,33 +263,24 @@ class MultiModalityCausalLM(MultiModalityPreTrainedModel):
+ condition = input_ids < 0
+ input_ids = (1-condition) * input_ids + condition * \
+ 0 # ignore the image embeddings
++
+ inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
++
+
+ # replace with the image embeddings
+ # 627 576
+ # inputs_embeds[images_seq_mask] = images_embeds[images_emb_mask]
+- print("inputs_embeds:", inputs_embeds.shape)
+- print("images_embeds[images_emb_mask].dtype", images_embeds[images_emb_mask].dtype)
+- print("inputs_embeds.dtype", inputs_embeds.dtype)
++
++
+ padding_size = images_seq_mask.shape[1] - images_emb_mask.shape[1]
+ padding = Tensor(np.full((images_seq_mask.shape[0], padding_size), False), dtype=images_emb_mask.dtype)
+ padded_images_emb_mask = ops.concat((images_emb_mask, padding), dim=1)
+- print("padded_images_emb_mask.shape:",padded_images_emb_mask.shape)
+- print("images_embeds.shape:",images_embeds.shape)
+- print("images_seq_mask.shape:",images_seq_mask.shape)
+ first_true = images_seq_mask.nonzero().squeeze()[0][1] # 42
+ last_true = images_seq_mask.nonzero().squeeze()[-1][1] # 42
+- print("first_true:",first_true)
+- print("last_true:",last_true)
+ left = inputs_embeds[:,:first_true]
+- print(left.shape)
+ right = inputs_embeds[:, last_true+1:]
+- print(right.shape)
+ inputs_embeds = ops.cat((left, images_embeds, right),1)
+- print("inputs_embeds.shape:",inputs_embeds.shape)
+- print("inputs_embeds.dtype:",inputs_embeds.dtype)
+-
+-
++
+
+ # inputs_embeds = images_embeds[padded_images_emb_mask] * images_seq_mask + inputs_embeds * (1 - images_seq_mask)
+ return inputs_embeds
+diff --git a/llm/inference/janus_pro/janus/models/siglip_vit.py b/llm/inference/janus_pro/janus/models/siglip_vit.py
+index 56a6f299..d896eeb2 100644
+--- a/llm/inference/janus_pro/janus/models/siglip_vit.py
++++ b/llm/inference/janus_pro/janus/models/siglip_vit.py
+@@ -580,7 +580,11 @@ class VisionTransformer(nn.Module):
+ if return_prefix_tokens:
+ return tuple(zip(outputs, prefix_tokens))
+ return tuple(outputs)
+-
++
++ @mindspore.jit(backend='GE')
++ def run_blocks_jit(self, x: mindspore.Tensor) -> mindspore.Tensor:
++ return self.blocks(x)
++
+ def forward_features(self, x: mindspore.Tensor) -> mindspore.Tensor:
+ x = self.patch_embed(x)
+ x = self._pos_embed(x)
+@@ -590,10 +594,11 @@ class VisionTransformer(nn.Module):
+ # x = checkpoint_seq(self.blocks, x)
+ # else:
+ # x = self.blocks(x)
+- x = self.blocks(x)
++ x = self.run_blocks_jit(x)
+ x = self.norm(x)
+ return x
+-
++
++ @mindspore.jit(backend='GE')
+ def forward_head(self, x: mindspore.Tensor, pre_logits: bool = False) -> mindspore.Tensor:
+ if self.attn_pool is not None:
+ x = self.attn_pool(x)
+diff --git a/llm/inference/janus_pro/janus/models/timm_layers.py b/llm/inference/janus_pro/janus/models/timm_layers.py
+index 8960d256..61e68c54 100644
+--- a/llm/inference/janus_pro/janus/models/timm_layers.py
++++ b/llm/inference/janus_pro/janus/models/timm_layers.py
+@@ -46,7 +46,7 @@ class Mlp(nn.Module):
+
+ def forward(self, x):
+ x = self.fc1(x)
+- x = self.act(x)
++ x = mindspore.ops.gelu(x)
+ x = self.drop1(x)
+ x = self.norm(x)
+ x = self.fc2(x)
+diff --git a/mindnlp/transformers/models/llama/modeling_llama.py b/mindnlp/transformers/models/llama/modeling_llama.py
+index 9c5cb555..c8c55492 100644
+--- a/mindnlp/transformers/models/llama/modeling_llama.py
++++ b/mindnlp/transformers/models/llama/modeling_llama.py
+@@ -429,7 +429,7 @@ class LlamaAttention(nn.Module):
+ attn_weights = attn_weights + causal_mask
+
+ # upcast attention to fp32
+- attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=mindspore.float32).to(query_states.dtype)
++ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+ attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+ attn_output = ops.matmul(attn_weights, value_states)
+
+@@ -508,7 +508,7 @@ class LlamaDecoderLayer(nn.Module):
+ """
+ residual = hidden_states
+
+- hidden_states = self.input_layernorm(hidden_states)
++ hidden_states, _ = mindspore.ops.rms_norm(hidden_states, self.input_layernorm.weight, self.input_layernorm.variance_epsilon)
+
+ # Self Attention
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
+@@ -526,7 +526,7 @@ class LlamaDecoderLayer(nn.Module):
+
+ # Fully Connected
+ residual = hidden_states
+- hidden_states = self.post_attention_layernorm(hidden_states)
++ hidden_states, _ = mindspore.ops.rms_norm(hidden_states, self.post_attention_layernorm.weight, self.post_attention_layernorm.variance_epsilon)
+ hidden_states = self.mlp(hidden_states)
+ hidden_states = residual + hidden_states
+
+--
+2.47.1.windows.2
+
diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0003-janus-llama-qwen2_vl.patch" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0003-janus-llama-qwen2_vl.patch"
new file mode 100644
index 00000000..c60c9b59
--- /dev/null
+++ "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0003-janus-llama-qwen2_vl.patch"
@@ -0,0 +1,286 @@
+From dba3c2bd641feb6526a82881e1ef395890711a9d Mon Sep 17 00:00:00 2001
+From: Alice <1215876379@qq.com>
+Date: Fri, 7 Nov 2025 00:13:55 +0800
+Subject: [PATCH 3/3] =?UTF-8?q?=E5=8F=82=E8=80=83janus=E4=B8=AD=E7=9A=84ll?=
+ =?UTF-8?q?ama=EF=BC=8C=E4=BC=98=E5=8C=96qwen2=5Fvl=E7=9A=84=E6=8E=A5?=
+ =?UTF-8?q?=E5=8F=A3=E8=B0=83=E7=94=A8?=
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+---
+ .../models/qwen2_vl/modeling_qwen2_vl.py | 104 +++++++++---------
+ 1 file changed, 53 insertions(+), 51 deletions(-)
+
+diff --git a/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+index ffb100cf..4ed07c37 100644
+--- a/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py
++++ b/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+@@ -151,7 +151,7 @@ class Qwen2VLRotaryEmbedding(nn.Module):
+ self.max_seq_len_cached = self.original_max_seq_len
+
+ @no_grad()
+- def forward(self, x, position_ids):
++ def forward(self, x, position_ids, mrope_section, unsqueeze_dim=1):
+ if "dynamic" in self.rope_type:
+ self._dynamic_frequency_update(position_ids)
+
+@@ -161,7 +161,7 @@ class Qwen2VLRotaryEmbedding(nn.Module):
+ position_ids_expanded = position_ids[:, :, None, :].float() # shape (3, bs, 1, positions)
+ # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+ # with ops.autocast(device_type=device_type, enabled=False):
+- freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).swapaxes(2, 3)
++ freqs = ops.transpose((inv_freq_expanded.float() @ position_ids_expanded.float()), 2, 3)
+ emb = ops.cat((freqs, freqs), dim=-1)
+ cos = emb.cos()
+ sin = emb.sin()
+@@ -169,6 +169,14 @@ class Qwen2VLRotaryEmbedding(nn.Module):
+ # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+ cos = cos * self.attention_scaling
+ sin = sin * self.attention_scaling
++
++ mrope_section = mrope_section * 2
++ cos = ops.unsqueeze(ops.cat([m[i % 3] for i, m in enumerate(ops.split(cos, mrope_section, dim=-1))], dim=-1),
++ unsqueeze_dim
++ )
++ sin = ops.unsqueeze(ops.cat([m[i % 3] for i, m in enumerate(ops.split(sin, mrope_section, dim=-1))], dim=-1),
++ unsqueeze_dim
++ )
+
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+@@ -176,12 +184,11 @@ class Qwen2VLRotaryEmbedding(nn.Module):
+ # Copied from transformers.models.llama.modeling_llama.rotate_half
+ def rotate_half(x):
+ """Rotates half the hidden dims of the input."""
+- x1 = x[..., : x.shape[-1] // 2]
+- x2 = x[..., x.shape[-1] // 2 :]
++ x1, x2 = ops.split(x, x.shape[-1] // 2, dim=-1)
+ return ops.cat((-x2, x1), dim=-1)
+
+
+-def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
++def apply_multimodal_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
+ """Applies Rotary Position Embedding with Multimodal Sections to the query and key tensors (https://qwenlm.github.io/blog/qwen2-vl/).
+
+ Explanation:
+@@ -213,14 +220,6 @@ def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim
+ Returns:
+ `tuple(mindspore.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+ """
+- mrope_section = mrope_section * 2
+- cos = ops.cat([m[i % 3] for i, m in enumerate(ops.split(cos, mrope_section, dim=-1))], dim=-1).unsqueeze(
+- unsqueeze_dim
+- )
+- sin = ops.cat([m[i % 3] for i, m in enumerate(ops.split(sin, mrope_section, dim=-1))], dim=-1).unsqueeze(
+- unsqueeze_dim
+- )
+-
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+@@ -229,10 +228,10 @@ def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim
+ def apply_rotary_pos_emb_vision(tensor: mindspore.Tensor, freqs: mindspore.Tensor) -> mindspore.Tensor:
+ orig_dtype = tensor.dtype
+ tensor = tensor.float()
+- cos = freqs.cos()
+- sin = freqs.sin()
+- cos = cos.unsqueeze(1).tile((1, 1, 2)).unsqueeze(0).float()
+- sin = sin.unsqueeze(1).tile((1, 1, 2)).unsqueeze(0).float()
++ cos = ops.cos(freqs)
++ sin = ops.sin(freqs)
++ cos = ops.unsqueeze(ops.unsqueeze(cos,1).tile((1, 1, 2)), 0).float()
++ sin = ops.unsqueeze(ops.unsqueeze(sin, 1).tile((1, 1, 2)), 0).float()
+ output = (tensor * cos) + (rotate_half(tensor) * sin)
+ output = output.to(orig_dtype)
+ return output
+@@ -316,28 +315,22 @@ class VisionAttention(nn.Module):
+ self.proj = nn.Linear(dim, dim)
+
+ def forward(
+- self, hidden_states: mindspore.Tensor, cu_seqlens: mindspore.Tensor, rotary_pos_emb: mindspore.Tensor = None
++ self, hidden_states: mindspore.Tensor, attention_mask: mindspore.Tensor, rotary_pos_emb: mindspore.Tensor = None
+ ) -> mindspore.Tensor:
+ seq_length = hidden_states.shape[0]
+- q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
++ q, k, v = mindspore.mint.reshape(self.qkv(hidden_states), (seq_length, 3, self.num_heads, -1)).permute(1, 0, 2, 3).unbind(0)
+ q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+ k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+
+- attention_mask = ops.full(
+- [1, seq_length, seq_length], float(ops.finfo(q.dtype).min), dtype=q.dtype
+- )
+- for i in range(1, len(cu_seqlens)):
+- attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0
+-
+- q = q.swapaxes(0, 1)
+- k = k.swapaxes(0, 1)
+- v = v.swapaxes(0, 1)
+- attn_weights = ops.matmul(q, k.swapaxes(1, 2)) / math.sqrt(self.head_dim)
++ q = ops.transpose(q, 0, 1)
++ k = ops.transpose(k, 0, 1)
++ v = ops.transpose(v, 0, 1)
++ attn_weights = ops.matmul(q, ops.transpose(k, 1, 2)) / math.sqrt(self.head_dim)
+ attn_weights = attn_weights + attention_mask
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+ attn_output = ops.matmul(attn_weights, v)
+- attn_output = attn_output.swapaxes(0, 1)
+- attn_output = attn_output.reshape(seq_length, -1)
++ attn_output = ops.transpose(attn_output, 0, 1)
++ attn_output = mindspore.mint.reshape(attn_output, (seq_length, -1))
+ attn_output = self.proj(attn_output)
+ return attn_output
+
+@@ -358,9 +351,9 @@ class Qwen2VLVisionBlock(nn.Module):
+ )
+ self.mlp = VisionMlp(dim=config.embed_dim, hidden_dim=mlp_hidden_dim, hidden_act=config.hidden_act)
+
+- def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> mindspore.Tensor:
++ def forward(self, hidden_states, attention_mask, rotary_pos_emb) -> mindspore.Tensor:
+ hidden_states = hidden_states + self.attn(
+- self.norm1(hidden_states), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
++ self.norm1(hidden_states), attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb
+ )
+ hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+ return hidden_states
+@@ -403,7 +396,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
+ causal_mask = ops.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype)
+ if sequence_length != 1:
+ causal_mask = ops.triu(causal_mask, diagonal=1)
+- causal_mask *= ops.arange(target_length) > cache_position.reshape(-1, 1)
++ causal_mask *= ops.arange(target_length) > mindspore.mint.reshape(cache_position, (-1, 1))
+ causal_mask = causal_mask[None, None, :, :].broadcast_to((batch_size, 1, -1, -1))
+ if attention_mask is not None:
+ causal_mask = causal_mask.copy() # copy to contiguous memory for in-place edit
+@@ -462,8 +455,8 @@ def repeat_kv(hidden_states: mindspore.Tensor, n_rep: int) -> mindspore.Tensor:
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+ if n_rep == 1:
+ return hidden_states
+- hidden_states = hidden_states[:, :, None, :, :].broadcast_to((batch, num_key_value_heads, n_rep, slen, head_dim))
+- return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
++ hidden_states = ops.broadcast_to(ops.unsqueeze(hidden_states, 2), (batch, num_key_value_heads, n_rep, slen, head_dim))
++ return mindspore.mint.reshape(hidden_states, (batch, num_key_value_heads * n_rep, slen, head_dim))
+
+
+ class Qwen2VLAttention(nn.Module):
+@@ -527,9 +520,9 @@ class Qwen2VLAttention(nn.Module):
+ key_states = self.k_proj(hidden_states)
+ value_states = self.v_proj(hidden_states)
+
+- query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).swapaxes(1, 2)
+- key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).swapaxes(1, 2)
+- value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).swapaxes(1, 2)
++ query_states = ops.transpose(query_states.view(bsz, q_len, self.num_heads, self.head_dim), 1, 2)
++ key_states = ops.transpose(key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim), 1, 2)
++ value_states = ops.transpose(value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim), 1, 2)
+
+ kv_seq_len = key_states.shape[-2]
+ if past_key_value is not None:
+@@ -545,7 +538,7 @@ class Qwen2VLAttention(nn.Module):
+ else:
+ cos, sin = position_embeddings
+ query_states, key_states = apply_multimodal_rotary_pos_emb(
+- query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
++ query_states, key_states, cos, sin
+ )
+
+ if past_key_value is not None:
+@@ -556,10 +549,10 @@ class Qwen2VLAttention(nn.Module):
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+- attn_weights = ops.matmul(query_states, key_states.swapaxes(2, 3)) / math.sqrt(self.head_dim)
++ attn_weights = ops.matmul(query_states, ops.transpose(key_states, 2, 3)) / math.sqrt(self.head_dim)
+
+ if attention_mask is not None: # no matter the length, we just slice it
+- causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
++ causal_mask = ops.narrow(attention_mask, 3, 0, key_states.shape[-2])
+ attn_weights = attn_weights + causal_mask
+
+ # upcast attention to fp32
+@@ -573,8 +566,8 @@ class Qwen2VLAttention(nn.Module):
+ f" {attn_output.shape}"
+ )
+
+- attn_output = attn_output.swapaxes(1, 2)
+- attn_output = attn_output.reshape(bsz, q_len, -1)
++ attn_output = ops.transpose(attn_output, 1, 2)
++ attn_output = mindspore.mint.reshape(attn_output, (bsz, q_len, -1))
+
+ attn_output = self.o_proj(attn_output)
+
+@@ -727,22 +720,22 @@ class Qwen2VisionTransformerPretrainedModel(Qwen2VLPreTrainedModel):
+ for t, h, w in grid_thw:
+ t, h, w = t.item(), h.item(), w.item()
+ hpos_ids = ops.arange(h).unsqueeze(1).broadcast_to((-1, w))
+- hpos_ids = hpos_ids.reshape(
++ hpos_ids = mindspore.mint.reshape(hpos_ids, (
+ h // self.spatial_merge_size,
+ self.spatial_merge_size,
+ w // self.spatial_merge_size,
+ self.spatial_merge_size,
+- )
++ ))
+ hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+ hpos_ids = hpos_ids.flatten()
+
+ wpos_ids = ops.arange(w).unsqueeze(0).broadcast_to((h, -1))
+- wpos_ids = wpos_ids.reshape(
++ wpos_ids = mindspore.mint.reshape(wpos_ids, (
+ h // self.spatial_merge_size,
+ self.spatial_merge_size,
+ w // self.spatial_merge_size,
+ self.spatial_merge_size,
+- )
++ ))
+ wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+ wpos_ids = wpos_ids.flatten()
+ pos_ids.append(ops.stack([hpos_ids, wpos_ids], dim=-1).tile((t, 1)))
+@@ -761,9 +754,16 @@ class Qwen2VisionTransformerPretrainedModel(Qwen2VLPreTrainedModel):
+ dim=0, dtype=mindspore.int32
+ )
+ cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
++
++ seq_length = hidden_states.shape[0]
++ attention_mask = ops.full(
++ [1, seq_length, seq_length], float(ops.finfo(hidden_states.dtype).min), dtype=hidden_states.dtype
++ )
++ for i in range(1, len(cu_seqlens)):
++ attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0
+
+ for blk in self.blocks:
+- hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb)
++ hidden_states = blk(hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb)
+
+ return self.merger(hidden_states)
+
+@@ -785,6 +785,8 @@ class Qwen2VLModel(Qwen2VLPreTrainedModel):
+ self.gradient_checkpointing = False
+ # Initialize weights and apply final processing
+ self.post_init()
++
++ self.rope_scaling = config.rope_scaling
+
+ def get_input_embeddings(self):
+ return self.embed_tokens
+@@ -847,7 +849,7 @@ class Qwen2VLModel(Qwen2VLPreTrainedModel):
+ hidden_states = inputs_embeds
+
+ # create position embeddings to be shared across the decoder layers
+- position_embeddings = self.rotary_emb(hidden_states, position_ids)
++ position_embeddings = self.rotary_emb(hidden_states, position_ids, self.rope_scaling["mrope_section"])
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+@@ -1120,7 +1122,7 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel):
+ text_len = len(input_tokens) - st
+ llm_pos_ids_list.append(ops.arange(text_len).view(1, -1).broadcast_to((3, -1)) + st_idx)
+
+- llm_positions = ops.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
++ llm_positions = mindspore.mint.reshape(ops.cat(llm_pos_ids_list, dim=1), (3, -1))
+ position_ids[..., i, attention_mask[i] == 1] = llm_positions
+ mrope_position_deltas.append(llm_positions.max().item() + 1 - len(total_input_ids[i]))
+ mrope_position_deltas = mindspore.tensor(mrope_position_deltas).unsqueeze(1)
+--
+2.47.1.windows.2
+