diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/README.md" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/README.md"
new file mode 100644
index 00000000..3215394e
--- /dev/null
+++ "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/README.md"
@@ -0,0 +1,44 @@
+
+# 2025年昇腾AI创新大赛-昇思模型开发挑战赛（S1赛季)--MultiModal赛题--好树花生队提交说明
+
+## 运行时长如下：
+
+<img src="./imgs/results.png"></img>
+
+## 优化点分为以下方面：
+
+1.qwenvl模型中有大量api替换，分别有mindspore.ops.rms_norm替换原来的实现，然后大量直接从tensor调用的方法，比如tensor.broadcast替换成mint中的方法，还有transpose、unsqueeze等方法
+
+2.qwenvl模型中的conv3d操作，原本调用的方法速度慢，改成mindspore.mint.nn.functional.conv3d后能够明显提升
+
+<img src="./imgs/2.png"></img>
+
+3.两个模型中softmax的操作，默认使用了fp32数据类型，改成默认的bf16，能够有轻微提升
+
+<img src="./imgs/3.png"></img>
+
+4.qwenvl模型的预处理阶段，在processing_vlm.py文件中，其使用tokenizer获得tag的4个方法操作很耗时，每次都重复执行，且每次操作都是一样的，所以改成在类初始化时就获取，后面直接使用现成的值，这样能够大大降低预处理的时间
+
+<img src="./imgs/4_1.png"></img>
+<img src="./imgs/4_2.png"></img>
+
+5.janus模型中的modeling_vlm.py文件里有大量打印，发现删除那些print后，速度偶尔有轻微提升
+
+6.janus的siglip_vit.py文件中，对forward_features方法中的blocks运算加jit，速度能明显提升，forward_head方法也加了jit，但速度提升不明显，有时感觉有轻微提升，有时却一点都没有提升
+
+<img src="./imgs/6.png"></img>
+
+7.qwenvl模型中vision模块的attention_mask重复计算了，将其提取到layer的for循环之外，保证就执行一次
+
+<img src="./imgs/7_1.png"></img>
+<img src="./imgs/7_2.png"></img>
+
+以上就是主要的几个修改点
+
+# 最终优化结果：
+| 评测指标 | 平均得分 |
+|---------|---------|
+| 峰值显存得分 | 116.6667 |
+| Prefill时延得分 | 382.3937    |
+| Decode时延得分 | 154.2476     |
+| **总分** | **217.7693** |
\ No newline at end of file
diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/2.png" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/2.png"
new file mode 100644
index 00000000..f62690a8
Binary files /dev/null and "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/2.png" differ
diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/3.png" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/3.png"
new file mode 100644
index 00000000..a66767ea
Binary files /dev/null and "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/3.png" differ
diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/4_1.png" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/4_1.png"
new file mode 100644
index 00000000..fe355fbb
Binary files /dev/null and "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/4_1.png" differ
diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/4_2.png" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/4_2.png"
new file mode 100644
index 00000000..0647a182
Binary files /dev/null and "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/4_2.png" differ
diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/6.png" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/6.png"
new file mode 100644
index 00000000..a46e889f
Binary files /dev/null and "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/6.png" differ
diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/7_1.png" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/7_1.png"
new file mode 100644
index 00000000..423e7a3d
Binary files /dev/null and "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/7_1.png" differ
diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/7_2.png" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/7_2.png"
new file mode 100644
index 00000000..98f4ea1a
Binary files /dev/null and "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/7_2.png" differ
diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/results.png" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/results.png"
new file mode 100644
index 00000000..0551ce7f
Binary files /dev/null and "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/imgs/results.png" differ
diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0001-.patch" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0001-.patch"
new file mode 100644
index 00000000..6513ba0e
--- /dev/null
+++ "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0001-.patch"
@@ -0,0 +1,141 @@
+From 4fffb31ea6073e689089caa02e0a17cf4b6f73af Mon Sep 17 00:00:00 2001
+From: Alice <1215876379@qq.com>
+Date: Tue, 4 Nov 2025 22:41:54 +0800
+Subject: [PATCH 1/3] =?UTF-8?q?=E5=A4=9A=E6=A8=A1=E6=80=81=E8=B0=83?=
+ =?UTF-8?q?=E4=BC=98=E6=8F=90=E4=BA=A4?=
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+---
+ .../janus_pro/janus/models/processing_vlm.py  | 28 ++++++++++++++++---
+ .../models/qwen2_vl/modeling_qwen2_vl.py      | 14 ++++++----
+ 2 files changed, 33 insertions(+), 9 deletions(-)
+
+diff --git a/llm/inference/janus_pro/janus/models/processing_vlm.py b/llm/inference/janus_pro/janus/models/processing_vlm.py
+index 7f881e39..c93bfd92 100644
+--- a/llm/inference/janus_pro/janus/models/processing_vlm.py
++++ b/llm/inference/janus_pro/janus/models/processing_vlm.py
+@@ -110,9 +110,13 @@ class VLChatProcessor(ProcessorMixin):
+             print(f"Add image tag = {image_tag} to the tokenizer")
+ 
+         self.image_tag = image_tag
++        self.image_tag_id = None
+         self.image_start_tag = image_start_tag
++        self.image_start_tag_id = None
+         self.image_end_tag = image_end_tag
++        self.image_end_tag_id = None
+         self.pad_tag = pad_tag
++        self.pad_tag_id = None
+ 
+         self.num_image_tokens = num_image_tokens
+         self.add_special_token = add_special_token
+@@ -185,17 +189,29 @@ class VLChatProcessor(ProcessorMixin):
+ 
+     @property
+     def image_id(self):
+-        image_id = self.tokenizer.vocab.get(self.image_tag)
++        if self.image_tag_id is None:
++            image_id = self.tokenizer.vocab.get(self.image_tag)
++            self.image_tag_id = image_id
++        else:
++            image_id = self.image_tag_id
+         return image_id
+ 
+     @property
+     def image_start_id(self):
+-        image_start_id = self.tokenizer.vocab.get(self.image_start_tag)
++        if self.image_start_tag_id is None:
++            image_start_id = self.tokenizer.vocab.get(self.image_start_tag)
++            self.image_start_tag_id = image_start_id
++        else:
++            image_start_id = self.image_start_tag_id
+         return image_start_id
+ 
+     @property
+     def image_end_id(self):
+-        image_end_id = self.tokenizer.vocab.get(self.image_end_tag)
++        if self.image_end_tag_id is None:
++            image_end_id = self.tokenizer.vocab.get(self.image_end_tag)
++            self.image_end_tag_id = image_end_id
++        else:
++            image_end_id = self.image_end_tag_id
+         return image_end_id
+ 
+     @property
+@@ -208,7 +224,11 @@ class VLChatProcessor(ProcessorMixin):
+ 
+     @property
+     def pad_id(self):
+-        pad_id = self.tokenizer.vocab.get(self.pad_tag)
++        if self.pad_tag_id is None:
++            pad_id = self.tokenizer.vocab.get(self.pad_tag)
++            self.pad_tag_id = pad_id
++        else:
++            pad_id = self.pad_tag_id
+         # pad_id = self.tokenizer.pad_token_id
+         # if pad_id is None:
+         #     pad_id = self.tokenizer.eos_token_id
+diff --git a/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+index d059dcbe..ffb100cf 100644
+--- a/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py
++++ b/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+@@ -265,6 +265,7 @@ class PatchEmbed(nn.Module):
+         self.embed_dim = embed_dim
+ 
+         kernel_size = (temporal_patch_size, patch_size, patch_size)
++        self.kernel_size = kernel_size
+         self.proj = nn.Conv3d(in_channels, embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=False)
+ 
+     def forward(self, hidden_states: mindspore.Tensor) -> mindspore.Tensor:
+@@ -272,7 +273,10 @@ class PatchEmbed(nn.Module):
+         hidden_states = hidden_states.view(
+             -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
+         )
+-        hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
++        
++        hidden_states = mindspore.mint.nn.functional.conv3d(hidden_states.to(dtype=target_dtype), self.proj.weight, 
++                                                            stride=self.kernel_size).view(-1, self.embed_dim)
++        
+         return hidden_states
+ 
+ 
+@@ -330,7 +334,7 @@ class VisionAttention(nn.Module):
+         v = v.swapaxes(0, 1)
+         attn_weights = ops.matmul(q, k.swapaxes(1, 2)) / math.sqrt(self.head_dim)
+         attn_weights = attn_weights + attention_mask
+-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=mindspore.float32).to(q.dtype)
++        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+         attn_output = ops.matmul(attn_weights, v)
+         attn_output = attn_output.swapaxes(0, 1)
+         attn_output = attn_output.reshape(seq_length, -1)
+@@ -559,7 +563,7 @@ class Qwen2VLAttention(nn.Module):
+             attn_weights = attn_weights + causal_mask
+ 
+         # upcast attention to fp32
+-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=mindspore.float32).to(query_states.dtype)
++        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+         attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+         attn_output = ops.matmul(attn_weights, value_states)
+ 
+@@ -637,7 +641,7 @@ class Qwen2VLDecoderLayer(nn.Module):
+ 
+         residual = hidden_states
+ 
+-        hidden_states = self.input_layernorm(hidden_states)
++        hidden_states, _ = mindspore.ops.rms_norm(hidden_states, self.input_layernorm.weight, self.input_layernorm.variance_epsilon)
+ 
+         # Self Attention
+         hidden_states, self_attn_weights, present_key_value = self.self_attn(
+@@ -654,7 +658,7 @@ class Qwen2VLDecoderLayer(nn.Module):
+ 
+         # Fully Connected
+         residual = hidden_states
+-        hidden_states = self.post_attention_layernorm(hidden_states)
++        hidden_states, _ = mindspore.ops.rms_norm(hidden_states, self.post_attention_layernorm.weight, self.post_attention_layernorm.variance_epsilon)
+         hidden_states = self.mlp(hidden_states)
+         hidden_states = residual + hidden_states
+ 
+-- 
+2.47.1.windows.2
+
diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0002-rmsnorm-jit.patch" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0002-rmsnorm-jit.patch"
new file mode 100644
index 00000000..8fc3923e
--- /dev/null
+++ "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0002-rmsnorm-jit.patch"
@@ -0,0 +1,183 @@
+From a9b4c9b85f237fcae52ea396cf7be606cdf16410 Mon Sep 17 00:00:00 2001
+From: Alice <1215876379@qq.com>
+Date: Thu, 6 Nov 2025 16:39:55 +0800
+Subject: [PATCH 2/3] =?UTF-8?q?=E7=BB=A7=E7=BB=AD=E4=BC=98=E5=8C=96?=
+ =?UTF-8?q?=EF=BC=8C=E5=8A=A0=E5=85=A5rmsnorm,jit?=
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+---
+ .../janus_pro/janus/models/clip_encoder.py    |  6 +++--
+ .../janus_pro/janus/models/modeling_vlm.py    | 27 ++++++++-----------
+ .../janus_pro/janus/models/siglip_vit.py      | 11 +++++---
+ .../janus_pro/janus/models/timm_layers.py     |  2 +-
+ .../models/llama/modeling_llama.py            |  6 ++---
+ 5 files changed, 27 insertions(+), 25 deletions(-)
+
+diff --git a/llm/inference/janus_pro/janus/models/clip_encoder.py b/llm/inference/janus_pro/janus/models/clip_encoder.py
+index a0620cfe..acb6ea3f 100644
+--- a/llm/inference/janus_pro/janus/models/clip_encoder.py
++++ b/llm/inference/janus_pro/janus/models/clip_encoder.py
+@@ -56,6 +56,7 @@ class CLIPVisionTower(nn.Module):
+         self.vision_tower, self.forward_kwargs = self.build_vision_tower(
+             vision_tower_params
+         )
++        self.vision_tower.jit()
+ 
+         if pixel_mean is not None and pixel_std is not None:
+             image_norm = Normalize(
+@@ -112,10 +113,11 @@ class CLIPVisionTower(nn.Module):
+         Returns:
+             image_features (torch.Tensor): [b, n_patch, d]
+         """
+-
++        
+         if self.image_norm is not None:
+             images = self.image_norm(images)
+-
++        
+         image_forward_outs = self.vision_tower(images, **self.forward_kwargs)
++        
+         image_features = self.feature_select(image_forward_outs)
+         return image_features
+diff --git a/llm/inference/janus_pro/janus/models/modeling_vlm.py b/llm/inference/janus_pro/janus/models/modeling_vlm.py
+index 7178c398..3d2d2d74 100644
+--- a/llm/inference/janus_pro/janus/models/modeling_vlm.py
++++ b/llm/inference/janus_pro/janus/models/modeling_vlm.py
+@@ -241,12 +241,16 @@ class MultiModalityCausalLM(MultiModalityPreTrainedModel):
+         Returns:
+             input_embeds (torch.Tensor): [b, T, D]
+         """
+-
++        
++        
+         bs, n = pixel_values.shape[0:2]
+         # "b n c h w -> (b n) c h w"
+         images = ops.reshape(
+             pixel_values, (bs * n, pixel_values.shape[2], pixel_values.shape[3], pixel_values.shape[4]))
+-        images_embeds = self.aligner(self.vision_model(images))
++        
++        vr = self.vision_model(images)
++        
++        images_embeds = self.aligner(vr)
+ 
+         # "(b n) t d -> b (n t) d"
+         images_embeds = ops.reshape(
+@@ -259,33 +263,24 @@ class MultiModalityCausalLM(MultiModalityPreTrainedModel):
+         condition = input_ids < 0
+         input_ids = (1-condition) * input_ids + condition * \
+             0  # ignore the image embeddings
++        
+         inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
++        
+ 
+         # replace with the image embeddings
+         # 627                               576
+         # inputs_embeds[images_seq_mask] = images_embeds[images_emb_mask]
+-        print("inputs_embeds:", inputs_embeds.shape)
+-        print("images_embeds[images_emb_mask].dtype", images_embeds[images_emb_mask].dtype)
+-        print("inputs_embeds.dtype", inputs_embeds.dtype)
++        
++        
+         padding_size = images_seq_mask.shape[1] - images_emb_mask.shape[1]
+         padding = Tensor(np.full((images_seq_mask.shape[0], padding_size), False), dtype=images_emb_mask.dtype)
+         padded_images_emb_mask = ops.concat((images_emb_mask, padding), dim=1)
+-        print("padded_images_emb_mask.shape:",padded_images_emb_mask.shape)
+-        print("images_embeds.shape:",images_embeds.shape)
+-        print("images_seq_mask.shape:",images_seq_mask.shape)
+         first_true = images_seq_mask.nonzero().squeeze()[0][1] # 42
+         last_true = images_seq_mask.nonzero().squeeze()[-1][1] # 42
+-        print("first_true:",first_true)
+-        print("last_true:",last_true)
+         left = inputs_embeds[:,:first_true]
+-        print(left.shape)
+         right = inputs_embeds[:, last_true+1:]
+-        print(right.shape)
+         inputs_embeds = ops.cat((left, images_embeds, right),1)
+-        print("inputs_embeds.shape:",inputs_embeds.shape)
+-        print("inputs_embeds.dtype:",inputs_embeds.dtype)
+-
+-
++        
+ 
+         # inputs_embeds = images_embeds[padded_images_emb_mask] * images_seq_mask + inputs_embeds * (1 - images_seq_mask)
+         return inputs_embeds
+diff --git a/llm/inference/janus_pro/janus/models/siglip_vit.py b/llm/inference/janus_pro/janus/models/siglip_vit.py
+index 56a6f299..d896eeb2 100644
+--- a/llm/inference/janus_pro/janus/models/siglip_vit.py
++++ b/llm/inference/janus_pro/janus/models/siglip_vit.py
+@@ -580,7 +580,11 @@ class VisionTransformer(nn.Module):
+         if return_prefix_tokens:
+             return tuple(zip(outputs, prefix_tokens))
+         return tuple(outputs)
+-
++    
++    @mindspore.jit(backend='GE')
++    def run_blocks_jit(self, x: mindspore.Tensor) -> mindspore.Tensor:
++        return self.blocks(x)
++    
+     def forward_features(self, x: mindspore.Tensor) -> mindspore.Tensor:
+         x = self.patch_embed(x)
+         x = self._pos_embed(x)
+@@ -590,10 +594,11 @@ class VisionTransformer(nn.Module):
+         #     x = checkpoint_seq(self.blocks, x)
+         # else:
+         #     x = self.blocks(x)
+-        x = self.blocks(x)
++        x = self.run_blocks_jit(x)
+         x = self.norm(x)
+         return x
+-
++    
++    @mindspore.jit(backend='GE')
+     def forward_head(self, x: mindspore.Tensor, pre_logits: bool = False) -> mindspore.Tensor:
+         if self.attn_pool is not None:
+             x = self.attn_pool(x)
+diff --git a/llm/inference/janus_pro/janus/models/timm_layers.py b/llm/inference/janus_pro/janus/models/timm_layers.py
+index 8960d256..61e68c54 100644
+--- a/llm/inference/janus_pro/janus/models/timm_layers.py
++++ b/llm/inference/janus_pro/janus/models/timm_layers.py
+@@ -46,7 +46,7 @@ class Mlp(nn.Module):
+ 
+     def forward(self, x):
+         x = self.fc1(x)
+-        x = self.act(x)
++        x = mindspore.ops.gelu(x)
+         x = self.drop1(x)
+         x = self.norm(x)
+         x = self.fc2(x)
+diff --git a/mindnlp/transformers/models/llama/modeling_llama.py b/mindnlp/transformers/models/llama/modeling_llama.py
+index 9c5cb555..c8c55492 100644
+--- a/mindnlp/transformers/models/llama/modeling_llama.py
++++ b/mindnlp/transformers/models/llama/modeling_llama.py
+@@ -429,7 +429,7 @@ class LlamaAttention(nn.Module):
+             attn_weights = attn_weights + causal_mask
+ 
+         # upcast attention to fp32
+-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=mindspore.float32).to(query_states.dtype)
++        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+         attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+         attn_output = ops.matmul(attn_weights, value_states)
+ 
+@@ -508,7 +508,7 @@ class LlamaDecoderLayer(nn.Module):
+         """
+         residual = hidden_states
+ 
+-        hidden_states = self.input_layernorm(hidden_states)
++        hidden_states, _ = mindspore.ops.rms_norm(hidden_states, self.input_layernorm.weight, self.input_layernorm.variance_epsilon)
+ 
+         # Self Attention
+         hidden_states, self_attn_weights, present_key_value = self.self_attn(
+@@ -526,7 +526,7 @@ class LlamaDecoderLayer(nn.Module):
+ 
+         # Fully Connected
+         residual = hidden_states
+-        hidden_states = self.post_attention_layernorm(hidden_states)
++        hidden_states, _ = mindspore.ops.rms_norm(hidden_states, self.post_attention_layernorm.weight, self.post_attention_layernorm.variance_epsilon)
+         hidden_states = self.mlp(hidden_states)
+         hidden_states = residual + hidden_states
+ 
+-- 
+2.47.1.windows.2
+
diff --git "a/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0003-janus-llama-qwen2_vl.patch" "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0003-janus-llama-qwen2_vl.patch"
new file mode 100644
index 00000000..c60c9b59
--- /dev/null
+++ "b/2025-Ascend-Innovation-Contest/S1/MultiModal/\345\245\275\346\240\221\350\212\261\347\224\237/patches/0003-janus-llama-qwen2_vl.patch"
@@ -0,0 +1,286 @@
+From dba3c2bd641feb6526a82881e1ef395890711a9d Mon Sep 17 00:00:00 2001
+From: Alice <1215876379@qq.com>
+Date: Fri, 7 Nov 2025 00:13:55 +0800
+Subject: [PATCH 3/3] =?UTF-8?q?=E5=8F=82=E8=80=83janus=E4=B8=AD=E7=9A=84ll?=
+ =?UTF-8?q?ama=EF=BC=8C=E4=BC=98=E5=8C=96qwen2=5Fvl=E7=9A=84=E6=8E=A5?=
+ =?UTF-8?q?=E5=8F=A3=E8=B0=83=E7=94=A8?=
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+---
+ .../models/qwen2_vl/modeling_qwen2_vl.py      | 104 +++++++++---------
+ 1 file changed, 53 insertions(+), 51 deletions(-)
+
+diff --git a/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+index ffb100cf..4ed07c37 100644
+--- a/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py
++++ b/mindnlp/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+@@ -151,7 +151,7 @@ class Qwen2VLRotaryEmbedding(nn.Module):
+             self.max_seq_len_cached = self.original_max_seq_len
+ 
+     @no_grad()
+-    def forward(self, x, position_ids):
++    def forward(self, x, position_ids, mrope_section, unsqueeze_dim=1):
+         if "dynamic" in self.rope_type:
+             self._dynamic_frequency_update(position_ids)
+ 
+@@ -161,7 +161,7 @@ class Qwen2VLRotaryEmbedding(nn.Module):
+         position_ids_expanded = position_ids[:, :, None, :].float()  # shape (3, bs, 1, positions)
+         # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+         # with ops.autocast(device_type=device_type, enabled=False):
+-        freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).swapaxes(2, 3)
++        freqs = ops.transpose((inv_freq_expanded.float() @ position_ids_expanded.float()), 2, 3)
+         emb = ops.cat((freqs, freqs), dim=-1)
+         cos = emb.cos()
+         sin = emb.sin()
+@@ -169,6 +169,14 @@ class Qwen2VLRotaryEmbedding(nn.Module):
+         # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+         cos = cos * self.attention_scaling
+         sin = sin * self.attention_scaling
++        
++        mrope_section = mrope_section * 2
++        cos = ops.unsqueeze(ops.cat([m[i % 3] for i, m in enumerate(ops.split(cos, mrope_section, dim=-1))], dim=-1),
++            unsqueeze_dim
++        )
++        sin = ops.unsqueeze(ops.cat([m[i % 3] for i, m in enumerate(ops.split(sin, mrope_section, dim=-1))], dim=-1),
++            unsqueeze_dim
++        )
+ 
+         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+ 
+@@ -176,12 +184,11 @@ class Qwen2VLRotaryEmbedding(nn.Module):
+ # Copied from transformers.models.llama.modeling_llama.rotate_half
+ def rotate_half(x):
+     """Rotates half the hidden dims of the input."""
+-    x1 = x[..., : x.shape[-1] // 2]
+-    x2 = x[..., x.shape[-1] // 2 :]
++    x1, x2 = ops.split(x, x.shape[-1] // 2, dim=-1)
+     return ops.cat((-x2, x1), dim=-1)
+ 
+ 
+-def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
++def apply_multimodal_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
+     """Applies Rotary Position Embedding with Multimodal Sections to the query and key tensors (https://qwenlm.github.io/blog/qwen2-vl/).
+ 
+     Explanation:
+@@ -213,14 +220,6 @@ def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim
+     Returns:
+         `tuple(mindspore.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+     """
+-    mrope_section = mrope_section * 2
+-    cos = ops.cat([m[i % 3] for i, m in enumerate(ops.split(cos, mrope_section, dim=-1))], dim=-1).unsqueeze(
+-        unsqueeze_dim
+-    )
+-    sin = ops.cat([m[i % 3] for i, m in enumerate(ops.split(sin, mrope_section, dim=-1))], dim=-1).unsqueeze(
+-        unsqueeze_dim
+-    )
+-
+     q_embed = (q * cos) + (rotate_half(q) * sin)
+     k_embed = (k * cos) + (rotate_half(k) * sin)
+     return q_embed, k_embed
+@@ -229,10 +228,10 @@ def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim
+ def apply_rotary_pos_emb_vision(tensor: mindspore.Tensor, freqs: mindspore.Tensor) -> mindspore.Tensor:
+     orig_dtype = tensor.dtype
+     tensor = tensor.float()
+-    cos = freqs.cos()
+-    sin = freqs.sin()
+-    cos = cos.unsqueeze(1).tile((1, 1, 2)).unsqueeze(0).float()
+-    sin = sin.unsqueeze(1).tile((1, 1, 2)).unsqueeze(0).float()
++    cos = ops.cos(freqs)
++    sin = ops.sin(freqs)
++    cos = ops.unsqueeze(ops.unsqueeze(cos,1).tile((1, 1, 2)), 0).float()
++    sin = ops.unsqueeze(ops.unsqueeze(sin, 1).tile((1, 1, 2)), 0).float()
+     output = (tensor * cos) + (rotate_half(tensor) * sin)
+     output = output.to(orig_dtype)
+     return output
+@@ -316,28 +315,22 @@ class VisionAttention(nn.Module):
+         self.proj = nn.Linear(dim, dim)
+ 
+     def forward(
+-        self, hidden_states: mindspore.Tensor, cu_seqlens: mindspore.Tensor, rotary_pos_emb: mindspore.Tensor = None
++        self, hidden_states: mindspore.Tensor, attention_mask: mindspore.Tensor, rotary_pos_emb: mindspore.Tensor = None
+     ) -> mindspore.Tensor:
+         seq_length = hidden_states.shape[0]
+-        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
++        q, k, v = mindspore.mint.reshape(self.qkv(hidden_states), (seq_length, 3, self.num_heads, -1)).permute(1, 0, 2, 3).unbind(0)
+         q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+         k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+ 
+-        attention_mask = ops.full(
+-            [1, seq_length, seq_length], float(ops.finfo(q.dtype).min), dtype=q.dtype
+-        )
+-        for i in range(1, len(cu_seqlens)):
+-            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0
+-
+-        q = q.swapaxes(0, 1)
+-        k = k.swapaxes(0, 1)
+-        v = v.swapaxes(0, 1)
+-        attn_weights = ops.matmul(q, k.swapaxes(1, 2)) / math.sqrt(self.head_dim)
++        q = ops.transpose(q, 0, 1)
++        k = ops.transpose(k, 0, 1)
++        v = ops.transpose(v, 0, 1)
++        attn_weights = ops.matmul(q, ops.transpose(k, 1, 2)) / math.sqrt(self.head_dim)
+         attn_weights = attn_weights + attention_mask
+         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+         attn_output = ops.matmul(attn_weights, v)
+-        attn_output = attn_output.swapaxes(0, 1)
+-        attn_output = attn_output.reshape(seq_length, -1)
++        attn_output = ops.transpose(attn_output, 0, 1)
++        attn_output = mindspore.mint.reshape(attn_output, (seq_length, -1))
+         attn_output = self.proj(attn_output)
+         return attn_output
+ 
+@@ -358,9 +351,9 @@ class Qwen2VLVisionBlock(nn.Module):
+         )
+         self.mlp = VisionMlp(dim=config.embed_dim, hidden_dim=mlp_hidden_dim, hidden_act=config.hidden_act)
+ 
+-    def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> mindspore.Tensor:
++    def forward(self, hidden_states, attention_mask, rotary_pos_emb) -> mindspore.Tensor:
+         hidden_states = hidden_states + self.attn(
+-            self.norm1(hidden_states), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
++            self.norm1(hidden_states), attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb
+         )
+         hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+         return hidden_states
+@@ -403,7 +396,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
+         causal_mask = ops.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype)
+         if sequence_length != 1:
+             causal_mask = ops.triu(causal_mask, diagonal=1)
+-        causal_mask *= ops.arange(target_length) > cache_position.reshape(-1, 1)
++        causal_mask *= ops.arange(target_length) > mindspore.mint.reshape(cache_position, (-1, 1))
+         causal_mask = causal_mask[None, None, :, :].broadcast_to((batch_size, 1, -1, -1))
+         if attention_mask is not None:
+             causal_mask = causal_mask.copy()  # copy to contiguous memory for in-place edit
+@@ -462,8 +455,8 @@ def repeat_kv(hidden_states: mindspore.Tensor, n_rep: int) -> mindspore.Tensor:
+     batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+     if n_rep == 1:
+         return hidden_states
+-    hidden_states = hidden_states[:, :, None, :, :].broadcast_to((batch, num_key_value_heads, n_rep, slen, head_dim))
+-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
++    hidden_states = ops.broadcast_to(ops.unsqueeze(hidden_states, 2), (batch, num_key_value_heads, n_rep, slen, head_dim))
++    return mindspore.mint.reshape(hidden_states, (batch, num_key_value_heads * n_rep, slen, head_dim))
+ 
+ 
+ class Qwen2VLAttention(nn.Module):
+@@ -527,9 +520,9 @@ class Qwen2VLAttention(nn.Module):
+         key_states = self.k_proj(hidden_states)
+         value_states = self.v_proj(hidden_states)
+ 
+-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).swapaxes(1, 2)
+-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).swapaxes(1, 2)
+-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).swapaxes(1, 2)
++        query_states = ops.transpose(query_states.view(bsz, q_len, self.num_heads, self.head_dim), 1, 2)
++        key_states = ops.transpose(key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim), 1, 2)
++        value_states = ops.transpose(value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim), 1, 2)
+ 
+         kv_seq_len = key_states.shape[-2]
+         if past_key_value is not None:
+@@ -545,7 +538,7 @@ class Qwen2VLAttention(nn.Module):
+         else:
+             cos, sin = position_embeddings
+         query_states, key_states = apply_multimodal_rotary_pos_emb(
+-            query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
++            query_states, key_states, cos, sin
+         )
+ 
+         if past_key_value is not None:
+@@ -556,10 +549,10 @@ class Qwen2VLAttention(nn.Module):
+         key_states = repeat_kv(key_states, self.num_key_value_groups)
+         value_states = repeat_kv(value_states, self.num_key_value_groups)
+ 
+-        attn_weights = ops.matmul(query_states, key_states.swapaxes(2, 3)) / math.sqrt(self.head_dim)
++        attn_weights = ops.matmul(query_states, ops.transpose(key_states, 2, 3)) / math.sqrt(self.head_dim)
+ 
+         if attention_mask is not None:  # no matter the length, we just slice it
+-            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
++            causal_mask = ops.narrow(attention_mask, 3, 0, key_states.shape[-2])
+             attn_weights = attn_weights + causal_mask
+ 
+         # upcast attention to fp32
+@@ -573,8 +566,8 @@ class Qwen2VLAttention(nn.Module):
+                 f" {attn_output.shape}"
+             )
+ 
+-        attn_output = attn_output.swapaxes(1, 2)
+-        attn_output = attn_output.reshape(bsz, q_len, -1)
++        attn_output = ops.transpose(attn_output, 1, 2)
++        attn_output = mindspore.mint.reshape(attn_output, (bsz, q_len, -1))
+ 
+         attn_output = self.o_proj(attn_output)
+ 
+@@ -727,22 +720,22 @@ class Qwen2VisionTransformerPretrainedModel(Qwen2VLPreTrainedModel):
+         for t, h, w in grid_thw:
+             t, h, w = t.item(), h.item(), w.item()
+             hpos_ids = ops.arange(h).unsqueeze(1).broadcast_to((-1, w))
+-            hpos_ids = hpos_ids.reshape(
++            hpos_ids = mindspore.mint.reshape(hpos_ids, (
+                 h // self.spatial_merge_size,
+                 self.spatial_merge_size,
+                 w // self.spatial_merge_size,
+                 self.spatial_merge_size,
+-            )
++            ))
+             hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+             hpos_ids = hpos_ids.flatten()
+ 
+             wpos_ids = ops.arange(w).unsqueeze(0).broadcast_to((h, -1))
+-            wpos_ids = wpos_ids.reshape(
++            wpos_ids = mindspore.mint.reshape(wpos_ids, (
+                 h // self.spatial_merge_size,
+                 self.spatial_merge_size,
+                 w // self.spatial_merge_size,
+                 self.spatial_merge_size,
+-            )
++            ))
+             wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+             wpos_ids = wpos_ids.flatten()
+             pos_ids.append(ops.stack([hpos_ids, wpos_ids], dim=-1).tile((t, 1)))
+@@ -761,9 +754,16 @@ class Qwen2VisionTransformerPretrainedModel(Qwen2VLPreTrainedModel):
+             dim=0, dtype=mindspore.int32
+         )
+         cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
++        
++        seq_length = hidden_states.shape[0]
++        attention_mask = ops.full(
++            [1, seq_length, seq_length], float(ops.finfo(hidden_states.dtype).min), dtype=hidden_states.dtype
++        )
++        for i in range(1, len(cu_seqlens)):
++            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0
+ 
+         for blk in self.blocks:
+-            hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb)
++            hidden_states = blk(hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb)
+ 
+         return self.merger(hidden_states)
+ 
+@@ -785,6 +785,8 @@ class Qwen2VLModel(Qwen2VLPreTrainedModel):
+         self.gradient_checkpointing = False
+         # Initialize weights and apply final processing
+         self.post_init()
++        
++        self.rope_scaling = config.rope_scaling
+ 
+     def get_input_embeddings(self):
+         return self.embed_tokens
+@@ -847,7 +849,7 @@ class Qwen2VLModel(Qwen2VLPreTrainedModel):
+         hidden_states = inputs_embeds
+ 
+         # create position embeddings to be shared across the decoder layers
+-        position_embeddings = self.rotary_emb(hidden_states, position_ids)
++        position_embeddings = self.rotary_emb(hidden_states, position_ids, self.rope_scaling["mrope_section"])
+ 
+         # decoder layers
+         all_hidden_states = () if output_hidden_states else None
+@@ -1120,7 +1122,7 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel):
+                     text_len = len(input_tokens) - st
+                     llm_pos_ids_list.append(ops.arange(text_len).view(1, -1).broadcast_to((3, -1)) + st_idx)
+ 
+-                llm_positions = ops.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
++                llm_positions = mindspore.mint.reshape(ops.cat(llm_pos_ids_list, dim=1), (3, -1))
+                 position_ids[..., i, attention_mask[i] == 1] = llm_positions
+                 mrope_position_deltas.append(llm_positions.max().item() + 1 - len(total_input_ids[i]))
+             mrope_position_deltas = mindspore.tensor(mrope_position_deltas).unsqueeze(1)
+-- 
+2.47.1.windows.2
+