From 9d8e2723f370874f40bd8e99944f98af379d76b1 Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Tue, 23 Dec 2025 14:11:41 -0800 Subject: [PATCH 01/26] Pick model runner change related to PR30475. Also overwrite qwen3_vl function to use _merge_multimodal_embeddings with index copy. --- vllm_gaudi/models/__init__.py | 5 ++- vllm_gaudi/models/utils.py | 3 +- vllm_gaudi/v1/worker/hpu_model_runner.py | 45 ++++++++++++++---------- 3 files changed, 32 insertions(+), 21 deletions(-) diff --git a/vllm_gaudi/models/__init__.py b/vllm_gaudi/models/__init__.py index c38c1af6f..27bce1048 100644 --- a/vllm_gaudi/models/__init__.py +++ b/vllm_gaudi/models/__init__.py @@ -1,6 +1,5 @@ from vllm.model_executor.models.registry import ModelRegistry - def register_model(): from vllm_gaudi.models.gemma3_mm import HpuGemma3ForConditionalGeneration # noqa: F401 @@ -11,3 +10,7 @@ def register_model(): from vllm_gaudi.models.qwen2_5_vl import HpuQwen2_5_VLForConditionalGeneration # noqa: F401 ModelRegistry.register_model("Qwen2_5_VLForConditionalGeneration", "vllm_gaudi.models.qwen2_5_vl:HpuQwen2_5_VLForConditionalGeneration") + + from vllm_gaudi.models.qwen3_vl import HpuQwen3_VLForConditionalGeneration # noqa: F401 + ModelRegistry.register_model("Qwen3_VLForConditionalGeneration", + "vllm_gaudi.models.qwen3_vl:HpuQwen3_VLForConditionalGeneration") diff --git a/vllm_gaudi/models/utils.py b/vllm_gaudi/models/utils.py index c1bbfd0e4..03d67119d 100644 --- a/vllm_gaudi/models/utils.py +++ b/vllm_gaudi/models/utils.py @@ -3,7 +3,6 @@ from vllm.model_executor.models import utils from vllm.model_executor.models.utils import (_embedding_count_expression, _flatten_embeddings) - # TODO: Replaced masked_scatter with torch.where to avoid HPU performance issues # with non_zero_i8 ops in TPC kernel. However, torch.where creates dynamic operations # causing recompilation on each run. Need to find a static operation alternative. @@ -29,6 +28,8 @@ def _merge_multimodal_embeddings( mm_embeds_flat = _flatten_embeddings(multimodal_embeddings) input_dtype = inputs_embeds.dtype + if is_multimodal.dtype == torch.int64: + return inputs_embeds.index_copy_(0, is_multimodal, mm_embeds_flat) try: # For debugging # inputs_embeds[is_multimodal] = mm_embeds_flat.to(dtype=input_dtype) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 21c729e71..1f18e9d5f 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -75,7 +75,7 @@ from vllm.model_executor.models.interfaces_base import (VllmModelForPooling, is_pooling_model, is_text_generation_model) from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.transformers_utils.config import is_interleaved -from vllm.v1.worker.utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs, scatter_mm_placeholders) +from vllm.v1.worker.utils import sanity_check_mm_encoder_outputs from vllm.v1.sample.rejection_sampler import RejectionSampler from vllm.v1.spec_decode.eagle import EagleProposer from vllm.v1.spec_decode.metadata import SpecDecodeMetadata @@ -665,7 +665,7 @@ def __init__( self.head_size = self.model_config.get_head_size() self.hidden_size = self.model_config.get_hidden_size() self.is_pooling_model = (model_config.runner_type == 'pooling') - logger.debug("model config: ", self.model_config) + logger.debug("model config: %s", self.model_config) self.attn_backend = get_attn_backend( self.head_size, @@ -1247,12 +1247,7 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput", req_ids: list if req_id not in self.encoder_cache: self.encoder_cache[req_id] = {} - self.encoder_cache[mm_hash] = scatter_mm_placeholders( - output, - is_embed=pos_info.is_embed.to( - device=output.device) if pos_info.is_embed is not None else pos_info.is_embed, - ) - + self.encoder_cache[mm_hash] = output # modified from: vllm/v1/worker/gpu_model_runner.py def _gather_mm_embeddings( self, @@ -1269,6 +1264,7 @@ def _gather_mm_embeddings( req_start_idx = 0 for req_id in req_ids: + mm_embeds_req: list[torch.Tensor] = [] num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id] req_state = self.requests[req_id] num_computed_tokens = \ @@ -1293,6 +1289,13 @@ def _gather_mm_embeddings( start_idx = max(num_computed_tokens - start_pos, 0) end_idx = min(num_computed_tokens - start_pos + num_scheduled_tokens, num_encoder_tokens) assert start_idx < end_idx + curr_embeds_start, curr_embeds_end = ( + pos_info.get_embeds_indices_in_range(start_idx, end_idx) + ) + # If there are no embeddings in the current range, we skip + # gathering the embeddings. + if curr_embeds_start == curr_embeds_end: + continue mm_hash = mm_feature.identifier encoder_output = self.encoder_cache.get(mm_hash, None) assert encoder_output is not None,\ @@ -1301,21 +1304,24 @@ def _gather_mm_embeddings( if (is_embed := pos_info.is_embed) is not None: is_embed = is_embed[start_idx:end_idx] + mm_embeds_item = encoder_output[curr_embeds_start:curr_embeds_end] + else: + mm_embeds_item = encoder_output[start_idx:end_idx] - mm_embeds_item = gather_mm_placeholders( - encoder_output[start_idx:end_idx], - is_embed=is_embed, - ) req_start_pos = req_start_idx + start_pos - num_computed_tokens - is_mm_embed[req_start_pos+start_idx:req_start_pos + end_idx] \ - = True - - # Only whole mm items are processed - mm_embeds.append(mm_embeds_item) + is_mm_embed[req_start_pos + start_idx : req_start_pos + end_idx] = ( + True if is_embed is None else is_embed + ) + mm_embeds_req.append(mm_embeds_item) + mm_embeds.extend(mm_embeds_req) req_start_idx += num_scheduled_tokens - is_mm_embed = self.is_mm_embed.copy_to_gpu(total_num_scheduled_tokens) - + # Convert bool tensor to index tensor for merge embedding statically if optimized mm + if self.uses_mrope: + is_mm_embed_index = torch.nonzero(is_mm_embed[:total_num_scheduled_tokens], as_tuple=True)[0] + is_mm_embed = is_mm_embed_index.to(self.device) + else: + is_mm_embed = self.is_mm_embed.copy_to_gpu(total_num_scheduled_tokens) return mm_embeds, is_mm_embed def _get_model_mm_inputs( @@ -3710,6 +3716,7 @@ def load_model(self) -> None: self._maybe_compile(self.model) self.model_memory_usage = m.consumed_device_memory logger.info("Compilation took %.4f GB", self.model_memory_usage / float(2**30)) + self.is_mm_optimized = is_mm_optimized(self.model) def _maybe_compile(self, *args, **kwargs): """Entrypoint for a torch.compilation of the model""" From 49d76333802cf5d2d8900f90f907d58a86771d0f Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Tue, 23 Dec 2025 15:03:52 -0800 Subject: [PATCH 02/26] add qwen3_vl.py functions --- vllm_gaudi/models/qwen3_vl.py | 89 +++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 vllm_gaudi/models/qwen3_vl.py diff --git a/vllm_gaudi/models/qwen3_vl.py b/vllm_gaudi/models/qwen3_vl.py new file mode 100644 index 000000000..85f7a8745 --- /dev/null +++ b/vllm_gaudi/models/qwen3_vl.py @@ -0,0 +1,89 @@ +import torch +from .utils import _merge_multimodal_embeddings +from vllm.model_executor.models.interfaces import MultiModalEmbeddings +from vllm.model_executor.models.qwen3_vl import Qwen3VLForConditionalGeneration + +class HpuQwen3_VLForConditionalGeneration(Qwen3VLForConditionalGeneration): + def _compute_deepstack_embeds( + self, + inputs_embeds: torch.Tensor, + multimodal_embeddings: MultiModalEmbeddings, + is_multimodal: torch.Tensor, + ) -> tuple[torch.Tensor, MultiModalEmbeddings]: + visual_lens = [len(x) for x in multimodal_embeddings] + multimodal_embeddings_cat = torch.cat(multimodal_embeddings, dim=0) + + ( + multimodal_embeddings_main, + multimodal_embeddings_multiscale, + ) = torch.split( + multimodal_embeddings_cat, + [self.visual_dim, self.multiscale_dim], + dim=-1, + ) + + multimodal_embeddings = torch.split( + multimodal_embeddings_main, visual_lens, dim=0 + ) + multimodal_embeddings_multiscale = torch.split( + multimodal_embeddings_multiscale, visual_lens, dim=0 + ) + + deepstack_input_embeds = inputs_embeds.new_zeros( + inputs_embeds.size(0), self.deepstack_num_level * inputs_embeds.size(1) + ) + + deepstack_input_embeds = _merge_multimodal_embeddings( + inputs_embeds=deepstack_input_embeds, + multimodal_embeddings=multimodal_embeddings_multiscale, + is_multimodal=is_multimodal, + ) + deepstack_input_embeds = deepstack_input_embeds.view( + inputs_embeds.shape[0], self.deepstack_num_level, self.visual_dim + ) + deepstack_input_embeds = deepstack_input_embeds.permute(1, 0, 2) + + return deepstack_input_embeds, multimodal_embeddings + + def embed_input_ids( + self, + input_ids: torch.Tensor, + multimodal_embeddings: MultiModalEmbeddings | None = None, + *, + is_multimodal: torch.Tensor | None = None, + handle_oov_mm_token: bool = False, + ) -> torch.Tensor: + inputs_embeds = self._embed_text_input_ids( + input_ids, + self.language_model.embed_input_ids, + is_multimodal=is_multimodal, + handle_oov_mm_token=handle_oov_mm_token, + ) + + if multimodal_embeddings is None or len(multimodal_embeddings) == 0: + return inputs_embeds + + is_multimodal = _require_is_multimodal(is_multimodal) + + if self.use_deepstack: + ( + deepstack_input_embeds, + multimodal_embeddings, + ) = self._compute_deepstack_embeds( + inputs_embeds=inputs_embeds, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, + ) + else: + deepstack_input_embeds = None + + inputs_embeds = _merge_multimodal_embeddings( + inputs_embeds=inputs_embeds, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, + ) + + if deepstack_input_embeds is not None: + self._set_deepstack_input_embeds(deepstack_input_embeds) + + return inputs_embeds From c6526de66ec87f384aebb2d335741d84c00cc643 Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Tue, 23 Dec 2025 22:30:14 -0800 Subject: [PATCH 03/26] precomit fix --- vllm_gaudi/v1/worker/hpu_model_runner.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 1f18e9d5f..1b1947b57 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -1289,9 +1289,7 @@ def _gather_mm_embeddings( start_idx = max(num_computed_tokens - start_pos, 0) end_idx = min(num_computed_tokens - start_pos + num_scheduled_tokens, num_encoder_tokens) assert start_idx < end_idx - curr_embeds_start, curr_embeds_end = ( - pos_info.get_embeds_indices_in_range(start_idx, end_idx) - ) + curr_embeds_start, curr_embeds_end = (pos_info.get_embeds_indices_in_range(start_idx, end_idx)) # If there are no embeddings in the current range, we skip # gathering the embeddings. if curr_embeds_start == curr_embeds_end: @@ -1309,9 +1307,8 @@ def _gather_mm_embeddings( mm_embeds_item = encoder_output[start_idx:end_idx] req_start_pos = req_start_idx + start_pos - num_computed_tokens - is_mm_embed[req_start_pos + start_idx : req_start_pos + end_idx] = ( - True if is_embed is None else is_embed - ) + is_mm_embed[req_start_pos + start_idx : req_start_pos + + end_idx] = (True if is_embed is None else is_embed) mm_embeds_req.append(mm_embeds_item) mm_embeds.extend(mm_embeds_req) req_start_idx += num_scheduled_tokens From 7c6329efc6589bef0121e6a59e45e92857bc94ee Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Wed, 24 Dec 2025 23:04:56 -0800 Subject: [PATCH 04/26] precommit fix and fix use_window_sdpa --- vllm_gaudi/extension/ops.py | 1 + vllm_gaudi/models/__init__.py | 1 + vllm_gaudi/models/qwen3_vl.py | 24 ++++++++++-------------- vllm_gaudi/models/utils.py | 1 + vllm_gaudi/v1/worker/hpu_model_runner.py | 8 ++++++-- 5 files changed, 19 insertions(+), 16 deletions(-) diff --git a/vllm_gaudi/extension/ops.py b/vllm_gaudi/extension/ops.py index a40388d3e..1daf8ea41 100644 --- a/vllm_gaudi/extension/ops.py +++ b/vllm_gaudi/extension/ops.py @@ -364,6 +364,7 @@ def _fsdpa_prompt_attention(query: torch.Tensor, query, key, value, attn_bias, 0.0, is_causal, scale, softmax_mode, recompute_mode, valid_seq_lengths, padding_side ] + args += [window_size] if window_size else [] attn_weights = fsdpa_op(*args) diff --git a/vllm_gaudi/models/__init__.py b/vllm_gaudi/models/__init__.py index 27bce1048..df5cb1b78 100644 --- a/vllm_gaudi/models/__init__.py +++ b/vllm_gaudi/models/__init__.py @@ -1,5 +1,6 @@ from vllm.model_executor.models.registry import ModelRegistry + def register_model(): from vllm_gaudi.models.gemma3_mm import HpuGemma3ForConditionalGeneration # noqa: F401 diff --git a/vllm_gaudi/models/qwen3_vl.py b/vllm_gaudi/models/qwen3_vl.py index 85f7a8745..49b388713 100644 --- a/vllm_gaudi/models/qwen3_vl.py +++ b/vllm_gaudi/models/qwen3_vl.py @@ -2,8 +2,10 @@ from .utils import _merge_multimodal_embeddings from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.qwen3_vl import Qwen3VLForConditionalGeneration - + + class HpuQwen3_VLForConditionalGeneration(Qwen3VLForConditionalGeneration): + def _compute_deepstack_embeds( self, inputs_embeds: torch.Tensor, @@ -22,29 +24,23 @@ def _compute_deepstack_embeds( dim=-1, ) - multimodal_embeddings = torch.split( - multimodal_embeddings_main, visual_lens, dim=0 - ) - multimodal_embeddings_multiscale = torch.split( - multimodal_embeddings_multiscale, visual_lens, dim=0 - ) + multimodal_embeddings = torch.split(multimodal_embeddings_main, visual_lens, dim=0) + multimodal_embeddings_multiscale = torch.split(multimodal_embeddings_multiscale, visual_lens, dim=0) - deepstack_input_embeds = inputs_embeds.new_zeros( - inputs_embeds.size(0), self.deepstack_num_level * inputs_embeds.size(1) - ) + deepstack_input_embeds = inputs_embeds.new_zeros(inputs_embeds.size(0), + self.deepstack_num_level * inputs_embeds.size(1)) deepstack_input_embeds = _merge_multimodal_embeddings( inputs_embeds=deepstack_input_embeds, multimodal_embeddings=multimodal_embeddings_multiscale, is_multimodal=is_multimodal, ) - deepstack_input_embeds = deepstack_input_embeds.view( - inputs_embeds.shape[0], self.deepstack_num_level, self.visual_dim - ) + deepstack_input_embeds = deepstack_input_embeds.view(inputs_embeds.shape[0], self.deepstack_num_level, + self.visual_dim) deepstack_input_embeds = deepstack_input_embeds.permute(1, 0, 2) return deepstack_input_embeds, multimodal_embeddings - + def embed_input_ids( self, input_ids: torch.Tensor, diff --git a/vllm_gaudi/models/utils.py b/vllm_gaudi/models/utils.py index 03d67119d..38dbc92c0 100644 --- a/vllm_gaudi/models/utils.py +++ b/vllm_gaudi/models/utils.py @@ -3,6 +3,7 @@ from vllm.model_executor.models import utils from vllm.model_executor.models.utils import (_embedding_count_expression, _flatten_embeddings) + # TODO: Replaced masked_scatter with torch.where to avoid HPU performance issues # with non_zero_i8 ops in TPC kernel. However, torch.where creates dynamic operations # causing recompilation on each run. Need to find a static operation alternative. diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 1b1947b57..21e8ad408 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -1248,6 +1248,7 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput", req_ids: list self.encoder_cache[req_id] = {} self.encoder_cache[mm_hash] = output + # modified from: vllm/v1/worker/gpu_model_runner.py def _gather_mm_embeddings( self, @@ -1307,7 +1308,7 @@ def _gather_mm_embeddings( mm_embeds_item = encoder_output[start_idx:end_idx] req_start_pos = req_start_idx + start_pos - num_computed_tokens - is_mm_embed[req_start_pos + start_idx : req_start_pos + + is_mm_embed[req_start_pos + start_idx:req_start_pos + end_idx] = (True if is_embed is None else is_embed) mm_embeds_req.append(mm_embeds_item) mm_embeds.extend(mm_embeds_req) @@ -1316,6 +1317,9 @@ def _gather_mm_embeddings( # Convert bool tensor to index tensor for merge embedding statically if optimized mm if self.uses_mrope: is_mm_embed_index = torch.nonzero(is_mm_embed[:total_num_scheduled_tokens], as_tuple=True)[0] + # Bounds validation on CPU + if len(is_mm_embed_index) > 0 and is_mm_embed_index.max() >= total_num_scheduled_tokens: + raise ValueError(f"Index {is_mm_embed_index.max()} exceeds tensor size {total_num_scheduled_tokens}") is_mm_embed = is_mm_embed_index.to(self.device) else: is_mm_embed = self.is_mm_embed.copy_to_gpu(total_num_scheduled_tokens) @@ -5321,7 +5325,7 @@ def __init__( if self.interleaved_sliding_window: self.use_window_sdpa = with_default(get_config().PT_HPU_SDPA_QKV_SLICE_MODE_FWD, False) #os.getenv("PT_HPU_SDPA_QKV_SLICE_MODE_FWD", "false").strip().lower() in ("1", "true") - self.slice_size = with_default(get_config().PT_HPU_SDPA_BC_FACTOR, False) + self.slice_size = int(with_default(get_config().PT_HPU_SDPA_BC_FACTOR, "1024")) # int(os.getenv("PT_HPU_SDPA_BC_FACTOR", "1024")) self.slice_thld = int(os.environ.get('VLLM_FUSEDSDPA_SLIDE_THLD', '8192')) From bff3cf58f11d7f2678daf8f129b6792f6a4f73d3 Mon Sep 17 00:00:00 2001 From: Iryna Boiko Date: Mon, 29 Dec 2025 10:21:47 +0100 Subject: [PATCH 05/26] Update qwen3_vl.py Format update --- vllm_gaudi/models/qwen3_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_gaudi/models/qwen3_vl.py b/vllm_gaudi/models/qwen3_vl.py index 49b388713..d186464db 100644 --- a/vllm_gaudi/models/qwen3_vl.py +++ b/vllm_gaudi/models/qwen3_vl.py @@ -28,7 +28,7 @@ def _compute_deepstack_embeds( multimodal_embeddings_multiscale = torch.split(multimodal_embeddings_multiscale, visual_lens, dim=0) deepstack_input_embeds = inputs_embeds.new_zeros(inputs_embeds.size(0), - self.deepstack_num_level * inputs_embeds.size(1)) + self.deepstack_num_level * inputs_embeds.size(1)) deepstack_input_embeds = _merge_multimodal_embeddings( inputs_embeds=deepstack_input_embeds, From 625d9c28c0451f876d82d15a4b7774cc1113126f Mon Sep 17 00:00:00 2001 From: Iryna Boiko Date: Mon, 29 Dec 2025 10:22:29 +0100 Subject: [PATCH 06/26] Update qwen3_vl.py format --- vllm_gaudi/models/qwen3_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_gaudi/models/qwen3_vl.py b/vllm_gaudi/models/qwen3_vl.py index d186464db..7499ee743 100644 --- a/vllm_gaudi/models/qwen3_vl.py +++ b/vllm_gaudi/models/qwen3_vl.py @@ -36,7 +36,7 @@ def _compute_deepstack_embeds( is_multimodal=is_multimodal, ) deepstack_input_embeds = deepstack_input_embeds.view(inputs_embeds.shape[0], self.deepstack_num_level, - self.visual_dim) + self.visual_dim) deepstack_input_embeds = deepstack_input_embeds.permute(1, 0, 2) return deepstack_input_embeds, multimodal_embeddings From bb3ac24cfd0b58b08f1225caffcbbdedf3fd53a5 Mon Sep 17 00:00:00 2001 From: Iryna Boiko Date: Tue, 30 Dec 2025 11:27:58 +0100 Subject: [PATCH 07/26] Update qwen3_vl.py update for pre-commit error: vllm_gaudi/models/qwen3_vl.py:62:25: F821 Undefined name `_require_is_multimodal` --- vllm_gaudi/models/qwen3_vl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm_gaudi/models/qwen3_vl.py b/vllm_gaudi/models/qwen3_vl.py index 7499ee743..394d559a3 100644 --- a/vllm_gaudi/models/qwen3_vl.py +++ b/vllm_gaudi/models/qwen3_vl.py @@ -2,6 +2,7 @@ from .utils import _merge_multimodal_embeddings from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.qwen3_vl import Qwen3VLForConditionalGeneration +from vllm.model_executor.models.interfaces import _require_is_multimodal class HpuQwen3_VLForConditionalGeneration(Qwen3VLForConditionalGeneration): From 48a96db5280cb4d1413173694cf3ce274dc5c55e Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Thu, 15 Jan 2026 11:26:53 -0800 Subject: [PATCH 08/26] fix test failure --- vllm_gaudi/models/interfaces.py | 2 +- vllm_gaudi/models/utils.py | 10 +++------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/vllm_gaudi/models/interfaces.py b/vllm_gaudi/models/interfaces.py index a4ea2327e..f45d0263f 100644 --- a/vllm_gaudi/models/interfaces.py +++ b/vllm_gaudi/models/interfaces.py @@ -38,4 +38,4 @@ def _embed_text_input_ids( return embed_input_ids(input_ids) -SupportsMultiModal._embed_text_input_ids = _embed_text_input_ids +#SupportsMultiModal._embed_text_input_ids = _embed_text_input_ids diff --git a/vllm_gaudi/models/utils.py b/vllm_gaudi/models/utils.py index 38dbc92c0..031334523 100644 --- a/vllm_gaudi/models/utils.py +++ b/vllm_gaudi/models/utils.py @@ -34,15 +34,11 @@ def _merge_multimodal_embeddings( try: # For debugging # inputs_embeds[is_multimodal] = mm_embeds_flat.to(dtype=input_dtype) - # htcore.mark_step() + # NOTE: This can avoid D2H sync (#22105), but fails to # raise an error if is_multimodal.sum() < len(mm_embeds_flat) - # inputs_embeds.masked_scatter_(is_multimodal.unsqueeze(-1), - # mm_embeds_flat.to(dtype=input_dtype)) - - multimodal_positions = torch.where(is_multimodal)[0][:mm_embeds_flat.shape[0]] - inputs_embeds[0, multimodal_positions] = mm_embeds_flat.to(dtype=input_dtype) - + inputs_embeds.masked_scatter_(is_multimodal.unsqueeze(-1), + mm_embeds_flat.to(dtype=input_dtype)) except RuntimeError as e: num_actual_tokens = len(mm_embeds_flat) num_expected_tokens = is_multimodal.sum().item() From db1054888d60227cae0ed42d881cd7c3c031d397 Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Thu, 15 Jan 2026 11:36:14 -0800 Subject: [PATCH 09/26] fix precommit issue --- vllm_gaudi/models/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm_gaudi/models/utils.py b/vllm_gaudi/models/utils.py index 031334523..e7c090040 100644 --- a/vllm_gaudi/models/utils.py +++ b/vllm_gaudi/models/utils.py @@ -37,8 +37,7 @@ def _merge_multimodal_embeddings( # NOTE: This can avoid D2H sync (#22105), but fails to # raise an error if is_multimodal.sum() < len(mm_embeds_flat) - inputs_embeds.masked_scatter_(is_multimodal.unsqueeze(-1), - mm_embeds_flat.to(dtype=input_dtype)) + inputs_embeds.masked_scatter_(is_multimodal.unsqueeze(-1), mm_embeds_flat.to(dtype=input_dtype)) except RuntimeError as e: num_actual_tokens = len(mm_embeds_flat) num_expected_tokens = is_multimodal.sum().item() From 40d7635620c3d149e3187c5ceae13d5348024011 Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Thu, 15 Jan 2026 15:06:09 -0800 Subject: [PATCH 10/26] Update interfaces.py for precommit fix --- vllm_gaudi/models/interfaces.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm_gaudi/models/interfaces.py b/vllm_gaudi/models/interfaces.py index f45d0263f..26ead7f83 100644 --- a/vllm_gaudi/models/interfaces.py +++ b/vllm_gaudi/models/interfaces.py @@ -1,7 +1,6 @@ from collections.abc import Callable import torch from torch import Tensor -from vllm.model_executor.models.interfaces import SupportsMultiModal def _embed_text_input_ids( From e23e6d2077b4f86645b78859f6b265be4726b7ad Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Thu, 15 Jan 2026 17:41:30 -0800 Subject: [PATCH 11/26] Update hpu_model_runner.py to match with upstream for MultiModalBudget --- vllm_gaudi/v1/worker/hpu_model_runner.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 7ff61d9d7..8df95edde 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -4616,8 +4616,7 @@ def warmup_multimodal_graphs(self, buckets): phase = 'Graph/Multimodal' from vllm.v1.worker.utils import MultiModalBudget self.mm_budget = MultiModalBudget( - self.model_config, - self.scheduler_config, + self.vllm_config, self.mm_registry, ) if self.supports_mm_inputs else None aspect_ratios = [(1, 1)] # 1:1 square From 4089adff19fc67b650bfa0bd06642a10b3780d3f Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Mon, 19 Jan 2026 14:06:50 -0800 Subject: [PATCH 12/26] Update qwen3_vl.py for precommit fix --- vllm_gaudi/models/qwen3_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_gaudi/models/qwen3_vl.py b/vllm_gaudi/models/qwen3_vl.py index eb127cb46..87089c26c 100644 --- a/vllm_gaudi/models/qwen3_vl.py +++ b/vllm_gaudi/models/qwen3_vl.py @@ -176,4 +176,4 @@ def embed_input_ids( if deepstack_input_embeds is not None: self._set_deepstack_input_embeds(deepstack_input_embeds) - return inputs_embeds \ No newline at end of file + return inputs_embeds From 79d90a416bdf487676cf78ebe4906d725dbe4c01 Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Mon, 19 Jan 2026 14:22:46 -0800 Subject: [PATCH 13/26] Update qwen3_vl.py for precommit fix --- vllm_gaudi/models/qwen3_vl.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm_gaudi/models/qwen3_vl.py b/vllm_gaudi/models/qwen3_vl.py index 87089c26c..82a72d3eb 100644 --- a/vllm_gaudi/models/qwen3_vl.py +++ b/vllm_gaudi/models/qwen3_vl.py @@ -1,10 +1,9 @@ import torch from .utils import _merge_multimodal_embeddings +from vllm.config import VllmConfig +from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.models.interfaces import MultiModalEmbeddings -from vllm.model_executor.models.qwen3_vl import Qwen3VLForConditionalGeneration from vllm.model_executor.models.interfaces import _require_is_multimodal -from vllm.model_executor.layers.activation import get_act_fn -from vllm.config import VllmConfig from vllm.model_executor.models.qwen3_vl import ( Qwen3VLForConditionalGeneration, Qwen3_VisionTransformer, @@ -86,6 +85,7 @@ def __init__( class HpuQwen3_VLForConditionalGeneration(Qwen3VLForConditionalGeneration): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__(vllm_config=vllm_config, prefix=prefix) @@ -100,6 +100,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): multimodal_config=multimodal_config, prefix=maybe_prefix(prefix, "visual"), ) + def _compute_deepstack_embeds( self, inputs_embeds: torch.Tensor, From 07f40c9170b10d0418f3b3bfb60cd4c308869238 Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Wed, 21 Jan 2026 08:59:03 -0800 Subject: [PATCH 14/26] add back warmup with ratio and video warmup --- vllm_gaudi/v1/worker/hpu_model_runner.py | 95 +++++++++++++----------- 1 file changed, 50 insertions(+), 45 deletions(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 36a527e5a..0e913bc7b 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import collections +import copy import contextlib import functools from functools import partial @@ -42,13 +43,14 @@ from vllm.v1.attention.selector import get_attn_backend from vllm.config import (VllmConfig, update_config) +from vllm.config.multimodal import ImageDummyOptions from vllm.distributed.kv_transfer import (get_kv_transfer_group, has_kv_transfer_group) from vllm.forward_context import set_forward_context from vllm.model_executor.layers.fused_moe.layer import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.vocab_parallel_embedding import (VocabParallelEmbedding) from vllm.model_executor.model_loader import get_model, get_model_loader -from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargsItem) from vllm.multimodal.utils import group_mm_kwargs_by_modality from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding @@ -704,10 +706,12 @@ def __init__( # Mult-modal-related. self.mm_registry = MULTIMODAL_REGISTRY self.uses_mrope = model_config.uses_mrope + self.model_config_copy = None self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(model_config) if self.supports_mm_inputs: self.is_mm_embed = self._make_buffer(self.max_num_tokens, dtype=torch.bool) + self.model_config_copy = copy.deepcopy(self.model_config) self.is_multimodal_raw_input_supported = (model_config.is_multimodal_raw_input_only_model) # Lazy initialization @@ -3887,12 +3891,13 @@ def log_warmup(self, phase, i, max_i, first_dim, second_dim, third_dim, causal=F f"free_mem:{free_mem}") tqdm.write(msg) - def log_warmup_multimodal(self, phase, i, max_i, batch_size, seq_len, img_args): + def log_warmup_multimodal(self, phase, i, max_i, batch_size, seq_len, w, h, f): free_mem = format_bytes(HabanaMemoryProfiler.current_free_device_memory()) msg = (f"[Warmup][{phase}][{i+1}/{max_i}] " f"batch_size:{batch_size} " f"seq_len:{seq_len} " - f"img_args:{img_args} " + f"resolution:{w}X{h} " + f"frames:{f} " f"free_mem:{free_mem}") logger.info(msg) @@ -4574,51 +4579,44 @@ def _get_mm_dummy_batch( ) -> BatchedTensorInputs: """Dummy data for profiling and precompiling multimodal models.""" assert self.mm_budget is not None - img_count = 1 - batch = image_args if self.get_model().vision_bucket_manager.is_batch_based else img_count - '''if self.get_model().vision_bucket_manager.is_batch_based: + count = 1 + num_frames = 0 + batch = image_args if self.get_model().vision_bucket_manager.is_batch_based else count + if self.get_model().vision_bucket_manager.is_batch_based: # Create ImageDummyOptions for Gemma3 - #image_options = ImageDummyOptions( - # width=896, # pixels as in gemma3 config - # height=896 # pixels as in gemma3 config - #) + w=896, # pixels as in gemma3 config + h=896 # pixels as in gemma3 config batch = image_args else: - #patch_size = int(self.get_patch_size_from_model()) + patch_size = int(self.get_patch_size_from_model()) # Calculate width and height to maintain aspect ratio and patch count # Total patches = (width/patch_size) * (height/patch_size) # We want: (w/ps) * (h/ps) = num_patch where num_patch is image_args # And: w/h = ratio_w/ratio_h - #grid_w = int(math.sqrt(image_args * ratio_w / ratio_h)) - #grid_h = int(image_args / grid_w) - #w = grid_w * patch_size - #h = grid_h * patch_size - #image_options = ImageDummyOptions( - # width=w, # Custom width in pixels - # height=h # Custom height in pixels - #) - batch = img_count - - processor = self.mm_registry.create_processor(model_config=self.model_config, cache=self.mm_budget.cache) - dummy_data = processor.dummy_inputs.get_decoder_dummy_data(processor, - seq_len=4096, - mm_counts={"image": img_count}, - mm_options={"image": image_options}), - - dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs( - seq_len=4096, - mm_counts={"image": img_count}, - ) - ''' - - assert modality == 'image' - # Result in the maximum GPU consumption of the model - dummy_mm_inputs = self.mm_registry.get_dummy_mm_inputs( - self.model_config, - mm_counts={modality: 1}, - cache=self.mm_budget.cache, - ) + grid_w = int(math.sqrt(image_args * ratio_w / ratio_h)) + grid_h = int(image_args / grid_w) + w = grid_w * patch_size + h = grid_h * patch_size + batch = count + self.model_config_copy.max_model_len = 4096 + if modality == 'image': + self.model_config_copy.limit_mm_per_prompt = { + "image": {"count": count, "width": w, "height": h} + } + elif modality == 'video': + video_options = self.model_config_copy.get_multimodal_config().get_dummy_options("video") + num_frames = video_options.num_frames if video_options and hasattr(video_options, 'num_frames') else 100 + w = video_options.width if video_options and hasattr(video_options, 'width') else w + h = video_options.height if video_options and hasattr(video_options, 'height') else h + count = video_options.count if video_options and hasattr(video_options, 'count') else 1 + self.model_config_copy.limit_mm_per_prompt = { + "video": {"count": count, "num_frames": num_frames, "width": w, "height": h} + } + else: + raise NotImplementedError(f"Modality '{modality}' is not supported") + dummy_mm_inputs = MultiModalRegistry().get_dummy_mm_inputs(self.model_config_copy, + mm_counts={modality: count}) dummy_mm_item = dummy_mm_inputs["mm_kwargs"][modality][0] # We use the cache so that the item is saved to the cache, # but not read from the cache @@ -4630,7 +4628,7 @@ def _get_mm_dummy_batch( dummy_mm_items, device=self.device, pin_memory=self.pin_memory, - )) + )), w, h, num_frames def warmup_multimodal_graphs(self, buckets): @@ -4651,15 +4649,22 @@ def warmup_multimodal_graphs(self, buckets): (9, 16), # 9:16 portrait ] aspect_ratios.extend(aspect_ratio_ext) + is_video_warmup = True if self.model_config.get_multimodal_config() is not None and \ + self.model_config.get_multimodal_config().get_dummy_options("video") is not None \ + and self.mm_budget.mm_limits['video'] != 999 else False + + is_image_warmup = True if self.model_config.get_multimodal_config() is not None and \ + self.model_config.get_multimodal_config().get_dummy_options("image") is not None \ + and self.mm_budget.mm_limits['image'] != 0 else False for modality, max_items in self.mm_budget.mm_limits.items(): - if modality == 'video': - logger.warning_once("Warming up for video is not implemented") + if modality == 'image' and is_image_warmup == False or modality == 'video' \ + and is_video_warmup == False: continue phase = f'Graph/Multimodal({modality})' num_candidates = len(buckets) for idx, img_arg in enumerate(buckets): for (ratio_w, ratio_h) in aspect_ratios: - batched_dummy_mm_inputs = self._get_mm_dummy_batch(modality, img_arg, ratio_w, ratio_h) + batched_dummy_mm_inputs, w, h, f = self._get_mm_dummy_batch(modality, img_arg, ratio_w, ratio_h) dummy_encoder_outputs = \ self.model.embed_multimodal( **batched_dummy_mm_inputs) @@ -4670,7 +4675,7 @@ def warmup_multimodal_graphs(self, buckets): ) self.graphed_buckets.add(img_arg) - self.log_warmup_multimodal(phase, idx, num_candidates, 1, 0, img_arg) + self.log_warmup_multimodal(phase, idx, num_candidates, 1, 0, w, h, f) def _maybe_profile_unified_attn(self): unified_cfg_str = os.environ.get('VLLM_PROFILE_UNIFIED', None) From 9db6b783ca119ff4b047a7210d42891a86ea3291 Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Wed, 21 Jan 2026 11:20:00 -0800 Subject: [PATCH 15/26] Update ops.py with removing uncessary change --- vllm_gaudi/extension/ops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm_gaudi/extension/ops.py b/vllm_gaudi/extension/ops.py index cd0ef7d1b..96e38f3bc 100644 --- a/vllm_gaudi/extension/ops.py +++ b/vllm_gaudi/extension/ops.py @@ -366,7 +366,6 @@ def _fsdpa_prompt_attention(query: torch.Tensor, query, key, value, attn_bias, 0.0, is_causal, scale, softmax_mode, recompute_mode, valid_seq_lengths, padding_side ] - args += [window_size] if window_size else [] attn_weights = fsdpa_op(*args) From 9be005620c376f299439e41f2d65352c52fcb06e Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Wed, 21 Jan 2026 23:39:57 -0800 Subject: [PATCH 16/26] Update hpu_model_runner.py for precommit fix --- vllm_gaudi/v1/worker/hpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 0e913bc7b..460f3f826 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -4584,7 +4584,7 @@ def _get_mm_dummy_batch( batch = image_args if self.get_model().vision_bucket_manager.is_batch_based else count if self.get_model().vision_bucket_manager.is_batch_based: # Create ImageDummyOptions for Gemma3 - w=896, # pixels as in gemma3 config + w=896 # pixels as in gemma3 config h=896 # pixels as in gemma3 config batch = image_args else: From b4f2e6c3933406080c0df304bee4886796463dd7 Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Wed, 21 Jan 2026 23:46:50 -0800 Subject: [PATCH 17/26] Update hpu_model_runner.py for precommit fix --- vllm_gaudi/v1/worker/hpu_model_runner.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 8422fe50a..8dacaf346 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -739,7 +739,6 @@ def __init__( # Mult-modal-related. self.mm_registry = MULTIMODAL_REGISTRY self.uses_mrope = model_config.uses_mrope - self.model_config_copy = None self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(model_config) if self.supports_mm_inputs: @@ -4682,13 +4681,13 @@ def warmup_multimodal_graphs(self, buckets): (9, 16), # 9:16 portrait ] aspect_ratios.extend(aspect_ratio_ext) - is_video_warmup = True if self.model_config.get_multimodal_config() is not None and \ + is_video_warmup = self.model_config.get_multimodal_config() is not None and \ self.model_config.get_multimodal_config().get_dummy_options("video") is not None \ - and self.mm_budget.mm_limits['video'] != 999 else False + and self.mm_budget.mm_limits['video'] != 999 - is_image_warmup = True if self.model_config.get_multimodal_config() is not None and \ + is_image_warmup = self.model_config.get_multimodal_config() is not None and \ self.model_config.get_multimodal_config().get_dummy_options("image") is not None \ - and self.mm_budget.mm_limits['image'] != 0 else False + and self.mm_budget.mm_limits['image'] != 0 for modality, max_items in self.mm_budget.mm_limits.items(): if modality == 'image' and is_image_warmup == False or modality == 'video' \ and is_video_warmup == False: From 02c239bff07cae16c203aef2c10c8682560fe33a Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Wed, 21 Jan 2026 23:54:00 -0800 Subject: [PATCH 18/26] Update hpu_model_runner.py for precommit fix --- vllm_gaudi/v1/worker/hpu_model_runner.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 8dacaf346..c4d0fa165 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -43,7 +43,6 @@ from vllm.v1.attention.selector import get_attn_backend from vllm.config import (VllmConfig, update_config) -from vllm.config.multimodal import ImageDummyOptions from vllm.distributed.kv_transfer import (get_kv_transfer_group, has_kv_transfer_group) from vllm.forward_context import get_forward_context, set_forward_context from vllm.model_executor.layers.fused_moe.layer import FusedMoE @@ -4616,8 +4615,8 @@ def _get_mm_dummy_batch( batch = image_args if self.get_model().vision_bucket_manager.is_batch_based else count if self.get_model().vision_bucket_manager.is_batch_based: # Create ImageDummyOptions for Gemma3 - w=896 # pixels as in gemma3 config - h=896 # pixels as in gemma3 config + w = 896 # pixels as in gemma3 config + h = 896 # pixels as in gemma3 config batch = image_args else: patch_size = int(self.get_patch_size_from_model()) @@ -4632,23 +4631,18 @@ def _get_mm_dummy_batch( batch = count self.model_config_copy.max_model_len = 4096 if modality == 'image': - self.model_config_copy.limit_mm_per_prompt = { - "image": {"count": count, "width": w, "height": h} - } + self.model_config_copy.limit_mm_per_prompt = {"image": {"count": count, "width": w, "height": h}} elif modality == 'video': video_options = self.model_config_copy.get_multimodal_config().get_dummy_options("video") num_frames = video_options.num_frames if video_options and hasattr(video_options, 'num_frames') else 100 w = video_options.width if video_options and hasattr(video_options, 'width') else w h = video_options.height if video_options and hasattr(video_options, 'height') else h count = video_options.count if video_options and hasattr(video_options, 'count') else 1 - self.model_config_copy.limit_mm_per_prompt = { - "video": {"count": count, "num_frames": num_frames, "width": w, "height": h} - } + self.model_config_copy.limit_mm_per_prompt = {"video": {"count": count, "num_frames": num_frames, "width": w, "height": h}} else: raise NotImplementedError(f"Modality '{modality}' is not supported") - dummy_mm_inputs = MultiModalRegistry().get_dummy_mm_inputs(self.model_config_copy, - mm_counts={modality: count}) + dummy_mm_inputs = MultiModalRegistry().get_dummy_mm_inputs(self.model_config_copy, mm_counts={modality: count}) dummy_mm_item = dummy_mm_inputs["mm_kwargs"][modality][0] # We use the cache so that the item is saved to the cache, # but not read from the cache From 3dd1f5cf3d7cd7836c3994ddd794c686b356545a Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Wed, 21 Jan 2026 23:59:18 -0800 Subject: [PATCH 19/26] Update hpu_model_runner.py for precommit fix --- vllm_gaudi/v1/worker/hpu_model_runner.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index c4d0fa165..f90534f8d 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -4638,7 +4638,14 @@ def _get_mm_dummy_batch( w = video_options.width if video_options and hasattr(video_options, 'width') else w h = video_options.height if video_options and hasattr(video_options, 'height') else h count = video_options.count if video_options and hasattr(video_options, 'count') else 1 - self.model_config_copy.limit_mm_per_prompt = {"video": {"count": count, "num_frames": num_frames, "width": w, "height": h}} + self.model_config_copy.limit_mm_per_prompt = { + "video": { + "count": count, + "num_frames": num_frames, + "width": w, + "height": h + } + } else: raise NotImplementedError(f"Modality '{modality}' is not supported") From 7757e80d6af1d2f7150f57ec448d42aa3fa7c995 Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Thu, 22 Jan 2026 00:01:57 -0800 Subject: [PATCH 20/26] Update hpu_model_runner.py for precommit fix --- vllm_gaudi/v1/worker/hpu_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index f90534f8d..3b973311c 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -4690,8 +4690,8 @@ def warmup_multimodal_graphs(self, buckets): self.model_config.get_multimodal_config().get_dummy_options("image") is not None \ and self.mm_budget.mm_limits['image'] != 0 for modality, max_items in self.mm_budget.mm_limits.items(): - if modality == 'image' and is_image_warmup == False or modality == 'video' \ - and is_video_warmup == False: + if modality == 'image' and not is_image_warmup or modality == 'video' \ + and not is_video_warmup: continue phase = f'Graph/Multimodal({modality})' num_candidates = len(buckets) From 913176ae052158d496da2d775c418b85a6204d93 Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Thu, 22 Jan 2026 16:58:54 -0800 Subject: [PATCH 21/26] fix qwen2.5vl unified attn test failure --- tests/full_tests/model_cards/qwen2.5-vl-7b.yaml | 2 +- vllm_gaudi/models/utils.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/full_tests/model_cards/qwen2.5-vl-7b.yaml b/tests/full_tests/model_cards/qwen2.5-vl-7b.yaml index 5c5fc51e3..45e504c24 100644 --- a/tests/full_tests/model_cards/qwen2.5-vl-7b.yaml +++ b/tests/full_tests/model_cards/qwen2.5-vl-7b.yaml @@ -1,4 +1,4 @@ -model_name: "Qwen/Qwen2.5-VL-7B-Instruct" +model_name: "/software/data/pytorch/huggingface/hub/models--Qwen--Qwen2.5-VL-7B-Instruct/snapshots/cc594898137f460bfe9f0759e9844b3ce807cfb5/" test_config: # List of test configurations. - modality is test for - modality: image # modality (currently supports image and video) extra_engine_args: # Optional extra arguments for the engine diff --git a/vllm_gaudi/models/utils.py b/vllm_gaudi/models/utils.py index e7c090040..667090f5e 100644 --- a/vllm_gaudi/models/utils.py +++ b/vllm_gaudi/models/utils.py @@ -29,7 +29,12 @@ def _merge_multimodal_embeddings( mm_embeds_flat = _flatten_embeddings(multimodal_embeddings) input_dtype = inputs_embeds.dtype - if is_multimodal.dtype == torch.int64: + if inputs_embeds.ndim == 3 and mm_embeds_flat.ndim == 2: + original_shape = inputs_embeds.shape + inputs_embeds = inputs_embeds.view(-1, inputs_embeds.shape[-1]) + result = inputs_embeds.index_copy_(0, is_multimodal, mm_embeds_flat) + return inputs_embeds.view(original_shape) + else: return inputs_embeds.index_copy_(0, is_multimodal, mm_embeds_flat) try: # For debugging From 091c5fe3d7bc575339ae223461e355d9cc139184 Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Thu, 22 Jan 2026 17:21:45 -0800 Subject: [PATCH 22/26] precommit fix --- vllm_gaudi/models/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_gaudi/models/utils.py b/vllm_gaudi/models/utils.py index 667090f5e..1784b276c 100644 --- a/vllm_gaudi/models/utils.py +++ b/vllm_gaudi/models/utils.py @@ -34,7 +34,7 @@ def _merge_multimodal_embeddings( inputs_embeds = inputs_embeds.view(-1, inputs_embeds.shape[-1]) result = inputs_embeds.index_copy_(0, is_multimodal, mm_embeds_flat) return inputs_embeds.view(original_shape) - else: + else: return inputs_embeds.index_copy_(0, is_multimodal, mm_embeds_flat) try: # For debugging From f0613fdce6666c495c12c2467a0152823e8c3ab5 Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Thu, 22 Jan 2026 17:29:51 -0800 Subject: [PATCH 23/26] precommit fix --- vllm_gaudi/models/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_gaudi/models/utils.py b/vllm_gaudi/models/utils.py index 1784b276c..4912cb3dd 100644 --- a/vllm_gaudi/models/utils.py +++ b/vllm_gaudi/models/utils.py @@ -32,7 +32,7 @@ def _merge_multimodal_embeddings( if inputs_embeds.ndim == 3 and mm_embeds_flat.ndim == 2: original_shape = inputs_embeds.shape inputs_embeds = inputs_embeds.view(-1, inputs_embeds.shape[-1]) - result = inputs_embeds.index_copy_(0, is_multimodal, mm_embeds_flat) + inputs_embeds.index_copy_(0, is_multimodal, mm_embeds_flat) return inputs_embeds.view(original_shape) else: return inputs_embeds.index_copy_(0, is_multimodal, mm_embeds_flat) From ec827b8a93383c410b26b2eeb20db44787bfdfa4 Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Thu, 22 Jan 2026 23:52:39 -0800 Subject: [PATCH 24/26] add more mm bucket --- vllm_gaudi/extension/bucketing/vision.py | 5 +++-- vllm_gaudi/v1/worker/hpu_model_runner.py | 20 ++++++++++---------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/vllm_gaudi/extension/bucketing/vision.py b/vllm_gaudi/extension/bucketing/vision.py index 78037de45..903359ee3 100644 --- a/vllm_gaudi/extension/bucketing/vision.py +++ b/vllm_gaudi/extension/bucketing/vision.py @@ -21,8 +21,9 @@ }, 'qwen3_vl': { 'is_batch_based': False, - #coverage for lmarena-ai/VisionArena-Chat - 'buckets': [512, 1024, 2048, 3072, 4096, 5120, 6144, 7168, 8192, 9216, 10240, 11264, 12288, 131076] + 'buckets': + [256, 512, 1024, 1350, 1602, 2048, 3072, 4096, 5120, 6144, 7168, 8192, 9216, 10240, 11264, 12288, 131076] + } } diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 3b973311c..92003c420 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -4672,16 +4672,16 @@ def warmup_multimodal_graphs(self, buckets): self.mm_registry, ) if self.supports_mm_inputs else None aspect_ratios = [(1, 1)] # 1:1 square - sanity_check = False - if self.get_model().vision_bucket_manager.is_batch_based: - sanity_check = True - aspect_ratio_ext = [ - (4, 3), # 4:3 landscape - (3, 4), # 3:4 portrait - (16, 9), # 16:9 widescreen - (9, 16), # 9:16 portrait - ] - aspect_ratios.extend(aspect_ratio_ext) + sanity_check = self.get_model().vision_bucket_manager.is_batch_based + + aspect_ratios = [ + (1, 1), # 1:1 square + (4, 3), # 4:3 landscape + (3, 4), # 3:4 portrait + (16, 9), # 16:9 widescreen + (9, 16), # 9:16 portrait + ] + is_video_warmup = self.model_config.get_multimodal_config() is not None and \ self.model_config.get_multimodal_config().get_dummy_options("video") is not None \ and self.mm_budget.mm_limits['video'] != 999 From 4cf5cb1baaae7f5ba1cb2db8b2fb82b5dc287d6b Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Fri, 23 Jan 2026 00:00:11 -0800 Subject: [PATCH 25/26] precommit fix --- vllm_gaudi/extension/bucketing/vision.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm_gaudi/extension/bucketing/vision.py b/vllm_gaudi/extension/bucketing/vision.py index 903359ee3..084526259 100644 --- a/vllm_gaudi/extension/bucketing/vision.py +++ b/vllm_gaudi/extension/bucketing/vision.py @@ -23,7 +23,6 @@ 'is_batch_based': False, 'buckets': [256, 512, 1024, 1350, 1602, 2048, 3072, 4096, 5120, 6144, 7168, 8192, 9216, 10240, 11264, 12288, 131076] - } } From f46b48db065599597e8ba5f674a1c612ca5cb349 Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Fri, 23 Jan 2026 09:28:44 -0800 Subject: [PATCH 26/26] Update qwen2.5-vl-7b.yaml to revert change --- tests/full_tests/model_cards/qwen2.5-vl-7b.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/full_tests/model_cards/qwen2.5-vl-7b.yaml b/tests/full_tests/model_cards/qwen2.5-vl-7b.yaml index 45e504c24..5c5fc51e3 100644 --- a/tests/full_tests/model_cards/qwen2.5-vl-7b.yaml +++ b/tests/full_tests/model_cards/qwen2.5-vl-7b.yaml @@ -1,4 +1,4 @@ -model_name: "/software/data/pytorch/huggingface/hub/models--Qwen--Qwen2.5-VL-7B-Instruct/snapshots/cc594898137f460bfe9f0759e9844b3ce807cfb5/" +model_name: "Qwen/Qwen2.5-VL-7B-Instruct" test_config: # List of test configurations. - modality is test for - modality: image # modality (currently supports image and video) extra_engine_args: # Optional extra arguments for the engine