Skip to content

Commit 1c3a154

Browse files
committed
only lllava models are modified
1 parent f408d55 commit 1c3a154

19 files changed

+198
-1029
lines changed

src/transformers/models/llava/modeling_llava.py

Lines changed: 2 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -461,85 +461,24 @@ def forward(
461461
"You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
462462
)
463463

464-
legacy_processing = False
465464
if inputs_embeds is None:
466465
inputs_embeds = self.get_input_embeddings()(input_ids)
467466

468-
# if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
469-
# not very reliable, but we don't expect one to actually pass 500+ images for one prompt
470-
# In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
471-
legacy_processing = (
472-
(input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
473-
) or (input_ids.shape[-1] == 1 and pixel_values is not None)
474-
475-
image_features = None
476467
if pixel_values is not None:
477468
image_features = self.get_image_features(
478469
pixel_values=pixel_values,
479470
vision_feature_layer=vision_feature_layer,
480471
vision_feature_select_strategy=vision_feature_select_strategy,
481472
)
482473

483-
if legacy_processing:
484-
logger.warning_once(
485-
"Expanding inputs for image tokens in LLaVa should be done in processing. "
486-
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
487-
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
488-
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
489-
)
490-
# prefill stage vs decoding stage (legacy behavior copied)
491-
if input_ids.shape[1] != 1:
492-
inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
493-
image_features, inputs_embeds, input_ids, attention_mask, labels
494-
)
495-
cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
496-
else:
497-
# Retrieve the first layer to inspect the logits and mask out the hidden states
498-
# that are set to 0
499-
first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
500-
501-
# Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
502-
batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
503-
504-
# Get the target length
505-
target_length = input_ids.shape[1]
506-
past_length = first_layer_past_key_value.shape[-1]
507-
508-
extended_attention_mask = torch.ones(
509-
(attention_mask.shape[0], past_length),
510-
dtype=attention_mask.dtype,
511-
device=attention_mask.device,
512-
)
513-
514-
# Filter out only the tokens that can be un-attended, this can happen
515-
# if one uses Llava + Fused modules where the cache on the
516-
# first iteration is already big enough, or if one passes custom cache
517-
valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
518-
new_batch_index = batch_index[valid_indices]
519-
new_non_attended_tokens = non_attended_tokens[valid_indices]
520-
521-
# Zero-out the places where we don't need to attend
522-
extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
523-
524-
attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
525-
position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
526-
cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:]
527-
528-
# TODO: @raushan retain only the new behavior after v4.47
529-
elif image_features is not None:
530474
n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
531475
n_image_features = image_features.shape[0] * image_features.shape[1]
532-
533476
if n_image_tokens != n_image_features:
534477
raise ValueError(
535478
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
536479
)
537-
special_image_mask = (
538-
(input_ids == self.config.image_token_index)
539-
.unsqueeze(-1)
540-
.expand_as(inputs_embeds)
541-
.to(inputs_embeds.device)
542-
)
480+
special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
481+
special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
543482
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
544483
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
545484

src/transformers/models/llava/processing_llava.py

Lines changed: 11 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -154,27 +154,17 @@ def __call__(
154154
# try to expand inputs in processing if we have the necessary parts
155155
prompt_strings = text
156156
if image_inputs.get("pixel_values") is not None:
157-
if self.patch_size is not None and self.vision_feature_select_strategy is not None:
158-
# Replace the image token with the expanded image token sequence
159-
pixel_values = image_inputs["pixel_values"]
160-
height, width = get_image_size(to_numpy_array(pixel_values[0]))
161-
num_image_tokens = (height // self.patch_size) * (
162-
width // self.patch_size
163-
) + self.num_additional_image_tokens
164-
if self.vision_feature_select_strategy == "default":
165-
num_image_tokens -= self.num_additional_image_tokens
166-
167-
prompt_strings = []
168-
for sample in text:
169-
sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
170-
prompt_strings.append(sample)
171-
else:
172-
logger.warning_once(
173-
"Expanding inputs for image tokens in LLaVa should be done in processing. "
174-
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
175-
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
176-
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
177-
)
157+
# Replace the image token with the expanded image token sequence
158+
pixel_values = image_inputs["pixel_values"]
159+
height, width = get_image_size(to_numpy_array(pixel_values[0]))
160+
num_image_tokens = (height // self.patch_size) * (width // self.patch_size) + 1
161+
if self.vision_feature_select_strategy == "default":
162+
num_image_tokens -= 1
163+
164+
prompt_strings = []
165+
for sample in text:
166+
sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
167+
prompt_strings.append(sample)
178168

179169
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
180170
return BatchFeature(data={**text_inputs, **image_inputs})

src/transformers/models/llava_next/modeling_llava_next.py

Lines changed: 5 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -689,7 +689,9 @@ def pack_image_features(self, image_features, image_sizes, vision_feature_select
689689
image_feature = torch.cat(
690690
(
691691
image_feature,
692-
image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.dtype),
692+
image_newline[:, None, None]
693+
.expand(*image_feature.shape[:-1], 1)
694+
.to(image_feature.device, image_feature.dtype),
693695
),
694696
dim=-1,
695697
)
@@ -835,18 +837,9 @@ def forward(
835837
"You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
836838
)
837839

838-
legacy_processing = False
839840
if inputs_embeds is None:
840841
inputs_embeds = self.get_input_embeddings()(input_ids)
841842

842-
# if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
843-
# not very reliable, but we don't expect one to actually pass 500+ images for one prompt
844-
# In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
845-
legacy_processing = (
846-
(input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
847-
) or (input_ids.shape[-1] == 1 and pixel_values is not None)
848-
849-
image_features = None
850843
if pixel_values is not None and pixel_values.size(0) > 0:
851844
image_features = self.get_image_features(
852845
pixel_values,
@@ -863,70 +856,14 @@ def forward(
863856
image_newline=self.image_newline,
864857
)
865858

866-
if legacy_processing:
867-
logger.warning_once(
868-
"Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
869-
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
870-
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
871-
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
872-
)
873-
if input_ids.shape[1] != 1:
874-
inputs_embeds = inputs_embeds.to(image_features.dtype)
875-
inputs_embeds, attention_mask, position_ids, labels, _ = self._merge_input_ids_with_image_features(
876-
image_features,
877-
feature_lens,
878-
inputs_embeds,
879-
input_ids,
880-
attention_mask,
881-
position_ids,
882-
labels=labels,
883-
)
884-
cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
885-
else:
886-
# Retrieve the first layer to inspect the logits and mask out the hidden states
887-
# that are set to 0
888-
first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
889-
890-
# Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
891-
batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
892-
893-
# Get the target length
894-
target_length = input_ids.shape[1]
895-
past_length = first_layer_past_key_value.shape[-1]
896-
897-
extended_attention_mask = torch.ones(
898-
(attention_mask.shape[0], past_length),
899-
dtype=attention_mask.dtype,
900-
device=attention_mask.device,
901-
)
902-
903-
# Filter out only the tokens that can be un-attended, this can happen
904-
# if one uses Llava + Fused modules where the cache on the
905-
# first iteration is already big enough, or if one passes custom cache
906-
valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
907-
new_batch_index = batch_index[valid_indices]
908-
new_non_attended_tokens = non_attended_tokens[valid_indices]
909-
910-
# Zero-out the places where we don't need to attend
911-
extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
912-
attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
913-
position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
914-
cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:]
915-
916-
# TODO: @raushan retain only the new behavior after v4.47
917-
elif image_features is not None:
918859
n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
919860
n_image_features = image_features.shape[0]
920861
if n_image_tokens != n_image_features:
921862
raise ValueError(
922863
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
923864
)
924-
special_image_mask = (
925-
(input_ids == self.config.image_token_index)
926-
.unsqueeze(-1)
927-
.expand_as(inputs_embeds)
928-
.to(inputs_embeds.device)
929-
)
865+
special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
866+
special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
930867
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
931868
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
932869

src/transformers/models/llava_next/processing_llava_next.py

Lines changed: 13 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -149,30 +149,19 @@ def __call__(
149149

150150
prompt_strings = text
151151
if image_inputs:
152-
if self.patch_size is None or self.vision_feature_select_strategy is None:
153-
logger.warning_once(
154-
"Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
155-
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
156-
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
157-
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
158-
)
159-
else:
160-
image_sizes = iter(image_inputs["image_sizes"])
161-
height, width = get_image_size(to_numpy_array(image_inputs["pixel_values"][0][0]))
162-
prompt_strings = []
163-
for sample in text:
164-
while self.image_token in sample:
165-
image_size = next(image_sizes)
166-
if not isinstance(image_size, (list, tuple)):
167-
# cast to list to avoid numerical precision errors when calculating unpadding
168-
image_size = image_size.tolist()
169-
orig_height, orig_width = image_size
170-
num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
171-
if self.vision_feature_select_strategy == "default":
172-
num_image_tokens -= self.num_additional_image_tokens
173-
sample = sample.replace(self.image_token, "<placeholder>" * num_image_tokens, 1)
174-
prompt_strings.append(sample)
175-
prompt_strings = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
152+
image_sizes = iter(image_inputs["image_sizes"])
153+
height, width = get_image_size(to_numpy_array(image_inputs["pixel_values"][0][0]))
154+
prompt_strings = []
155+
for sample in text:
156+
while self.image_token in sample:
157+
image_size = next(image_sizes)
158+
orig_height, orig_width = image_size
159+
num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
160+
if self.vision_feature_select_strategy == "default":
161+
num_image_tokens -= 1
162+
sample = sample.replace(self.image_token, "<placeholder>" * num_image_tokens, 1)
163+
prompt_strings.append(sample)
164+
prompt_strings = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
176165

177166
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
178167

0 commit comments

Comments
 (0)