From 4cff439dcf7e97c5b9ea02dbea41f749ebe7e79a Mon Sep 17 00:00:00 2001 From: Eustache Le Bihan Date: Tue, 22 Apr 2025 18:11:44 +0200 Subject: [PATCH 1/2] handle default kwargs --- src/transformers/processing_utils.py | 34 ++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index dc6bd4bf1428..fec424a38052 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -1432,14 +1432,36 @@ def apply_chat_template( "template_kwargs": {}, } + # Get the kwargs type annotation from __call__, if any + typing_processor_kwargs_class = self.__call__.__annotations__.get("kwargs") + processor_defaults_kwargs = {} + + # Retrieve processor default kwargs + if typing_processor_kwargs_class: + processor_kwargs_class = typing_processor_kwargs_class.__args__[0] + processor_defaults = getattr(processor_kwargs_class, "_defaults", {}) + + # This combines all default values from different categories (like text_kwargs, images_kwargs, etc.) + processor_defaults_kwargs = {k: v for values in processor_defaults.values() for k, v in values.items()} + for kwarg_type in processed_kwargs: for key in AllKwargsForChatTemplate.__annotations__[kwarg_type].__annotations__.keys(): kwarg_type_defaults = AllKwargsForChatTemplate.__annotations__[kwarg_type] - default_value = getattr(kwarg_type_defaults, key, None) + default_value = processor_defaults_kwargs.get(key, getattr(kwarg_type_defaults, key, None)) value = kwargs.pop(key, default_value) + if value is not None and not isinstance(value, dict): processed_kwargs[kwarg_type][key] = value + # If the key is in the processor defaults, we need to pass it to the processor + if key in processor_defaults_kwargs: + kwargs[key] = value + + # handle the two naming conventions for fps: video_fps is load kwargs and fps is processor kwargs + if key == "video_fps": + key = "fps" + kwargs[key] = value + if isinstance(conversation, (list, tuple)) and ( isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content") ): @@ -1504,12 +1526,16 @@ def apply_chat_template( ) else: # TODO: raushan, should be `self.video_processor.load_video_for_model` when API is added - video, metadata = self._load_video_for_model( - fname, + load_video_kwargs = kwargs.copy() + load_video_kwargs.update( num_frames=mm_load_kwargs.get("num_frames", None), fps=mm_load_kwargs.get("video_fps", None), backend=mm_load_kwargs["video_load_backend"], - **kwargs, + ) + + video, metadata = self._load_video_for_model( + fname, + **load_video_kwargs, ) videos.append(video) video_metadata.append(metadata) From 7e90ed8ce2ac39230eb5f1962008430656c9f502 Mon Sep 17 00:00:00 2001 From: Eustache Le Bihan Date: Tue, 22 Apr 2025 19:13:20 +0200 Subject: [PATCH 2/2] handle video_fps --- src/transformers/processing_utils.py | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index fec424a38052..931328050fe5 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -1447,20 +1447,18 @@ def apply_chat_template( for kwarg_type in processed_kwargs: for key in AllKwargsForChatTemplate.__annotations__[kwarg_type].__annotations__.keys(): kwarg_type_defaults = AllKwargsForChatTemplate.__annotations__[kwarg_type] - default_value = processor_defaults_kwargs.get(key, getattr(kwarg_type_defaults, key, None)) + + # handle the two naming conventions for fps: video_fps in load kwargs and fps in processor kwargs + processor_key = key if key != "video_fps" else "fps" + default_value = processor_defaults_kwargs.get(processor_key, getattr(kwarg_type_defaults, key, None)) value = kwargs.pop(key, default_value) if value is not None and not isinstance(value, dict): processed_kwargs[kwarg_type][key] = value # If the key is in the processor defaults, we need to pass it to the processor - if key in processor_defaults_kwargs: - kwargs[key] = value - - # handle the two naming conventions for fps: video_fps is load kwargs and fps is processor kwargs - if key == "video_fps": - key = "fps" - kwargs[key] = value + if processor_key in processor_defaults_kwargs and processor_key not in kwargs: + kwargs[processor_key] = value if isinstance(conversation, (list, tuple)) and ( isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content") @@ -1526,16 +1524,12 @@ def apply_chat_template( ) else: # TODO: raushan, should be `self.video_processor.load_video_for_model` when API is added - load_video_kwargs = kwargs.copy() - load_video_kwargs.update( + video, metadata = self._load_video_for_model( + fname, num_frames=mm_load_kwargs.get("num_frames", None), fps=mm_load_kwargs.get("video_fps", None), backend=mm_load_kwargs["video_load_backend"], - ) - - video, metadata = self._load_video_for_model( - fname, - **load_video_kwargs, + **{k: v for k, v in kwargs.items() if k != "fps"}, ) videos.append(video) video_metadata.append(metadata)