From 558bc71e2f722d7ea9c43a3ff47649f0399baeab Mon Sep 17 00:00:00 2001
From: Vladimir Blagojevic <dovlex@gmail.com>
Date: Tue, 2 Dec 2025 15:04:21 +0100
Subject: [PATCH 01/10] Update HuggingFaceLocalChatGenerator default model to
 Qwen/Qwen3-0.6B

---
 .../generators/chat/hugging_face_local.py          |  7 ++++---
 .../generators/chat/test_hugging_face_local.py     | 14 +++++---------
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/haystack/components/generators/chat/hugging_face_local.py b/haystack/components/generators/chat/hugging_face_local.py
index 2296b4f7f1..772cd56968 100644
--- a/haystack/components/generators/chat/hugging_face_local.py
+++ b/haystack/components/generators/chat/hugging_face_local.py
@@ -95,7 +95,7 @@ class HuggingFaceLocalChatGenerator:
     Generates chat responses using models from Hugging Face that run locally.
 
     Use this component with chat-based models,
-    such as `HuggingFaceH4/zephyr-7b-beta` or `meta-llama/Llama-2-7b-chat-hf`.
+    such as `Qwen/Qwen3-0.6B` or `meta-llama/Llama-2-7b-chat-hf`.
     LLMs running locally may need powerful hardware.
 
     ### Usage example
@@ -104,7 +104,7 @@ class HuggingFaceLocalChatGenerator:
     from haystack.components.generators.chat import HuggingFaceLocalChatGenerator
     from haystack.dataclasses import ChatMessage
 
-    generator = HuggingFaceLocalChatGenerator(model="HuggingFaceH4/zephyr-7b-beta")
+    generator = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B")
     generator.warm_up()
     messages = [ChatMessage.from_user("What's Natural Language Processing? Be brief.")]
     print(generator.run(messages))
@@ -129,7 +129,7 @@ class HuggingFaceLocalChatGenerator:
 
     def __init__(  # pylint: disable=too-many-positional-arguments
         self,
-        model: str = "HuggingFaceH4/zephyr-7b-beta",
+        model: str = "Qwen/Qwen3-0.6B",
         task: Optional[Literal["text-generation", "text2text-generation"]] = None,
         device: Optional[ComponentDevice] = None,
         token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
@@ -590,6 +590,7 @@ def _prepare_inputs(
             chat_template=self.chat_template,
             add_generation_prompt=True,
             tools=[tc.tool_spec for tc in flat_tools] if flat_tools else None,
+            enable_thinking=False,
         )
         # prepared_prompt is a string since we set tokenize=False https://hf.co/docs/transformers/main/chat_templating
         assert isinstance(prepared_prompt, str)
diff --git a/test/components/generators/chat/test_hugging_face_local.py b/test/components/generators/chat/test_hugging_face_local.py
index 78f24073a4..7a6ae926b2 100644
--- a/test/components/generators/chat/test_hugging_face_local.py
+++ b/test/components/generators/chat/test_hugging_face_local.py
@@ -133,7 +133,7 @@ def test_init_task_parameter(self, model_info_mock):
         )
 
         assert generator.huggingface_pipeline_kwargs == {
-            "model": "HuggingFaceH4/zephyr-7b-beta",
+            "model": "Qwen/Qwen3-0.6B",
             "task": "text2text-generation",
             "token": None,
             "device": "cpu",
@@ -147,7 +147,7 @@ def test_init_task_in_huggingface_pipeline_kwargs(self, model_info_mock):
         )
 
         assert generator.huggingface_pipeline_kwargs == {
-            "model": "HuggingFaceH4/zephyr-7b-beta",
+            "model": "Qwen/Qwen3-0.6B",
             "task": "text2text-generation",
             "token": None,
             "device": "cpu",
@@ -490,9 +490,7 @@ def test_live_run(self, monkeypatch):
         monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
         messages = [ChatMessage.from_user("Please create a summary about the following topic: Climate change")]
 
-        llm = HuggingFaceLocalChatGenerator(
-            model="Qwen/Qwen2.5-0.5B-Instruct", generation_kwargs={"max_new_tokens": 50}
-        )
+        llm = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 50})
         llm.warm_up()
 
         result = llm.run(messages)
@@ -513,7 +511,7 @@ def test_init_fail_with_tools_and_streaming(self, model_info_mock, tools):
             )
 
     def test_run_with_tools(self, model_info_mock, tools):
-        generator = HuggingFaceLocalChatGenerator(model="meta-llama/Llama-2-13b-chat-hf", tools=tools)
+        generator = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B", tools=tools)
 
         # Mock pipeline and tokenizer
         mock_pipeline = Mock(return_value=[{"generated_text": '{"name": "weather", "arguments": {"city": "Paris"}}'}])
@@ -805,9 +803,7 @@ async def streaming_callback(chunk: StreamingChunk) -> None:
             streaming_chunks.append(chunk)
 
         llm = HuggingFaceLocalChatGenerator(
-            model="Qwen/Qwen2.5-0.5B-Instruct",
-            generation_kwargs={"max_new_tokens": 50},
-            streaming_callback=streaming_callback,
+            model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 50}, streaming_callback=streaming_callback
         )
         llm.warm_up()
 

From b7ed04712d65eba7278f6ad9f7c36d7020037633 Mon Sep 17 00:00:00 2001
From: Vladimir Blagojevic <dovlex@gmail.com>
Date: Mon, 8 Dec 2025 11:46:16 +0100
Subject: [PATCH 02/10] Add enable_thinking init parameter

---
 haystack/components/generators/chat/hugging_face_local.py | 8 +++++++-
 .../components/generators/chat/test_hugging_face_local.py | 4 ++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/haystack/components/generators/chat/hugging_face_local.py b/haystack/components/generators/chat/hugging_face_local.py
index 772cd56968..4ecfef9b23 100644
--- a/haystack/components/generators/chat/hugging_face_local.py
+++ b/haystack/components/generators/chat/hugging_face_local.py
@@ -141,6 +141,7 @@ def __init__(  # pylint: disable=too-many-positional-arguments
         tools: Optional[ToolsType] = None,
         tool_parsing_function: Optional[Callable[[str], Optional[list[ToolCall]]]] = None,
         async_executor: Optional[ThreadPoolExecutor] = None,
+        enable_thinking: bool = False,
     ) -> None:
         """
         Initializes the HuggingFaceLocalChatGenerator component.
@@ -186,6 +187,9 @@ def __init__(  # pylint: disable=too-many-positional-arguments
         :param async_executor:
             Optional ThreadPoolExecutor to use for async calls. If not provided, a single-threaded executor will be
             initialized and used
+        :param enable_thinking:
+            Whether to enable thinking mode in the chat template. When enabled, the model can generate
+            intermediate reasoning steps before producing the final response. Defaults to False.
         """
         torch_and_transformers_import.check()
 
@@ -243,6 +247,7 @@ def __init__(  # pylint: disable=too-many-positional-arguments
         self.streaming_callback = streaming_callback
         self.pipeline: Optional[HfPipeline] = None
         self.tools = tools
+        self.enable_thinking = enable_thinking
 
         self._owns_executor = async_executor is None
         self.executor = (
@@ -308,6 +313,7 @@ def to_dict(self) -> dict[str, Any]:
             chat_template=self.chat_template,
             tools=serialize_tools_or_toolset(self.tools),
             tool_parsing_function=serialize_callable(self.tool_parsing_function),
+            enable_thinking=self.enable_thinking,
         )
 
         huggingface_pipeline_kwargs = serialization_dict["init_parameters"]["huggingface_pipeline_kwargs"]
@@ -590,7 +596,7 @@ def _prepare_inputs(
             chat_template=self.chat_template,
             add_generation_prompt=True,
             tools=[tc.tool_spec for tc in flat_tools] if flat_tools else None,
-            enable_thinking=False,
+            enable_thinking=self.enable_thinking,
         )
         # prepared_prompt is a string since we set tokenize=False https://hf.co/docs/transformers/main/chat_templating
         assert isinstance(prepared_prompt, str)
diff --git a/test/components/generators/chat/test_hugging_face_local.py b/test/components/generators/chat/test_hugging_face_local.py
index 7a6ae926b2..8a8ab53aab 100644
--- a/test/components/generators/chat/test_hugging_face_local.py
+++ b/test/components/generators/chat/test_hugging_face_local.py
@@ -178,6 +178,7 @@ def test_to_dict(self, model_info_mock, tools):
             streaming_callback=None,
             chat_template="irrelevant",
             tools=tools,
+            enable_thinking=True,
         )
 
         # Call the to_dict method
@@ -191,6 +192,7 @@ def test_to_dict(self, model_info_mock, tools):
         assert init_params["generation_kwargs"] == {"max_new_tokens": 512, "n": 5, "stop_sequences": ["stop", "words"]}
         assert init_params["streaming_callback"] is None
         assert init_params["chat_template"] == "irrelevant"
+        assert init_params["enable_thinking"] is True
         assert init_params["tools"] == [
             {
                 "type": "haystack.tools.tool.Tool",
@@ -214,6 +216,7 @@ def test_from_dict(self, model_info_mock, tools):
             streaming_callback=None,
             chat_template="irrelevant",
             tools=tools,
+            enable_thinking=True,
         )
         # Call the to_dict method
         result = generator.to_dict()
@@ -224,6 +227,7 @@ def test_from_dict(self, model_info_mock, tools):
         assert generator_2.generation_kwargs == {"max_new_tokens": 512, "n": 5, "stop_sequences": ["stop", "words"]}
         assert generator_2.streaming_callback is None
         assert generator_2.chat_template == "irrelevant"
+        assert generator_2.enable_thinking is True
         assert len(generator_2.tools) == 1
         assert generator_2.tools[0].name == "weather"
         assert generator_2.tools[0].description == "useful to determine the weather in a given location"

From 0d478af3ad82d17e94c896183e2f41da46c86969 Mon Sep 17 00:00:00 2001
From: Vladimir Blagojevic <dovlex@gmail.com>
Date: Wed, 10 Dec 2025 13:08:53 +0100
Subject: [PATCH 03/10] Pydoc wording

---
 haystack/components/generators/chat/hugging_face_local.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/haystack/components/generators/chat/hugging_face_local.py b/haystack/components/generators/chat/hugging_face_local.py
index 4ecfef9b23..40483aa4d2 100644
--- a/haystack/components/generators/chat/hugging_face_local.py
+++ b/haystack/components/generators/chat/hugging_face_local.py
@@ -188,8 +188,8 @@ def __init__(  # pylint: disable=too-many-positional-arguments
             Optional ThreadPoolExecutor to use for async calls. If not provided, a single-threaded executor will be
             initialized and used
         :param enable_thinking:
-            Whether to enable thinking mode in the chat template. When enabled, the model can generate
-            intermediate reasoning steps before producing the final response. Defaults to False.
+            Whether to enable thinking mode in the chat template for thinking-capable models.
+            When enabled, the model generates intermediate reasoning before the final response. Defaults to False.
         """
         torch_and_transformers_import.check()
 

From 285fea8a9030f8f08b7c9a374b8ab93b30eaac42 Mon Sep 17 00:00:00 2001
From: Vladimir Blagojevic <dovlex@gmail.com>
Date: Wed, 10 Dec 2025 13:15:02 +0100
Subject: [PATCH 04/10] Format test

---
 test/components/generators/chat/test_hugging_face_local.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/test/components/generators/chat/test_hugging_face_local.py b/test/components/generators/chat/test_hugging_face_local.py
index b914631924..ac295461c8 100644
--- a/test/components/generators/chat/test_hugging_face_local.py
+++ b/test/components/generators/chat/test_hugging_face_local.py
@@ -494,9 +494,7 @@ def test_live_run(self, monkeypatch):
         monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
         messages = [ChatMessage.from_user("Please create a summary about the following topic: Climate change")]
 
-        llm = HuggingFaceLocalChatGenerator(
-            model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 50}
-        )
+        llm = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 50})
 
         result = llm.run(messages)
 

From 87c36b31b803d5719a220888d5667f27bb496f1e Mon Sep 17 00:00:00 2001
From: Vladimir Blagojevic <dovlex@gmail.com>
Date: Wed, 10 Dec 2025 14:50:48 +0100
Subject: [PATCH 05/10] Add tests for enable_thinking flag

---
 .../chat/test_hugging_face_local.py           | 48 +++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/test/components/generators/chat/test_hugging_face_local.py b/test/components/generators/chat/test_hugging_face_local.py
index ac295461c8..b604ecf3d8 100644
--- a/test/components/generators/chat/test_hugging_face_local.py
+++ b/test/components/generators/chat/test_hugging_face_local.py
@@ -502,6 +502,54 @@ def test_live_run(self, monkeypatch):
         assert isinstance(result["replies"][0], ChatMessage)
         assert "climate change" in result["replies"][0].text.lower()
 
+    @pytest.mark.integration
+    @pytest.mark.slow
+    @pytest.mark.flaky(reruns=3, reruns_delay=10)
+    def test_live_run_with_enable_thinking(self, monkeypatch):
+        """Test that enable_thinking works with the default Qwen3 model in a live run."""
+        monkeypatch.delenv("HF_API_TOKEN", raising=False)
+        messages = [ChatMessage.from_user("What is 2+2?")]
+
+        llm = HuggingFaceLocalChatGenerator(
+            model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 450}, enable_thinking=True
+        )
+
+        result = llm.run(messages)
+
+        assert "replies" in result
+        assert isinstance(result["replies"][0], ChatMessage)
+        reply_text = result["replies"][0].text
+
+        assert reply_text is not None
+        assert "<think>" in reply_text
+        assert "</think>" in reply_text
+        assert len(reply_text) > 0
+        assert "4" in reply_text.lower()
+
+    @pytest.mark.integration
+    @pytest.mark.slow
+    @pytest.mark.flaky(reruns=3, reruns_delay=10)
+    def test_live_run_without_enable_thinking(self, monkeypatch):
+        """Test that enable_thinking=False prevents thinking tags in the response."""
+        monkeypatch.delenv("HF_API_TOKEN", raising=False)
+        messages = [ChatMessage.from_user("What is 2+2?")]
+
+        llm = HuggingFaceLocalChatGenerator(
+            model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 450}, enable_thinking=False
+        )
+
+        result = llm.run(messages)
+
+        assert "replies" in result
+        assert isinstance(result["replies"][0], ChatMessage)
+        reply_text = result["replies"][0].text
+
+        assert reply_text is not None
+        assert "<think>" not in reply_text
+        assert "</think>" not in reply_text
+        assert len(reply_text) > 0
+        assert "4" in reply_text.lower()
+
     def test_init_fail_with_duplicate_tool_names(self, model_info_mock, tools):
         duplicate_tools = [tools[0], tools[0]]
         with pytest.raises(ValueError, match="Duplicate tool names found"):

From 41081f6c6020c718768f9d1d56a928cc7df4f175 Mon Sep 17 00:00:00 2001
From: Vladimir Blagojevic <dovlex@gmail.com>
Date: Wed, 10 Dec 2025 14:54:22 +0100
Subject: [PATCH 06/10] Add reno note for HuggingFaceLocalChatGenerator updates

---
 .../huggingface-local-qwen3-thinking-2a6e0f07d7da54e8.yaml  | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 releasenotes/notes/huggingface-local-qwen3-thinking-2a6e0f07d7da54e8.yaml

diff --git a/releasenotes/notes/huggingface-local-qwen3-thinking-2a6e0f07d7da54e8.yaml b/releasenotes/notes/huggingface-local-qwen3-thinking-2a6e0f07d7da54e8.yaml
new file mode 100644
index 0000000000..276942b95a
--- /dev/null
+++ b/releasenotes/notes/huggingface-local-qwen3-thinking-2a6e0f07d7da54e8.yaml
@@ -0,0 +1,6 @@
+---
+upgrade:
+  - |
+    `HuggingFaceLocalChatGenerator` now uses `Qwen/Qwen3-0.6B` as the default model, replacing the previous default.
+    Additionally, a new `enable_thinking` parameter has been added to enable thinking mode in chat templates for
+    thinking-capable models, allowing them to generate intermediate reasoning steps before producing final responses.

From 530a35fde6011e3f21a0f8359d259737d9638988 Mon Sep 17 00:00:00 2001
From: Vladimir Blagojevic <dovlex@gmail.com>
Date: Thu, 11 Dec 2025 11:00:51 +0100
Subject: [PATCH 07/10] Update
 haystack/components/generators/chat/hugging_face_local.py

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>
---
 haystack/components/generators/chat/hugging_face_local.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/haystack/components/generators/chat/hugging_face_local.py b/haystack/components/generators/chat/hugging_face_local.py
index b83225cfa6..03e66d0abd 100644
--- a/haystack/components/generators/chat/hugging_face_local.py
+++ b/haystack/components/generators/chat/hugging_face_local.py
@@ -141,6 +141,7 @@ def __init__(  # pylint: disable=too-many-positional-arguments
         tools: Optional[ToolsType] = None,
         tool_parsing_function: Optional[Callable[[str], Optional[list[ToolCall]]]] = None,
         async_executor: Optional[ThreadPoolExecutor] = None,
+        *,
         enable_thinking: bool = False,
     ) -> None:
         """

From 5e002203ebdb412d9908f31c679106f8bf9c83e6 Mon Sep 17 00:00:00 2001
From: Vladimir Blagojevic <dovlex@gmail.com>
Date: Thu, 11 Dec 2025 11:05:30 +0100
Subject: [PATCH 08/10] Update release notes for HuggingFaceLocalChatGenerator

Updated the release notes to reflect changes in the HuggingFaceLocalChatGenerator, including the new default model and the addition of the enable_thinking parameter.
---
 ...uggingface-local-qwen3-thinking-2a6e0f07d7da54e8.yaml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/releasenotes/notes/huggingface-local-qwen3-thinking-2a6e0f07d7da54e8.yaml b/releasenotes/notes/huggingface-local-qwen3-thinking-2a6e0f07d7da54e8.yaml
index 276942b95a..a408a85b94 100644
--- a/releasenotes/notes/huggingface-local-qwen3-thinking-2a6e0f07d7da54e8.yaml
+++ b/releasenotes/notes/huggingface-local-qwen3-thinking-2a6e0f07d7da54e8.yaml
@@ -1,6 +1,9 @@
 ---
 upgrade:
   - |
-    `HuggingFaceLocalChatGenerator` now uses `Qwen/Qwen3-0.6B` as the default model, replacing the previous default.
-    Additionally, a new `enable_thinking` parameter has been added to enable thinking mode in chat templates for
-    thinking-capable models, allowing them to generate intermediate reasoning steps before producing final responses.
+    ``HuggingFaceLocalChatGenerator`` now uses ``Qwen/Qwen3-0.6B`` as the default model, replacing the previous default.
+enhancements:
+  - |
+    A new ``enable_thinking`` parameter has been added to enable thinking mode in chat templates for thinking-capable models, 
+    allowing them to generate intermediate reasoning steps before producing final responses.
+ 

From 936a1498a519d18aaf1530e6272283ee77186370 Mon Sep 17 00:00:00 2001
From: Vladimir Blagojevic <dovlex@gmail.com>
Date: Thu, 11 Dec 2025 11:12:54 +0100
Subject: [PATCH 09/10] Simplify test_live_run with/out enable_thinking flag

---
 .../chat/test_hugging_face_local.py           | 44 ++++---------------
 1 file changed, 8 insertions(+), 36 deletions(-)

diff --git a/test/components/generators/chat/test_hugging_face_local.py b/test/components/generators/chat/test_hugging_face_local.py
index b604ecf3d8..8dc011ae6a 100644
--- a/test/components/generators/chat/test_hugging_face_local.py
+++ b/test/components/generators/chat/test_hugging_face_local.py
@@ -491,65 +491,37 @@ def test_messages_conversion_is_called(self, mock_convert, model_info_mock):
     @pytest.mark.slow
     @pytest.mark.flaky(reruns=3, reruns_delay=10)
     def test_live_run(self, monkeypatch):
+        """Test live run with default behavior and enable_thinking."""
         monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
-        messages = [ChatMessage.from_user("Please create a summary about the following topic: Climate change")]
 
+        # Test 1: Default behavior (no enable_thinking parameter) - should not include thinking tags
+        messages = [ChatMessage.from_user("Please create a summary about the following topic: Climate change")]
         llm = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 50})
-
         result = llm.run(messages)
 
         assert "replies" in result
         assert isinstance(result["replies"][0], ChatMessage)
-        assert "climate change" in result["replies"][0].text.lower()
+        reply_text = result["replies"][0].text
+        assert "climate change" in reply_text.lower()
+        assert "<think>" not in reply_text
+        assert "</think>" not in reply_text
 
-    @pytest.mark.integration
-    @pytest.mark.slow
-    @pytest.mark.flaky(reruns=3, reruns_delay=10)
-    def test_live_run_with_enable_thinking(self, monkeypatch):
-        """Test that enable_thinking works with the default Qwen3 model in a live run."""
-        monkeypatch.delenv("HF_API_TOKEN", raising=False)
+        # Test 2: With enable_thinking=True - should include thinking tags
         messages = [ChatMessage.from_user("What is 2+2?")]
-
         llm = HuggingFaceLocalChatGenerator(
             model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 450}, enable_thinking=True
         )
-
         result = llm.run(messages)
 
         assert "replies" in result
         assert isinstance(result["replies"][0], ChatMessage)
         reply_text = result["replies"][0].text
-
         assert reply_text is not None
         assert "<think>" in reply_text
         assert "</think>" in reply_text
         assert len(reply_text) > 0
         assert "4" in reply_text.lower()
 
-    @pytest.mark.integration
-    @pytest.mark.slow
-    @pytest.mark.flaky(reruns=3, reruns_delay=10)
-    def test_live_run_without_enable_thinking(self, monkeypatch):
-        """Test that enable_thinking=False prevents thinking tags in the response."""
-        monkeypatch.delenv("HF_API_TOKEN", raising=False)
-        messages = [ChatMessage.from_user("What is 2+2?")]
-
-        llm = HuggingFaceLocalChatGenerator(
-            model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 450}, enable_thinking=False
-        )
-
-        result = llm.run(messages)
-
-        assert "replies" in result
-        assert isinstance(result["replies"][0], ChatMessage)
-        reply_text = result["replies"][0].text
-
-        assert reply_text is not None
-        assert "<think>" not in reply_text
-        assert "</think>" not in reply_text
-        assert len(reply_text) > 0
-        assert "4" in reply_text.lower()
-
     def test_init_fail_with_duplicate_tool_names(self, model_info_mock, tools):
         duplicate_tools = [tools[0], tools[0]]
         with pytest.raises(ValueError, match="Duplicate tool names found"):

From 642d7ad7b6a5ffcccd646e3c0ba6b7fe1bd54b21 Mon Sep 17 00:00:00 2001
From: Vladimir Blagojevic <dovlex@gmail.com>
Date: Thu, 11 Dec 2025 11:27:14 +0100
Subject: [PATCH 10/10] Test shuffle

---
 .../chat/test_hugging_face_local.py           | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/test/components/generators/chat/test_hugging_face_local.py b/test/components/generators/chat/test_hugging_face_local.py
index 8dc011ae6a..0fe229591f 100644
--- a/test/components/generators/chat/test_hugging_face_local.py
+++ b/test/components/generators/chat/test_hugging_face_local.py
@@ -491,26 +491,30 @@ def test_messages_conversion_is_called(self, mock_convert, model_info_mock):
     @pytest.mark.slow
     @pytest.mark.flaky(reruns=3, reruns_delay=10)
     def test_live_run(self, monkeypatch):
-        """Test live run with default behavior and enable_thinking."""
+        """Test live run with default behavior (no thinking)."""
         monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
-
-        # Test 1: Default behavior (no enable_thinking parameter) - should not include thinking tags
         messages = [ChatMessage.from_user("Please create a summary about the following topic: Climate change")]
+
         llm = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 50})
+
         result = llm.run(messages)
 
         assert "replies" in result
         assert isinstance(result["replies"][0], ChatMessage)
-        reply_text = result["replies"][0].text
-        assert "climate change" in reply_text.lower()
-        assert "<think>" not in reply_text
-        assert "</think>" not in reply_text
+        assert "climate change" in result["replies"][0].text.lower()
 
-        # Test 2: With enable_thinking=True - should include thinking tags
+    @pytest.mark.integration
+    @pytest.mark.slow
+    @pytest.mark.flaky(reruns=3, reruns_delay=10)
+    def test_live_run_thinking(self, monkeypatch):
+        """Test live run with enable_thinking=True."""
+        monkeypatch.delenv("HF_API_TOKEN", raising=False)
         messages = [ChatMessage.from_user("What is 2+2?")]
+
         llm = HuggingFaceLocalChatGenerator(
             model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 450}, enable_thinking=True
         )
+
         result = llm.run(messages)
 
         assert "replies" in result