From 558bc71e2f722d7ea9c43a3ff47649f0399baeab Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Tue, 2 Dec 2025 15:04:21 +0100 Subject: [PATCH 01/10] Update HuggingFaceLocalChatGenerator default model to Qwen/Qwen3-0.6B --- .../generators/chat/hugging_face_local.py | 7 ++++--- .../generators/chat/test_hugging_face_local.py | 14 +++++--------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/haystack/components/generators/chat/hugging_face_local.py b/haystack/components/generators/chat/hugging_face_local.py index 2296b4f7f1..772cd56968 100644 --- a/haystack/components/generators/chat/hugging_face_local.py +++ b/haystack/components/generators/chat/hugging_face_local.py @@ -95,7 +95,7 @@ class HuggingFaceLocalChatGenerator: Generates chat responses using models from Hugging Face that run locally. Use this component with chat-based models, - such as `HuggingFaceH4/zephyr-7b-beta` or `meta-llama/Llama-2-7b-chat-hf`. + such as `Qwen/Qwen3-0.6B` or `meta-llama/Llama-2-7b-chat-hf`. LLMs running locally may need powerful hardware. ### Usage example @@ -104,7 +104,7 @@ class HuggingFaceLocalChatGenerator: from haystack.components.generators.chat import HuggingFaceLocalChatGenerator from haystack.dataclasses import ChatMessage - generator = HuggingFaceLocalChatGenerator(model="HuggingFaceH4/zephyr-7b-beta") + generator = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B") generator.warm_up() messages = [ChatMessage.from_user("What's Natural Language Processing? Be brief.")] print(generator.run(messages)) @@ -129,7 +129,7 @@ class HuggingFaceLocalChatGenerator: def __init__( # pylint: disable=too-many-positional-arguments self, - model: str = "HuggingFaceH4/zephyr-7b-beta", + model: str = "Qwen/Qwen3-0.6B", task: Optional[Literal["text-generation", "text2text-generation"]] = None, device: Optional[ComponentDevice] = None, token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False), @@ -590,6 +590,7 @@ def _prepare_inputs( chat_template=self.chat_template, add_generation_prompt=True, tools=[tc.tool_spec for tc in flat_tools] if flat_tools else None, + enable_thinking=False, ) # prepared_prompt is a string since we set tokenize=False https://hf.co/docs/transformers/main/chat_templating assert isinstance(prepared_prompt, str) diff --git a/test/components/generators/chat/test_hugging_face_local.py b/test/components/generators/chat/test_hugging_face_local.py index 78f24073a4..7a6ae926b2 100644 --- a/test/components/generators/chat/test_hugging_face_local.py +++ b/test/components/generators/chat/test_hugging_face_local.py @@ -133,7 +133,7 @@ def test_init_task_parameter(self, model_info_mock): ) assert generator.huggingface_pipeline_kwargs == { - "model": "HuggingFaceH4/zephyr-7b-beta", + "model": "Qwen/Qwen3-0.6B", "task": "text2text-generation", "token": None, "device": "cpu", @@ -147,7 +147,7 @@ def test_init_task_in_huggingface_pipeline_kwargs(self, model_info_mock): ) assert generator.huggingface_pipeline_kwargs == { - "model": "HuggingFaceH4/zephyr-7b-beta", + "model": "Qwen/Qwen3-0.6B", "task": "text2text-generation", "token": None, "device": "cpu", @@ -490,9 +490,7 @@ def test_live_run(self, monkeypatch): monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 messages = [ChatMessage.from_user("Please create a summary about the following topic: Climate change")] - llm = HuggingFaceLocalChatGenerator( - model="Qwen/Qwen2.5-0.5B-Instruct", generation_kwargs={"max_new_tokens": 50} - ) + llm = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 50}) llm.warm_up() result = llm.run(messages) @@ -513,7 +511,7 @@ def test_init_fail_with_tools_and_streaming(self, model_info_mock, tools): ) def test_run_with_tools(self, model_info_mock, tools): - generator = HuggingFaceLocalChatGenerator(model="meta-llama/Llama-2-13b-chat-hf", tools=tools) + generator = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B", tools=tools) # Mock pipeline and tokenizer mock_pipeline = Mock(return_value=[{"generated_text": '{"name": "weather", "arguments": {"city": "Paris"}}'}]) @@ -805,9 +803,7 @@ async def streaming_callback(chunk: StreamingChunk) -> None: streaming_chunks.append(chunk) llm = HuggingFaceLocalChatGenerator( - model="Qwen/Qwen2.5-0.5B-Instruct", - generation_kwargs={"max_new_tokens": 50}, - streaming_callback=streaming_callback, + model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 50}, streaming_callback=streaming_callback ) llm.warm_up() From b7ed04712d65eba7278f6ad9f7c36d7020037633 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Mon, 8 Dec 2025 11:46:16 +0100 Subject: [PATCH 02/10] Add enable_thinking init parameter --- haystack/components/generators/chat/hugging_face_local.py | 8 +++++++- .../components/generators/chat/test_hugging_face_local.py | 4 ++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/haystack/components/generators/chat/hugging_face_local.py b/haystack/components/generators/chat/hugging_face_local.py index 772cd56968..4ecfef9b23 100644 --- a/haystack/components/generators/chat/hugging_face_local.py +++ b/haystack/components/generators/chat/hugging_face_local.py @@ -141,6 +141,7 @@ def __init__( # pylint: disable=too-many-positional-arguments tools: Optional[ToolsType] = None, tool_parsing_function: Optional[Callable[[str], Optional[list[ToolCall]]]] = None, async_executor: Optional[ThreadPoolExecutor] = None, + enable_thinking: bool = False, ) -> None: """ Initializes the HuggingFaceLocalChatGenerator component. @@ -186,6 +187,9 @@ def __init__( # pylint: disable=too-many-positional-arguments :param async_executor: Optional ThreadPoolExecutor to use for async calls. If not provided, a single-threaded executor will be initialized and used + :param enable_thinking: + Whether to enable thinking mode in the chat template. When enabled, the model can generate + intermediate reasoning steps before producing the final response. Defaults to False. """ torch_and_transformers_import.check() @@ -243,6 +247,7 @@ def __init__( # pylint: disable=too-many-positional-arguments self.streaming_callback = streaming_callback self.pipeline: Optional[HfPipeline] = None self.tools = tools + self.enable_thinking = enable_thinking self._owns_executor = async_executor is None self.executor = ( @@ -308,6 +313,7 @@ def to_dict(self) -> dict[str, Any]: chat_template=self.chat_template, tools=serialize_tools_or_toolset(self.tools), tool_parsing_function=serialize_callable(self.tool_parsing_function), + enable_thinking=self.enable_thinking, ) huggingface_pipeline_kwargs = serialization_dict["init_parameters"]["huggingface_pipeline_kwargs"] @@ -590,7 +596,7 @@ def _prepare_inputs( chat_template=self.chat_template, add_generation_prompt=True, tools=[tc.tool_spec for tc in flat_tools] if flat_tools else None, - enable_thinking=False, + enable_thinking=self.enable_thinking, ) # prepared_prompt is a string since we set tokenize=False https://hf.co/docs/transformers/main/chat_templating assert isinstance(prepared_prompt, str) diff --git a/test/components/generators/chat/test_hugging_face_local.py b/test/components/generators/chat/test_hugging_face_local.py index 7a6ae926b2..8a8ab53aab 100644 --- a/test/components/generators/chat/test_hugging_face_local.py +++ b/test/components/generators/chat/test_hugging_face_local.py @@ -178,6 +178,7 @@ def test_to_dict(self, model_info_mock, tools): streaming_callback=None, chat_template="irrelevant", tools=tools, + enable_thinking=True, ) # Call the to_dict method @@ -191,6 +192,7 @@ def test_to_dict(self, model_info_mock, tools): assert init_params["generation_kwargs"] == {"max_new_tokens": 512, "n": 5, "stop_sequences": ["stop", "words"]} assert init_params["streaming_callback"] is None assert init_params["chat_template"] == "irrelevant" + assert init_params["enable_thinking"] is True assert init_params["tools"] == [ { "type": "haystack.tools.tool.Tool", @@ -214,6 +216,7 @@ def test_from_dict(self, model_info_mock, tools): streaming_callback=None, chat_template="irrelevant", tools=tools, + enable_thinking=True, ) # Call the to_dict method result = generator.to_dict() @@ -224,6 +227,7 @@ def test_from_dict(self, model_info_mock, tools): assert generator_2.generation_kwargs == {"max_new_tokens": 512, "n": 5, "stop_sequences": ["stop", "words"]} assert generator_2.streaming_callback is None assert generator_2.chat_template == "irrelevant" + assert generator_2.enable_thinking is True assert len(generator_2.tools) == 1 assert generator_2.tools[0].name == "weather" assert generator_2.tools[0].description == "useful to determine the weather in a given location" From 0d478af3ad82d17e94c896183e2f41da46c86969 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Wed, 10 Dec 2025 13:08:53 +0100 Subject: [PATCH 03/10] Pydoc wording --- haystack/components/generators/chat/hugging_face_local.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haystack/components/generators/chat/hugging_face_local.py b/haystack/components/generators/chat/hugging_face_local.py index 4ecfef9b23..40483aa4d2 100644 --- a/haystack/components/generators/chat/hugging_face_local.py +++ b/haystack/components/generators/chat/hugging_face_local.py @@ -188,8 +188,8 @@ def __init__( # pylint: disable=too-many-positional-arguments Optional ThreadPoolExecutor to use for async calls. If not provided, a single-threaded executor will be initialized and used :param enable_thinking: - Whether to enable thinking mode in the chat template. When enabled, the model can generate - intermediate reasoning steps before producing the final response. Defaults to False. + Whether to enable thinking mode in the chat template for thinking-capable models. + When enabled, the model generates intermediate reasoning before the final response. Defaults to False. """ torch_and_transformers_import.check() From 285fea8a9030f8f08b7c9a374b8ab93b30eaac42 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Wed, 10 Dec 2025 13:15:02 +0100 Subject: [PATCH 04/10] Format test --- test/components/generators/chat/test_hugging_face_local.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/test/components/generators/chat/test_hugging_face_local.py b/test/components/generators/chat/test_hugging_face_local.py index b914631924..ac295461c8 100644 --- a/test/components/generators/chat/test_hugging_face_local.py +++ b/test/components/generators/chat/test_hugging_face_local.py @@ -494,9 +494,7 @@ def test_live_run(self, monkeypatch): monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 messages = [ChatMessage.from_user("Please create a summary about the following topic: Climate change")] - llm = HuggingFaceLocalChatGenerator( - model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 50} - ) + llm = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 50}) result = llm.run(messages) From 87c36b31b803d5719a220888d5667f27bb496f1e Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Wed, 10 Dec 2025 14:50:48 +0100 Subject: [PATCH 05/10] Add tests for enable_thinking flag --- .../chat/test_hugging_face_local.py | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/test/components/generators/chat/test_hugging_face_local.py b/test/components/generators/chat/test_hugging_face_local.py index ac295461c8..b604ecf3d8 100644 --- a/test/components/generators/chat/test_hugging_face_local.py +++ b/test/components/generators/chat/test_hugging_face_local.py @@ -502,6 +502,54 @@ def test_live_run(self, monkeypatch): assert isinstance(result["replies"][0], ChatMessage) assert "climate change" in result["replies"][0].text.lower() + @pytest.mark.integration + @pytest.mark.slow + @pytest.mark.flaky(reruns=3, reruns_delay=10) + def test_live_run_with_enable_thinking(self, monkeypatch): + """Test that enable_thinking works with the default Qwen3 model in a live run.""" + monkeypatch.delenv("HF_API_TOKEN", raising=False) + messages = [ChatMessage.from_user("What is 2+2?")] + + llm = HuggingFaceLocalChatGenerator( + model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 450}, enable_thinking=True + ) + + result = llm.run(messages) + + assert "replies" in result + assert isinstance(result["replies"][0], ChatMessage) + reply_text = result["replies"][0].text + + assert reply_text is not None + assert "" in reply_text + assert "" in reply_text + assert len(reply_text) > 0 + assert "4" in reply_text.lower() + + @pytest.mark.integration + @pytest.mark.slow + @pytest.mark.flaky(reruns=3, reruns_delay=10) + def test_live_run_without_enable_thinking(self, monkeypatch): + """Test that enable_thinking=False prevents thinking tags in the response.""" + monkeypatch.delenv("HF_API_TOKEN", raising=False) + messages = [ChatMessage.from_user("What is 2+2?")] + + llm = HuggingFaceLocalChatGenerator( + model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 450}, enable_thinking=False + ) + + result = llm.run(messages) + + assert "replies" in result + assert isinstance(result["replies"][0], ChatMessage) + reply_text = result["replies"][0].text + + assert reply_text is not None + assert "" not in reply_text + assert "" not in reply_text + assert len(reply_text) > 0 + assert "4" in reply_text.lower() + def test_init_fail_with_duplicate_tool_names(self, model_info_mock, tools): duplicate_tools = [tools[0], tools[0]] with pytest.raises(ValueError, match="Duplicate tool names found"): From 41081f6c6020c718768f9d1d56a928cc7df4f175 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Wed, 10 Dec 2025 14:54:22 +0100 Subject: [PATCH 06/10] Add reno note for HuggingFaceLocalChatGenerator updates --- .../huggingface-local-qwen3-thinking-2a6e0f07d7da54e8.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 releasenotes/notes/huggingface-local-qwen3-thinking-2a6e0f07d7da54e8.yaml diff --git a/releasenotes/notes/huggingface-local-qwen3-thinking-2a6e0f07d7da54e8.yaml b/releasenotes/notes/huggingface-local-qwen3-thinking-2a6e0f07d7da54e8.yaml new file mode 100644 index 0000000000..276942b95a --- /dev/null +++ b/releasenotes/notes/huggingface-local-qwen3-thinking-2a6e0f07d7da54e8.yaml @@ -0,0 +1,6 @@ +--- +upgrade: + - | + `HuggingFaceLocalChatGenerator` now uses `Qwen/Qwen3-0.6B` as the default model, replacing the previous default. + Additionally, a new `enable_thinking` parameter has been added to enable thinking mode in chat templates for + thinking-capable models, allowing them to generate intermediate reasoning steps before producing final responses. From 530a35fde6011e3f21a0f8359d259737d9638988 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Thu, 11 Dec 2025 11:00:51 +0100 Subject: [PATCH 07/10] Update haystack/components/generators/chat/hugging_face_local.py Co-authored-by: Stefano Fiorucci --- haystack/components/generators/chat/hugging_face_local.py | 1 + 1 file changed, 1 insertion(+) diff --git a/haystack/components/generators/chat/hugging_face_local.py b/haystack/components/generators/chat/hugging_face_local.py index b83225cfa6..03e66d0abd 100644 --- a/haystack/components/generators/chat/hugging_face_local.py +++ b/haystack/components/generators/chat/hugging_face_local.py @@ -141,6 +141,7 @@ def __init__( # pylint: disable=too-many-positional-arguments tools: Optional[ToolsType] = None, tool_parsing_function: Optional[Callable[[str], Optional[list[ToolCall]]]] = None, async_executor: Optional[ThreadPoolExecutor] = None, + *, enable_thinking: bool = False, ) -> None: """ From 5e002203ebdb412d9908f31c679106f8bf9c83e6 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Thu, 11 Dec 2025 11:05:30 +0100 Subject: [PATCH 08/10] Update release notes for HuggingFaceLocalChatGenerator Updated the release notes to reflect changes in the HuggingFaceLocalChatGenerator, including the new default model and the addition of the enable_thinking parameter. --- ...uggingface-local-qwen3-thinking-2a6e0f07d7da54e8.yaml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/releasenotes/notes/huggingface-local-qwen3-thinking-2a6e0f07d7da54e8.yaml b/releasenotes/notes/huggingface-local-qwen3-thinking-2a6e0f07d7da54e8.yaml index 276942b95a..a408a85b94 100644 --- a/releasenotes/notes/huggingface-local-qwen3-thinking-2a6e0f07d7da54e8.yaml +++ b/releasenotes/notes/huggingface-local-qwen3-thinking-2a6e0f07d7da54e8.yaml @@ -1,6 +1,9 @@ --- upgrade: - | - `HuggingFaceLocalChatGenerator` now uses `Qwen/Qwen3-0.6B` as the default model, replacing the previous default. - Additionally, a new `enable_thinking` parameter has been added to enable thinking mode in chat templates for - thinking-capable models, allowing them to generate intermediate reasoning steps before producing final responses. + ``HuggingFaceLocalChatGenerator`` now uses ``Qwen/Qwen3-0.6B`` as the default model, replacing the previous default. +enhancements: + - | + A new ``enable_thinking`` parameter has been added to enable thinking mode in chat templates for thinking-capable models, + allowing them to generate intermediate reasoning steps before producing final responses. + From 936a1498a519d18aaf1530e6272283ee77186370 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Thu, 11 Dec 2025 11:12:54 +0100 Subject: [PATCH 09/10] Simplify test_live_run with/out enable_thinking flag --- .../chat/test_hugging_face_local.py | 44 ++++--------------- 1 file changed, 8 insertions(+), 36 deletions(-) diff --git a/test/components/generators/chat/test_hugging_face_local.py b/test/components/generators/chat/test_hugging_face_local.py index b604ecf3d8..8dc011ae6a 100644 --- a/test/components/generators/chat/test_hugging_face_local.py +++ b/test/components/generators/chat/test_hugging_face_local.py @@ -491,65 +491,37 @@ def test_messages_conversion_is_called(self, mock_convert, model_info_mock): @pytest.mark.slow @pytest.mark.flaky(reruns=3, reruns_delay=10) def test_live_run(self, monkeypatch): + """Test live run with default behavior and enable_thinking.""" monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 - messages = [ChatMessage.from_user("Please create a summary about the following topic: Climate change")] + # Test 1: Default behavior (no enable_thinking parameter) - should not include thinking tags + messages = [ChatMessage.from_user("Please create a summary about the following topic: Climate change")] llm = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 50}) - result = llm.run(messages) assert "replies" in result assert isinstance(result["replies"][0], ChatMessage) - assert "climate change" in result["replies"][0].text.lower() + reply_text = result["replies"][0].text + assert "climate change" in reply_text.lower() + assert "" not in reply_text + assert "" not in reply_text - @pytest.mark.integration - @pytest.mark.slow - @pytest.mark.flaky(reruns=3, reruns_delay=10) - def test_live_run_with_enable_thinking(self, monkeypatch): - """Test that enable_thinking works with the default Qwen3 model in a live run.""" - monkeypatch.delenv("HF_API_TOKEN", raising=False) + # Test 2: With enable_thinking=True - should include thinking tags messages = [ChatMessage.from_user("What is 2+2?")] - llm = HuggingFaceLocalChatGenerator( model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 450}, enable_thinking=True ) - result = llm.run(messages) assert "replies" in result assert isinstance(result["replies"][0], ChatMessage) reply_text = result["replies"][0].text - assert reply_text is not None assert "" in reply_text assert "" in reply_text assert len(reply_text) > 0 assert "4" in reply_text.lower() - @pytest.mark.integration - @pytest.mark.slow - @pytest.mark.flaky(reruns=3, reruns_delay=10) - def test_live_run_without_enable_thinking(self, monkeypatch): - """Test that enable_thinking=False prevents thinking tags in the response.""" - monkeypatch.delenv("HF_API_TOKEN", raising=False) - messages = [ChatMessage.from_user("What is 2+2?")] - - llm = HuggingFaceLocalChatGenerator( - model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 450}, enable_thinking=False - ) - - result = llm.run(messages) - - assert "replies" in result - assert isinstance(result["replies"][0], ChatMessage) - reply_text = result["replies"][0].text - - assert reply_text is not None - assert "" not in reply_text - assert "" not in reply_text - assert len(reply_text) > 0 - assert "4" in reply_text.lower() - def test_init_fail_with_duplicate_tool_names(self, model_info_mock, tools): duplicate_tools = [tools[0], tools[0]] with pytest.raises(ValueError, match="Duplicate tool names found"): From 642d7ad7b6a5ffcccd646e3c0ba6b7fe1bd54b21 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Thu, 11 Dec 2025 11:27:14 +0100 Subject: [PATCH 10/10] Test shuffle --- .../chat/test_hugging_face_local.py | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/test/components/generators/chat/test_hugging_face_local.py b/test/components/generators/chat/test_hugging_face_local.py index 8dc011ae6a..0fe229591f 100644 --- a/test/components/generators/chat/test_hugging_face_local.py +++ b/test/components/generators/chat/test_hugging_face_local.py @@ -491,26 +491,30 @@ def test_messages_conversion_is_called(self, mock_convert, model_info_mock): @pytest.mark.slow @pytest.mark.flaky(reruns=3, reruns_delay=10) def test_live_run(self, monkeypatch): - """Test live run with default behavior and enable_thinking.""" + """Test live run with default behavior (no thinking).""" monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 - - # Test 1: Default behavior (no enable_thinking parameter) - should not include thinking tags messages = [ChatMessage.from_user("Please create a summary about the following topic: Climate change")] + llm = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 50}) + result = llm.run(messages) assert "replies" in result assert isinstance(result["replies"][0], ChatMessage) - reply_text = result["replies"][0].text - assert "climate change" in reply_text.lower() - assert "" not in reply_text - assert "" not in reply_text + assert "climate change" in result["replies"][0].text.lower() - # Test 2: With enable_thinking=True - should include thinking tags + @pytest.mark.integration + @pytest.mark.slow + @pytest.mark.flaky(reruns=3, reruns_delay=10) + def test_live_run_thinking(self, monkeypatch): + """Test live run with enable_thinking=True.""" + monkeypatch.delenv("HF_API_TOKEN", raising=False) messages = [ChatMessage.from_user("What is 2+2?")] + llm = HuggingFaceLocalChatGenerator( model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 450}, enable_thinking=True ) + result = llm.run(messages) assert "replies" in result