diff --git a/flexeval/core/evaluate_chat_response.py b/flexeval/core/evaluate_chat_response.py index 906b569f..4180c6d1 100644 --- a/flexeval/core/evaluate_chat_response.py +++ b/flexeval/core/evaluate_chat_response.py @@ -229,6 +229,8 @@ def evaluate_chat_response( # noqa: C901, PLR0912 } if output["lm_output"].raw_text: restructured_output["raw_lm_output"] = output["lm_output"].raw_text + if output["lm_output"].reasoning_text: + restructured_output["reasoning_text"] = output["lm_output"].reasoning_text restructured_outputs.append(restructured_output) return metrics_summary_dict, restructured_outputs diff --git a/tests/core/test_evaluate_chat_response.py b/tests/core/test_evaluate_chat_response.py index d451b38d..eea393f4 100644 --- a/tests/core/test_evaluate_chat_response.py +++ b/tests/core/test_evaluate_chat_response.py @@ -66,6 +66,8 @@ def test_evaluate_chat_response( # Therefore, in any case the system message should be in the first turn. assert outputs[0]["extra_info"]["messages"][0]["role"] == "system" + assert outputs[0]["reasoning_text"] == "reasoning_text" + if use_tools: assert isinstance(outputs[0]["extra_info"]["tool_calls"], list) assert isinstance(outputs[0]["extra_info"]["tools"], list) diff --git a/tests/dummy_modules/lm.py b/tests/dummy_modules/lm.py index 2790b6d1..7f2fdbda 100644 --- a/tests/dummy_modules/lm.py +++ b/tests/dummy_modules/lm.py @@ -45,6 +45,7 @@ def _batch_generate_chat_response( return [ LMOutput( text=f"This is response to `{messages[-1]['content']}` with kwargs {kwargs}", + reasoning_text="reasoning_text", finish_reason="length", tool_calls=tc, tool_call_validation_result="CompleteToolCall" if tc else "TextOnly",