Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions flexeval/core/evaluate_chat_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def execute_conversation_flow(
if lm_output.tool_call_validation_result
else {}
)
| ({"reasoning_text": lm_output.reasoning_text} if lm_output.reasoning_text else {})
)
current_chat_history[batch_i].insert(response_context_indices[batch_i], lm_output_message)
last_lm_outputs[batch_i] = lm_output
Expand Down Expand Up @@ -216,6 +217,8 @@ def evaluate_chat_response( # noqa: C901, PLR0912
restructured_outputs: list[dict[str, Any]] = []
for output in outputs:
extra_info = output["chat_instance"].extra_info | {"messages": output["messages"]}
if output["lm_output"].reasoning_text:
extra_info["reasoning_text"] = output["lm_output"].reasoning_text
if output["lm_output"].tool_calls:
extra_info["tool_calls"] = output["lm_output"].tool_calls
if output["chat_instance"].tools:
Expand Down
2 changes: 2 additions & 0 deletions tests/core/test_evaluate_chat_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ def test_evaluate_chat_response(
# Therefore, in any case the system message should be in the first turn.
assert outputs[0]["extra_info"]["messages"][0]["role"] == "system"

assert outputs[0]["extra_info"]["reasoning_text"] == "reasoning_text"

if use_tools:
assert isinstance(outputs[0]["extra_info"]["tool_calls"], list)
assert isinstance(outputs[0]["extra_info"]["tools"], list)
Expand Down
1 change: 1 addition & 0 deletions tests/dummy_modules/lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def _batch_generate_chat_response(
return [
LMOutput(
text=f"This is response to `{messages[-1]['content']}` with kwargs {kwargs}",
reasoning_text="reasoning_text",
finish_reason="length",
tool_calls=tc,
tool_call_validation_result="CompleteToolCall" if tc else "TextOnly",
Expand Down