diff --git a/src/philosophy_bench/providers.py b/src/philosophy_bench/providers.py index bb1001e..c2a237f 100644 --- a/src/philosophy_bench/providers.py +++ b/src/philosophy_bench/providers.py @@ -720,18 +720,23 @@ async def call_gemini( ) ) + # Always opt into thought summaries. Without `include_thoughts=True` the + # Gemini API will reason internally but return zero thought parts to the + # caller — silently producing an empty thinking trace in the transcript + # even though the model is using its thinking budget. `thinking_level` is + # optional (Gemini 2.5 lacks the param; passing None preserves the model's + # auto/dynamic budget). thinking_level = spec.thinking_config.get("thinking_level") + thinking_config_kwargs: dict = {"include_thoughts": True} + if thinking_level: + thinking_config_kwargs["thinking_level"] = thinking_level config_kwargs = dict( system_instruction=system, max_output_tokens=8192, temperature=spec.temperature, tools=_gemini_tools(tools), + thinking_config=genai_types.ThinkingConfig(**thinking_config_kwargs), ) - if thinking_level: - config_kwargs["thinking_config"] = genai_types.ThinkingConfig( - thinking_level=thinking_level, - include_thoughts=True, - ) config = genai_types.GenerateContentConfig(**config_kwargs) def _call(): diff --git a/tests/test_providers_gemini.py b/tests/test_providers_gemini.py new file mode 100644 index 0000000..ae3cd7a --- /dev/null +++ b/tests/test_providers_gemini.py @@ -0,0 +1,87 @@ +"""Regression test: Gemini calls always opt into thought summaries. + +If `include_thoughts` isn't set on `ThinkingConfig`, Gemini reasons +internally but returns zero thought parts to the caller, silently producing +empty thinking traces in the bench transcripts. +""" + +from __future__ import annotations + + +def _build_thinking_config(spec): + """Mirror the assembly in providers.call_gemini() without making an API + call. Importing the real path under a mocked `google.genai.types` would + require pulling the SDK in; instead, instantiate the fields directly.""" + + # Stand-in for genai_types.ThinkingConfig — captures kwargs. + class _StubTC: + def __init__(self, **kw): + self.kwargs = kw + + class _StubTypes: + ThinkingConfig = _StubTC + + # Re-execute the relevant block from call_gemini against the stub. + thinking_level = spec.thinking_config.get("thinking_level") + thinking_config_kwargs = {"include_thoughts": True} + if thinking_level: + thinking_config_kwargs["thinking_level"] = thinking_level + return _StubTypes.ThinkingConfig(**thinking_config_kwargs) + + +def _spec(**thinking_config): + """Tiny fake ModelSpec exposing only `thinking_config`.""" + from types import SimpleNamespace + + return SimpleNamespace(thinking_config=dict(thinking_config)) + + +def test_thinking_config_includes_thoughts_for_gemini_25(): + """Gemini 2.5 specs have empty thinking_config — bug was that + `include_thoughts=True` only got set when `thinking_level` was present.""" + cfg = _build_thinking_config(_spec()) + assert cfg.kwargs.get("include_thoughts") is True + assert "thinking_level" not in cfg.kwargs + + +def test_thinking_config_includes_thoughts_for_gemini_3x(): + """Gemini 3.x specs set thinking_level=MEDIUM. Both fields should land + on the config.""" + cfg = _build_thinking_config(_spec(thinking_level="MEDIUM")) + assert cfg.kwargs.get("include_thoughts") is True + assert cfg.kwargs.get("thinking_level") == "MEDIUM" + + +def test_real_provider_module_assembly_uses_include_thoughts(): + """Smoke test: scan providers.py source to confirm the regression + isn't reintroduced by an editor merging the include_thoughts back into + a conditional branch.""" + from pathlib import Path + + src = Path(__file__).resolve().parents[1] / "src" / "philosophy_bench" / "providers.py" + text = src.read_text() + # The fix sets include_thoughts=True unconditionally inside call_gemini. + # If a future change re-introduces a conditional gating include_thoughts, + # this test will catch it. + gemini_block_start = text.index("async def call_gemini") + gemini_block_end = text.index("async def ", gemini_block_start + 1) + block = text[gemini_block_start:gemini_block_end] + assert "include_thoughts=True" in block, ( + "call_gemini must always pass include_thoughts=True; otherwise " + "Gemini 2.5 family produces empty thinking traces." + ) + # Also: make sure include_thoughts is NOT inside an `if thinking_level:` + # conditional anymore. + if_block_start = block.find("if thinking_level:") + if if_block_start != -1: + # If there's still a conditional, include_thoughts must NOT be + # gated by it. We check this by ensuring the if-block doesn't + # contain include_thoughts=True. + if_block_end = block.find("\n ", if_block_start + 1) + if if_block_end == -1: + if_block_end = len(block) + if_block = block[if_block_start:if_block_end] + assert "include_thoughts=True" not in if_block, ( + "include_thoughts=True is gated behind `if thinking_level:` — " + "this reintroduces the Gemini 2.5 empty-thought-trace bug." + )